early-access version 2433

This commit is contained in:
pineappleEA
2022-01-25 01:41:08 +01:00
parent ae416cfe9f
commit f39c04fb30
7481 changed files with 1809449 additions and 26 deletions

30
externals/ffmpeg/ffmpeg/libswscale/Makefile vendored Executable file
View File

@@ -0,0 +1,30 @@
NAME = swscale
DESC = FFmpeg image rescaling library
HEADERS = swscale.h \
version.h \
OBJS = alphablend.o \
hscale.o \
hscale_fast_bilinear.o \
gamma.o \
input.o \
options.o \
output.o \
rgb2rgb.o \
slice.o \
swscale.o \
swscale_unscaled.o \
utils.o \
yuv2rgb.o \
vscale.o \
OBJS-$(CONFIG_SHARED) += log2_tab.o
# Windows resource file
SLIBOBJS-$(HAVE_GNU_WINDRES) += swscaleres.o
TESTPROGS = colorspace \
floatimg_cmp \
pixdesc_query \
swscale \

View File

@@ -0,0 +1,8 @@
OBJS += aarch64/rgb2rgb.o \
aarch64/swscale.o \
aarch64/swscale_unscaled.o \
NEON-OBJS += aarch64/hscale.o \
aarch64/output.o \
aarch64/rgb2rgb_neon.o \
aarch64/yuv2rgb_neon.o \

View File

@@ -0,0 +1,80 @@
/*
* Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
function ff_hscale_8_to_15_neon, export=1
sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
1: ldr w8, [x5], #4 // filterPos[idx]
ldr w0, [x5], #4 // filterPos[idx + 1]
ldr w11, [x5], #4 // filterPos[idx + 2]
ldr w9, [x5], #4 // filterPos[idx + 3]
mov x16, x4 // filter0 = filter
add x12, x16, x7 // filter1 = filter0 + filterSize*2
add x13, x12, x7 // filter2 = filter1 + filterSize*2
add x4, x13, x7 // filter3 = filter2 + filterSize*2
movi v0.2D, #0 // val sum part 1 (for dst[0])
movi v1.2D, #0 // val sum part 2 (for dst[1])
movi v2.2D, #0 // val sum part 3 (for dst[2])
movi v3.2D, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, UXTW // srcp + filterPos[0]
add x8, x3, w0, UXTW // srcp + filterPos[1]
add x0, x3, w11, UXTW // srcp + filterPos[2]
add x11, x3, w9, UXTW // srcp + filterPos[3]
mov w15, w6 // filterSize counter
2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}]
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}]
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
uxtl v4.8H, v4.8B // unpack part 1 to 16-bit
smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}]
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
uxtl v6.8H, v6.8B // unpack part 2 to 16-bit
smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
uxtl v16.8H, v16.8B // unpack part 3 to 16-bit
smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}]
smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
subs w15, w15, #8 // j -= 8: processed 8/filterSize
uxtl v18.8H, v18.8B // unpack part 4 to 16-bit
smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
smlal2 v3.4S, v18.8H, v19.8H // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
b.gt 2b // inner loop if filterSize not consumed completely
addp v0.4S, v0.4S, v0.4S // part0 horizontal pair adding
addp v1.4S, v1.4S, v1.4S // part1 horizontal pair adding
addp v2.4S, v2.4S, v2.4S // part2 horizontal pair adding
addp v3.4S, v3.4S, v3.4S // part3 horizontal pair adding
addp v0.4S, v0.4S, v0.4S // part0 horizontal pair adding
addp v1.4S, v1.4S, v1.4S // part1 horizontal pair adding
addp v2.4S, v2.4S, v2.4S // part2 horizontal pair adding
addp v3.4S, v3.4S, v3.4S // part3 horizontal pair adding
zip1 v0.4S, v0.4S, v1.4S // part01 = zip values from part0 and part1
zip1 v2.4S, v2.4S, v3.4S // part23 = zip values from part2 and part3
mov v0.d[1], v2.d[0] // part0123 = zip values from part01 and part23
subs w2, w2, #4 // dstW -= 4
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
st1 {v0.4H}, [x1], #8 // write to destination part0123
b.gt 1b // loop until end of line
ret
endfunc

View File

@@ -0,0 +1,58 @@
/*
* Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
function ff_yuv2planeX_8_neon, export=1
ld1 {v0.8B}, [x5] // load 8x8-bit dither
cbz w6, 1f // check if offsetting present
ext v0.8B, v0.8B, v0.8B, #3 // honor offsetting which can be 0 or 3 only
1: uxtl v0.8H, v0.8B // extend dither to 16-bit
ushll v1.4S, v0.4H, #12 // extend dither to 32-bit with left shift by 12 (part 1)
ushll2 v2.4S, v0.8H, #12 // extend dither to 32-bit with left shift by 12 (part 2)
mov x7, #0 // i = 0
2: mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
mov w8, w1 // tmpfilterSize = filterSize
mov x9, x2 // srcp = src
mov x10, x0 // filterp = filter
3: ldp x11, x12, [x9], #16 // get 2 pointers: src[j] and src[j+1]
add x11, x11, x7, lsl #1 // &src[j ][i]
add x12, x12, x7, lsl #1 // &src[j+1][i]
ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
ld1 {v6.8H}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
ld1r {v7.8H}, [x10], #2 // read 1x16-bit coeff X at filter[j ] and duplicate across lanes
ld1r {v16.8H}, [x10], #2 // read 1x16-bit coeff Y at filter[j+1] and duplicate across lanes
smlal v3.4S, v5.4H, v7.4H // val0 += {A,B,C,D} * X
smlal2 v4.4S, v5.8H, v7.8H // val1 += {E,F,G,H} * X
smlal v3.4S, v6.4H, v16.4H // val0 += {I,J,K,L} * Y
smlal2 v4.4S, v6.8H, v16.8H // val1 += {M,N,O,P} * Y
subs w8, w8, #2 // tmpfilterSize -= 2
b.gt 3b // loop until filterSize consumed
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
st1 {v3.8b}, [x3], #8 // write to destination
subs w4, w4, #8 // dstW -= 8
add x7, x7, #8 // i += 8
b.gt 2b // loop until width consumed
ret
endfunc

View File

@@ -0,0 +1,41 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/aarch64/cpu.h"
#include "libavutil/cpu.h"
#include "libavutil/bswap.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride);
av_cold void rgb2rgb_init_aarch64(void)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
interleaveBytes = ff_interleave_bytes_neon;
}
}

View File

@@ -0,0 +1,79 @@
/*
* Copyright (c) 2020 Martin Storsjo
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
// void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
// uint8_t *dest, int width, int height,
// int src1Stride, int src2Stride, int dstStride);
function ff_interleave_bytes_neon, export=1
sub w5, w5, w3
sub w6, w6, w3
sub w7, w7, w3, lsl #1
1:
ands w8, w3, #0xfffffff0 // & ~15
b.eq 3f
2:
ld1 {v0.16b}, [x0], #16
ld1 {v1.16b}, [x1], #16
subs w8, w8, #16
st2 {v0.16b, v1.16b}, [x2], #32
b.gt 2b
tst w3, #15
b.eq 9f
3:
tst w3, #8
b.eq 4f
ld1 {v0.8b}, [x0], #8
ld1 {v1.8b}, [x1], #8
st2 {v0.8b, v1.8b}, [x2], #16
4:
tst w3, #4
b.eq 5f
ld1 {v0.s}[0], [x0], #4
ld1 {v1.s}[0], [x1], #4
zip1 v0.8b, v0.8b, v1.8b
st1 {v0.8b}, [x2], #8
5:
ands w8, w3, #3
b.eq 9f
6:
ldrb w9, [x0], #1
ldrb w10, [x1], #1
subs w8, w8, #1
bfi w9, w10, #8, #8
strh w9, [x2], #2
b.gt 6b
9:
subs w4, w4, #1
b.eq 0f
add x0, x0, w5, sxtw
add x1, x1, w6, sxtw
add x2, x2, w7, sxtw
b 1b
0:
ret
endfunc

View File

@@ -0,0 +1,47 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/aarch64/cpu.h"
void ff_hscale_8_to_15_neon(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
if (c->srcBpc == 8 && c->dstBpc <= 14 &&
(c->hLumFilterSize % 8) == 0 &&
(c->hChrFilterSize % 8) == 0)
{
c->hyScale = c->hcScale = ff_hscale_8_to_15_neon;
}
if (c->dstBpc == 8) {
c->yuv2planeX = ff_yuv2planeX_8_neon;
}
}
}

View File

@@ -0,0 +1,129 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/aarch64/cpu.h"
#define YUV_TO_RGB_TABLE \
c->yuv2rgb_v2r_coeff, \
c->yuv2rgb_u2g_coeff, \
c->yuv2rgb_v2g_coeff, \
c->yuv2rgb_u2b_coeff, \
#define DECLARE_FF_YUVX_TO_RGBX_FUNCS(ifmt, ofmt) \
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
uint8_t *dst, int linesize, \
const uint8_t *srcY, int linesizeY, \
const uint8_t *srcU, int linesizeU, \
const uint8_t *srcV, int linesizeV, \
const int16_t *table, \
int y_offset, \
int y_coeff); \
\
static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \
int srcStride[], int srcSliceY, int srcSliceH, \
uint8_t *dst[], int dstStride[]) { \
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
\
return ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
src[0], srcStride[0], \
src[1], srcStride[1], \
src[2], srcStride[2], \
yuv2rgb_table, \
c->yuv2rgb_y_offset >> 6, \
c->yuv2rgb_y_coeff); \
} \
#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra) \
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p)
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p)
#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt) \
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
uint8_t *dst, int linesize, \
const uint8_t *srcY, int linesizeY, \
const uint8_t *srcC, int linesizeC, \
const int16_t *table, \
int y_offset, \
int y_coeff); \
\
static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \
int srcStride[], int srcSliceY, int srcSliceH, \
uint8_t *dst[], int dstStride[]) { \
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
\
return ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
src[0], srcStride[0], src[1], srcStride[1], \
yuv2rgb_table, \
c->yuv2rgb_y_offset >> 6, \
c->yuv2rgb_y_coeff); \
} \
#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra) \
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12)
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
/* We need a 16 pixel width alignment. This constraint can easily be removed
* for input reading but for the output which is 4-bytes per pixel (RGBA) the
* assembly might be writing as much as 4*15=60 extra bytes at the end of the
* line, which won't fit the 32-bytes buffer alignment. */
#define SET_FF_NVX_TO_RGBX_FUNC(ifmt, IFMT, ofmt, OFMT, accurate_rnd) do { \
if (c->srcFormat == AV_PIX_FMT_##IFMT \
&& c->dstFormat == AV_PIX_FMT_##OFMT \
&& !(c->srcH & 1) \
&& !(c->srcW & 15) \
&& !accurate_rnd) \
c->swscale = ifmt##_to_##ofmt##_neon_wrapper; \
} while (0)
#define SET_FF_NVX_TO_ALL_RGBX_FUNC(nvx, NVX, accurate_rnd) do { \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, argb, ARGB, accurate_rnd); \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgba, RGBA, accurate_rnd); \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd); \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd); \
} while (0)
static void get_unscaled_swscale_neon(SwsContext *c) {
int accurate_rnd = c->flags & SWS_ACCURATE_RND;
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
}
void ff_get_unscaled_swscale_aarch64(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
get_unscaled_swscale_neon(c);
}

View File

@@ -0,0 +1,212 @@
/*
* Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
* Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
.macro load_yoff_ycoeff yoff ycoeff
#if defined(__APPLE__)
ldp w9, w10, [sp, #\yoff]
#else
ldr w9, [sp, #\yoff]
ldr w10, [sp, #\ycoeff]
#endif
.endm
.macro load_args_nv12
ldr x8, [sp] // table
load_yoff_ycoeff 8, 16 // y_offset, y_coeff
ld1 {v1.1D}, [x8]
dup v0.8H, w10
dup v3.8H, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
neg w11, w0
.endm
.macro load_args_nv21
load_args_nv12
.endm
.macro load_args_yuv420p
ldr x13, [sp] // srcV
ldr w14, [sp, #8] // linesizeV
ldr x8, [sp, #16] // table
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
ld1 {v1.1D}, [x8]
dup v0.8H, w10
dup v3.8H, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
lsr w11, w0, #1
neg w11, w11
.endm
.macro load_args_yuv422p
ldr x13, [sp] // srcV
ldr w14, [sp, #8] // linesizeV
ldr x8, [sp, #16] // table
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
ld1 {v1.1D}, [x8]
dup v0.8H, w10
dup v3.8H, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
.endm
.macro load_chroma_nv12
ld2 {v16.8B, v17.8B}, [x6], #16
ushll v18.8H, v16.8B, #3
ushll v19.8H, v17.8B, #3
.endm
.macro load_chroma_nv21
ld2 {v16.8B, v17.8B}, [x6], #16
ushll v19.8H, v16.8B, #3
ushll v18.8H, v17.8B, #3
.endm
.macro load_chroma_yuv420p
ld1 {v16.8B}, [ x6], #8
ld1 {v17.8B}, [x13], #8
ushll v18.8H, v16.8B, #3
ushll v19.8H, v17.8B, #3
.endm
.macro load_chroma_yuv422p
load_chroma_yuv420p
.endm
.macro increment_nv12
ands w15, w1, #1
csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width
add x6, x6, w16, SXTW // srcC += incC
.endm
.macro increment_nv21
increment_nv12
.endm
.macro increment_yuv420p
ands w15, w1, #1
csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2
csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2
add x6, x6, w16, SXTW // srcU += incU
add x13, x13, w17, SXTW // srcV += incV
.endm
.macro increment_yuv422p
add x6, x6, w7, UXTW // srcU += incU
add x13, x13, w14, UXTW // srcV += incV
.endm
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
add v20.8H, v26.8H, v20.8H // Y1 + R1
add v21.8H, v27.8H, v21.8H // Y2 + R2
add v22.8H, v26.8H, v22.8H // Y1 + G1
add v23.8H, v27.8H, v23.8H // Y2 + G2
add v24.8H, v26.8H, v24.8H // Y1 + B1
add v25.8H, v27.8H, v25.8H // Y2 + B2
sqrshrun \r1, v20.8H, #1 // clip_u8((Y1 + R1) >> 1)
sqrshrun \r2, v21.8H, #1 // clip_u8((Y2 + R1) >> 1)
sqrshrun \g1, v22.8H, #1 // clip_u8((Y1 + G1) >> 1)
sqrshrun \g2, v23.8H, #1 // clip_u8((Y2 + G1) >> 1)
sqrshrun \b1, v24.8H, #1 // clip_u8((Y1 + B1) >> 1)
sqrshrun \b2, v25.8H, #1 // clip_u8((Y2 + B1) >> 1)
movi \a1, #255
movi \a2, #255
.endm
.macro declare_func ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
load_args_\ifmt
mov w9, w1
1:
mov w8, w0 // w8 = width
2:
movi v5.8H, #4, lsl #8 // 128 * (1<<3)
load_chroma_\ifmt
sub v18.8H, v18.8H, v5.8H // U*(1<<3) - 128*(1<<3)
sub v19.8H, v19.8H, v5.8H // V*(1<<3) - 128*(1<<3)
sqdmulh v20.8H, v19.8H, v1.H[0] // V * v2r (R)
sqdmulh v22.8H, v18.8H, v1.H[1] // U * u2g
sqdmulh v19.8H, v19.8H, v1.H[2] // V * v2g
add v22.8H, v22.8H, v19.8H // U * u2g + V * v2g (G)
sqdmulh v24.8H, v18.8H, v1.H[3] // U * u2b (B)
zip2 v21.8H, v20.8H, v20.8H // R2
zip1 v20.8H, v20.8H, v20.8H // R1
zip2 v23.8H, v22.8H, v22.8H // G2
zip1 v22.8H, v22.8H, v22.8H // G1
zip2 v25.8H, v24.8H, v24.8H // B2
zip1 v24.8H, v24.8H, v24.8H // B1
ld1 {v2.16B}, [x4], #16 // load luma
ushll v26.8H, v2.8B, #3 // Y1*(1<<3)
ushll2 v27.8H, v2.16B, #3 // Y2*(1<<3)
sub v26.8H, v26.8H, v3.8H // Y1*(1<<3) - y_offset
sub v27.8H, v27.8H, v3.8H // Y2*(1<<3) - y_offset
sqdmulh v26.8H, v26.8H, v0.8H // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
sqdmulh v27.8H, v27.8H, v0.8H // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
.ifc \ofmt,argb // 1 2 3 0
compute_rgba v5.8B,v6.8B,v7.8B,v4.8B, v17.8B,v18.8B,v19.8B,v16.8B
.endif
.ifc \ofmt,rgba // 0 1 2 3
compute_rgba v4.8B,v5.8B,v6.8B,v7.8B, v16.8B,v17.8B,v18.8B,v19.8B
.endif
.ifc \ofmt,abgr // 3 2 1 0
compute_rgba v7.8B,v6.8B,v5.8B,v4.8B, v19.8B,v18.8B,v17.8B,v16.8B
.endif
.ifc \ofmt,bgra // 2 1 0 3
compute_rgba v6.8B,v5.8B,v4.8B,v7.8B, v18.8B,v17.8B,v16.8B,v19.8B
.endif
st4 { v4.8B, v5.8B, v6.8B, v7.8B}, [x2], #32
st4 {v16.8B,v17.8B,v18.8B,v19.8B}, [x2], #32
subs w8, w8, #16 // width -= 16
b.gt 2b
add x2, x2, w3, UXTW // dst += padding
add x4, x4, w5, UXTW // srcY += paddingY
increment_\ifmt
subs w1, w1, #1 // height -= 1
b.gt 1b
mov w0, w9
ret
endfunc
.endm
.macro declare_rgb_funcs ifmt
declare_func \ifmt, argb
declare_func \ifmt, rgba
declare_func \ifmt, abgr
declare_func \ifmt, bgra
.endm
declare_rgb_funcs nv12
declare_rgb_funcs nv21
declare_rgb_funcs yuv420p
declare_rgb_funcs yuv422p

View File

@@ -0,0 +1,169 @@
/*
* Copyright (C) 2015 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "swscale_internal.h"
int ff_sws_alphablendaway(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
int nb_components = desc->nb_components;
int plane, x, y;
int plane_count = isGray(c->srcFormat) ? 1 : 3;
int sixteen_bits = desc->comp[0].depth >= 9;
unsigned off = 1<<(desc->comp[0].depth - 1);
unsigned shift = desc->comp[0].depth;
unsigned max = (1<<shift) - 1;
int target_table[2][3];
for (plane = 0; plane < plane_count; plane++) {
int a = 0, b = 0;
if (c->alphablend == SWS_ALPHA_BLEND_CHECKERBOARD) {
a = (1<<(desc->comp[0].depth - 1))/2;
b = 3*(1<<(desc->comp[0].depth-1))/2;
}
target_table[0][plane] = plane && !(desc->flags & AV_PIX_FMT_FLAG_RGB) ? 1<<(desc->comp[0].depth - 1) : a;
target_table[1][plane] = plane && !(desc->flags & AV_PIX_FMT_FLAG_RGB) ? 1<<(desc->comp[0].depth - 1) : b;
}
av_assert0(plane_count == nb_components - 1);
if (desc->flags & AV_PIX_FMT_FLAG_PLANAR) {
for (plane = 0; plane < plane_count; plane++) {
int w = plane ? c->chrSrcW : c->srcW;
int x_subsample = plane ? desc->log2_chroma_w: 0;
int y_subsample = plane ? desc->log2_chroma_h: 0;
for (y = srcSliceY >> y_subsample; y < AV_CEIL_RSHIFT(srcSliceH, y_subsample); y++) {
if (x_subsample || y_subsample) {
int alpha;
unsigned u;
if (sixteen_bits) {
ptrdiff_t alpha_step = srcStride[plane_count] >> 1;
const uint16_t *s = (const uint16_t *)(src[plane ] + srcStride[plane ] * y);
const uint16_t *a = (const uint16_t *)(src[plane_count] + (srcStride[plane_count] * y << y_subsample));
uint16_t *d = ( uint16_t *)(dst[plane ] + dstStride[plane ] * y);
if ((!isBE(c->srcFormat)) == !HAVE_BIGENDIAN) {
for (x = 0; x < w; x++) {
if (y_subsample) {
alpha = (a[2*x] + a[2*x + 1] + 2 +
a[2*x + alpha_step] + a[2*x + alpha_step + 1]) >> 2;
} else
alpha = (a[2*x] + a[2*x + 1]) >> 1;
u = s[x]*alpha + target_table[((x^y)>>5)&1][plane]*(max-alpha) + off;
d[x] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
} else {
for (x = 0; x < w; x++) {
if (y_subsample) {
alpha = (av_bswap16(a[2*x]) + av_bswap16(a[2*x + 1]) + 2 +
av_bswap16(a[2*x + alpha_step]) + av_bswap16(a[2*x + alpha_step + 1])) >> 2;
} else
alpha = (av_bswap16(a[2*x]) + av_bswap16(a[2*x + 1])) >> 1;
u = av_bswap16(s[x])*alpha + target_table[((x^y)>>5)&1][plane]*(max-alpha) + off;
d[x] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
}
} else {
ptrdiff_t alpha_step = srcStride[plane_count];
const uint8_t *s = src[plane ] + srcStride[plane] * y;
const uint8_t *a = src[plane_count] + (srcStride[plane_count] * y << y_subsample);
uint8_t *d = dst[plane ] + dstStride[plane] * y;
for (x = 0; x < w; x++) {
if (y_subsample) {
alpha = (a[2*x] + a[2*x + 1] + 2 +
a[2*x + alpha_step] + a[2*x + alpha_step + 1]) >> 2;
} else
alpha = (a[2*x] + a[2*x + 1]) >> 1;
u = s[x]*alpha + target_table[((x^y)>>5)&1][plane]*(255-alpha) + 128;
d[x] = (257*u) >> 16;
}
}
} else {
if (sixteen_bits) {
const uint16_t *s = (const uint16_t *)(src[plane ] + srcStride[plane ] * y);
const uint16_t *a = (const uint16_t *)(src[plane_count] + srcStride[plane_count] * y);
uint16_t *d = ( uint16_t *)(dst[plane ] + dstStride[plane ] * y);
if ((!isBE(c->srcFormat)) == !HAVE_BIGENDIAN) {
for (x = 0; x < w; x++) {
unsigned u = s[x]*a[x] + target_table[((x^y)>>5)&1][plane]*(max-a[x]) + off;
d[x] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
} else {
for (x = 0; x < w; x++) {
unsigned aswap =av_bswap16(a[x]);
unsigned u = av_bswap16(s[x])*aswap + target_table[((x^y)>>5)&1][plane]*(max-aswap) + off;
d[x] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
}
} else {
const uint8_t *s = src[plane ] + srcStride[plane] * y;
const uint8_t *a = src[plane_count] + srcStride[plane_count] * y;
uint8_t *d = dst[plane ] + dstStride[plane] * y;
for (x = 0; x < w; x++) {
unsigned u = s[x]*a[x] + target_table[((x^y)>>5)&1][plane]*(255-a[x]) + 128;
d[x] = (257*u) >> 16;
}
}
}
}
}
} else {
int alpha_pos = desc->comp[plane_count].offset;
int w = c->srcW;
for (y = srcSliceY; y < srcSliceH; y++) {
if (sixteen_bits) {
const uint16_t *s = (const uint16_t *)(src[0] + srcStride[0] * y + 2*!alpha_pos);
const uint16_t *a = (const uint16_t *)(src[0] + srcStride[0] * y + alpha_pos);
uint16_t *d = ( uint16_t *)(dst[0] + dstStride[0] * y);
if ((!isBE(c->srcFormat)) == !HAVE_BIGENDIAN) {
for (x = 0; x < w; x++) {
for (plane = 0; plane < plane_count; plane++) {
int x_index = (plane_count + 1) * x;
unsigned u = s[x_index + plane]*a[x_index] + target_table[((x^y)>>5)&1][plane]*(max-a[x_index]) + off;
d[plane_count*x + plane] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
}
} else {
for (x = 0; x < w; x++) {
for (plane = 0; plane < plane_count; plane++) {
int x_index = (plane_count + 1) * x;
unsigned aswap =av_bswap16(a[x_index]);
unsigned u = av_bswap16(s[x_index + plane])*aswap + target_table[((x^y)>>5)&1][plane]*(max-aswap) + off;
d[plane_count*x + plane] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
}
}
} else {
const uint8_t *s = src[0] + srcStride[0] * y + !alpha_pos;
const uint8_t *a = src[0] + srcStride[0] * y + alpha_pos;
uint8_t *d = dst[0] + dstStride[0] * y;
for (x = 0; x < w; x++) {
for (plane = 0; plane < plane_count; plane++) {
int x_index = (plane_count + 1) * x;
unsigned u = s[x_index + plane]*a[x_index] + target_table[((x^y)>>5)&1][plane]*(255-a[x_index]) + 128;
d[plane_count*x + plane] = (257*u) >> 16;
}
}
}
}
}
return 0;
}

View File

@@ -0,0 +1,8 @@
OBJS += arm/swscale.o \
arm/swscale_unscaled.o \
NEON-OBJS += arm/rgb2yuv_neon_32.o
NEON-OBJS += arm/rgb2yuv_neon_16.o
NEON-OBJS += arm/hscale.o \
arm/output.o \
arm/yuv2rgb_neon.o \

View File

@@ -0,0 +1,70 @@
/*
* Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
* Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_hscale_8_to_15_neon, export=1
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ filter
ldr r5, [sp, #108] @ filterPos
ldr r6, [sp, #112] @ filterSize
add r10, r4, r6, lsl #1 @ filter2 = filter + filterSize * 2
1: ldr r8, [r5], #4 @ filterPos[0]
ldr r9, [r5], #4 @ filterPos[1]
vmov.s32 q4, #0 @ val accumulator
vmov.s32 q5, #0 @ val accumulator
mov r7, r6 @ tmpfilterSize = filterSize
mov r0, r3 @ srcp
2: add r11, r0, r8 @ srcp + filterPos[0]
add r12, r0, r9 @ srcp + filterPos[1]
vld1.8 d0, [r11] @ srcp[filterPos[0] + {0..7}]
vld1.8 d2, [r12] @ srcp[filterPos[1] + {0..7}]
vld1.16 {q2}, [r4]! @ load 8x16-bit filter values
vld1.16 {q3}, [r10]! @ load 8x16-bit filter values
vmovl.u8 q0, d0 @ unpack src values to 16-bit
vmovl.u8 q1, d2 @ unpack src values to 16-bit
vmull.s16 q8, d0, d4 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 1)
vmull.s16 q9, d1, d5 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 2)
vmull.s16 q10, d2, d6 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 1)
vmull.s16 q11, d3, d7 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 2)
vpadd.s32 d16, d16, d17 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
vpadd.s32 d17, d18, d19 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
vpadd.s32 d20, d20, d21 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
vpadd.s32 d21, d22, d23 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
vadd.s32 q4, q8 @ update val accumulator
vadd.s32 q5, q10 @ update val accumulator
add r0, #8 @ srcp += 8
subs r7, #8 @ tmpfilterSize -= 8
bgt 2b @ loop until tmpfilterSize is consumed
mov r4, r10 @ filter = filter2
add r10, r10, r6, lsl #1 @ filter2 += filterSize * 2
vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 1)
vpadd.s32 d9, d10, d11 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 2)
vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 4x32-bit sums into 2x32-bit
vqshrn.s32 d8, q4, #7 @ shift and clip the 2x16-bit final values
vst1.32 {d8[0]},[r1]! @ write destination
subs r2, #2 @ dstW -= 2
bgt 1b @ loop until end of line
vpop {q4-q7}
pop {r4-r12, lr}
mov pc, lr
endfunc

View File

@@ -0,0 +1,78 @@
/*
* Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
* Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_yuv2planeX_8_neon, export=1
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ dstW
ldr r5, [sp, #108] @ dither
ldr r6, [sp, #112] @ offset
vld1.8 {d0}, [r5] @ load 8x8-bit dither values
cmp r6, #0 @ check offsetting which can be 0 or 3 only
beq 1f
vext.u8 d0, d0, d0, #3 @ honor offseting which can be 3 only
1: vmovl.u8 q0, d0 @ extend dither to 16-bit
vshll.u16 q1, d0, #12 @ extend dither to 32-bit with left shift by 12 (part 1)
vshll.u16 q2, d1, #12 @ extend dither to 32-bit with left shift by 12 (part 2)
mov r7, #0 @ i = 0
2: vmov.u8 q3, q1 @ initialize accumulator with dithering values (part 1)
vmov.u8 q4, q2 @ initialize accumulator with dithering values (part 2)
mov r8, r1 @ tmpFilterSize = filterSize
mov r9, r2 @ srcp
mov r10, r0 @ filterp
3: ldr r11, [r9], #4 @ get pointer @ src[j]
ldr r12, [r9], #4 @ get pointer @ src[j+1]
add r11, r11, r7, lsl #1 @ &src[j][i]
add r12, r12, r7, lsl #1 @ &src[j+1][i]
vld1.16 {q5}, [r11] @ read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
vld1.16 {q6}, [r12] @ read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
ldr r11, [r10], #4 @ read 2x16-bit coeffs (X, Y) at (filter[j], filter[j+1])
vmov.16 q7, q5 @ copy 8x16-bit @ src[j ][i + {0..7}] for following inplace zip instruction
vmov.16 q8, q6 @ copy 8x16-bit @ src[j+1][i + {0..7}] for following inplace zip instruction
vzip.16 q7, q8 @ A,I,B,J,C,K,D,L,E,M,F,N,G,O,H,P
vdup.32 q15, r11 @ X,Y,X,Y,X,Y,X,Y
vmull.s16 q9, d14, d30 @ A*X,I*Y,B*X,J*Y
vmull.s16 q10, d15, d31 @ C*X,K*Y,D*X,L*Y
vmull.s16 q11, d16, d30 @ E*X,M*Y,F*X,N*Y
vmull.s16 q12, d17, d31 @ G*X,O*Y,H*X,P*Y
vpadd.s32 d10, d18, d19 @ A*X+I*Y,B*X+J*Y
vpadd.s32 d11, d20, d21 @ C*X+K*Y,D*X+L*Y
vpadd.s32 d12, d22, d23 @ E*X+M*Y,F*X+N*Y
vpadd.s32 d13, d24, d25 @ G*X+O*Y,H*X+P*Y
vadd.s32 q3, q5 @ update val accumulator (part 1)
vadd.s32 q4, q6 @ update val accumulator (part 2)
subs r8, #2 @ tmpFilterSize -= 2
bgt 3b @ loop until filterSize is consumed
vshr.s32 q3, q3, #19 @ val>>19 (part 1)
vshr.s32 q4, q4, #19 @ val>>19 (part 2)
vqmovun.s32 d6, q3 @ clip16(val>>19) (part 1)
vqmovun.s32 d7, q4 @ clip16(val>>19) (part 2)
vqmovn.u16 d6, q3 @ merge part 1 and part 2
vst1.8 {d6}, [r3]! @ write destination
add r7, #8 @ i += 8
subs r4, r4, #8 @ dstW -= 8
bgt 2b @ loop until width is consumed
vpop {q4-q7}
pop {r4-r12, lr}
mov pc, lr
endfunc

View File

@@ -0,0 +1,83 @@
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_AS_DN_DIRECTIVE
#include "rgb2yuv_neon_common.S"
/* downsampled R16G16B16 x8 */
alias_qw r16x8, q7
alias_qw g16x8, q8
alias_qw b16x8, q9
alias n16x16_l, q11
alias n16x16_h, q12
alias y16x16_l, q13
alias y16x16_h, q14
alias_qw y8x16, q15
.macro init src
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
vrshrn.i32 CO_R, q13, #7
vrshrn.i32 CO_G, q14, #7
vrshrn.i32 CO_B, q15, #7
vmov.u8 BIAS_Y, #16
vmov.u8 BIAS_U, #128
.endm
.macro compute_y_16x1_step action, s8x16, coeff
vmovl.u8 n16x16_l, \s8x16\()_l
vmovl.u8 n16x16_h, \s8x16\()_h
\action y16x16_l, n16x16_l, \coeff
\action y16x16_h, n16x16_h, \coeff
.endm
.macro compute_y_16x1
compute_y_16x1_step vmul, r8x16, CO_RY
compute_y_16x1_step vmla, g8x16, CO_GY
compute_y_16x1_step vmla, b8x16, CO_BY
vrshrn.i16 y8x16_l, y16x16_l, #8
vrshrn.i16 y8x16_h, y16x16_h, #8
vadd.u8 y8x16, y8x16, BIAS_Y
.endm
alias c16x8, q15
alias_qw c8x8x2, q10
.macro compute_chroma_8x1 c, C
vmul c16x8, r16x8, CO_R\C
vmla c16x8, g16x8, CO_G\C
vmla c16x8, b16x8, CO_B\C
vrshrn.i16 \c\()8x8, c16x8, #8
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
.endm
loop_420sp rgbx, nv12, init, kernel_420_16x2, 16
#endif

View File

@@ -0,0 +1,122 @@
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_AS_DN_DIRECTIVE
#include "rgb2yuv_neon_common.S"
/* downsampled R16G16B16 x8 */
alias_qw r16x8, q7
alias_qw g16x8, q8
alias_qw b16x8, q9
alias n16x16_o, q11
alias n16x16_ol, q11_l
alias n16x16_oh, q11_h
alias y32x16_el, q12
alias y32x16_eh, q13
alias y32x16_ol, q14
alias y32x16_oh, q15
alias y16x16_e, q12
alias y16x16_el, q12_l
alias y16x16_eh, q12_h
alias y16x16_o, q13
alias y16x16_ol, q13_l
alias y16x16_oh, q13_h
alias y8x16, y16x16_e
.macro init src
// load s32x3x3, narrow to s16x3x3
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
vmovn.i32 CO_R, q13
vmovn.i32 CO_G, q14
vmovn.i32 CO_B, q15
vmov.u8 BIAS_Y, #16
vmov.u8 BIAS_U, #128
.endm
.macro compute_y_16x1_step action, s8x16, coeff
vmov.u8 n16x16_o, #0
vtrn.u8 \s8x16, n16x16_o
\action y32x16_el, \s8x16\()_l, \coeff
\action y32x16_eh, \s8x16\()_h, \coeff
\action y32x16_ol, n16x16_ol, \coeff
\action y32x16_oh, n16x16_oh, \coeff
.endm
/*
* in: r8x16, g8x16, b8x16
* out: y8x16
* clobber: q11-q15, r8x16, g8x16, b8x16
*/
.macro compute_y_16x1
compute_y_16x1_step vmull, r8x16, CO_RY
compute_y_16x1_step vmlal, g8x16, CO_GY
compute_y_16x1_step vmlal, b8x16, CO_BY
vrshrn.i32 y16x16_el, y32x16_el, #15
vrshrn.i32 y16x16_eh, y32x16_eh, #15
vrshrn.i32 y16x16_ol, y32x16_ol, #15
vrshrn.i32 y16x16_oh, y32x16_oh, #15
vtrn.8 y16x16_e, y16x16_o
vadd.u8 y8x16, y8x16, BIAS_Y
.endm
alias c32x8_l, q14
alias c32x8_h, q15
alias_qw c16x8, q13
alias_qw c8x8x2, q10
.macro compute_chroma_8x1_step action, s16x8, coeff
\action c32x8_l, \s16x8\()_l, \coeff
\action c32x8_h, \s16x8\()_h, \coeff
.endm
/*
* in: r16x8, g16x8, b16x8
* out: c8x8
* clobber: q14-q15
*/
.macro compute_chroma_8x1 c, C
compute_chroma_8x1_step vmull, r16x8, CO_R\C
compute_chroma_8x1_step vmlal, g16x8, CO_G\C
compute_chroma_8x1_step vmlal, b16x8, CO_B\C
vrshrn.i32 c16x8_l, c32x8_l, #15
vrshrn.i32 c16x8_h, c32x8_h, #15
vmovn.i16 \c\()8x8, c16x8
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
.endm
loop_420sp rgbx, nv12, init, kernel_420_16x2, 32
#endif

View File

@@ -0,0 +1,291 @@
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro alias name, tgt, set=1
.if \set != 0
\name .req \tgt
.else
.unreq \name
.endif
.endm
.altmacro
.macro alias_dw_all qw, dw_l, dw_h
alias q\qw\()_l, d\dw_l
alias q\qw\()_h, d\dw_h
.if \qw < 15
alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
.endif
.endm
alias_dw_all 0, 0, 1
.noaltmacro
.macro alias_qw name, qw, set=1
alias \name\(), \qw, \set
alias \name\()_l, \qw\()_l, \set
alias \name\()_h, \qw\()_h, \set
.endm
.macro prologue
push {r4-r12, lr}
vpush {q4-q7}
.endm
.macro epilogue
vpop {q4-q7}
pop {r4-r12, pc}
.endm
.macro load_arg reg, ix
ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
.endm
/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma
* int width, int height,
* int y_stride, int c_stride, int src_stride,
* int32_t coeff_table[9]);
*/
.macro alias_loop_420sp set=1
alias src, r0, \set
alias src0, src, \set
alias y, r1, \set
alias y0, y, \set
alias chroma, r2, \set
alias width, r3, \set
alias header, width, \set
alias height, r4, \set
alias y_stride, r5, \set
alias c_stride, r6, \set
alias c_padding, c_stride, \set
alias src_stride, r7, \set
alias y0_end, r8, \set
alias src_padding,r9, \set
alias y_padding, r10, \set
alias src1, r11, \set
alias y1, r12, \set
alias coeff_table,r12, \set
.endm
.macro loop_420sp s_fmt, d_fmt, init, kernel, precision
function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
prologue
alias_loop_420sp
load_arg height, 4
load_arg y_stride, 5
load_arg c_stride, 6
load_arg src_stride, 7
load_arg coeff_table, 8
\init coeff_table
sub y_padding, y_stride, width
sub c_padding, c_stride, width
sub src_padding, src_stride, width, LSL #2
add y0_end, y0, width
and header, width, #15
add y1, y0, y_stride
add src1, src0, src_stride
0:
cmp header, #0
beq 1f
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header
1:
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma
cmp y0, y0_end
blt 1b
2:
add y0, y1, y_padding
add y0_end, y1, y_stride
add chroma, chroma, c_padding
add src0, src1, src_padding
add y1, y0, y_stride
add src1, src0, src_stride
subs height, height, #2
bgt 0b
epilogue
alias_loop_420sp 0
endfunc
.endm
.macro downsample
vpaddl.u8 r16x8, r8x16
vpaddl.u8 g16x8, g8x16
vpaddl.u8 b16x8, b8x16
.endm
/* acculumate and right shift by 2 */
.macro downsample_ars2
vpadal.u8 r16x8, r8x16
vpadal.u8 g16x8, g8x16
vpadal.u8 b16x8, b8x16
vrshr.u16 r16x8, r16x8, #2
vrshr.u16 g16x8, g16x8, #2
vrshr.u16 b16x8, b16x8, #2
.endm
.macro store_y8_16x1 dst, count
.ifc "\count",""
vstmia \dst!, {y8x16}
.else
vstmia \dst, {y8x16}
add \dst, \dst, \count
.endif
.endm
.macro store_chroma_nv12_8x1 dst, count
.ifc "\count",""
vst2.i8 {u8x8, v8x8}, [\dst]!
.else
vst2.i8 {u8x8, v8x8}, [\dst], \count
.endif
.endm
.macro store_chroma_nv21_8x1 dst, count
.ifc "\count",""
vst2.i8 {v8x8, u8x8}, [\dst]!
.else
vst2.i8 {v8x8, u8x8}, [\dst], \count
.endif
.endm
.macro load_8888_16x1 a, b, c, d, src, count
.ifc "\count",""
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]!
.else
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]
sub \src, \src, #32
add \src, \src, \count, LSL #2
.endif
.endm
.macro load_rgbx_16x1 src, count
load_8888_16x1 r, g, b, x, \src, \count
.endm
.macro load_bgrx_16x1 src, count
load_8888_16x1 b, g, r, x, \src, \count
.endm
.macro alias_src_rgbx set=1
alias_src_8888 r, g, b, x, \set
.endm
.macro alias_src_bgrx set=1
alias_src_8888 b, g, r, x, \set
.endm
.macro alias_dst_nv12 set=1
alias u8x8, c8x8x2_l, \set
alias v8x8, c8x8x2_h, \set
.endm
.macro alias_dst_nv21 set=1
alias v8x8, c8x8x2_l, \set
alias u8x8, c8x8x2_h, \set
.endm
// common aliases
alias CO_R d0
CO_RY .dn d0.s16[0]
CO_RU .dn d0.s16[1]
CO_RV .dn d0.s16[2]
alias CO_G d1
CO_GY .dn d1.s16[0]
CO_GU .dn d1.s16[1]
CO_GV .dn d1.s16[2]
alias CO_B d2
CO_BY .dn d2.s16[0]
CO_BU .dn d2.s16[1]
CO_BV .dn d2.s16[2]
alias BIAS_U, d3
alias BIAS_V, BIAS_U
alias BIAS_Y, q2
/* q3-q6 R8G8B8X8 x16 */
.macro alias_src_8888 a, b, c, d, set
alias_qw \a\()8x16, q3, \set
alias_qw \b\()8x16, q4, \set
alias_qw \c\()8x16, q5, \set
alias_qw \d\()8x16, q6, \set
.endm
.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count
alias_src_\rgb_fmt
alias_dst_\yuv_fmt
load_\rgb_fmt\()_16x1 \rgb0, \count
downsample
compute_y_16x1
store_y8_16x1 \y0, \count
load_\rgb_fmt\()_16x1 \rgb1, \count
downsample_ars2
compute_y_16x1
store_y8_16x1 \y1, \count
compute_chroma_8x1 u, U
compute_chroma_8x1 v, V
store_chroma_\yuv_fmt\()_8x1 \chroma, \count
alias_dst_\yuv_fmt 0
alias_src_\rgb_fmt 0
.endm

View File

@@ -0,0 +1,47 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/arm/cpu.h"
void ff_hscale_8_to_15_neon(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
av_cold void ff_sws_init_swscale_arm(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
if (c->srcBpc == 8 && c->dstBpc <= 14 &&
(c->hLumFilterSize % 8) == 0 &&
(c->hChrFilterSize % 8) == 0)
{
c->hyScale = c->hcScale = ff_hscale_8_to_15_neon;
}
if (c->dstBpc == 8) {
c->yuv2planeX = ff_yuv2planeX_8_neon;
}
}
}

View File

@@ -0,0 +1,186 @@
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/arm/cpu.h"
#if HAVE_AS_DN_DIRECTIVE
extern void rgbx_to_nv12_neon_32(const uint8_t *src, uint8_t *y, uint8_t *chroma,
int width, int height,
int y_stride, int c_stride, int src_stride,
int32_t coeff_tbl[9]);
extern void rgbx_to_nv12_neon_16(const uint8_t *src, uint8_t *y, uint8_t *chroma,
int width, int height,
int y_stride, int c_stride, int src_stride,
int32_t coeff_tbl[9]);
static int rgbx_to_nv12_neon_32_wrapper(SwsContext *context, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[]) {
rgbx_to_nv12_neon_32(src[0] + srcSliceY * srcStride[0],
dst[0] + srcSliceY * dstStride[0],
dst[1] + (srcSliceY / 2) * dstStride[1],
context->srcW, srcSliceH,
dstStride[0], dstStride[1], srcStride[0],
context->input_rgb2yuv_table);
return 0;
}
static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[]) {
rgbx_to_nv12_neon_16(src[0] + srcSliceY * srcStride[0],
dst[0] + srcSliceY * dstStride[0],
dst[1] + (srcSliceY / 2) * dstStride[1],
context->srcW, srcSliceH,
dstStride[0], dstStride[1], srcStride[0],
context->input_rgb2yuv_table);
return 0;
}
#define YUV_TO_RGB_TABLE \
c->yuv2rgb_v2r_coeff, \
c->yuv2rgb_u2g_coeff, \
c->yuv2rgb_v2g_coeff, \
c->yuv2rgb_u2b_coeff, \
#define DECLARE_FF_YUVX_TO_RGBX_FUNCS(ifmt, ofmt) \
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
uint8_t *dst, int linesize, \
const uint8_t *srcY, int linesizeY, \
const uint8_t *srcU, int linesizeU, \
const uint8_t *srcV, int linesizeV, \
const int16_t *table, \
int y_offset, \
int y_coeff); \
\
static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \
int srcStride[], int srcSliceY, int srcSliceH, \
uint8_t *dst[], int dstStride[]) { \
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
\
ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
src[0], srcStride[0], \
src[1], srcStride[1], \
src[2], srcStride[2], \
yuv2rgb_table, \
c->yuv2rgb_y_offset >> 6, \
c->yuv2rgb_y_coeff); \
\
return 0; \
} \
#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra) \
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p)
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p)
#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt) \
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
uint8_t *dst, int linesize, \
const uint8_t *srcY, int linesizeY, \
const uint8_t *srcC, int linesizeC, \
const int16_t *table, \
int y_offset, \
int y_coeff); \
\
static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \
int srcStride[], int srcSliceY, int srcSliceH, \
uint8_t *dst[], int dstStride[]) { \
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
\
ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
src[0], srcStride[0], src[1], srcStride[1], \
yuv2rgb_table, \
c->yuv2rgb_y_offset >> 6, \
c->yuv2rgb_y_coeff); \
\
return 0; \
} \
#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra) \
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12)
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
/* We need a 16 pixel width alignment. This constraint can easily be removed
* for input reading but for the output which is 4-bytes per pixel (RGBA) the
* assembly might be writing as much as 4*15=60 extra bytes at the end of the
* line, which won't fit the 32-bytes buffer alignment. */
#define SET_FF_NVX_TO_RGBX_FUNC(ifmt, IFMT, ofmt, OFMT, accurate_rnd) do { \
if (c->srcFormat == AV_PIX_FMT_##IFMT \
&& c->dstFormat == AV_PIX_FMT_##OFMT \
&& !(c->srcH & 1) \
&& !(c->srcW & 15) \
&& !accurate_rnd) { \
c->swscale = ifmt##_to_##ofmt##_neon_wrapper; \
} \
} while (0)
#define SET_FF_NVX_TO_ALL_RGBX_FUNC(nvx, NVX, accurate_rnd) do { \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, argb, ARGB, accurate_rnd); \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgba, RGBA, accurate_rnd); \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd); \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd); \
} while (0)
static void get_unscaled_swscale_neon(SwsContext *c) {
int accurate_rnd = c->flags & SWS_ACCURATE_RND;
if (c->srcFormat == AV_PIX_FMT_RGBA
&& c->dstFormat == AV_PIX_FMT_NV12
&& (c->srcW >= 16)) {
c->swscale = accurate_rnd ? rgbx_to_nv12_neon_32_wrapper
: rgbx_to_nv12_neon_16_wrapper;
}
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
}
void ff_get_unscaled_swscale_arm(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
get_unscaled_swscale_neon(c);
}
#else
void ff_get_unscaled_swscale_arm(SwsContext *c)
{
}
#endif

View File

@@ -0,0 +1,280 @@
/*
* Copyright (c) 2015 Matthieu Bouron <matthieu.bouron stupeflix.com>
* Copyright (c) 2015 Clément Bœsch <clement stupeflix.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro compute_premult
vsub.u16 q14,q11 @ q14 = U * (1 << 3) - 128 * (1 << 3)
vsub.u16 q15,q11 @ q15 = V * (1 << 3) - 128 * (1 << 3)
vqdmulh.s16 q8, q15, d1[0] @ q8 = V * v2r
vqdmulh.s16 q9, q14, d1[1] @ q9 = U * u2g
vqdmulh.s16 q5, q15, d1[2] @ q5 = V * v2g
vadd.s16 q9, q5 @ q9 = U * u2g + V * v2g
vqdmulh.s16 q10,q14, d1[3] @ q10 = U * u2b
.endm
.macro compute_color dst_comp1 dst_comp2 pre
vadd.s16 q1, q14, \pre
vadd.s16 q2, q15, \pre
vqrshrun.s16 \dst_comp1, q1, #1
vqrshrun.s16 \dst_comp2, q2, #1
.endm
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
compute_color \r1, \r2, q8
compute_color \g1, \g2, q9
compute_color \b1, \b2, q10
vmov.u8 \a1, #255
vmov.u8 \a2, #255
.endm
.macro compute dst ofmt
vshll.u8 q14, d14, #3 @ q14 = Y * (1 << 3)
vshll.u8 q15, d15, #3 @ q15 = Y * (1 << 3)
vsub.s16 q14, q12 @ q14 = (Y - y_offset)
vsub.s16 q15, q12 @ q15 = (Y - y_offset)
vqdmulh.s16 q14, q13 @ q14 = (Y - y_offset) * y_coeff
vqdmulh.s16 q15, q13 @ q15 = (Y - y_offset) * y_coeff
.ifc \ofmt,argb
compute_rgba d7, d8, d9, d6, d11, d12, d13, d10
.endif
.ifc \ofmt,rgba
compute_rgba d6, d7, d8, d9, d10, d11, d12, d13
.endif
.ifc \ofmt,abgr
compute_rgba d9, d8, d7, d6, d13, d12, d11, d10
.endif
.ifc \ofmt,bgra
compute_rgba d8, d7, d6, d9, d12, d11, d10, d13
.endif
vzip.8 d6, d10 @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16
vzip.8 d7, d11 @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16
vzip.8 d8, d12 @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16
vzip.8 d9, d13 @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16
vst4.8 {q3, q4}, [\dst,:128]!
vst4.8 {q5, q6}, [\dst,:128]!
.endm
.macro process_1l_internal dst src ofmt
vld2.8 {d14, d15}, [\src]! @ q7 = Y (interleaved)
compute \dst, \ofmt
.endm
.macro process_1l ofmt
compute_premult
process_1l_internal r2, r4, \ofmt
.endm
.macro process_2l ofmt
compute_premult
process_1l_internal r2, r4, \ofmt
process_1l_internal r11,r12,\ofmt
.endm
.macro load_args_nv12
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcC
ldr r7, [sp, #116] @ r7 = linesizeC
ldr r8, [sp, #120] @ r8 = table
ldr r9, [sp, #124] @ r9 = y_offset
ldr r10,[sp, #128] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
add r11, r2, r3 @ r11 = dst + linesize (dst2)
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
lsl r3, r3, #1
lsl r5, r5, #1
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
sub r7, r7, r0 @ r7 = linesizeC - width (paddingC)
.endm
.macro load_args_nv21
load_args_nv12
.endm
.macro load_args_yuv420p
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcU
ldr r8, [sp, #128] @ r8 = table
ldr r9, [sp, #132] @ r9 = y_offset
ldr r10,[sp, #136] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
add r11, r2, r3 @ r11 = dst + linesize (dst2)
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
lsl r3, r3, #1
lsl r5, r5, #1
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
ldr r10,[sp, #120] @ r10 = srcV
.endm
.macro load_args_yuv422p
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcU
ldr r7, [sp, #116] @ r7 = linesizeU
ldr r12,[sp, #124] @ r12 = linesizeV
ldr r8, [sp, #128] @ r8 = table
ldr r9, [sp, #132] @ r9 = y_offset
ldr r10,[sp, #136] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
sub r3, r3, r0, lsl #2 @ r3 = linesize - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY - width (paddingY)
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV)
ldr r10,[sp, #120] @ r10 = srcV
.endm
.macro load_chroma_nv12
pld [r12, #64*3]
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
.endm
.macro load_chroma_nv21
pld [r12, #64*3]
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
vshll.u8 q14, d3, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d2, #3 @ q15 = V * (1 << 3)
.endm
.macro load_chroma_yuv420p
pld [r10, #64*3]
pld [r12, #64*3]
vld1.8 d2, [r6]! @ d2: chroma red line
vld1.8 d3, [r10]! @ d3: chroma blue line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
.endm
.macro load_chroma_yuv422p
pld [r10, #64*3]
vld1.8 d2, [r6]! @ d2: chroma red line
vld1.8 d3, [r10]! @ d3: chroma blue line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
.endm
.macro increment_and_test_nv12
add r11, r11, r3 @ dst2 += padding
add r12, r12, r5 @ srcY2 += paddingY
add r6, r6, r7 @ srcC += paddingC
subs r1, r1, #2 @ height -= 2
.endm
.macro increment_and_test_nv21
increment_and_test_nv12
.endm
.macro increment_and_test_yuv420p
add r11, r11, r3 @ dst2 += padding
add r12, r12, r5 @ srcY2 += paddingY
ldr r7, [sp, #116] @ r7 = linesizeU
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
add r6, r6, r7 @ srcU += paddingU
ldr r7, [sp, #124] @ r7 = linesizeV
sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV)
add r10, r10, r7 @ srcV += paddingV
subs r1, r1, #2 @ height -= 2
.endm
.macro increment_and_test_yuv422p
add r6, r6, r7 @ srcU += paddingU
add r10,r10,r12 @ srcV += paddingV
subs r1, r1, #1 @ height -= 1
.endm
.macro process_nv12 ofmt
process_2l \ofmt
.endm
.macro process_nv21 ofmt
process_2l \ofmt
.endm
.macro process_yuv420p ofmt
process_2l \ofmt
.endm
.macro process_yuv422p ofmt
process_1l \ofmt
.endm
.macro declare_func ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
load_args_\ifmt
vmov.u16 q11, #1024 @ q11 = 128 * (1 << 3)
vdup.16 q12, r9 @ q12 = y_offset
vmov d26, d0 @ q13 = y_coeff
vmov d27, d0 @ q13 = y_coeff
1:
mov r8, r0 @ r8 = width
2:
pld [r6, #64*3]
pld [r4, #64*3]
vmov.i8 d10, #128
load_chroma_\ifmt
process_\ifmt \ofmt
subs r8, r8, #16 @ width -= 16
bgt 2b
add r2, r2, r3 @ dst += padding
add r4, r4, r5 @ srcY += paddingY
increment_and_test_\ifmt
bgt 1b
vpop {q4-q7}
pop {r4-r12, lr}
mov pc, lr
endfunc
.endm
.macro declare_rgb_funcs ifmt
declare_func \ifmt, argb
declare_func \ifmt, rgba
declare_func \ifmt, abgr
declare_func \ifmt, bgra
.endm
declare_rgb_funcs nv12
declare_rgb_funcs nv21
declare_rgb_funcs yuv420p
declare_rgb_funcs yuv422p

View File

@@ -0,0 +1,338 @@
/*
* Bayer-to-RGB/YV12 template
* Copyright (c) 2011-2014 Peter Ross <pross@xvid.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#if defined(BAYER_BGGR) || defined(BAYER_GBRG)
#define BAYER_R 0
#define BAYER_G 1
#define BAYER_B 2
#endif
#if defined(BAYER_RGGB) || defined(BAYER_GRBG)
#define BAYER_R 2
#define BAYER_G 1
#define BAYER_B 0
#endif
#if defined(BAYER_8)
#define BAYER_READ(x) (x)
#define BAYER_SIZEOF 1
#define BAYER_SHIFT 0
#endif
#if defined(BAYER_16LE)
#define BAYER_READ(x) AV_RL16(&(x))
#define BAYER_SIZEOF 2
#define BAYER_SHIFT 8
#endif
#if defined(BAYER_16BE)
#define BAYER_READ(x) AV_RB16(&(x))
#define BAYER_SIZEOF 2
#define BAYER_SHIFT 8
#endif
#define S(y, x) BAYER_READ(src[(y)*src_stride + BAYER_SIZEOF*(x)])
#define T(y, x) (unsigned int)S(y, x)
#define R(y, x) dst[(y)*dst_stride + (x)*3 + BAYER_R]
#define G(y, x) dst[(y)*dst_stride + (x)*3 + BAYER_G]
#define B(y, x) dst[(y)*dst_stride + (x)*3 + BAYER_B]
#if defined(BAYER_BGGR) || defined(BAYER_RGGB)
#define BAYER_TO_RGB24_COPY \
R(0, 0) = \
R(0, 1) = \
R(1, 1) = \
R(1, 0) = S(1, 1) >> BAYER_SHIFT; \
\
G(0, 1) = S(0, 1) >> BAYER_SHIFT; \
G(0, 0) = \
G(1, 1) = (T(0, 1) + T(1, 0)) >> (1 + BAYER_SHIFT); \
G(1, 0) = S(1, 0) >> BAYER_SHIFT; \
\
B(1, 1) = \
B(0, 0) = \
B(0, 1) = \
B(1, 0) = S(0, 0) >> BAYER_SHIFT;
#define BAYER_TO_RGB24_INTERPOLATE \
R(0, 0) = (T(-1, -1) + T(-1, 1) + T(1, -1) + T(1, 1)) >> (2 + BAYER_SHIFT); \
G(0, 0) = (T(-1, 0) + T( 0, -1) + T(0, 1) + T(1, 0)) >> (2 + BAYER_SHIFT); \
B(0, 0) = S(0, 0) >> BAYER_SHIFT; \
\
R(0, 1) = (T(-1, 1) + T(1, 1)) >> (1 + BAYER_SHIFT); \
G(0, 1) = S(0, 1) >> BAYER_SHIFT; \
B(0, 1) = (T(0, 0) + T(0, 2)) >> (1 + BAYER_SHIFT); \
\
R(1, 0) = (T(1, -1) + T(1, 1)) >> (1 + BAYER_SHIFT); \
G(1, 0) = S(1, 0) >> BAYER_SHIFT; \
B(1, 0) = (T(0, 0) + T(2, 0)) >> (1 + BAYER_SHIFT); \
\
R(1, 1) = S(1, 1) >> BAYER_SHIFT; \
G(1, 1) = (T(0, 1) + T(1, 0) + T(1, 2) + T(2, 1)) >> (2 + BAYER_SHIFT); \
B(1, 1) = (T(0, 0) + T(0, 2) + T(2, 0) + T(2, 2)) >> (2 + BAYER_SHIFT);
#else
#define BAYER_TO_RGB24_COPY \
R(0, 0) = \
R(0, 1) = \
R(1, 1) = \
R(1, 0) = S(1, 0) >> BAYER_SHIFT; \
\
G(0, 0) = S(0, 0) >> BAYER_SHIFT; \
G(1, 1) = S(1, 1) >> BAYER_SHIFT; \
G(0, 1) = \
G(1, 0) = (T(0, 0) + T(1, 1)) >> (1 + BAYER_SHIFT); \
\
B(1, 1) = \
B(0, 0) = \
B(0, 1) = \
B(1, 0) = S(0, 1) >> BAYER_SHIFT;
#define BAYER_TO_RGB24_INTERPOLATE \
R(0, 0) = (T(-1, 0) + T(1, 0)) >> (1 + BAYER_SHIFT); \
G(0, 0) = S(0, 0) >> BAYER_SHIFT; \
B(0, 0) = (T(0, -1) + T(0, 1)) >> (1 + BAYER_SHIFT); \
\
R(0, 1) = (T(-1, 0) + T(-1, 2) + T(1, 0) + T(1, 2)) >> (2 + BAYER_SHIFT); \
G(0, 1) = (T(-1, 1) + T(0, 0) + T(0, 2) + T(1, 1)) >> (2 + BAYER_SHIFT); \
B(0, 1) = S(0, 1) >> BAYER_SHIFT; \
\
R(1, 0) = S(1, 0) >> BAYER_SHIFT; \
G(1, 0) = (T(0, 0) + T(1, -1) + T(1, 1) + T(2, 0)) >> (2 + BAYER_SHIFT); \
B(1, 0) = (T(0, -1) + T(0, 1) + T(2, -1) + T(2, 1)) >> (2 + BAYER_SHIFT); \
\
R(1, 1) = (T(1, 0) + T(1, 2)) >> (1 + BAYER_SHIFT); \
G(1, 1) = S(1, 1) >> BAYER_SHIFT; \
B(1, 1) = (T(0, 1) + T(2, 1)) >> (1 + BAYER_SHIFT);
#endif
#if defined(BAYER_BGGR) || defined(BAYER_RGGB)
#define BAYER_TO_RGB48_COPY \
R(0, 0) = \
R(0, 1) = \
R(1, 1) = \
R(1, 0) = S(1, 1); \
\
G(0, 1) = S(0, 1); \
G(0, 0) = \
G(1, 1) = (T(0, 1) + T(1, 0)) >> 1; \
G(1, 0) = S(1, 0); \
\
B(1, 1) = \
B(0, 0) = \
B(0, 1) = \
B(1, 0) = S(0, 0);
#define BAYER_TO_RGB48_INTERPOLATE \
R(0, 0) = (T(-1, -1) + T(-1, 1) + T(1, -1) + T(1, 1)) >> 2; \
G(0, 0) = (T(-1, 0) + T( 0, -1) + T(0, 1) + T(1, 0)) >> 2; \
B(0, 0) = S(0, 0); \
\
R(0, 1) = (T(-1, 1) + T(1, 1)) >> 1; \
G(0, 1) = S(0, 1); \
B(0, 1) = (T(0, 0) + T(0, 2)) >> 1; \
\
R(1, 0) = (T(1, -1) + T(1, 1)) >> 1; \
G(1, 0) = S(1, 0); \
B(1, 0) = (T(0, 0) + T(2, 0)) >> 1; \
\
R(1, 1) = S(1, 1); \
G(1, 1) = (T(0, 1) + T(1, 0) + T(1, 2) + T(2, 1)) >> 2; \
B(1, 1) = (T(0, 0) + T(0, 2) + T(2, 0) + T(2, 2)) >> 2;
#else
#define BAYER_TO_RGB48_COPY \
R(0, 0) = \
R(0, 1) = \
R(1, 1) = \
R(1, 0) = S(1, 0); \
\
G(0, 0) = S(0, 0); \
G(1, 1) = S(1, 1); \
G(0, 1) = \
G(1, 0) = (T(0, 0) + T(1, 1)) >> 1; \
\
B(1, 1) = \
B(0, 0) = \
B(0, 1) = \
B(1, 0) = S(0, 1);
#define BAYER_TO_RGB48_INTERPOLATE \
R(0, 0) = (T(-1, 0) + T(1, 0)) >> 1; \
G(0, 0) = S(0, 0); \
B(0, 0) = (T(0, -1) + T(0, 1)) >> 1; \
\
R(0, 1) = (T(-1, 0) + T(-1, 2) + T(1, 0) + T(1, 2)) >> 2; \
G(0, 1) = (T(-1, 1) + T(0, 0) + T(0, 2) + T(1, 1)) >> 2; \
B(0, 1) = S(0, 1); \
\
R(1, 0) = S(1, 0); \
G(1, 0) = (T(0, 0) + T(1, -1) + T(1, 1) + T(2, 0)) >> 2; \
B(1, 0) = (T(0, -1) + T(0, 1) + T(2, -1) + T(2, 1)) >> 2; \
\
R(1, 1) = (T(1, 0) + T(1, 2)) >> 1; \
G(1, 1) = S(1, 1); \
B(1, 1) = (T(0, 1) + T(2, 1)) >> 1;
#endif
/**
* invoke ff_rgb24toyv12 for 2x2 pixels
*/
#define rgb24toyv12_2x2(src, dstY, dstU, dstV, luma_stride, src_stride, rgb2yuv) \
ff_rgb24toyv12(src, dstY, dstV, dstU, 2, 2, luma_stride, 0, src_stride, rgb2yuv)
static void BAYER_RENAME(rgb24_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int width)
{
int i;
for (i = 0 ; i < width; i+= 2) {
BAYER_TO_RGB24_COPY
src += 2 * BAYER_SIZEOF;
dst += 6;
}
}
static void BAYER_RENAME(rgb24_interpolate)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int width)
{
int i;
BAYER_TO_RGB24_COPY
src += 2 * BAYER_SIZEOF;
dst += 6;
for (i = 2 ; i < width - 2; i+= 2) {
BAYER_TO_RGB24_INTERPOLATE
src += 2 * BAYER_SIZEOF;
dst += 6;
}
if (width > 2) {
BAYER_TO_RGB24_COPY
}
}
static void BAYER_RENAME(rgb48_copy)(const uint8_t *src, int src_stride, uint8_t *ddst, int dst_stride, int width)
{
uint16_t *dst = (uint16_t *)ddst;
int i;
dst_stride /= 2;
for (i = 0 ; i < width; i+= 2) {
BAYER_TO_RGB48_COPY
src += 2 * BAYER_SIZEOF;
dst += 6;
}
}
static void BAYER_RENAME(rgb48_interpolate)(const uint8_t *src, int src_stride, uint8_t *ddst, int dst_stride, int width)
{
uint16_t *dst = (uint16_t *)ddst;
int i;
dst_stride /= 2;
BAYER_TO_RGB48_COPY
src += 2 * BAYER_SIZEOF;
dst += 6;
for (i = 2 ; i < width - 2; i+= 2) {
BAYER_TO_RGB48_INTERPOLATE
src += 2 * BAYER_SIZEOF;
dst += 6;
}
if (width > 2) {
BAYER_TO_RGB48_COPY
}
}
static void BAYER_RENAME(yv12_copy)(const uint8_t *src, int src_stride, uint8_t *dstY, uint8_t *dstU, uint8_t *dstV, int luma_stride, int width, int32_t *rgb2yuv)
{
uint8_t dst[12];
const int dst_stride = 6;
int i;
for (i = 0 ; i < width; i+= 2) {
BAYER_TO_RGB24_COPY
rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv);
src += 2 * BAYER_SIZEOF;
dstY += 2;
dstU++;
dstV++;
}
}
static void BAYER_RENAME(yv12_interpolate)(const uint8_t *src, int src_stride, uint8_t *dstY, uint8_t *dstU, uint8_t *dstV, int luma_stride, int width, int32_t *rgb2yuv)
{
uint8_t dst[12];
const int dst_stride = 6;
int i;
BAYER_TO_RGB24_COPY
rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv);
src += 2 * BAYER_SIZEOF;
dstY += 2;
dstU++;
dstV++;
for (i = 2 ; i < width - 2; i+= 2) {
BAYER_TO_RGB24_INTERPOLATE
rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv);
src += 2 * BAYER_SIZEOF;
dstY += 2;
dstU++;
dstV++;
}
if (width > 2) {
BAYER_TO_RGB24_COPY
rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv);
}
}
#undef S
#undef T
#undef R
#undef G
#undef B
#undef BAYER_TO_RGB24_COPY
#undef BAYER_TO_RGB24_INTERPOLATE
#undef BAYER_TO_RGB48_COPY
#undef BAYER_TO_RGB48_INTERPOLATE
#undef BAYER_RENAME
#undef BAYER_R
#undef BAYER_G
#undef BAYER_B
#undef BAYER_READ
#undef BAYER_SIZEOF
#undef BAYER_SHIFT
#if defined(BAYER_BGGR)
#undef BAYER_BGGR
#endif
#if defined(BAYER_RGGB)
#undef BAYER_RGGB
#endif
#if defined(BAYER_GBRG)
#undef BAYER_GBRG
#endif
#if defined(BAYER_GRBG)
#undef BAYER_GRBG
#endif
#if defined(BAYER_8)
#undef BAYER_8
#endif
#if defined(BAYER_16LE)
#undef BAYER_16LE
#endif
#if defined(BAYER_16BE)
#undef BAYER_16BE
#endif

72
externals/ffmpeg/ffmpeg/libswscale/gamma.c vendored Executable file
View File

@@ -0,0 +1,72 @@
/*
* Copyright (C) 2015 Pedro Arthur <bygrandao@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "swscale_internal.h"
typedef struct GammaContext
{
uint16_t *table;
} GammaContext;
// gamma_convert expects 16 bit rgb format
// it writes directly in src slice thus it must be modifiable (done through cascade context)
static int gamma_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
GammaContext *instance = desc->instance;
uint16_t *table = instance->table;
int srcW = desc->src->width;
int i;
for (i = 0; i < sliceH; ++i) {
uint8_t ** src = desc->src->plane[0].line;
int src_pos = sliceY+i - desc->src->plane[0].sliceY;
uint16_t *src1 = (uint16_t*)*(src+src_pos);
int j;
for (j = 0; j < srcW; ++j) {
uint16_t r = AV_RL16(src1 + j*4 + 0);
uint16_t g = AV_RL16(src1 + j*4 + 1);
uint16_t b = AV_RL16(src1 + j*4 + 2);
AV_WL16(src1 + j*4 + 0, table[r]);
AV_WL16(src1 + j*4 + 1, table[g]);
AV_WL16(src1 + j*4 + 2, table[b]);
}
}
return sliceH;
}
int ff_init_gamma_convert(SwsFilterDescriptor *desc, SwsSlice * src, uint16_t *table)
{
GammaContext *li = av_malloc(sizeof(GammaContext));
if (!li)
return AVERROR(ENOMEM);
li->table = table;
desc->instance = li;
desc->src = src;
desc->dst = NULL;
desc->process = &gamma_convert;
return 0;
}

289
externals/ffmpeg/ffmpeg/libswscale/hscale.c vendored Executable file
View File

@@ -0,0 +1,289 @@
/*
* Copyright (C) 2015 Pedro Arthur <bygrandao@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "swscale_internal.h"
/// Scaler instance data
typedef struct FilterContext
{
uint16_t *filter;
int *filter_pos;
int filter_size;
int xInc;
} FilterContext;
/// Color conversion instance data
typedef struct ColorContext
{
uint32_t *pal;
} ColorContext;
static int lum_h_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
FilterContext *instance = desc->instance;
int srcW = desc->src->width;
int dstW = desc->dst->width;
int xInc = instance->xInc;
int i;
for (i = 0; i < sliceH; ++i) {
uint8_t ** src = desc->src->plane[0].line;
uint8_t ** dst = desc->dst->plane[0].line;
int src_pos = sliceY+i - desc->src->plane[0].sliceY;
int dst_pos = sliceY+i - desc->dst->plane[0].sliceY;
if (c->hyscale_fast) {
c->hyscale_fast(c, (int16_t*)dst[dst_pos], dstW, src[src_pos], srcW, xInc);
} else {
c->hyScale(c, (int16_t*)dst[dst_pos], dstW, (const uint8_t *)src[src_pos], instance->filter,
instance->filter_pos, instance->filter_size);
}
if (c->lumConvertRange)
c->lumConvertRange((int16_t*)dst[dst_pos], dstW);
desc->dst->plane[0].sliceH += 1;
if (desc->alpha) {
src = desc->src->plane[3].line;
dst = desc->dst->plane[3].line;
src_pos = sliceY+i - desc->src->plane[3].sliceY;
dst_pos = sliceY+i - desc->dst->plane[3].sliceY;
desc->dst->plane[3].sliceH += 1;
if (c->hyscale_fast) {
c->hyscale_fast(c, (int16_t*)dst[dst_pos], dstW, src[src_pos], srcW, xInc);
} else {
c->hyScale(c, (int16_t*)dst[dst_pos], dstW, (const uint8_t *)src[src_pos], instance->filter,
instance->filter_pos, instance->filter_size);
}
}
}
return sliceH;
}
static int lum_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
int srcW = desc->src->width;
ColorContext * instance = desc->instance;
uint32_t * pal = instance->pal;
int i;
desc->dst->plane[0].sliceY = sliceY;
desc->dst->plane[0].sliceH = sliceH;
desc->dst->plane[3].sliceY = sliceY;
desc->dst->plane[3].sliceH = sliceH;
for (i = 0; i < sliceH; ++i) {
int sp0 = sliceY+i - desc->src->plane[0].sliceY;
int sp1 = ((sliceY+i) >> desc->src->v_chr_sub_sample) - desc->src->plane[1].sliceY;
const uint8_t * src[4] = { desc->src->plane[0].line[sp0],
desc->src->plane[1].line[sp1],
desc->src->plane[2].line[sp1],
desc->src->plane[3].line[sp0]};
uint8_t * dst = desc->dst->plane[0].line[i];
if (c->lumToYV12) {
c->lumToYV12(dst, src[0], src[1], src[2], srcW, pal);
} else if (c->readLumPlanar) {
c->readLumPlanar(dst, src, srcW, c->input_rgb2yuv_table);
}
if (desc->alpha) {
dst = desc->dst->plane[3].line[i];
if (c->alpToYV12) {
c->alpToYV12(dst, src[3], src[1], src[2], srcW, pal);
} else if (c->readAlpPlanar) {
c->readAlpPlanar(dst, src, srcW, NULL);
}
}
}
return sliceH;
}
int ff_init_desc_fmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal)
{
ColorContext * li = av_malloc(sizeof(ColorContext));
if (!li)
return AVERROR(ENOMEM);
li->pal = pal;
desc->instance = li;
desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
desc->src =src;
desc->dst = dst;
desc->process = &lum_convert;
return 0;
}
int ff_init_desc_hscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc)
{
FilterContext *li = av_malloc(sizeof(FilterContext));
if (!li)
return AVERROR(ENOMEM);
li->filter = filter;
li->filter_pos = filter_pos;
li->filter_size = filter_size;
li->xInc = xInc;
desc->instance = li;
desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
desc->src = src;
desc->dst = dst;
desc->process = &lum_h_scale;
return 0;
}
static int chr_h_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
FilterContext *instance = desc->instance;
int srcW = AV_CEIL_RSHIFT(desc->src->width, desc->src->h_chr_sub_sample);
int dstW = AV_CEIL_RSHIFT(desc->dst->width, desc->dst->h_chr_sub_sample);
int xInc = instance->xInc;
uint8_t ** src1 = desc->src->plane[1].line;
uint8_t ** dst1 = desc->dst->plane[1].line;
uint8_t ** src2 = desc->src->plane[2].line;
uint8_t ** dst2 = desc->dst->plane[2].line;
int src_pos1 = sliceY - desc->src->plane[1].sliceY;
int dst_pos1 = sliceY - desc->dst->plane[1].sliceY;
int src_pos2 = sliceY - desc->src->plane[2].sliceY;
int dst_pos2 = sliceY - desc->dst->plane[2].sliceY;
int i;
for (i = 0; i < sliceH; ++i) {
if (c->hcscale_fast) {
c->hcscale_fast(c, (uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW, src1[src_pos1+i], src2[src_pos2+i], srcW, xInc);
} else {
c->hcScale(c, (uint16_t*)dst1[dst_pos1+i], dstW, src1[src_pos1+i], instance->filter, instance->filter_pos, instance->filter_size);
c->hcScale(c, (uint16_t*)dst2[dst_pos2+i], dstW, src2[src_pos2+i], instance->filter, instance->filter_pos, instance->filter_size);
}
if (c->chrConvertRange)
c->chrConvertRange((uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW);
desc->dst->plane[1].sliceH += 1;
desc->dst->plane[2].sliceH += 1;
}
return sliceH;
}
static int chr_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
int srcW = AV_CEIL_RSHIFT(desc->src->width, desc->src->h_chr_sub_sample);
ColorContext * instance = desc->instance;
uint32_t * pal = instance->pal;
int sp0 = (sliceY - (desc->src->plane[0].sliceY >> desc->src->v_chr_sub_sample)) << desc->src->v_chr_sub_sample;
int sp1 = sliceY - desc->src->plane[1].sliceY;
int i;
desc->dst->plane[1].sliceY = sliceY;
desc->dst->plane[1].sliceH = sliceH;
desc->dst->plane[2].sliceY = sliceY;
desc->dst->plane[2].sliceH = sliceH;
for (i = 0; i < sliceH; ++i) {
const uint8_t * src[4] = { desc->src->plane[0].line[sp0+i],
desc->src->plane[1].line[sp1+i],
desc->src->plane[2].line[sp1+i],
desc->src->plane[3].line[sp0+i]};
uint8_t * dst1 = desc->dst->plane[1].line[i];
uint8_t * dst2 = desc->dst->plane[2].line[i];
if (c->chrToYV12) {
c->chrToYV12(dst1, dst2, src[0], src[1], src[2], srcW, pal);
} else if (c->readChrPlanar) {
c->readChrPlanar(dst1, dst2, src, srcW, c->input_rgb2yuv_table);
}
}
return sliceH;
}
int ff_init_desc_cfmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal)
{
ColorContext * li = av_malloc(sizeof(ColorContext));
if (!li)
return AVERROR(ENOMEM);
li->pal = pal;
desc->instance = li;
desc->src =src;
desc->dst = dst;
desc->process = &chr_convert;
return 0;
}
int ff_init_desc_chscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc)
{
FilterContext *li = av_malloc(sizeof(FilterContext));
if (!li)
return AVERROR(ENOMEM);
li->filter = filter;
li->filter_pos = filter_pos;
li->filter_size = filter_size;
li->xInc = xInc;
desc->instance = li;
desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
desc->src = src;
desc->dst = dst;
desc->process = &chr_h_scale;
return 0;
}
static int no_chr_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
desc->dst->plane[1].sliceY = sliceY + sliceH - desc->dst->plane[1].available_lines;
desc->dst->plane[1].sliceH = desc->dst->plane[1].available_lines;
desc->dst->plane[2].sliceY = sliceY + sliceH - desc->dst->plane[2].available_lines;
desc->dst->plane[2].sliceH = desc->dst->plane[2].available_lines;
return 0;
}
int ff_init_desc_no_chr(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst)
{
desc->src = src;
desc->dst = dst;
desc->alpha = 0;
desc->instance = NULL;
desc->process = &no_chr_scale;
return 0;
}

View File

@@ -0,0 +1,55 @@
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "swscale_internal.h"
void ff_hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
const uint8_t *src, int srcW, int xInc)
{
int i;
unsigned int xpos = 0;
for (i = 0; i < dstWidth; i++) {
register unsigned int xx = xpos >> 16;
register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
dst[i] = (src[xx] << 7) + (src[xx + 1] - src[xx]) * xalpha;
xpos += xInc;
}
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
dst[i] = src[srcW-1]*128;
}
void ff_hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
int dstWidth, const uint8_t *src1,
const uint8_t *src2, int srcW, int xInc)
{
int i;
unsigned int xpos = 0;
for (i = 0; i < dstWidth; i++) {
register unsigned int xx = xpos >> 16;
register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
dst1[i] = (src1[xx] * (xalpha ^ 127) + src1[xx + 1] * xalpha);
dst2[i] = (src2[xx] * (xalpha ^ 127) + src2[xx + 1] * xalpha);
xpos += xInc;
}
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
dst1[i] = src1[srcW-1]*128;
dst2[i] = src2[srcW-1]*128;
}
}

1750
externals/ffmpeg/ffmpeg/libswscale/input.c vendored Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,7 @@
LIBSWSCALE_MAJOR {
global:
swscale_*;
sws_*;
local:
*;
};

View File

@@ -0,0 +1 @@
#include "libavutil/log2_tab.c"

98
externals/ffmpeg/ffmpeg/libswscale/options.c vendored Executable file
View File

@@ -0,0 +1,98 @@
/*
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/avutil.h"
#include "libavutil/opt.h"
#include "libavutil/pixfmt.h"
#include "swscale.h"
#include "swscale_internal.h"
static const char *sws_context_to_name(void *ptr)
{
return "swscaler";
}
#define OFFSET(x) offsetof(SwsContext, x)
#define DEFAULT 0
#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
static const AVOption swscale_options[] = {
{ "sws_flags", "scaler flags", OFFSET(flags), AV_OPT_TYPE_FLAGS, { .i64 = SWS_BICUBIC }, 0, UINT_MAX, VE, "sws_flags" },
{ "fast_bilinear", "fast bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FAST_BILINEAR }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bilinear", "bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BILINEAR }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bicubic", "bicubic", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BICUBIC }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "experimental", "experimental", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_X }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "neighbor", "nearest neighbor", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_POINT }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "area", "averaging area", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_AREA }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bicublin", "luma bicubic, chroma bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BICUBLIN }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "gauss", "Gaussian", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_GAUSS }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "sinc", "sinc", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_SINC }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "lanczos", "Lanczos", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_LANCZOS }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "spline", "natural bicubic spline", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_SPLINE }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "print_info", "print info", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_PRINT_INFO }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "accurate_rnd", "accurate rounding", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ACCURATE_RND }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "full_chroma_int", "full chroma interpolation", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FULL_CHR_H_INT }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "full_chroma_inp", "full chroma input", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FULL_CHR_H_INP }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bitexact", "", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BITEXACT }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "error_diffusion", "error diffusion dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ERROR_DIFFUSION}, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "srcw", "source width", OFFSET(srcW), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "srch", "source height", OFFSET(srcH), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "dstw", "destination width", OFFSET(dstW), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "dsth", "destination height", OFFSET(dstH), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "src_format", "source format", OFFSET(srcFormat), AV_OPT_TYPE_PIXEL_FMT,{ .i64 = DEFAULT }, 0, INT_MAX, VE },
{ "dst_format", "destination format", OFFSET(dstFormat), AV_OPT_TYPE_PIXEL_FMT,{ .i64 = DEFAULT }, 0, INT_MAX, VE },
{ "src_range", "source is full range", OFFSET(srcRange), AV_OPT_TYPE_BOOL, { .i64 = DEFAULT }, 0, 1, VE },
{ "dst_range", "destination is full range", OFFSET(dstRange), AV_OPT_TYPE_BOOL, { .i64 = DEFAULT }, 0, 1, VE },
{ "param0", "scaler param 0", OFFSET(param[0]), AV_OPT_TYPE_DOUBLE, { .dbl = SWS_PARAM_DEFAULT }, INT_MIN, INT_MAX, VE },
{ "param1", "scaler param 1", OFFSET(param[1]), AV_OPT_TYPE_DOUBLE, { .dbl = SWS_PARAM_DEFAULT }, INT_MIN, INT_MAX, VE },
{ "src_v_chr_pos", "source vertical chroma position in luma grid/256" , OFFSET(src_v_chr_pos), AV_OPT_TYPE_INT, { .i64 = -513 }, -513, 512, VE },
{ "src_h_chr_pos", "source horizontal chroma position in luma grid/256", OFFSET(src_h_chr_pos), AV_OPT_TYPE_INT, { .i64 = -513 }, -513, 512, VE },
{ "dst_v_chr_pos", "destination vertical chroma position in luma grid/256" , OFFSET(dst_v_chr_pos), AV_OPT_TYPE_INT, { .i64 = -513 }, -513, 512, VE },
{ "dst_h_chr_pos", "destination horizontal chroma position in luma grid/256", OFFSET(dst_h_chr_pos), AV_OPT_TYPE_INT, { .i64 = -513 }, -513, 512, VE },
{ "sws_dither", "set dithering algorithm", OFFSET(dither), AV_OPT_TYPE_INT, { .i64 = SWS_DITHER_AUTO }, 0, NB_SWS_DITHER, VE, "sws_dither" },
{ "auto", "leave choice to sws", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_AUTO }, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "bayer", "bayer dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_BAYER }, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "ed", "error diffusion", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_ED }, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "a_dither", "arithmetic addition dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_A_DITHER}, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "x_dither", "arithmetic xor dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_X_DITHER}, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "gamma", "gamma correct scaling", OFFSET(gamma_flag),AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
{ "alphablend", "mode for alpha -> non alpha", OFFSET(alphablend),AV_OPT_TYPE_INT, { .i64 = SWS_ALPHA_BLEND_NONE}, 0, SWS_ALPHA_BLEND_NB-1, VE, "alphablend" },
{ "none", "ignore alpha", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ALPHA_BLEND_NONE}, INT_MIN, INT_MAX, VE, "alphablend" },
{ "uniform_color", "blend onto a uniform color", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ALPHA_BLEND_UNIFORM},INT_MIN, INT_MAX, VE, "alphablend" },
{ "checkerboard", "blend onto a checkerboard", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ALPHA_BLEND_CHECKERBOARD},INT_MIN, INT_MAX, VE, "alphablend" },
{ NULL }
};
const AVClass ff_sws_context_class = {
.class_name = "SWScaler",
.item_name = sws_context_to_name,
.option = swscale_options,
.category = AV_CLASS_CATEGORY_SWSCALER,
.version = LIBAVUTIL_VERSION_INT,
};
const AVClass *sws_get_class(void)
{
return &ff_sws_context_class;
}

3040
externals/ffmpeg/ffmpeg/libswscale/output.c vendored Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,4 @@
OBJS += ppc/swscale_altivec.o \
ppc/yuv2rgb_altivec.o \
ppc/yuv2yuv_altivec.o \
ppc/swscale_vsx.o

View File

@@ -0,0 +1,290 @@
/*
* AltiVec-enhanced yuv2yuvX
*
* Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
* based on the equivalent C code in swscale.c
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "yuv2rgb_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#if HAVE_ALTIVEC
#if HAVE_BIGENDIAN
#define vzero vec_splat_s32(0)
#define GET_LS(a,b,c,s) {\
vector signed short l2 = vec_ld(((b) << 1) + 16, s);\
ls = vec_perm(a, l2, c);\
a = l2;\
}
#define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
vector signed short ls;\
vector signed int vf1, vf2, i1, i2;\
GET_LS(l1, x, perm, src);\
i1 = vec_mule(filter, ls);\
i2 = vec_mulo(filter, ls);\
vf1 = vec_mergeh(i1, i2);\
vf2 = vec_mergel(i1, i2);\
d1 = vec_add(d1, vf1);\
d2 = vec_add(d2, vf2);\
} while (0)
#define LOAD_FILTER(vf,f) {\
vector unsigned char perm0 = vec_lvsl(joffset, f);\
vf = vec_ld(joffset, f);\
vf = vec_perm(vf, vf, perm0);\
}
#define LOAD_L1(ll1,s,p){\
p = vec_lvsl(xoffset, s);\
ll1 = vec_ld(xoffset, s);\
}
// The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
// The neat trick: We only care for half the elements,
// high or low depending on (i<<3)%16 (it's 0 or 8 here),
// and we're going to use vec_mule, so we choose
// carefully how to "unpack" the elements into the even slots.
#define GET_VF4(a, vf, f) {\
vf = vec_ld(a<< 3, f);\
if ((a << 3) % 16)\
vf = vec_mergel(vf, (vector signed short)vzero);\
else\
vf = vec_mergeh(vf, (vector signed short)vzero);\
}
#define FIRST_LOAD(sv, pos, s, per) {\
sv = vec_ld(pos, s);\
per = vec_lvsl(pos, s);\
}
#define UPDATE_PTR(s0, d0, s1, d1) {\
d0 = s0;\
d1 = s1;\
}
#define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
v1 = vec_ld(pos + a + 16, s);\
vf = vec_perm(v0, v1, per);\
}
#define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) {\
if ((((uintptr_t)s + pos) % 16) > 8) {\
v1 = vec_ld(pos + a + 16, s);\
}\
vf = vec_perm(v0, src_v1, per);\
}
#define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
vf1 = vec_ld((a * 2 * filterSize) + (b * 2) + 16 + off, f);\
vf = vec_perm(vf0, vf1, per);\
}
#define FUNC(name) name ## _altivec
#include "swscale_ppc_template.c"
#undef FUNC
#undef vzero
#endif /* HAVE_BIGENDIAN */
#define output_pixel(pos, val, bias, signedness) \
if (big_endian) { \
AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
} else { \
AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
}
static void
yuv2plane1_float_u(const int32_t *src, float *dest, int dstW, int start)
{
static const int big_endian = HAVE_BIGENDIAN;
static const int shift = 3;
static const float float_mult = 1.0f / 65535.0f;
int i, val;
uint16_t val_uint;
for (i = start; i < dstW; ++i){
val = src[i] + (1 << (shift - 1));
output_pixel(&val_uint, val, 0, uint);
dest[i] = float_mult * (float)val_uint;
}
}
static void
yuv2plane1_float_bswap_u(const int32_t *src, uint32_t *dest, int dstW, int start)
{
static const int big_endian = HAVE_BIGENDIAN;
static const int shift = 3;
static const float float_mult = 1.0f / 65535.0f;
int i, val;
uint16_t val_uint;
for (i = start; i < dstW; ++i){
val = src[i] + (1 << (shift - 1));
output_pixel(&val_uint, val, 0, uint);
dest[i] = av_bswap32(av_float2int(float_mult * (float)val_uint));
}
}
static void yuv2plane1_float_altivec(const int32_t *src, float *dest, int dstW)
{
const int dst_u = -(uintptr_t)dest & 3;
const int shift = 3;
const int add = (1 << (shift - 1));
const int clip = (1 << 16) - 1;
const float fmult = 1.0f / 65535.0f;
const vec_u32 vadd = (vec_u32) {add, add, add, add};
const vec_u32 vshift = (vec_u32) vec_splat_u32(shift);
const vec_u32 vlargest = (vec_u32) {clip, clip, clip, clip};
const vec_f vmul = (vec_f) {fmult, fmult, fmult, fmult};
const vec_f vzero = (vec_f) {0, 0, 0, 0};
vec_u32 v;
vec_f vd;
int i;
yuv2plane1_float_u(src, dest, dst_u, 0);
for (i = dst_u; i < dstW - 3; i += 4) {
v = vec_ld(0, (const uint32_t *) &src[i]);
v = vec_add(v, vadd);
v = vec_sr(v, vshift);
v = vec_min(v, vlargest);
vd = vec_ctf(v, 0);
vd = vec_madd(vd, vmul, vzero);
vec_st(vd, 0, &dest[i]);
}
yuv2plane1_float_u(src, dest, dstW, i);
}
static void yuv2plane1_float_bswap_altivec(const int32_t *src, uint32_t *dest, int dstW)
{
const int dst_u = -(uintptr_t)dest & 3;
const int shift = 3;
const int add = (1 << (shift - 1));
const int clip = (1 << 16) - 1;
const float fmult = 1.0f / 65535.0f;
const vec_u32 vadd = (vec_u32) {add, add, add, add};
const vec_u32 vshift = (vec_u32) vec_splat_u32(shift);
const vec_u32 vlargest = (vec_u32) {clip, clip, clip, clip};
const vec_f vmul = (vec_f) {fmult, fmult, fmult, fmult};
const vec_f vzero = (vec_f) {0, 0, 0, 0};
const vec_u32 vswapbig = (vec_u32) {16, 16, 16, 16};
const vec_u16 vswapsmall = vec_splat_u16(8);
vec_u32 v;
vec_f vd;
int i;
yuv2plane1_float_bswap_u(src, dest, dst_u, 0);
for (i = dst_u; i < dstW - 3; i += 4) {
v = vec_ld(0, (const uint32_t *) &src[i]);
v = vec_add(v, vadd);
v = vec_sr(v, vshift);
v = vec_min(v, vlargest);
vd = vec_ctf(v, 0);
vd = vec_madd(vd, vmul, vzero);
vd = (vec_f) vec_rl((vec_u32) vd, vswapbig);
vd = (vec_f) vec_rl((vec_u16) vd, vswapsmall);
vec_st(vd, 0, (float *) &dest[i]);
}
yuv2plane1_float_bswap_u(src, dest, dstW, i);
}
#define yuv2plane1_float(template, dest_type, BE_LE) \
static void yuv2plane1_float ## BE_LE ## _altivec(const int16_t *src, uint8_t *dest, \
int dstW, \
const uint8_t *dither, int offset) \
{ \
template((const int32_t *)src, (dest_type *)dest, dstW); \
}
#if HAVE_BIGENDIAN
yuv2plane1_float(yuv2plane1_float_altivec, float, BE)
yuv2plane1_float(yuv2plane1_float_bswap_altivec, uint32_t, LE)
#else
yuv2plane1_float(yuv2plane1_float_altivec, float, LE)
yuv2plane1_float(yuv2plane1_float_bswap_altivec, uint32_t, BE)
#endif
#endif /* HAVE_ALTIVEC */
av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
{
#if HAVE_ALTIVEC
enum AVPixelFormat dstFormat = c->dstFormat;
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
#if HAVE_BIGENDIAN
if (c->srcBpc == 8 && c->dstBpc <= 14) {
c->hyScale = c->hcScale = hScale_real_altivec;
}
if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) &&
dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE &&
!c->needAlpha) {
c->yuv2planeX = yuv2planeX_altivec;
}
#endif
if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
c->yuv2plane1 = yuv2plane1_floatBE_altivec;
} else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
c->yuv2plane1 = yuv2plane1_floatLE_altivec;
}
/* The following list of supported dstFormat values should
* match what's found in the body of ff_yuv2packedX_altivec() */
if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) {
switch (c->dstFormat) {
case AV_PIX_FMT_ABGR:
c->yuv2packedX = ff_yuv2abgr_X_altivec;
break;
case AV_PIX_FMT_BGRA:
c->yuv2packedX = ff_yuv2bgra_X_altivec;
break;
case AV_PIX_FMT_ARGB:
c->yuv2packedX = ff_yuv2argb_X_altivec;
break;
case AV_PIX_FMT_RGBA:
c->yuv2packedX = ff_yuv2rgba_X_altivec;
break;
case AV_PIX_FMT_BGR24:
c->yuv2packedX = ff_yuv2bgr24_X_altivec;
break;
case AV_PIX_FMT_RGB24:
c->yuv2packedX = ff_yuv2rgb24_X_altivec;
break;
}
}
#endif /* HAVE_ALTIVEC */
ff_sws_init_swscale_vsx(c);
}

View File

@@ -0,0 +1,221 @@
/*
* AltiVec-enhanced yuv2yuvX
*
* Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
* based on the equivalent C code in swscale.c
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mem_internal.h"
static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest,
const uint8_t *dither, int offset, int x)
{
register int i, j;
LOCAL_ALIGNED(16, int, val, [16]);
vector signed int vo1, vo2, vo3, vo4;
vector unsigned short vs1, vs2;
vector unsigned char vf;
vector unsigned int altivec_vectorShiftInt19 =
vec_add(vec_splat_u32(10), vec_splat_u32(9));
for (i = 0; i < 16; i++)
val[i] = dither[(x + i + offset) & 7] << 12;
vo1 = vec_ld(0, val);
vo2 = vec_ld(16, val);
vo3 = vec_ld(32, val);
vo4 = vec_ld(48, val);
for (j = 0; j < filterSize; j++) {
unsigned int joffset=j<<1;
unsigned int xoffset=x<<1;
vector unsigned char av_unused perm;
vector signed short l1,vLumFilter;
LOAD_FILTER(vLumFilter,filter);
vLumFilter = vec_splat(vLumFilter, 0);
LOAD_L1(l1,src[j],perm);
yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
}
vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
vs1 = vec_packsu(vo1, vo2);
vs2 = vec_packsu(vo3, vo4);
vf = vec_packsu(vs1, vs2);
VEC_ST(vf, 0, dest);
}
static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset, int x)
{
int i, j;
for (i = x; i < dstW; i++) {
int t = dither[(i + offset) & 7] << 12;
for (j = 0; j < filterSize; j++)
t += src[j][i] * filter[j];
dest[i] = av_clip_uint8(t >> 19);
}
}
static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
int dst_u = -(uintptr_t)dest & 15;
int i;
yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
for (i = dst_u; i < dstW - 15; i += 16)
FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
offset, i);
yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
}
static void FUNC(hScale_real)(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize)
{
register int i;
LOCAL_ALIGNED(16, int, tempo, [4]);
if (filterSize % 4) {
for (i = 0; i < dstW; i++) {
register int j;
register int srcPos = filterPos[i];
register int val = 0;
for (j = 0; j < filterSize; j++)
val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
}
} else
switch (filterSize) {
case 4:
for (i = 0; i < dstW; i++) {
register int srcPos = filterPos[i];
vector unsigned char src_vF = unaligned_load(srcPos, src);
vector signed short src_v, filter_v;
vector signed int val_vEven, val_s;
src_v = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
// now put our elements in the even slots
src_v = vec_mergeh(src_v, (vector signed short)vzero);
GET_VF4(i, filter_v, filter);
val_vEven = vec_mule(src_v, filter_v);
val_s = vec_sums(val_vEven, vzero);
vec_st(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
break;
case 8:
for (i = 0; i < dstW; i++) {
register int srcPos = filterPos[i];
vector unsigned char src_vF, av_unused src_v0, av_unused src_v1;
vector unsigned char av_unused permS;
vector signed short src_v, filter_v;
vector signed int val_v, val_s;
FIRST_LOAD(src_v0, srcPos, src, permS);
LOAD_SRCV8(srcPos, 0, src, permS, src_v0, src_v1, src_vF);
src_v = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
filter_v = vec_ld(i << 4, filter);
val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
val_s = vec_sums(val_v, vzero);
vec_st(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
break;
case 16:
for (i = 0; i < dstW; i++) {
register int srcPos = filterPos[i];
vector unsigned char src_vF = unaligned_load(srcPos, src);
vector signed short src_vA = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
vector signed short src_vB = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
vector signed short filter_v0 = vec_ld(i << 5, filter);
vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
vector signed int val_s = vec_sums(val_v, vzero);
VEC_ST(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
break;
default:
for (i = 0; i < dstW; i++) {
register int j, av_unused offset = i * 2 * filterSize;
register int srcPos = filterPos[i];
vector signed int val_s, val_v = (vector signed int)vzero;
vector signed short av_unused filter_v0R;
vector unsigned char av_unused permF, av_unused src_v0, av_unused permS;
FIRST_LOAD(filter_v0R, offset, filter, permF);
FIRST_LOAD(src_v0, srcPos, src, permS);
for (j = 0; j < filterSize - 15; j += 16) {
vector unsigned char av_unused src_v1, src_vF;
vector signed short av_unused filter_v1R, av_unused filter_v2R,
filter_v0, filter_v1, src_vA, src_vB;
vector signed int val_acc;
LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF);
src_vA = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
src_vB = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v0, 0);
GET_VFD(i, j, filter, filter_v1R, filter_v2R, permF, filter_v1, 16);
val_acc = vec_msums(src_vA, filter_v0, val_v);
val_v = vec_msums(src_vB, filter_v1, val_acc);
UPDATE_PTR(filter_v2R, filter_v0R, src_v1, src_v0);
}
if (j < filterSize - 7) {
// loading src_v0 is useless, it's already done above
vector unsigned char av_unused src_v1, src_vF;
vector signed short src_v, av_unused filter_v1R, filter_v;
LOAD_SRCV8(srcPos, j, src, permS, src_v0, src_v1, src_vF);
src_v = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v, 0);
val_v = vec_msums(src_v, filter_v, val_v);
}
val_s = vec_sums(val_v, vzero);
VEC_ST(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,866 @@
/*
* AltiVec acceleration for colorspace conversion
*
* copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* Convert I420 YV12 to RGB in various formats,
* it rejects images that are not in 420 formats,
* it rejects images that don't have widths of multiples of 16,
* it rejects images that don't have heights of multiples of 2.
* Reject defers to C simulation code.
*
* Lots of optimizations to be done here.
*
* 1. Need to fix saturation code. I just couldn't get it to fly with packs
* and adds, so we currently use max/min to clip.
*
* 2. The inefficient use of chroma loading needs a bit of brushing up.
*
* 3. Analysis of pipeline stalls needs to be done. Use shark to identify
* pipeline stalls.
*
*
* MODIFIED to calculate coeffs from currently selected color space.
* MODIFIED core to be a macro where you specify the output format.
* ADDED UYVY conversion which is never called due to some thing in swscale.
* CORRECTED algorithim selection to be strict on input formats.
* ADDED runtime detection of AltiVec.
*
* ADDED altivec_yuv2packedX vertical scl + RGB converter
*
* March 27,2004
* PERFORMANCE ANALYSIS
*
* The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
* used as test.
* The AltiVec version uses 10% of the processor or ~100Mips for D1 video
* same sequence.
*
* 720 * 480 * 30 ~10MPS
*
* so we have roughly 10 clocks per pixel. This is too high, something has
* to be wrong.
*
* OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
* need for vec_min.
*
* OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to
* have the input video frame, it was just decompressed so it probably resides
* in L1 caches. However, we are creating the output video stream. This needs
* to use the DSTST instruction to optimize for the cache. We couple this with
* the fact that we are not going to be visiting the input buffer again so we
* mark it Least Recently Used. This shaves 25% of the processor cycles off.
*
* Now memcpy is the largest mips consumer in the system, probably due
* to the inefficient X11 stuff.
*
* GL libraries seem to be very slow on this machine 1.33Ghz PB running
* Jaguar, this is not the case for my 1Ghz PB. I thought it might be
* a versioning issue, however I have libGL.1.2.dylib for both
* machines. (We need to figure this out now.)
*
* GL2 libraries work now with patch for RGB32.
*
* NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
*
* Integrated luma prescaling adjustment for saturation/contrast/brightness
* adjustment.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include "config.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/mem_internal.h"
#include "libavutil/pixdesc.h"
#include "yuv2rgb_altivec.h"
#if HAVE_ALTIVEC
#undef PROFILE_THE_BEAST
#undef INC_SCALING
typedef unsigned char ubyte;
typedef signed char sbyte;
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
* homogeneous vector registers x0,x1,x2 are interleaved with the
* following technique:
*
* o0 = vec_mergeh(x0, x1);
* o1 = vec_perm(o0, x2, perm_rgb_0);
* o2 = vec_perm(o0, x2, perm_rgb_1);
* o3 = vec_mergel(x0, x1);
* o4 = vec_perm(o3, o2, perm_rgb_2);
* o5 = vec_perm(o3, o2, perm_rgb_3);
*
* perm_rgb_0: o0(RG).h v1(B) --> o1*
* 0 1 2 3 4
* rgbr|gbrg|brgb|rgbr
* 0010 0100 1001 0010
* 0102 3145 2673 894A
*
* perm_rgb_1: o0(RG).h v1(B) --> o2
* 0 1 2 3 4
* gbrg|brgb|bbbb|bbbb
* 0100 1001 1111 1111
* B5CD 6EF7 89AB CDEF
*
* perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
* 0 1 2 3 4
* gbrg|brgb|rgbr|gbrg
* 1111 1111 0010 0100
* 89AB CDEF 0182 3945
*
* perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
* 0 1 2 3 4
* brgb|rgbr|gbrg|brgb
* 1001 0010 0100 1001
* a67b 89cA BdCD eEFf
*/
static const vector unsigned char
perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05,
0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a },
perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 },
perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a,
0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f };
#define vec_merge3(x2, x1, x0, y0, y1, y2) \
do { \
__typeof__(x0) o0, o2, o3; \
o0 = vec_mergeh(x0, x1); \
y0 = vec_perm(o0, x2, perm_rgb_0); \
o2 = vec_perm(o0, x2, perm_rgb_1); \
o3 = vec_mergel(x0, x1); \
y1 = vec_perm(o3, o2, perm_rgb_2); \
y2 = vec_perm(o3, o2, perm_rgb_3); \
} while (0)
#define vec_mstbgr24(x0, x1, x2, ptr) \
do { \
__typeof__(x0) _0, _1, _2; \
vec_merge3(x0, x1, x2, _0, _1, _2); \
vec_st(_0, 0, ptr++); \
vec_st(_1, 0, ptr++); \
vec_st(_2, 0, ptr++); \
} while (0)
#define vec_mstrgb24(x0, x1, x2, ptr) \
do { \
__typeof__(x0) _0, _1, _2; \
vec_merge3(x2, x1, x0, _0, _1, _2); \
vec_st(_0, 0, ptr++); \
vec_st(_1, 0, ptr++); \
vec_st(_2, 0, ptr++); \
} while (0)
/* pack the pixels in rgb0 format
* msb R
* lsb 0
*/
#define vec_mstrgb32(T, x0, x1, x2, x3, ptr) \
do { \
T _0, _1, _2, _3; \
_0 = vec_mergeh(x0, x1); \
_1 = vec_mergeh(x2, x3); \
_2 = (T) vec_mergeh((vector unsigned short) _0, \
(vector unsigned short) _1); \
_3 = (T) vec_mergel((vector unsigned short) _0, \
(vector unsigned short) _1); \
vec_st(_2, 0 * 16, (T *) ptr); \
vec_st(_3, 1 * 16, (T *) ptr); \
_0 = vec_mergel(x0, x1); \
_1 = vec_mergel(x2, x3); \
_2 = (T) vec_mergeh((vector unsigned short) _0, \
(vector unsigned short) _1); \
_3 = (T) vec_mergel((vector unsigned short) _0, \
(vector unsigned short) _1); \
vec_st(_2, 2 * 16, (T *) ptr); \
vec_st(_3, 3 * 16, (T *) ptr); \
ptr += 4; \
} while (0)
/*
* 1 0 1.4021 | | Y |
* 1 -0.3441 -0.7142 |x| Cb|
* 1 1.7718 0 | | Cr|
*
*
* Y: [-128 127]
* Cb/Cr : [-128 127]
*
* typical YUV conversion works on Y: 0-255 this version has been
* optimized for JPEG decoding.
*/
#if HAVE_BIGENDIAN
#define vec_unh(x) \
(vector signed short) \
vec_perm(x, (__typeof__(x)) { 0 }, \
((vector unsigned char) { \
0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03, \
0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 }))
#define vec_unl(x) \
(vector signed short) \
vec_perm(x, (__typeof__(x)) { 0 }, \
((vector unsigned char) { \
0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B, \
0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F }))
#else
#define vec_unh(x)(vector signed short) vec_mergeh(x,(__typeof__(x)) { 0 })
#define vec_unl(x)(vector signed short) vec_mergel(x,(__typeof__(x)) { 0 })
#endif
#define vec_clip_s16(x) \
vec_max(vec_min(x, ((vector signed short) { \
235, 235, 235, 235, 235, 235, 235, 235 })), \
((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 }))
#define vec_packclp(x, y) \
(vector unsigned char) \
vec_packs((vector unsigned short) \
vec_max(x, ((vector signed short) { 0 })), \
(vector unsigned short) \
vec_max(y, ((vector signed short) { 0 })))
static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y,
vector signed short U, vector signed short V,
vector signed short *R, vector signed short *G,
vector signed short *B)
{
vector signed short vx, ux, uvx;
Y = vec_mradds(Y, c->CY, c->OY);
U = vec_sub(U, (vector signed short)
vec_splat((vector signed short) { 128 }, 0));
V = vec_sub(V, (vector signed short)
vec_splat((vector signed short) { 128 }, 0));
// ux = (CBU * (u << c->CSHIFT) + 0x4000) >> 15;
ux = vec_sl(U, c->CSHIFT);
*B = vec_mradds(ux, c->CBU, Y);
// vx = (CRV * (v << c->CSHIFT) + 0x4000) >> 15;
vx = vec_sl(V, c->CSHIFT);
*R = vec_mradds(vx, c->CRV, Y);
// uvx = ((CGU * u) + (CGV * v)) >> 15;
uvx = vec_mradds(U, c->CGU, Y);
*G = vec_mradds(V, c->CGV, uvx);
}
/*
* ------------------------------------------------------------------------------
* CS converters
* ------------------------------------------------------------------------------
*/
#if !HAVE_VSX
static inline vector unsigned char vec_xl(signed long long offset, const ubyte *addr)
{
const vector unsigned char *v_addr = (const vector unsigned char *) (addr + offset);
vector unsigned char align_perm = vec_lvsl(offset, addr);
return (vector unsigned char) vec_perm(v_addr[0], v_addr[1], align_perm);
}
#endif /* !HAVE_VSX */
#define DEFCSP420_CVT(name, out_pixels) \
static int altivec_ ## name(SwsContext *c, const unsigned char **in, \
int *instrides, int srcSliceY, int srcSliceH, \
unsigned char **oplanes, int *outstrides) \
{ \
int w = c->srcW; \
int h = srcSliceH; \
int i, j; \
int instrides_scl[3]; \
vector unsigned char y0, y1; \
\
vector signed char u, v; \
\
vector signed short Y0, Y1, Y2, Y3; \
vector signed short U, V; \
vector signed short vx, ux, uvx; \
vector signed short vx0, ux0, uvx0; \
vector signed short vx1, ux1, uvx1; \
vector signed short R0, G0, B0; \
vector signed short R1, G1, B1; \
vector unsigned char R, G, B; \
\
vector signed short lCY = c->CY; \
vector signed short lOY = c->OY; \
vector signed short lCRV = c->CRV; \
vector signed short lCBU = c->CBU; \
vector signed short lCGU = c->CGU; \
vector signed short lCGV = c->CGV; \
vector unsigned short lCSHIFT = c->CSHIFT; \
\
const ubyte *y1i = in[0]; \
const ubyte *y2i = in[0] + instrides[0]; \
const ubyte *ui = in[1]; \
const ubyte *vi = in[2]; \
\
vector unsigned char *oute, *outo; \
\
/* loop moves y{1, 2}i by w */ \
instrides_scl[0] = instrides[0] * 2 - w; \
/* loop moves ui by w / 2 */ \
instrides_scl[1] = instrides[1] - w / 2; \
/* loop moves vi by w / 2 */ \
instrides_scl[2] = instrides[2] - w / 2; \
\
for (i = 0; i < h / 2; i++) { \
oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \
(srcSliceY + i * 2)); \
outo = oute + (outstrides[0] >> 4); \
vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \
vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \
\
for (j = 0; j < w / 16; j++) { \
y0 = vec_xl(0, y1i); \
\
y1 = vec_xl(0, y2i); \
\
u = (vector signed char) vec_xl(0, ui); \
\
v = (vector signed char) vec_xl(0, vi); \
\
u = (vector signed char) \
vec_sub(u, \
(vector signed char) \
vec_splat((vector signed char) { 128 }, 0)); \
v = (vector signed char) \
vec_sub(v, \
(vector signed char) \
vec_splat((vector signed char) { 128 }, 0)); \
\
U = vec_unpackh(u); \
V = vec_unpackh(v); \
\
Y0 = vec_unh(y0); \
Y1 = vec_unl(y0); \
Y2 = vec_unh(y1); \
Y3 = vec_unl(y1); \
\
Y0 = vec_mradds(Y0, lCY, lOY); \
Y1 = vec_mradds(Y1, lCY, lOY); \
Y2 = vec_mradds(Y2, lCY, lOY); \
Y3 = vec_mradds(Y3, lCY, lOY); \
\
/* ux = (CBU * (u << CSHIFT) + 0x4000) >> 15 */ \
ux = vec_sl(U, lCSHIFT); \
ux = vec_mradds(ux, lCBU, (vector signed short) { 0 }); \
ux0 = vec_mergeh(ux, ux); \
ux1 = vec_mergel(ux, ux); \
\
/* vx = (CRV * (v << CSHIFT) + 0x4000) >> 15; */ \
vx = vec_sl(V, lCSHIFT); \
vx = vec_mradds(vx, lCRV, (vector signed short) { 0 }); \
vx0 = vec_mergeh(vx, vx); \
vx1 = vec_mergel(vx, vx); \
\
/* uvx = ((CGU * u) + (CGV * v)) >> 15 */ \
uvx = vec_mradds(U, lCGU, (vector signed short) { 0 }); \
uvx = vec_mradds(V, lCGV, uvx); \
uvx0 = vec_mergeh(uvx, uvx); \
uvx1 = vec_mergel(uvx, uvx); \
\
R0 = vec_add(Y0, vx0); \
G0 = vec_add(Y0, uvx0); \
B0 = vec_add(Y0, ux0); \
R1 = vec_add(Y1, vx1); \
G1 = vec_add(Y1, uvx1); \
B1 = vec_add(Y1, ux1); \
\
R = vec_packclp(R0, R1); \
G = vec_packclp(G0, G1); \
B = vec_packclp(B0, B1); \
\
out_pixels(R, G, B, oute); \
\
R0 = vec_add(Y2, vx0); \
G0 = vec_add(Y2, uvx0); \
B0 = vec_add(Y2, ux0); \
R1 = vec_add(Y3, vx1); \
G1 = vec_add(Y3, uvx1); \
B1 = vec_add(Y3, ux1); \
R = vec_packclp(R0, R1); \
G = vec_packclp(G0, G1); \
B = vec_packclp(B0, B1); \
\
\
out_pixels(R, G, B, outo); \
\
y1i += 16; \
y2i += 16; \
ui += 8; \
vi += 8; \
} \
\
ui += instrides_scl[1]; \
vi += instrides_scl[2]; \
y1i += instrides_scl[0]; \
y2i += instrides_scl[0]; \
} \
return srcSliceH; \
}
#define out_abgr(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), c, b, a, ptr)
#define out_bgra(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), ptr)
#define out_rgba(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), ptr)
#define out_argb(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), a, b, c, ptr)
#define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr)
#define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr)
DEFCSP420_CVT(yuv2_abgr, out_abgr)
DEFCSP420_CVT(yuv2_bgra, out_bgra)
DEFCSP420_CVT(yuv2_rgba, out_rgba)
DEFCSP420_CVT(yuv2_argb, out_argb)
DEFCSP420_CVT(yuv2_rgb24, out_rgb24)
DEFCSP420_CVT(yuv2_bgr24, out_bgr24)
// uyvy|uyvy|uyvy|uyvy
// 0123 4567 89ab cdef
static const vector unsigned char
demux_u = { 0x10, 0x00, 0x10, 0x00,
0x10, 0x04, 0x10, 0x04,
0x10, 0x08, 0x10, 0x08,
0x10, 0x0c, 0x10, 0x0c },
demux_v = { 0x10, 0x02, 0x10, 0x02,
0x10, 0x06, 0x10, 0x06,
0x10, 0x0A, 0x10, 0x0A,
0x10, 0x0E, 0x10, 0x0E },
demux_y = { 0x10, 0x01, 0x10, 0x03,
0x10, 0x05, 0x10, 0x07,
0x10, 0x09, 0x10, 0x0B,
0x10, 0x0D, 0x10, 0x0F };
/*
* this is so I can play live CCIR raw video
*/
static int altivec_uyvy_rgb32(SwsContext *c, const unsigned char **in,
int *instrides, int srcSliceY, int srcSliceH,
unsigned char **oplanes, int *outstrides)
{
int w = c->srcW;
int h = srcSliceH;
int i, j;
vector unsigned char uyvy;
vector signed short Y, U, V;
vector signed short R0, G0, B0, R1, G1, B1;
vector unsigned char R, G, B;
vector unsigned char *out;
const ubyte *img;
img = in[0];
out = (vector unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]);
for (i = 0; i < h; i++)
for (j = 0; j < w / 16; j++) {
uyvy = vec_ld(0, img);
U = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
V = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
Y = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
cvtyuvtoRGB(c, Y, U, V, &R0, &G0, &B0);
uyvy = vec_ld(16, img);
U = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
V = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
Y = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
cvtyuvtoRGB(c, Y, U, V, &R1, &G1, &B1);
R = vec_packclp(R0, R1);
G = vec_packclp(G0, G1);
B = vec_packclp(B0, B1);
// vec_mstbgr24 (R,G,B, out);
out_rgba(R, G, B, out);
img += 32;
}
return srcSliceH;
}
#endif /* HAVE_ALTIVEC */
/* Ok currently the acceleration routine only supports
* inputs of widths a multiple of 16
* and heights a multiple 2
*
* So we just fall back to the C codes for this.
*/
av_cold SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c)
{
#if HAVE_ALTIVEC
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return NULL;
/*
* and this seems not to matter too much I tried a bunch of
* videos with abnormal widths and MPlayer crashes elsewhere.
* mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
* boom with X11 bad match.
*
*/
if ((c->srcW & 0xf) != 0)
return NULL;
switch (c->srcFormat) {
case AV_PIX_FMT_YUV410P:
case AV_PIX_FMT_YUV420P:
/*case IMGFMT_CLPL: ??? */
case AV_PIX_FMT_GRAY8:
case AV_PIX_FMT_NV12:
case AV_PIX_FMT_NV21:
if ((c->srcH & 0x1) != 0)
return NULL;
switch (c->dstFormat) {
case AV_PIX_FMT_RGB24:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
return altivec_yuv2_rgb24;
case AV_PIX_FMT_BGR24:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
return altivec_yuv2_bgr24;
case AV_PIX_FMT_ARGB:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
return altivec_yuv2_argb;
case AV_PIX_FMT_ABGR:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
return altivec_yuv2_abgr;
case AV_PIX_FMT_RGBA:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
return altivec_yuv2_rgba;
case AV_PIX_FMT_BGRA:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
return altivec_yuv2_bgra;
default: return NULL;
}
break;
case AV_PIX_FMT_UYVY422:
switch (c->dstFormat) {
case AV_PIX_FMT_BGR32:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
return altivec_uyvy_rgb32;
default: return NULL;
}
break;
}
#endif /* HAVE_ALTIVEC */
return NULL;
}
av_cold void ff_yuv2rgb_init_tables_ppc(SwsContext *c,
const int inv_table[4],
int brightness,
int contrast,
int saturation)
{
#if HAVE_ALTIVEC
union {
DECLARE_ALIGNED(16, signed short, tmp)[8];
vector signed short vec;
} buf;
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9; // cy
buf.tmp[1] = -256 * brightness; // oy
buf.tmp[2] = (inv_table[0] >> 3) * (contrast >> 16) * (saturation >> 16); // crv
buf.tmp[3] = (inv_table[1] >> 3) * (contrast >> 16) * (saturation >> 16); // cbu
buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgu
buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgv
c->CSHIFT = (vector unsigned short) vec_splat_u16(2);
c->CY = vec_splat((vector signed short) buf.vec, 0);
c->OY = vec_splat((vector signed short) buf.vec, 1);
c->CRV = vec_splat((vector signed short) buf.vec, 2);
c->CBU = vec_splat((vector signed short) buf.vec, 3);
c->CGU = vec_splat((vector signed short) buf.vec, 4);
c->CGV = vec_splat((vector signed short) buf.vec, 5);
return;
#endif /* HAVE_ALTIVEC */
}
#if HAVE_ALTIVEC
static av_always_inline void yuv2packedX_altivec(SwsContext *c,
const int16_t *lumFilter,
const int16_t **lumSrc,
int lumFilterSize,
const int16_t *chrFilter,
const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize,
const int16_t **alpSrc,
uint8_t *dest,
int dstW, int dstY,
enum AVPixelFormat target)
{
int i, j;
vector signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1, U, V;
vector signed short R0, G0, B0, R1, G1, B1;
vector unsigned char R, G, B;
vector unsigned char *out, *nout;
vector signed short RND = vec_splat_s16(1 << 3);
vector unsigned short SCL = vec_splat_u16(4);
DECLARE_ALIGNED(16, unsigned int, scratch)[16];
vector signed short *YCoeffs, *CCoeffs;
YCoeffs = c->vYCoeffsBank + dstY * lumFilterSize;
CCoeffs = c->vCCoeffsBank + dstY * chrFilterSize;
out = (vector unsigned char *) dest;
for (i = 0; i < dstW; i += 16) {
Y0 = RND;
Y1 = RND;
/* extract 16 coeffs from lumSrc */
for (j = 0; j < lumFilterSize; j++) {
X0 = vec_ld(0, &lumSrc[j][i]);
X1 = vec_ld(16, &lumSrc[j][i]);
Y0 = vec_mradds(X0, YCoeffs[j], Y0);
Y1 = vec_mradds(X1, YCoeffs[j], Y1);
}
U = RND;
V = RND;
/* extract 8 coeffs from U,V */
for (j = 0; j < chrFilterSize; j++) {
X = vec_ld(0, &chrUSrc[j][i / 2]);
U = vec_mradds(X, CCoeffs[j], U);
X = vec_ld(0, &chrVSrc[j][i / 2]);
V = vec_mradds(X, CCoeffs[j], V);
}
/* scale and clip signals */
Y0 = vec_sra(Y0, SCL);
Y1 = vec_sra(Y1, SCL);
U = vec_sra(U, SCL);
V = vec_sra(V, SCL);
Y0 = vec_clip_s16(Y0);
Y1 = vec_clip_s16(Y1);
U = vec_clip_s16(U);
V = vec_clip_s16(V);
/* now we have
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
*
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
* V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
*/
U0 = vec_mergeh(U, U);
V0 = vec_mergeh(V, V);
U1 = vec_mergel(U, U);
V1 = vec_mergel(V, V);
cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
R = vec_packclp(R0, R1);
G = vec_packclp(G0, G1);
B = vec_packclp(B0, B1);
switch (target) {
case AV_PIX_FMT_ABGR:
out_abgr(R, G, B, out);
break;
case AV_PIX_FMT_BGRA:
out_bgra(R, G, B, out);
break;
case AV_PIX_FMT_RGBA:
out_rgba(R, G, B, out);
break;
case AV_PIX_FMT_ARGB:
out_argb(R, G, B, out);
break;
case AV_PIX_FMT_RGB24:
out_rgb24(R, G, B, out);
break;
case AV_PIX_FMT_BGR24:
out_bgr24(R, G, B, out);
break;
default:
{
/* If this is reached, the caller should have called yuv2packedXinC
* instead. */
static int printed_error_message;
if (!printed_error_message) {
av_log(c, AV_LOG_ERROR,
"altivec_yuv2packedX doesn't support %s output\n",
av_get_pix_fmt_name(c->dstFormat));
printed_error_message = 1;
}
return;
}
}
}
if (i < dstW) {
i -= 16;
Y0 = RND;
Y1 = RND;
/* extract 16 coeffs from lumSrc */
for (j = 0; j < lumFilterSize; j++) {
X0 = vec_ld(0, &lumSrc[j][i]);
X1 = vec_ld(16, &lumSrc[j][i]);
Y0 = vec_mradds(X0, YCoeffs[j], Y0);
Y1 = vec_mradds(X1, YCoeffs[j], Y1);
}
U = RND;
V = RND;
/* extract 8 coeffs from U,V */
for (j = 0; j < chrFilterSize; j++) {
X = vec_ld(0, &chrUSrc[j][i / 2]);
U = vec_mradds(X, CCoeffs[j], U);
X = vec_ld(0, &chrVSrc[j][i / 2]);
V = vec_mradds(X, CCoeffs[j], V);
}
/* scale and clip signals */
Y0 = vec_sra(Y0, SCL);
Y1 = vec_sra(Y1, SCL);
U = vec_sra(U, SCL);
V = vec_sra(V, SCL);
Y0 = vec_clip_s16(Y0);
Y1 = vec_clip_s16(Y1);
U = vec_clip_s16(U);
V = vec_clip_s16(V);
/* now we have
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
*
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
* V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
*/
U0 = vec_mergeh(U, U);
V0 = vec_mergeh(V, V);
U1 = vec_mergel(U, U);
V1 = vec_mergel(V, V);
cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
R = vec_packclp(R0, R1);
G = vec_packclp(G0, G1);
B = vec_packclp(B0, B1);
nout = (vector unsigned char *) scratch;
switch (target) {
case AV_PIX_FMT_ABGR:
out_abgr(R, G, B, nout);
break;
case AV_PIX_FMT_BGRA:
out_bgra(R, G, B, nout);
break;
case AV_PIX_FMT_RGBA:
out_rgba(R, G, B, nout);
break;
case AV_PIX_FMT_ARGB:
out_argb(R, G, B, nout);
break;
case AV_PIX_FMT_RGB24:
out_rgb24(R, G, B, nout);
break;
case AV_PIX_FMT_BGR24:
out_bgr24(R, G, B, nout);
break;
default:
/* Unreachable, I think. */
av_log(c, AV_LOG_ERROR,
"altivec_yuv2packedX doesn't support %s output\n",
av_get_pix_fmt_name(c->dstFormat));
return;
}
memcpy(&((uint32_t *) dest)[i], scratch, (dstW - i) / 4);
}
}
#define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \
const int16_t *lumFilter, \
const int16_t **lumSrc, \
int lumFilterSize, \
const int16_t *chrFilter, \
const int16_t **chrUSrc, \
const int16_t **chrVSrc, \
int chrFilterSize, \
const int16_t **alpSrc, \
uint8_t *dest, int dstW, int dstY) \
{ \
yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
chrFilter, chrUSrc, chrVSrc, \
chrFilterSize, alpSrc, \
dest, dstW, dstY, pixfmt); \
}
YUV2PACKEDX_WRAPPER(abgr, AV_PIX_FMT_ABGR);
YUV2PACKEDX_WRAPPER(bgra, AV_PIX_FMT_BGRA);
YUV2PACKEDX_WRAPPER(argb, AV_PIX_FMT_ARGB);
YUV2PACKEDX_WRAPPER(rgba, AV_PIX_FMT_RGBA);
YUV2PACKEDX_WRAPPER(rgb24, AV_PIX_FMT_RGB24);
YUV2PACKEDX_WRAPPER(bgr24, AV_PIX_FMT_BGR24);
#endif /* HAVE_ALTIVEC */

View File

@@ -0,0 +1,51 @@
/*
* AltiVec-enhanced yuv2yuvX
*
* Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
* based on the equivalent C code in swscale.c
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_PPC_YUV2RGB_ALTIVEC_H
#define SWSCALE_PPC_YUV2RGB_ALTIVEC_H
#include <stdint.h>
#include "libswscale/swscale_internal.h"
#define YUV2PACKEDX_HEADER(suffix) \
void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \
const int16_t *lumFilter, \
const int16_t **lumSrc, \
int lumFilterSize, \
const int16_t *chrFilter, \
const int16_t **chrUSrc, \
const int16_t **chrVSrc, \
int chrFilterSize, \
const int16_t **alpSrc, \
uint8_t *dest, \
int dstW, int dstY);
YUV2PACKEDX_HEADER(abgr);
YUV2PACKEDX_HEADER(bgra);
YUV2PACKEDX_HEADER(argb);
YUV2PACKEDX_HEADER(rgba);
YUV2PACKEDX_HEADER(rgb24);
YUV2PACKEDX_HEADER(bgr24);
#endif /* SWSCALE_PPC_YUV2RGB_ALTIVEC_H */

View File

@@ -0,0 +1,204 @@
/*
* AltiVec-enhanced yuv-to-yuv conversion routines.
*
* Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
* based on the equivalent C code in swscale.c
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#if HAVE_ALTIVEC
static int yv12toyuy2_unscaled_altivec(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *dstParam[],
int dstStride_a[])
{
uint8_t *dst = dstParam[0] + dstStride_a[0] * srcSliceY;
// yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH,
// srcStride[0], srcStride[1], dstStride[0]);
const uint8_t *ysrc = src[0];
const uint8_t *usrc = src[1];
const uint8_t *vsrc = src[2];
const int width = c->srcW;
const int height = srcSliceH;
const int lumStride = srcStride[0];
const int chromStride = srcStride[1];
const int dstStride = dstStride_a[0];
const vector unsigned char yperm = vec_lvsl(0, ysrc);
const int vertLumPerChroma = 2;
register unsigned int y;
/* This code assumes:
*
* 1) dst is 16 bytes-aligned
* 2) dstStride is a multiple of 16
* 3) width is a multiple of 16
* 4) lum & chrom stride are multiples of 8
*/
for (y = 0; y < height; y++) {
int i;
for (i = 0; i < width - 31; i += 32) {
const unsigned int j = i >> 1;
vector unsigned char v_yA = vec_ld(i, ysrc);
vector unsigned char v_yB = vec_ld(i + 16, ysrc);
vector unsigned char v_yC = vec_ld(i + 32, ysrc);
vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
vector unsigned char v_uA = vec_ld(j, usrc);
vector unsigned char v_uB = vec_ld(j + 16, usrc);
vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
vector unsigned char v_vA = vec_ld(j, vsrc);
vector unsigned char v_vB = vec_ld(j + 16, vsrc);
vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
vector unsigned char v_yuy2_2 = vec_mergeh(v_y2, v_uv_b);
vector unsigned char v_yuy2_3 = vec_mergel(v_y2, v_uv_b);
vec_st(v_yuy2_0, (i << 1), dst);
vec_st(v_yuy2_1, (i << 1) + 16, dst);
vec_st(v_yuy2_2, (i << 1) + 32, dst);
vec_st(v_yuy2_3, (i << 1) + 48, dst);
}
if (i < width) {
const unsigned int j = i >> 1;
vector unsigned char v_y1 = vec_ld(i, ysrc);
vector unsigned char v_u = vec_ld(j, usrc);
vector unsigned char v_v = vec_ld(j, vsrc);
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
vec_st(v_yuy2_0, (i << 1), dst);
vec_st(v_yuy2_1, (i << 1) + 16, dst);
}
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
return srcSliceH;
}
static int yv12touyvy_unscaled_altivec(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *dstParam[],
int dstStride_a[])
{
uint8_t *dst = dstParam[0] + dstStride_a[0] * srcSliceY;
// yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH,
// srcStride[0], srcStride[1], dstStride[0]);
const uint8_t *ysrc = src[0];
const uint8_t *usrc = src[1];
const uint8_t *vsrc = src[2];
const int width = c->srcW;
const int height = srcSliceH;
const int lumStride = srcStride[0];
const int chromStride = srcStride[1];
const int dstStride = dstStride_a[0];
const int vertLumPerChroma = 2;
const vector unsigned char yperm = vec_lvsl(0, ysrc);
register unsigned int y;
/* This code assumes:
*
* 1) dst is 16 bytes-aligned
* 2) dstStride is a multiple of 16
* 3) width is a multiple of 16
* 4) lum & chrom stride are multiples of 8
*/
for (y = 0; y < height; y++) {
int i;
for (i = 0; i < width - 31; i += 32) {
const unsigned int j = i >> 1;
vector unsigned char v_yA = vec_ld(i, ysrc);
vector unsigned char v_yB = vec_ld(i + 16, ysrc);
vector unsigned char v_yC = vec_ld(i + 32, ysrc);
vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
vector unsigned char v_uA = vec_ld(j, usrc);
vector unsigned char v_uB = vec_ld(j + 16, usrc);
vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
vector unsigned char v_vA = vec_ld(j, vsrc);
vector unsigned char v_vB = vec_ld(j + 16, vsrc);
vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
vector unsigned char v_uyvy_2 = vec_mergeh(v_uv_b, v_y2);
vector unsigned char v_uyvy_3 = vec_mergel(v_uv_b, v_y2);
vec_st(v_uyvy_0, (i << 1), dst);
vec_st(v_uyvy_1, (i << 1) + 16, dst);
vec_st(v_uyvy_2, (i << 1) + 32, dst);
vec_st(v_uyvy_3, (i << 1) + 48, dst);
}
if (i < width) {
const unsigned int j = i >> 1;
vector unsigned char v_y1 = vec_ld(i, ysrc);
vector unsigned char v_u = vec_ld(j, usrc);
vector unsigned char v_v = vec_ld(j, vsrc);
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
vec_st(v_uyvy_0, (i << 1), dst);
vec_st(v_uyvy_1, (i << 1) + 16, dst);
}
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
return srcSliceH;
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_get_unscaled_swscale_ppc(SwsContext *c)
{
#if HAVE_ALTIVEC
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
if (!(c->srcW & 15) && !(c->flags & SWS_BITEXACT) &&
c->srcFormat == AV_PIX_FMT_YUV420P) {
enum AVPixelFormat dstFormat = c->dstFormat;
// unscaled YV12 -> packed YUV, we want speed
if (dstFormat == AV_PIX_FMT_YUYV422)
c->swscale = yv12toyuy2_unscaled_altivec;
else if (dstFormat == AV_PIX_FMT_UYVY422)
c->swscale = yv12touyvy_unscaled_altivec;
}
#endif /* HAVE_ALTIVEC */
}

418
externals/ffmpeg/ffmpeg/libswscale/rgb2rgb.c vendored Executable file
View File

@@ -0,0 +1,418 @@
/*
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include "libavutil/attributes.h"
#include "libavutil/bswap.h"
#include "config.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size);
void (*shuffle_bytes_0321)(const uint8_t *src, uint8_t *dst, int src_size);
void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
void (*shuffle_bytes_1230)(const uint8_t *src, uint8_t *dst, int src_size);
void (*shuffle_bytes_3012)(const uint8_t *src, uint8_t *dst, int src_size);
void (*shuffle_bytes_3210)(const uint8_t *src, uint8_t *dst, int src_size);
void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride);
void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride,
int32_t *rgb2yuv);
void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
int srcStride, int dstStride);
void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
int width, int height, int src1Stride,
int src2Stride, int dstStride);
void (*deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
int width, int height, int srcStride,
int dst1Stride, int dst2Stride);
void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
int width, int height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2);
void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2,
const uint8_t *src3, uint8_t *dst,
int width, int height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride);
void (*uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
void (*uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
void (*yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
#define BY ((int)( 0.098 * (1 << RGB2YUV_SHIFT) + 0.5))
#define BV ((int)(-0.071 * (1 << RGB2YUV_SHIFT) + 0.5))
#define BU ((int)( 0.439 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GY ((int)( 0.504 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GV ((int)(-0.368 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GU ((int)(-0.291 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RY ((int)( 0.257 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RV ((int)( 0.439 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RU ((int)(-0.148 * (1 << RGB2YUV_SHIFT) + 0.5))
//plain C versions
#include "rgb2rgb_template.c"
/*
* RGB15->RGB16 original by Strepto/Astral
* ported to gcc & bugfixed : A'rpi
* MMXEXT, 3DNOW optimization by Nick Kurshev
* 32-bit C version, and and&add trick by Michael Niedermayer
*/
av_cold void ff_sws_rgb2rgb_init(void)
{
rgb2rgb_init_c();
if (ARCH_AARCH64)
rgb2rgb_init_aarch64();
if (ARCH_X86)
rgb2rgb_init_x86();
}
void rgb32to24(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 2;
for (i = 0; i < num_pixels; i++) {
#if HAVE_BIGENDIAN
/* RGB32 (= A,B,G,R) -> BGR24 (= B,G,R) */
dst[3 * i + 0] = src[4 * i + 1];
dst[3 * i + 1] = src[4 * i + 2];
dst[3 * i + 2] = src[4 * i + 3];
#else
dst[3 * i + 0] = src[4 * i + 2];
dst[3 * i + 1] = src[4 * i + 1];
dst[3 * i + 2] = src[4 * i + 0];
#endif
}
}
void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size)
{
int i;
for (i = 0; 3 * i < src_size; i++) {
#if HAVE_BIGENDIAN
/* RGB24 (= R, G, B) -> BGR32 (= A, R, G, B) */
dst[4 * i + 0] = 255;
dst[4 * i + 1] = src[3 * i + 0];
dst[4 * i + 2] = src[3 * i + 1];
dst[4 * i + 3] = src[3 * i + 2];
#else
dst[4 * i + 0] = src[3 * i + 2];
dst[4 * i + 1] = src[3 * i + 1];
dst[4 * i + 2] = src[3 * i + 0];
dst[4 * i + 3] = 255;
#endif
}
}
void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
#else
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = 255;
#endif
}
}
void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t rgb, r, g, b;
uint16_t *d = (uint16_t *)dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
rgb = *s++;
r = rgb & 0xF00;
g = rgb & 0x0F0;
b = rgb & 0x00F;
r = (r << 3) | ((r & 0x800) >> 1);
g = (g << 2) | ((g & 0x080) >> 2);
b = (b << 1) | ( b >> 3);
*d++ = r | g | b;
}
}
void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
}
}
void rgb16tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
((uint16_t *)dst)[i] = (rgb >> 11) | (rgb & 0x7E0) | (rgb << 11);
}
}
void rgb16tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
((uint16_t *)dst)[i] = (rgb >> 11) | ((rgb & 0x7C0) >> 1) | ((rgb & 0x1F) << 10);
}
}
void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
#else
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = 255;
#endif
}
}
void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
}
}
void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
((uint16_t *)dst)[i] = ((rgb & 0x7C00) >> 10) | ((rgb & 0x3E0) << 1) | (rgb << 11);
}
}
void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
unsigned br = rgb & 0x7C1F;
((uint16_t *)dst)[i] = (br >> 10) | (rgb & 0x3E0) | (br << 10);
}
}
void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
uint16_t *s = (uint16_t *)src;
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = s[i];
d[i] = (rgb << 8 | rgb & 0xF0 | rgb >> 8) & 0xFFF;
}
}
#define DEFINE_RGB48TOBGR48(need_bswap, swap) \
void rgb48tobgr48_ ## need_bswap(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
uint16_t *d = (uint16_t *)dst; \
uint16_t *s = (uint16_t *)src; \
int i, num_pixels = src_size >> 1; \
\
for (i = 0; i < num_pixels; i += 3) { \
d[i ] = swap ? av_bswap16(s[i + 2]) : s[i + 2]; \
d[i + 1] = swap ? av_bswap16(s[i + 1]) : s[i + 1]; \
d[i + 2] = swap ? av_bswap16(s[i ]) : s[i ]; \
} \
}
DEFINE_RGB48TOBGR48(nobswap, 0)
DEFINE_RGB48TOBGR48(bswap, 1)
#define DEFINE_RGB64TOBGR48(need_bswap, swap) \
void rgb64tobgr48_ ## need_bswap(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
uint16_t *d = (uint16_t *)dst; \
uint16_t *s = (uint16_t *)src; \
int i, num_pixels = src_size >> 3; \
\
for (i = 0; i < num_pixels; i++) { \
d[3 * i ] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2]; \
d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1]; \
d[3 * i + 2] = swap ? av_bswap16(s[4 * i ]) : s[4 * i ]; \
} \
}
DEFINE_RGB64TOBGR48(nobswap, 0)
DEFINE_RGB64TOBGR48(bswap, 1)
#define DEFINE_RGB64TO48(need_bswap, swap) \
void rgb64to48_ ## need_bswap(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
uint16_t *d = (uint16_t *)dst; \
uint16_t *s = (uint16_t *)src; \
int i, num_pixels = src_size >> 3; \
\
for (i = 0; i < num_pixels; i++) { \
d[3 * i ] = swap ? av_bswap16(s[4 * i ]) : s[4 * i ]; \
d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1]; \
d[3 * i + 2] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2]; \
} \
}
DEFINE_RGB64TO48(nobswap, 0)
DEFINE_RGB64TO48(bswap, 1)
#define DEFINE_RGB48TOBGR64(need_bswap, swap) \
void rgb48tobgr64_ ## need_bswap(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
uint16_t *d = (uint16_t *)dst; \
uint16_t *s = (uint16_t *)src; \
int i, num_pixels = src_size / 6; \
\
for (i = 0; i < num_pixels; i++) { \
d[4 * i ] = swap ? av_bswap16(s[3 * i + 2]) : s[3 * i + 2]; \
d[4 * i + 1] = swap ? av_bswap16(s[3 * i + 1]) : s[3 * i + 1]; \
d[4 * i + 2] = swap ? av_bswap16(s[3 * i ]) : s[3 * i ]; \
d[4 * i + 3] = 0xFFFF; \
} \
}
DEFINE_RGB48TOBGR64(nobswap, 0)
DEFINE_RGB48TOBGR64(bswap, 1)
#define DEFINE_RGB48TO64(need_bswap, swap) \
void rgb48to64_ ## need_bswap(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
uint16_t *d = (uint16_t *)dst; \
uint16_t *s = (uint16_t *)src; \
int i, num_pixels = src_size / 6; \
\
for (i = 0; i < num_pixels; i++) { \
d[4 * i ] = swap ? av_bswap16(s[3 * i ]) : s[3 * i ]; \
d[4 * i + 1] = swap ? av_bswap16(s[3 * i + 1]) : s[3 * i + 1]; \
d[4 * i + 2] = swap ? av_bswap16(s[3 * i + 2]) : s[3 * i + 2]; \
d[4 * i + 3] = 0xFFFF; \
} \
}
DEFINE_RGB48TO64(nobswap, 0)
DEFINE_RGB48TO64(bswap, 1)

175
externals/ffmpeg/ffmpeg/libswscale/rgb2rgb.h vendored Executable file
View File

@@ -0,0 +1,175 @@
/*
* software RGB to RGB converter
* pluralize by Software PAL8 to RGB converter
* Software YUV to YUV converter
* Software YUV to RGB converter
* Written by Nick Kurshev.
* YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_RGB2RGB_H
#define SWSCALE_RGB2RGB_H
#include <inttypes.h>
#include "libavutil/avutil.h"
#include "swscale.h"
/* A full collection of RGB to RGB(BGR) converters */
extern void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*shuffle_bytes_0321)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*shuffle_bytes_1230)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*shuffle_bytes_3012)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*shuffle_bytes_3210)(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb48tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb48tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64to48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64to48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb48tobgr64_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb48tobgr64_bswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb48to64_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb48to64_bswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size);
void rgb32to24(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size);
void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
uint8_t *vdst, int width, int height, int lumStride,
int chromStride, int srcStride, int32_t *rgb2yuv);
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
/**
* Width should be a multiple of 16.
*/
extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride);
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
/**
* Width should be a multiple of 16.
*/
extern void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
/**
* Height should be a multiple of 2 and width should be a multiple of 2.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line, others are ignored.
* FIXME: Write high quality version.
*/
extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride,
int32_t *rgb2yuv);
extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
int srcStride, int dstStride);
extern void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
int width, int height, int src1Stride,
int src2Stride, int dstStride);
extern void (*deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
int width, int height, int srcStride,
int dst1Stride, int dst2Stride);
extern void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
int width, int height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2);
extern void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
uint8_t *dst,
int width, int height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride);
extern void (*uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
extern void (*uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
extern void (*yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
extern void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
void ff_sws_rgb2rgb_init(void);
void rgb2rgb_init_aarch64(void);
void rgb2rgb_init_x86(void);
#endif /* SWSCALE_RGB2RGB_H */

View File

@@ -0,0 +1,992 @@
/*
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
* lot of big-endian byte order fixes by Alex Beregszaszi
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include "libavutil/attributes.h"
static inline void rgb24tobgr32_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *dest = dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
#if HAVE_BIGENDIAN
/* RGB24 (= R, G, B) -> RGB32 (= A, B, G, R) */
*dest++ = 255;
*dest++ = s[2];
*dest++ = s[1];
*dest++ = s[0];
s += 3;
#else
*dest++ = *s++;
*dest++ = *s++;
*dest++ = *s++;
*dest++ = 255;
#endif
}
}
static inline void rgb32tobgr24_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *dest = dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
#if HAVE_BIGENDIAN
/* RGB32 (= A, B, G, R) -> RGB24 (= R, G, B) */
s++;
dest[2] = *s++;
dest[1] = *s++;
dest[0] = *s++;
dest += 3;
#else
*dest++ = *s++;
*dest++ = *s++;
*dest++ = *s++;
s++;
#endif
}
}
/*
* original by Strepto/Astral
* ported to gcc & bugfixed: A'rpi
* MMXEXT, 3DNOW optimization by Nick Kurshev
* 32-bit C version, and and&add trick by Michael Niedermayer
*/
static inline void rgb15to16_c(const uint8_t *src, uint8_t *dst, int src_size)
{
register uint8_t *d = dst;
register const uint8_t *s = src;
register const uint8_t *end = s + src_size;
const uint8_t *mm_end = end - 3;
while (s < mm_end) {
register unsigned x = *((const uint32_t *)s);
*((uint32_t *)d) = (x & 0x7FFF7FFF) + (x & 0x7FE07FE0);
d += 4;
s += 4;
}
if (s < end) {
register unsigned short x = *((const uint16_t *)s);
*((uint16_t *)d) = (x & 0x7FFF) + (x & 0x7FE0);
}
}
static inline void rgb16to15_c(const uint8_t *src, uint8_t *dst, int src_size)
{
register uint8_t *d = dst;
register const uint8_t *s = src;
register const uint8_t *end = s + src_size;
const uint8_t *mm_end = end - 3;
while (s < mm_end) {
register uint32_t x = *((const uint32_t *)s);
*((uint32_t *)d) = ((x >> 1) & 0x7FE07FE0) | (x & 0x001F001F);
s += 4;
d += 4;
}
if (s < end) {
register uint16_t x = *((const uint16_t *)s);
*((uint16_t *)d) = ((x >> 1) & 0x7FE0) | (x & 0x001F);
}
}
static inline void rgb32to16_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xFF) >> 3) +
((rgb & 0xFC00) >> 5) +
((rgb & 0xF80000) >> 8);
}
}
static inline void rgb32tobgr16_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xF8) << 8) +
((rgb & 0xFC00) >> 5) +
((rgb & 0xF80000) >> 19);
}
}
static inline void rgb32to15_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xFF) >> 3) +
((rgb & 0xF800) >> 6) +
((rgb & 0xF80000) >> 9);
}
}
static inline void rgb32tobgr15_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xF8) << 7) +
((rgb & 0xF800) >> 6) +
((rgb & 0xF80000) >> 19);
}
}
static inline void rgb24tobgr16_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
const int b = *s++;
const int g = *s++;
const int r = *s++;
*d++ = (b >> 3) | ((g & 0xFC) << 3) | ((r & 0xF8) << 8);
}
}
static inline void rgb24to16_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
const int r = *s++;
const int g = *s++;
const int b = *s++;
*d++ = (b >> 3) | ((g & 0xFC) << 3) | ((r & 0xF8) << 8);
}
}
static inline void rgb24tobgr15_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
const int b = *s++;
const int g = *s++;
const int r = *s++;
*d++ = (b >> 3) | ((g & 0xF8) << 2) | ((r & 0xF8) << 7);
}
}
static inline void rgb24to15_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
const int r = *s++;
const int g = *s++;
const int b = *s++;
*d++ = (b >> 3) | ((g & 0xF8) << 2) | ((r & 0xF8) << 7);
}
}
static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
}
}
static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *d = (uint8_t *)dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
}
}
static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
#else
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = 255;
#endif
}
}
static inline void rgb16to32_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
#else
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = 255;
#endif
}
}
static inline void shuffle_bytes_2103_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
int idx = 15 - src_size;
const uint8_t *s = src - idx;
uint8_t *d = dst - idx;
for (; idx < 15; idx += 4) {
register unsigned v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
v &= 0xff00ff;
*(uint32_t *)&d[idx] = (v >> 16) + g + (v << 16);
}
}
static inline void shuffle_bytes_0321_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
int idx = 15 - src_size;
const uint8_t *s = src - idx;
uint8_t *d = dst - idx;
for (; idx < 15; idx += 4) {
register unsigned v = *(const uint32_t *)&s[idx], g = v & 0x00ff00ff;
v &= 0xff00ff00;
*(uint32_t *)&d[idx] = (v >> 16) + g + (v << 16);
}
}
#define DEFINE_SHUFFLE_BYTES(name, a, b, c, d) \
static void shuffle_bytes_##name (const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
int i; \
\
for (i = 0; i < src_size; i += 4) { \
dst[i + 0] = src[i + a]; \
dst[i + 1] = src[i + b]; \
dst[i + 2] = src[i + c]; \
dst[i + 3] = src[i + d]; \
} \
}
DEFINE_SHUFFLE_BYTES(1230_c, 1, 2, 3, 0)
DEFINE_SHUFFLE_BYTES(3012_c, 3, 0, 1, 2)
DEFINE_SHUFFLE_BYTES(3210_c, 3, 2, 1, 0)
static inline void rgb24tobgr24_c(const uint8_t *src, uint8_t *dst, int src_size)
{
unsigned i;
for (i = 0; i < src_size; i += 3) {
register uint8_t x = src[i + 2];
dst[i + 1] = src[i + 1];
dst[i + 2] = src[i + 0];
dst[i + 0] = x;
}
}
static inline void yuvPlanartoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride,
int dstStride, int vertLumPerChroma)
{
int y, i;
const int chromWidth = width >> 1;
for (y = 0; y < height; y++) {
#if HAVE_FAST_64BIT
uint64_t *ldst = (uint64_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for (i = 0; i < chromWidth; i += 2) {
uint64_t k = yc[0] + (uc[0] << 8) +
(yc[1] << 16) + ((unsigned) vc[0] << 24);
uint64_t l = yc[2] + (uc[1] << 8) +
(yc[3] << 16) + ((unsigned) vc[1] << 24);
*ldst++ = k + (l << 32);
yc += 4;
uc += 2;
vc += 2;
}
#else
int *idst = (int32_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for (i = 0; i < chromWidth; i++) {
#if HAVE_BIGENDIAN
*idst++ = (yc[0] << 24) + (uc[0] << 16) +
(yc[1] << 8) + (vc[0] << 0);
#else
*idst++ = yc[0] + (uc[0] << 8) +
(yc[1] << 16) + (vc[0] << 24);
#endif
yc += 2;
uc++;
vc++;
}
#endif
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void yv12toyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
//FIXME interpolate chroma
yuvPlanartoyuy2_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 2);
}
static inline void yuvPlanartouyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride,
int dstStride, int vertLumPerChroma)
{
int y, i;
const int chromWidth = width >> 1;
for (y = 0; y < height; y++) {
#if HAVE_FAST_64BIT
uint64_t *ldst = (uint64_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for (i = 0; i < chromWidth; i += 2) {
uint64_t k = uc[0] + (yc[0] << 8) +
(vc[0] << 16) + ((unsigned) yc[1] << 24);
uint64_t l = uc[1] + (yc[2] << 8) +
(vc[1] << 16) + ((unsigned) yc[3] << 24);
*ldst++ = k + (l << 32);
yc += 4;
uc += 2;
vc += 2;
}
#else
int *idst = (int32_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for (i = 0; i < chromWidth; i++) {
#if HAVE_BIGENDIAN
*idst++ = (uc[0] << 24) + (yc[0] << 16) +
(vc[0] << 8) + (yc[1] << 0);
#else
*idst++ = uc[0] + (yc[0] << 8) +
(vc[0] << 16) + (yc[1] << 24);
#endif
yc += 2;
uc++;
vc++;
}
#endif
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void yv12touyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
//FIXME interpolate chroma
yuvPlanartouyvy_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 2);
}
/**
* Width should be a multiple of 16.
*/
static inline void yuv422ptouyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
yuvPlanartouyvy_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 1);
}
/**
* Width should be a multiple of 16.
*/
static inline void yuv422ptoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
yuvPlanartoyuy2_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 1);
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void yuy2toyv12_c(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height, int lumStride,
int chromStride, int srcStride)
{
int y;
const int chromWidth = width >> 1;
for (y = 0; y < height; y += 2) {
int i;
for (i = 0; i < chromWidth; i++) {
ydst[2 * i + 0] = src[4 * i + 0];
udst[i] = src[4 * i + 1];
ydst[2 * i + 1] = src[4 * i + 2];
vdst[i] = src[4 * i + 3];
}
ydst += lumStride;
src += srcStride;
for (i = 0; i < chromWidth; i++) {
ydst[2 * i + 0] = src[4 * i + 0];
ydst[2 * i + 1] = src[4 * i + 2];
}
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
}
static inline void planar2x_c(const uint8_t *src, uint8_t *dst, int srcWidth,
int srcHeight, int srcStride, int dstStride)
{
int x, y;
dst[0] = src[0];
// first line
for (x = 0; x < srcWidth - 1; x++) {
dst[2 * x + 1] = (3 * src[x] + src[x + 1]) >> 2;
dst[2 * x + 2] = (src[x] + 3 * src[x + 1]) >> 2;
}
dst[2 * srcWidth - 1] = src[srcWidth - 1];
dst += dstStride;
for (y = 1; y < srcHeight; y++) {
const int mmxSize = 1;
dst[0] = (src[0] * 3 + src[srcStride]) >> 2;
dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
for (x = mmxSize - 1; x < srcWidth - 1; x++) {
dst[2 * x + 1] = (src[x + 0] * 3 + src[x + srcStride + 1]) >> 2;
dst[2 * x + dstStride + 2] = (src[x + 0] + 3 * src[x + srcStride + 1]) >> 2;
dst[2 * x + dstStride + 1] = (src[x + 1] + 3 * src[x + srcStride]) >> 2;
dst[2 * x + 2] = (src[x + 1] * 3 + src[x + srcStride]) >> 2;
}
dst[srcWidth * 2 - 1] = (src[srcWidth - 1] * 3 + src[srcWidth - 1 + srcStride]) >> 2;
dst[srcWidth * 2 - 1 + dstStride] = (src[srcWidth - 1] + 3 * src[srcWidth - 1 + srcStride]) >> 2;
dst += dstStride * 2;
src += srcStride;
}
// last line
dst[0] = src[0];
for (x = 0; x < srcWidth - 1; x++) {
dst[2 * x + 1] = (src[x] * 3 + src[x + 1]) >> 2;
dst[2 * x + 2] = (src[x] + 3 * src[x + 1]) >> 2;
}
dst[2 * srcWidth - 1] = src[srcWidth - 1];
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line, others are ignored.
* FIXME: Write HQ version.
*/
static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height, int lumStride,
int chromStride, int srcStride)
{
int y;
const int chromWidth = width >> 1;
for (y = 0; y < height; y += 2) {
int i;
for (i = 0; i < chromWidth; i++) {
udst[i] = src[4 * i + 0];
ydst[2 * i + 0] = src[4 * i + 1];
vdst[i] = src[4 * i + 2];
ydst[2 * i + 1] = src[4 * i + 3];
}
ydst += lumStride;
src += srcStride;
for (i = 0; i < chromWidth; i++) {
ydst[2 * i + 0] = src[4 * i + 1];
ydst[2 * i + 1] = src[4 * i + 3];
}
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
}
/**
* Height should be a multiple of 2 and width should be a multiple of 2.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line,
* others are ignored in the C version.
* FIXME: Write HQ version.
*/
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
uint8_t *vdst, int width, int height, int lumStride,
int chromStride, int srcStride, int32_t *rgb2yuv)
{
int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
int y;
const int chromWidth = width >> 1;
for (y = 0; y < height; y += 2) {
int i;
for (i = 0; i < chromWidth; i++) {
unsigned int b = src[6 * i + 0];
unsigned int g = src[6 * i + 1];
unsigned int r = src[6 * i + 2];
unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
udst[i] = U;
vdst[i] = V;
ydst[2 * i] = Y;
b = src[6 * i + 3];
g = src[6 * i + 4];
r = src[6 * i + 5];
Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
ydst[2 * i + 1] = Y;
}
ydst += lumStride;
src += srcStride;
if (y+1 == height)
break;
for (i = 0; i < chromWidth; i++) {
unsigned int b = src[6 * i + 0];
unsigned int g = src[6 * i + 1];
unsigned int r = src[6 * i + 2];
unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
ydst[2 * i] = Y;
b = src[6 * i + 3];
g = src[6 * i + 4];
r = src[6 * i + 5];
Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
ydst[2 * i + 1] = Y;
}
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
}
static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride)
{
int h;
for (h = 0; h < height; h++) {
int w;
for (w = 0; w < width; w++) {
dest[2 * w + 0] = src1[w];
dest[2 * w + 1] = src2[w];
}
dest += dstStride;
src1 += src1Stride;
src2 += src2Stride;
}
}
static void deinterleaveBytes_c(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
int width, int height, int srcStride,
int dst1Stride, int dst2Stride)
{
int h;
for (h = 0; h < height; h++) {
int w;
for (w = 0; w < width; w++) {
dst1[w] = src[2 * w + 0];
dst2[w] = src[2 * w + 1];
}
src += srcStride;
dst1 += dst1Stride;
dst2 += dst2Stride;
}
}
static inline void vu9_to_vu12_c(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
int width, int height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2)
{
int x, y;
int w = width / 2;
int h = height / 2;
for (y = 0; y < h; y++) {
const uint8_t *s1 = src1 + srcStride1 * (y >> 1);
uint8_t *d = dst1 + dstStride1 * y;
for (x = 0; x < w; x++)
d[2 * x] = d[2 * x + 1] = s1[x];
}
for (y = 0; y < h; y++) {
const uint8_t *s2 = src2 + srcStride2 * (y >> 1);
uint8_t *d = dst2 + dstStride2 * y;
for (x = 0; x < w; x++)
d[2 * x] = d[2 * x + 1] = s2[x];
}
}
static inline void yvu9_to_yuy2_c(const uint8_t *src1, const uint8_t *src2,
const uint8_t *src3, uint8_t *dst,
int width, int height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride)
{
int x, y;
int w = width / 2;
int h = height;
for (y = 0; y < h; y++) {
const uint8_t *yp = src1 + srcStride1 * y;
const uint8_t *up = src2 + srcStride2 * (y >> 2);
const uint8_t *vp = src3 + srcStride3 * (y >> 2);
uint8_t *d = dst + dstStride * y;
for (x = 0; x < w; x++) {
const int x2 = x << 2;
d[8 * x + 0] = yp[x2];
d[8 * x + 1] = up[x];
d[8 * x + 2] = yp[x2 + 1];
d[8 * x + 3] = vp[x];
d[8 * x + 4] = yp[x2 + 2];
d[8 * x + 5] = up[x];
d[8 * x + 6] = yp[x2 + 3];
d[8 * x + 7] = vp[x];
}
}
}
static void extract_even_c(const uint8_t *src, uint8_t *dst, int count)
{
dst += count;
src += count * 2;
count = -count;
while (count < 0) {
dst[count] = src[2 * count];
count++;
}
}
static void extract_even2_c(const uint8_t *src, uint8_t *dst0, uint8_t *dst1,
int count)
{
dst0 += count;
dst1 += count;
src += count * 4;
count = -count;
while (count < 0) {
dst0[count] = src[4 * count + 0];
dst1[count] = src[4 * count + 2];
count++;
}
}
static void extract_even2avg_c(const uint8_t *src0, const uint8_t *src1,
uint8_t *dst0, uint8_t *dst1, int count)
{
dst0 += count;
dst1 += count;
src0 += count * 4;
src1 += count * 4;
count = -count;
while (count < 0) {
dst0[count] = (src0[4 * count + 0] + src1[4 * count + 0]) >> 1;
dst1[count] = (src0[4 * count + 2] + src1[4 * count + 2]) >> 1;
count++;
}
}
static void extract_odd2_c(const uint8_t *src, uint8_t *dst0, uint8_t *dst1,
int count)
{
dst0 += count;
dst1 += count;
src += count * 4;
count = -count;
src++;
while (count < 0) {
dst0[count] = src[4 * count + 0];
dst1[count] = src[4 * count + 2];
count++;
}
}
static void extract_odd2avg_c(const uint8_t *src0, const uint8_t *src1,
uint8_t *dst0, uint8_t *dst1, int count)
{
dst0 += count;
dst1 += count;
src0 += count * 4;
src1 += count * 4;
count = -count;
src0++;
src1++;
while (count < 0) {
dst0[count] = (src0[4 * count + 0] + src1[4 * count + 0]) >> 1;
dst1[count] = (src0[4 * count + 2] + src1[4 * count + 2]) >> 1;
count++;
}
}
static void yuyvtoyuv420_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = AV_CEIL_RSHIFT(width, 1);
for (y = 0; y < height; y++) {
extract_even_c(src, ydst, width);
if (y & 1) {
extract_odd2avg_c(src - srcStride, src, udst, vdst, chromWidth);
udst += chromStride;
vdst += chromStride;
}
src += srcStride;
ydst += lumStride;
}
}
static void yuyvtoyuv422_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = AV_CEIL_RSHIFT(width, 1);
for (y = 0; y < height; y++) {
extract_even_c(src, ydst, width);
extract_odd2_c(src, udst, vdst, chromWidth);
src += srcStride;
ydst += lumStride;
udst += chromStride;
vdst += chromStride;
}
}
static void uyvytoyuv420_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = AV_CEIL_RSHIFT(width, 1);
for (y = 0; y < height; y++) {
extract_even_c(src + 1, ydst, width);
if (y & 1) {
extract_even2avg_c(src - srcStride, src, udst, vdst, chromWidth);
udst += chromStride;
vdst += chromStride;
}
src += srcStride;
ydst += lumStride;
}
}
static void uyvytoyuv422_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = AV_CEIL_RSHIFT(width, 1);
for (y = 0; y < height; y++) {
extract_even_c(src + 1, ydst, width);
extract_even2_c(src, udst, vdst, chromWidth);
src += srcStride;
ydst += lumStride;
udst += chromStride;
vdst += chromStride;
}
}
static av_cold void rgb2rgb_init_c(void)
{
rgb15to16 = rgb15to16_c;
rgb15tobgr24 = rgb15tobgr24_c;
rgb15to32 = rgb15to32_c;
rgb16tobgr24 = rgb16tobgr24_c;
rgb16to32 = rgb16to32_c;
rgb16to15 = rgb16to15_c;
rgb24tobgr16 = rgb24tobgr16_c;
rgb24tobgr15 = rgb24tobgr15_c;
rgb24tobgr32 = rgb24tobgr32_c;
rgb32to16 = rgb32to16_c;
rgb32to15 = rgb32to15_c;
rgb32tobgr24 = rgb32tobgr24_c;
rgb24to15 = rgb24to15_c;
rgb24to16 = rgb24to16_c;
rgb24tobgr24 = rgb24tobgr24_c;
#if HAVE_BIGENDIAN
shuffle_bytes_0321 = shuffle_bytes_2103_c;
shuffle_bytes_2103 = shuffle_bytes_0321_c;
#else
shuffle_bytes_0321 = shuffle_bytes_0321_c;
shuffle_bytes_2103 = shuffle_bytes_2103_c;
#endif
shuffle_bytes_1230 = shuffle_bytes_1230_c;
shuffle_bytes_3012 = shuffle_bytes_3012_c;
shuffle_bytes_3210 = shuffle_bytes_3210_c;
rgb32tobgr16 = rgb32tobgr16_c;
rgb32tobgr15 = rgb32tobgr15_c;
yv12toyuy2 = yv12toyuy2_c;
yv12touyvy = yv12touyvy_c;
yuv422ptoyuy2 = yuv422ptoyuy2_c;
yuv422ptouyvy = yuv422ptouyvy_c;
yuy2toyv12 = yuy2toyv12_c;
planar2x = planar2x_c;
ff_rgb24toyv12 = ff_rgb24toyv12_c;
interleaveBytes = interleaveBytes_c;
deinterleaveBytes = deinterleaveBytes_c;
vu9_to_vu12 = vu9_to_vu12_c;
yvu9_to_yuy2 = yvu9_to_yuy2_c;
uyvytoyuv420 = uyvytoyuv420_c;
uyvytoyuv422 = uyvytoyuv422_c;
yuyvtoyuv420 = yuyvtoyuv420_c;
yuyvtoyuv422 = yuyvtoyuv422_c;
}

394
externals/ffmpeg/ffmpeg/libswscale/slice.c vendored Executable file
View File

@@ -0,0 +1,394 @@
/*
* Copyright (C) 2015 Pedro Arthur <bygrandao@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "swscale_internal.h"
static void free_lines(SwsSlice *s)
{
int i;
for (i = 0; i < 2; ++i) {
int n = s->plane[i].available_lines;
int j;
for (j = 0; j < n; ++j) {
av_freep(&s->plane[i].line[j]);
if (s->is_ring)
s->plane[i].line[j+n] = NULL;
}
}
for (i = 0; i < 4; ++i)
memset(s->plane[i].line, 0, sizeof(uint8_t*) * s->plane[i].available_lines * (s->is_ring ? 3 : 1));
s->should_free_lines = 0;
}
/*
slice lines contains extra bytes for vectorial code thus @size
is the allocated memory size and @width is the number of pixels
*/
static int alloc_lines(SwsSlice *s, int size, int width)
{
int i;
int idx[2] = {3, 2};
s->should_free_lines = 1;
s->width = width;
for (i = 0; i < 2; ++i) {
int n = s->plane[i].available_lines;
int j;
int ii = idx[i];
av_assert0(n == s->plane[ii].available_lines);
for (j = 0; j < n; ++j) {
// chroma plane line U and V are expected to be contiguous in memory
// by mmx vertical scaler code
s->plane[i].line[j] = av_malloc(size * 2 + 32);
if (!s->plane[i].line[j]) {
free_lines(s);
return AVERROR(ENOMEM);
}
s->plane[ii].line[j] = s->plane[i].line[j] + size + 16;
if (s->is_ring) {
s->plane[i].line[j+n] = s->plane[i].line[j];
s->plane[ii].line[j+n] = s->plane[ii].line[j];
}
}
}
return 0;
}
static int alloc_slice(SwsSlice *s, enum AVPixelFormat fmt, int lumLines, int chrLines, int h_sub_sample, int v_sub_sample, int ring)
{
int i;
int size[4] = { lumLines,
chrLines,
chrLines,
lumLines };
s->h_chr_sub_sample = h_sub_sample;
s->v_chr_sub_sample = v_sub_sample;
s->fmt = fmt;
s->is_ring = ring;
s->should_free_lines = 0;
for (i = 0; i < 4; ++i) {
int n = size[i] * ( ring == 0 ? 1 : 3);
s->plane[i].line = av_mallocz_array(sizeof(uint8_t*), n);
if (!s->plane[i].line)
return AVERROR(ENOMEM);
s->plane[i].tmp = ring ? s->plane[i].line + size[i] * 2 : NULL;
s->plane[i].available_lines = size[i];
s->plane[i].sliceY = 0;
s->plane[i].sliceH = 0;
}
return 0;
}
static void free_slice(SwsSlice *s)
{
int i;
if (s) {
if (s->should_free_lines)
free_lines(s);
for (i = 0; i < 4; ++i) {
av_freep(&s->plane[i].line);
s->plane[i].tmp = NULL;
}
}
}
int ff_rotate_slice(SwsSlice *s, int lum, int chr)
{
int i;
if (lum) {
for (i = 0; i < 4; i+=3) {
int n = s->plane[i].available_lines;
int l = lum - s->plane[i].sliceY;
if (l >= n * 2) {
s->plane[i].sliceY += n;
s->plane[i].sliceH -= n;
}
}
}
if (chr) {
for (i = 1; i < 3; ++i) {
int n = s->plane[i].available_lines;
int l = chr - s->plane[i].sliceY;
if (l >= n * 2) {
s->plane[i].sliceY += n;
s->plane[i].sliceH -= n;
}
}
}
return 0;
}
int ff_init_slice_from_src(SwsSlice * s, uint8_t *src[4], int stride[4], int srcW, int lumY, int lumH, int chrY, int chrH, int relative)
{
int i = 0;
const int start[4] = {lumY,
chrY,
chrY,
lumY};
const int end[4] = {lumY +lumH,
chrY + chrH,
chrY + chrH,
lumY + lumH};
s->width = srcW;
for (i = 0; i < 4 && src[i] != NULL; ++i) {
uint8_t *const src_i = src[i] + (relative ? 0 : start[i]) * stride[i];
int j;
int first = s->plane[i].sliceY;
int n = s->plane[i].available_lines;
int lines = end[i] - start[i];
int tot_lines = end[i] - first;
if (start[i] >= first && n >= tot_lines) {
s->plane[i].sliceH = FFMAX(tot_lines, s->plane[i].sliceH);
for (j = 0; j < lines; j+= 1)
s->plane[i].line[start[i] - first + j] = src_i + j * stride[i];
} else {
s->plane[i].sliceY = start[i];
lines = lines > n ? n : lines;
s->plane[i].sliceH = lines;
for (j = 0; j < lines; j+= 1)
s->plane[i].line[j] = src_i + j * stride[i];
}
}
return 0;
}
static void fill_ones(SwsSlice *s, int n, int bpc)
{
int i, j, k, size, end;
for (i = 0; i < 4; ++i) {
size = s->plane[i].available_lines;
for (j = 0; j < size; ++j) {
if (bpc == 16) {
end = (n>>1) + 1;
for (k = 0; k < end; ++k)
((int32_t*)(s->plane[i].line[j]))[k] = 1<<18;
} else if (bpc == 32) {
end = (n>>2) + 1;
for (k = 0; k < end; ++k)
((int64_t*)(s->plane[i].line[j]))[k] = 1LL<<34;
} else {
end = n + 1;
for (k = 0; k < end; ++k)
((int16_t*)(s->plane[i].line[j]))[k] = 1<<14;
}
}
}
}
/*
Calculates the minimum ring buffer size, it should be able to store vFilterSize
more n lines where n is the max difference between each adjacent slice which
outputs a line.
The n lines are needed only when there is not enough src lines to output a single
dst line, then we should buffer these lines to process them on the next call to scale.
*/
static void get_min_buffer_size(SwsContext *c, int *out_lum_size, int *out_chr_size)
{
int lumY;
int dstH = c->dstH;
int chrDstH = c->chrDstH;
int *lumFilterPos = c->vLumFilterPos;
int *chrFilterPos = c->vChrFilterPos;
int lumFilterSize = c->vLumFilterSize;
int chrFilterSize = c->vChrFilterSize;
int chrSubSample = c->chrSrcVSubSample;
*out_lum_size = lumFilterSize;
*out_chr_size = chrFilterSize;
for (lumY = 0; lumY < dstH; lumY++) {
int chrY = (int64_t)lumY * chrDstH / dstH;
int nextSlice = FFMAX(lumFilterPos[lumY] + lumFilterSize - 1,
((chrFilterPos[chrY] + chrFilterSize - 1)
<< chrSubSample));
nextSlice >>= chrSubSample;
nextSlice <<= chrSubSample;
(*out_lum_size) = FFMAX((*out_lum_size), nextSlice - lumFilterPos[lumY]);
(*out_chr_size) = FFMAX((*out_chr_size), (nextSlice >> chrSubSample) - chrFilterPos[chrY]);
}
}
int ff_init_filters(SwsContext * c)
{
int i;
int index;
int num_ydesc;
int num_cdesc;
int num_vdesc = isPlanarYUV(c->dstFormat) && !isGray(c->dstFormat) ? 2 : 1;
int need_lum_conv = c->lumToYV12 || c->readLumPlanar || c->alpToYV12 || c->readAlpPlanar;
int need_chr_conv = c->chrToYV12 || c->readChrPlanar;
int need_gamma = c->is_internal_gamma;
int srcIdx, dstIdx;
int dst_stride = FFALIGN(c->dstW * sizeof(int16_t) + 66, 16);
uint32_t * pal = usePal(c->srcFormat) ? c->pal_yuv : (uint32_t*)c->input_rgb2yuv_table;
int res = 0;
int lumBufSize;
int chrBufSize;
get_min_buffer_size(c, &lumBufSize, &chrBufSize);
lumBufSize = FFMAX(lumBufSize, c->vLumFilterSize + MAX_LINES_AHEAD);
chrBufSize = FFMAX(chrBufSize, c->vChrFilterSize + MAX_LINES_AHEAD);
if (c->dstBpc == 16)
dst_stride <<= 1;
if (c->dstBpc == 32)
dst_stride <<= 2;
num_ydesc = need_lum_conv ? 2 : 1;
num_cdesc = need_chr_conv ? 2 : 1;
c->numSlice = FFMAX(num_ydesc, num_cdesc) + 2;
c->numDesc = num_ydesc + num_cdesc + num_vdesc + (need_gamma ? 2 : 0);
c->descIndex[0] = num_ydesc + (need_gamma ? 1 : 0);
c->descIndex[1] = num_ydesc + num_cdesc + (need_gamma ? 1 : 0);
c->desc = av_mallocz_array(sizeof(SwsFilterDescriptor), c->numDesc);
if (!c->desc)
return AVERROR(ENOMEM);
c->slice = av_mallocz_array(sizeof(SwsSlice), c->numSlice);
res = alloc_slice(&c->slice[0], c->srcFormat, c->srcH, c->chrSrcH, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
if (res < 0) goto cleanup;
for (i = 1; i < c->numSlice-2; ++i) {
res = alloc_slice(&c->slice[i], c->srcFormat, lumBufSize, chrBufSize, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
if (res < 0) goto cleanup;
res = alloc_lines(&c->slice[i], FFALIGN(c->srcW*2+78, 16), c->srcW);
if (res < 0) goto cleanup;
}
// horizontal scaler output
res = alloc_slice(&c->slice[i], c->srcFormat, lumBufSize, chrBufSize, c->chrDstHSubSample, c->chrDstVSubSample, 1);
if (res < 0) goto cleanup;
res = alloc_lines(&c->slice[i], dst_stride, c->dstW);
if (res < 0) goto cleanup;
fill_ones(&c->slice[i], dst_stride>>1, c->dstBpc);
// vertical scaler output
++i;
res = alloc_slice(&c->slice[i], c->dstFormat, c->dstH, c->chrDstH, c->chrDstHSubSample, c->chrDstVSubSample, 0);
if (res < 0) goto cleanup;
index = 0;
srcIdx = 0;
dstIdx = 1;
if (need_gamma) {
res = ff_init_gamma_convert(c->desc + index, c->slice + srcIdx, c->inv_gamma);
if (res < 0) goto cleanup;
++index;
}
if (need_lum_conv) {
res = ff_init_desc_fmt_convert(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], pal);
if (res < 0) goto cleanup;
c->desc[index].alpha = c->needAlpha;
++index;
srcIdx = dstIdx;
}
dstIdx = FFMAX(num_ydesc, num_cdesc);
res = ff_init_desc_hscale(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], c->hLumFilter, c->hLumFilterPos, c->hLumFilterSize, c->lumXInc);
if (res < 0) goto cleanup;
c->desc[index].alpha = c->needAlpha;
++index;
{
srcIdx = 0;
dstIdx = 1;
if (need_chr_conv) {
res = ff_init_desc_cfmt_convert(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], pal);
if (res < 0) goto cleanup;
++index;
srcIdx = dstIdx;
}
dstIdx = FFMAX(num_ydesc, num_cdesc);
if (c->needs_hcscale)
res = ff_init_desc_chscale(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], c->hChrFilter, c->hChrFilterPos, c->hChrFilterSize, c->chrXInc);
else
res = ff_init_desc_no_chr(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx]);
if (res < 0) goto cleanup;
}
++index;
{
srcIdx = c->numSlice - 2;
dstIdx = c->numSlice - 1;
res = ff_init_vscale(c, c->desc + index, c->slice + srcIdx, c->slice + dstIdx);
if (res < 0) goto cleanup;
}
++index;
if (need_gamma) {
res = ff_init_gamma_convert(c->desc + index, c->slice + dstIdx, c->gamma);
if (res < 0) goto cleanup;
}
return 0;
cleanup:
ff_free_filters(c);
return res;
}
int ff_free_filters(SwsContext *c)
{
int i;
if (c->desc) {
for (i = 0; i < c->numDesc; ++i)
av_freep(&c->desc[i].instance);
av_freep(&c->desc);
}
if (c->slice) {
for (i = 0; i < c->numSlice; ++i)
free_slice(&c->slice[i]);
av_freep(&c->slice);
}
return 0;
}

985
externals/ffmpeg/ffmpeg/libswscale/swscale.c vendored Executable file
View File

@@ -0,0 +1,985 @@
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include "libavutil/avassert.h"
#include "libavutil/avutil.h"
#include "libavutil/bswap.h"
#include "libavutil/cpu.h"
#include "libavutil/imgutils.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/mathematics.h"
#include "libavutil/mem_internal.h"
#include "libavutil/pixdesc.h"
#include "config.h"
#include "rgb2rgb.h"
#include "swscale_internal.h"
#include "swscale.h"
DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_128)[9][8] = {
{ 36, 68, 60, 92, 34, 66, 58, 90, },
{ 100, 4, 124, 28, 98, 2, 122, 26, },
{ 52, 84, 44, 76, 50, 82, 42, 74, },
{ 116, 20, 108, 12, 114, 18, 106, 10, },
{ 32, 64, 56, 88, 38, 70, 62, 94, },
{ 96, 0, 120, 24, 102, 6, 126, 30, },
{ 48, 80, 40, 72, 54, 86, 46, 78, },
{ 112, 16, 104, 8, 118, 22, 110, 14, },
{ 36, 68, 60, 92, 34, 66, 58, 90, },
};
DECLARE_ALIGNED(8, static const uint8_t, sws_pb_64)[8] = {
64, 64, 64, 64, 64, 64, 64, 64
};
static av_always_inline void fillPlane(uint8_t *plane, int stride, int width,
int height, int y, uint8_t val)
{
int i;
uint8_t *ptr = plane + stride * y;
for (i = 0; i < height; i++) {
memset(ptr, val, width);
ptr += stride;
}
}
static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
int i;
int32_t *dst = (int32_t *) _dst;
const uint16_t *src = (const uint16_t *) _src;
int bits = desc->comp[0].depth - 1;
int sh = bits - 4;
if ((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) {
sh = 9;
} else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
sh = 16 - 1 - 4;
}
for (i = 0; i < dstW; i++) {
int j;
int srcPos = filterPos[i];
int val = 0;
for (j = 0; j < filterSize; j++) {
val += src[srcPos + j] * filter[filterSize * i + j];
}
// filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
}
}
static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
int i;
const uint16_t *src = (const uint16_t *) _src;
int sh = desc->comp[0].depth - 1;
if (sh<15) {
sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 : (desc->comp[0].depth - 1);
} else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
sh = 16 - 1;
}
for (i = 0; i < dstW; i++) {
int j;
int srcPos = filterPos[i];
int val = 0;
for (j = 0; j < filterSize; j++) {
val += src[srcPos + j] * filter[filterSize * i + j];
}
// filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
}
}
// bilinear / bicubic scaling
static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize)
{
int i;
for (i = 0; i < dstW; i++) {
int j;
int srcPos = filterPos[i];
int val = 0;
for (j = 0; j < filterSize; j++) {
val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
}
dst[i] = FFMIN(val >> 7, (1 << 15) - 1); // the cubic equation does overflow ...
}
}
static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize)
{
int i;
int32_t *dst = (int32_t *) _dst;
for (i = 0; i < dstW; i++) {
int j;
int srcPos = filterPos[i];
int val = 0;
for (j = 0; j < filterSize; j++) {
val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
}
dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ...
}
}
// FIXME all pal and rgb srcFormats could do this conversion as well
// FIXME all scalers more complex than bilinear could do half of this transform
static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
{
int i;
for (i = 0; i < width; i++) {
dstU[i] = (FFMIN(dstU[i], 30775) * 4663 - 9289992) >> 12; // -264
dstV[i] = (FFMIN(dstV[i], 30775) * 4663 - 9289992) >> 12; // -264
}
}
static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
{
int i;
for (i = 0; i < width; i++) {
dstU[i] = (dstU[i] * 1799 + 4081085) >> 11; // 1469
dstV[i] = (dstV[i] * 1799 + 4081085) >> 11; // 1469
}
}
static void lumRangeToJpeg_c(int16_t *dst, int width)
{
int i;
for (i = 0; i < width; i++)
dst[i] = (FFMIN(dst[i], 30189) * 19077 - 39057361) >> 14;
}
static void lumRangeFromJpeg_c(int16_t *dst, int width)
{
int i;
for (i = 0; i < width; i++)
dst[i] = (dst[i] * 14071 + 33561947) >> 14;
}
static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
{
int i;
int32_t *dstU = (int32_t *) _dstU;
int32_t *dstV = (int32_t *) _dstV;
for (i = 0; i < width; i++) {
dstU[i] = (FFMIN(dstU[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264
dstV[i] = (FFMIN(dstV[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264
}
}
static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
{
int i;
int32_t *dstU = (int32_t *) _dstU;
int32_t *dstV = (int32_t *) _dstV;
for (i = 0; i < width; i++) {
dstU[i] = (dstU[i] * 1799 + (4081085 << 4)) >> 11; // 1469
dstV[i] = (dstV[i] * 1799 + (4081085 << 4)) >> 11; // 1469
}
}
static void lumRangeToJpeg16_c(int16_t *_dst, int width)
{
int i;
int32_t *dst = (int32_t *) _dst;
for (i = 0; i < width; i++) {
dst[i] = ((int)(FFMIN(dst[i], 30189 << 4) * 4769U - (39057361 << 2))) >> 12;
}
}
static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
{
int i;
int32_t *dst = (int32_t *) _dst;
for (i = 0; i < width; i++)
dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
}
#define DEBUG_SWSCALE_BUFFERS 0
#define DEBUG_BUFFERS(...) \
if (DEBUG_SWSCALE_BUFFERS) \
av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
static int swscale(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *dst[], int dstStride[])
{
/* load a few things into local vars to make the code more readable?
* and faster */
const int dstW = c->dstW;
const int dstH = c->dstH;
const enum AVPixelFormat dstFormat = c->dstFormat;
const int flags = c->flags;
int32_t *vLumFilterPos = c->vLumFilterPos;
int32_t *vChrFilterPos = c->vChrFilterPos;
const int vLumFilterSize = c->vLumFilterSize;
const int vChrFilterSize = c->vChrFilterSize;
yuv2planar1_fn yuv2plane1 = c->yuv2plane1;
yuv2planarX_fn yuv2planeX = c->yuv2planeX;
yuv2interleavedX_fn yuv2nv12cX = c->yuv2nv12cX;
yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
yuv2packedX_fn yuv2packedX = c->yuv2packedX;
yuv2anyX_fn yuv2anyX = c->yuv2anyX;
const int chrSrcSliceY = srcSliceY >> c->chrSrcVSubSample;
const int chrSrcSliceH = AV_CEIL_RSHIFT(srcSliceH, c->chrSrcVSubSample);
int should_dither = isNBPS(c->srcFormat) ||
is16BPS(c->srcFormat);
int lastDstY;
/* vars which will change and which we need to store back in the context */
int dstY = c->dstY;
int lastInLumBuf = c->lastInLumBuf;
int lastInChrBuf = c->lastInChrBuf;
int lumStart = 0;
int lumEnd = c->descIndex[0];
int chrStart = lumEnd;
int chrEnd = c->descIndex[1];
int vStart = chrEnd;
int vEnd = c->numDesc;
SwsSlice *src_slice = &c->slice[lumStart];
SwsSlice *hout_slice = &c->slice[c->numSlice-2];
SwsSlice *vout_slice = &c->slice[c->numSlice-1];
SwsFilterDescriptor *desc = c->desc;
int needAlpha = c->needAlpha;
int hasLumHoles = 1;
int hasChrHoles = 1;
if (isPacked(c->srcFormat)) {
src[1] =
src[2] =
src[3] = src[0];
srcStride[1] =
srcStride[2] =
srcStride[3] = srcStride[0];
}
srcStride[1] *= 1 << c->vChrDrop;
srcStride[2] *= 1 << c->vChrDrop;
DEBUG_BUFFERS("swscale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
src[0], srcStride[0], src[1], srcStride[1],
src[2], srcStride[2], src[3], srcStride[3],
dst[0], dstStride[0], dst[1], dstStride[1],
dst[2], dstStride[2], dst[3], dstStride[3]);
DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
srcSliceY, srcSliceH, dstY, dstH);
DEBUG_BUFFERS("vLumFilterSize: %d vChrFilterSize: %d\n",
vLumFilterSize, vChrFilterSize);
if (dstStride[0]&15 || dstStride[1]&15 ||
dstStride[2]&15 || dstStride[3]&15) {
static int warnedAlready = 0; // FIXME maybe move this into the context
if (flags & SWS_PRINT_INFO && !warnedAlready) {
av_log(c, AV_LOG_WARNING,
"Warning: dstStride is not aligned!\n"
" ->cannot do aligned memory accesses anymore\n");
warnedAlready = 1;
}
}
if ( (uintptr_t)dst[0]&15 || (uintptr_t)dst[1]&15 || (uintptr_t)dst[2]&15
|| (uintptr_t)src[0]&15 || (uintptr_t)src[1]&15 || (uintptr_t)src[2]&15
|| dstStride[0]&15 || dstStride[1]&15 || dstStride[2]&15 || dstStride[3]&15
|| srcStride[0]&15 || srcStride[1]&15 || srcStride[2]&15 || srcStride[3]&15
) {
static int warnedAlready=0;
int cpu_flags = av_get_cpu_flags();
if (HAVE_MMXEXT && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speed loss\n");
warnedAlready=1;
}
}
/* Note the user might start scaling the picture in the middle so this
* will not get executed. This is not really intended but works
* currently, so people might do it. */
if (srcSliceY == 0) {
dstY = 0;
lastInLumBuf = -1;
lastInChrBuf = -1;
}
if (!should_dither) {
c->chrDither8 = c->lumDither8 = sws_pb_64;
}
lastDstY = dstY;
ff_init_vscale_pfn(c, yuv2plane1, yuv2planeX, yuv2nv12cX,
yuv2packed1, yuv2packed2, yuv2packedX, yuv2anyX, c->use_mmx_vfilter);
ff_init_slice_from_src(src_slice, (uint8_t**)src, srcStride, c->srcW,
srcSliceY, srcSliceH, chrSrcSliceY, chrSrcSliceH, 1);
ff_init_slice_from_src(vout_slice, (uint8_t**)dst, dstStride, c->dstW,
dstY, dstH, dstY >> c->chrDstVSubSample,
AV_CEIL_RSHIFT(dstH, c->chrDstVSubSample), 0);
if (srcSliceY == 0) {
hout_slice->plane[0].sliceY = lastInLumBuf + 1;
hout_slice->plane[1].sliceY = lastInChrBuf + 1;
hout_slice->plane[2].sliceY = lastInChrBuf + 1;
hout_slice->plane[3].sliceY = lastInLumBuf + 1;
hout_slice->plane[0].sliceH =
hout_slice->plane[1].sliceH =
hout_slice->plane[2].sliceH =
hout_slice->plane[3].sliceH = 0;
hout_slice->width = dstW;
}
for (; dstY < dstH; dstY++) {
const int chrDstY = dstY >> c->chrDstVSubSample;
int use_mmx_vfilter= c->use_mmx_vfilter;
// First line needed as input
const int firstLumSrcY = FFMAX(1 - vLumFilterSize, vLumFilterPos[dstY]);
const int firstLumSrcY2 = FFMAX(1 - vLumFilterSize, vLumFilterPos[FFMIN(dstY | ((1 << c->chrDstVSubSample) - 1), dstH - 1)]);
// First line needed as input
const int firstChrSrcY = FFMAX(1 - vChrFilterSize, vChrFilterPos[chrDstY]);
// Last line needed as input
int lastLumSrcY = FFMIN(c->srcH, firstLumSrcY + vLumFilterSize) - 1;
int lastLumSrcY2 = FFMIN(c->srcH, firstLumSrcY2 + vLumFilterSize) - 1;
int lastChrSrcY = FFMIN(c->chrSrcH, firstChrSrcY + vChrFilterSize) - 1;
int enough_lines;
int i;
int posY, cPosY, firstPosY, lastPosY, firstCPosY, lastCPosY;
// handle holes (FAST_BILINEAR & weird filters)
if (firstLumSrcY > lastInLumBuf) {
hasLumHoles = lastInLumBuf != firstLumSrcY - 1;
if (hasLumHoles) {
hout_slice->plane[0].sliceY = firstLumSrcY;
hout_slice->plane[3].sliceY = firstLumSrcY;
hout_slice->plane[0].sliceH =
hout_slice->plane[3].sliceH = 0;
}
lastInLumBuf = firstLumSrcY - 1;
}
if (firstChrSrcY > lastInChrBuf) {
hasChrHoles = lastInChrBuf != firstChrSrcY - 1;
if (hasChrHoles) {
hout_slice->plane[1].sliceY = firstChrSrcY;
hout_slice->plane[2].sliceY = firstChrSrcY;
hout_slice->plane[1].sliceH =
hout_slice->plane[2].sliceH = 0;
}
lastInChrBuf = firstChrSrcY - 1;
}
DEBUG_BUFFERS("dstY: %d\n", dstY);
DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
firstLumSrcY, lastLumSrcY, lastInLumBuf);
DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
firstChrSrcY, lastChrSrcY, lastInChrBuf);
// Do we have enough lines in this slice to output the dstY line
enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH &&
lastChrSrcY < AV_CEIL_RSHIFT(srcSliceY + srcSliceH, c->chrSrcVSubSample);
if (!enough_lines) {
lastLumSrcY = srcSliceY + srcSliceH - 1;
lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
lastLumSrcY, lastChrSrcY);
}
av_assert0((lastLumSrcY - firstLumSrcY + 1) <= hout_slice->plane[0].available_lines);
av_assert0((lastChrSrcY - firstChrSrcY + 1) <= hout_slice->plane[1].available_lines);
posY = hout_slice->plane[0].sliceY + hout_slice->plane[0].sliceH;
if (posY <= lastLumSrcY && !hasLumHoles) {
firstPosY = FFMAX(firstLumSrcY, posY);
lastPosY = FFMIN(firstLumSrcY + hout_slice->plane[0].available_lines - 1, srcSliceY + srcSliceH - 1);
} else {
firstPosY = posY;
lastPosY = lastLumSrcY;
}
cPosY = hout_slice->plane[1].sliceY + hout_slice->plane[1].sliceH;
if (cPosY <= lastChrSrcY && !hasChrHoles) {
firstCPosY = FFMAX(firstChrSrcY, cPosY);
lastCPosY = FFMIN(firstChrSrcY + hout_slice->plane[1].available_lines - 1, AV_CEIL_RSHIFT(srcSliceY + srcSliceH, c->chrSrcVSubSample) - 1);
} else {
firstCPosY = cPosY;
lastCPosY = lastChrSrcY;
}
ff_rotate_slice(hout_slice, lastPosY, lastCPosY);
if (posY < lastLumSrcY + 1) {
for (i = lumStart; i < lumEnd; ++i)
desc[i].process(c, &desc[i], firstPosY, lastPosY - firstPosY + 1);
}
lastInLumBuf = lastLumSrcY;
if (cPosY < lastChrSrcY + 1) {
for (i = chrStart; i < chrEnd; ++i)
desc[i].process(c, &desc[i], firstCPosY, lastCPosY - firstCPosY + 1);
}
lastInChrBuf = lastChrSrcY;
if (!enough_lines)
break; // we can't output a dstY line so let's try with the next slice
#if HAVE_MMX_INLINE
ff_updateMMXDitherTables(c, dstY);
#endif
if (should_dither) {
c->chrDither8 = ff_dither_8x8_128[chrDstY & 7];
c->lumDither8 = ff_dither_8x8_128[dstY & 7];
}
if (dstY >= dstH - 2) {
/* hmm looks like we can't use MMX here without overwriting
* this array's tail */
ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX, &yuv2nv12cX,
&yuv2packed1, &yuv2packed2, &yuv2packedX, &yuv2anyX);
use_mmx_vfilter= 0;
ff_init_vscale_pfn(c, yuv2plane1, yuv2planeX, yuv2nv12cX,
yuv2packed1, yuv2packed2, yuv2packedX, yuv2anyX, use_mmx_vfilter);
}
{
for (i = vStart; i < vEnd; ++i)
desc[i].process(c, &desc[i], dstY, 1);
}
}
if (isPlanar(dstFormat) && isALPHA(dstFormat) && !needAlpha) {
int length = dstW;
int height = dstY - lastDstY;
if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
fillPlane16(dst[3], dstStride[3], length, height, lastDstY,
1, desc->comp[3].depth,
isBE(dstFormat));
} else if (is32BPS(dstFormat)) {
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
fillPlane32(dst[3], dstStride[3], length, height, lastDstY,
1, desc->comp[3].depth,
isBE(dstFormat), desc->flags & AV_PIX_FMT_FLAG_FLOAT);
} else
fillPlane(dst[3], dstStride[3], length, height, lastDstY, 255);
}
#if HAVE_MMXEXT_INLINE
if (av_get_cpu_flags() & AV_CPU_FLAG_MMXEXT)
__asm__ volatile ("sfence" ::: "memory");
#endif
emms_c();
/* store changed local vars back in the context */
c->dstY = dstY;
c->lastInLumBuf = lastInLumBuf;
c->lastInChrBuf = lastInChrBuf;
return dstY - lastDstY;
}
av_cold void ff_sws_init_range_convert(SwsContext *c)
{
c->lumConvertRange = NULL;
c->chrConvertRange = NULL;
if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
if (c->dstBpc <= 14) {
if (c->srcRange) {
c->lumConvertRange = lumRangeFromJpeg_c;
c->chrConvertRange = chrRangeFromJpeg_c;
} else {
c->lumConvertRange = lumRangeToJpeg_c;
c->chrConvertRange = chrRangeToJpeg_c;
}
} else {
if (c->srcRange) {
c->lumConvertRange = lumRangeFromJpeg16_c;
c->chrConvertRange = chrRangeFromJpeg16_c;
} else {
c->lumConvertRange = lumRangeToJpeg16_c;
c->chrConvertRange = chrRangeToJpeg16_c;
}
}
}
}
static av_cold void sws_init_swscale(SwsContext *c)
{
enum AVPixelFormat srcFormat = c->srcFormat;
ff_sws_init_output_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
&c->yuv2nv12cX, &c->yuv2packed1,
&c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
ff_sws_init_input_funcs(c);
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
c->hyScale = c->hcScale = hScale8To15_c;
if (c->flags & SWS_FAST_BILINEAR) {
c->hyscale_fast = ff_hyscale_fast_c;
c->hcscale_fast = ff_hcscale_fast_c;
}
} else {
c->hyScale = c->hcScale = hScale8To19_c;
}
} else {
c->hyScale = c->hcScale = c->dstBpc > 14 ? hScale16To19_c
: hScale16To15_c;
}
ff_sws_init_range_convert(c);
if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
srcFormat == AV_PIX_FMT_MONOBLACK || srcFormat == AV_PIX_FMT_MONOWHITE))
c->needs_hcscale = 1;
}
SwsFunc ff_getSwsFunc(SwsContext *c)
{
sws_init_swscale(c);
if (ARCH_PPC)
ff_sws_init_swscale_ppc(c);
if (ARCH_X86)
ff_sws_init_swscale_x86(c);
if (ARCH_AARCH64)
ff_sws_init_swscale_aarch64(c);
if (ARCH_ARM)
ff_sws_init_swscale_arm(c);
return swscale;
}
static void reset_ptr(const uint8_t *src[], enum AVPixelFormat format)
{
if (!isALPHA(format))
src[3] = NULL;
if (!isPlanar(format)) {
src[3] = src[2] = NULL;
if (!usePal(format))
src[1] = NULL;
}
}
static int check_image_pointers(const uint8_t * const data[4], enum AVPixelFormat pix_fmt,
const int linesizes[4])
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
int i;
av_assert2(desc);
for (i = 0; i < 4; i++) {
int plane = desc->comp[i].plane;
if (!data[plane] || !linesizes[plane])
return 0;
}
return 1;
}
static void xyz12Torgb48(struct SwsContext *c, uint16_t *dst,
const uint16_t *src, int stride, int h)
{
int xp,yp;
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
for (yp=0; yp<h; yp++) {
for (xp=0; xp+2<stride; xp+=3) {
int x, y, z, r, g, b;
if (desc->flags & AV_PIX_FMT_FLAG_BE) {
x = AV_RB16(src + xp + 0);
y = AV_RB16(src + xp + 1);
z = AV_RB16(src + xp + 2);
} else {
x = AV_RL16(src + xp + 0);
y = AV_RL16(src + xp + 1);
z = AV_RL16(src + xp + 2);
}
x = c->xyzgamma[x>>4];
y = c->xyzgamma[y>>4];
z = c->xyzgamma[z>>4];
// convert from XYZlinear to sRGBlinear
r = c->xyz2rgb_matrix[0][0] * x +
c->xyz2rgb_matrix[0][1] * y +
c->xyz2rgb_matrix[0][2] * z >> 12;
g = c->xyz2rgb_matrix[1][0] * x +
c->xyz2rgb_matrix[1][1] * y +
c->xyz2rgb_matrix[1][2] * z >> 12;
b = c->xyz2rgb_matrix[2][0] * x +
c->xyz2rgb_matrix[2][1] * y +
c->xyz2rgb_matrix[2][2] * z >> 12;
// limit values to 12-bit depth
r = av_clip_uintp2(r, 12);
g = av_clip_uintp2(g, 12);
b = av_clip_uintp2(b, 12);
// convert from sRGBlinear to RGB and scale from 12bit to 16bit
if (desc->flags & AV_PIX_FMT_FLAG_BE) {
AV_WB16(dst + xp + 0, c->rgbgamma[r] << 4);
AV_WB16(dst + xp + 1, c->rgbgamma[g] << 4);
AV_WB16(dst + xp + 2, c->rgbgamma[b] << 4);
} else {
AV_WL16(dst + xp + 0, c->rgbgamma[r] << 4);
AV_WL16(dst + xp + 1, c->rgbgamma[g] << 4);
AV_WL16(dst + xp + 2, c->rgbgamma[b] << 4);
}
}
src += stride;
dst += stride;
}
}
static void rgb48Toxyz12(struct SwsContext *c, uint16_t *dst,
const uint16_t *src, int stride, int h)
{
int xp,yp;
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->dstFormat);
for (yp=0; yp<h; yp++) {
for (xp=0; xp+2<stride; xp+=3) {
int x, y, z, r, g, b;
if (desc->flags & AV_PIX_FMT_FLAG_BE) {
r = AV_RB16(src + xp + 0);
g = AV_RB16(src + xp + 1);
b = AV_RB16(src + xp + 2);
} else {
r = AV_RL16(src + xp + 0);
g = AV_RL16(src + xp + 1);
b = AV_RL16(src + xp + 2);
}
r = c->rgbgammainv[r>>4];
g = c->rgbgammainv[g>>4];
b = c->rgbgammainv[b>>4];
// convert from sRGBlinear to XYZlinear
x = c->rgb2xyz_matrix[0][0] * r +
c->rgb2xyz_matrix[0][1] * g +
c->rgb2xyz_matrix[0][2] * b >> 12;
y = c->rgb2xyz_matrix[1][0] * r +
c->rgb2xyz_matrix[1][1] * g +
c->rgb2xyz_matrix[1][2] * b >> 12;
z = c->rgb2xyz_matrix[2][0] * r +
c->rgb2xyz_matrix[2][1] * g +
c->rgb2xyz_matrix[2][2] * b >> 12;
// limit values to 12-bit depth
x = av_clip_uintp2(x, 12);
y = av_clip_uintp2(y, 12);
z = av_clip_uintp2(z, 12);
// convert from XYZlinear to X'Y'Z' and scale from 12bit to 16bit
if (desc->flags & AV_PIX_FMT_FLAG_BE) {
AV_WB16(dst + xp + 0, c->xyzgammainv[x] << 4);
AV_WB16(dst + xp + 1, c->xyzgammainv[y] << 4);
AV_WB16(dst + xp + 2, c->xyzgammainv[z] << 4);
} else {
AV_WL16(dst + xp + 0, c->xyzgammainv[x] << 4);
AV_WL16(dst + xp + 1, c->xyzgammainv[y] << 4);
AV_WL16(dst + xp + 2, c->xyzgammainv[z] << 4);
}
}
src += stride;
dst += stride;
}
}
/**
* swscale wrapper, so we don't need to export the SwsContext.
* Assumes planar YUV to be in YUV order instead of YVU.
*/
int attribute_align_arg sws_scale(struct SwsContext *c,
const uint8_t * const srcSlice[],
const int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *const dst[],
const int dstStride[])
{
int i, ret;
const uint8_t *src2[4];
uint8_t *dst2[4];
uint8_t *rgb0_tmp = NULL;
int macro_height = isBayer(c->srcFormat) ? 2 : (1 << c->chrSrcVSubSample);
// copy strides, so they can safely be modified
int srcStride2[4];
int dstStride2[4];
int srcSliceY_internal = srcSliceY;
if (!srcStride || !dstStride || !dst || !srcSlice) {
av_log(c, AV_LOG_ERROR, "One of the input parameters to sws_scale() is NULL, please check the calling code\n");
return 0;
}
for (i=0; i<4; i++) {
srcStride2[i] = srcStride[i];
dstStride2[i] = dstStride[i];
}
if ((srcSliceY & (macro_height-1)) ||
((srcSliceH& (macro_height-1)) && srcSliceY + srcSliceH != c->srcH) ||
srcSliceY + srcSliceH > c->srcH) {
av_log(c, AV_LOG_ERROR, "Slice parameters %d, %d are invalid\n", srcSliceY, srcSliceH);
return AVERROR(EINVAL);
}
if (c->gamma_flag && c->cascaded_context[0]) {
ret = sws_scale(c->cascaded_context[0],
srcSlice, srcStride, srcSliceY, srcSliceH,
c->cascaded_tmp, c->cascaded_tmpStride);
if (ret < 0)
return ret;
if (c->cascaded_context[2])
ret = sws_scale(c->cascaded_context[1], (const uint8_t * const *)c->cascaded_tmp, c->cascaded_tmpStride, srcSliceY, srcSliceH, c->cascaded1_tmp, c->cascaded1_tmpStride);
else
ret = sws_scale(c->cascaded_context[1], (const uint8_t * const *)c->cascaded_tmp, c->cascaded_tmpStride, srcSliceY, srcSliceH, dst, dstStride);
if (ret < 0)
return ret;
if (c->cascaded_context[2]) {
ret = sws_scale(c->cascaded_context[2],
(const uint8_t * const *)c->cascaded1_tmp, c->cascaded1_tmpStride, c->cascaded_context[1]->dstY - ret, c->cascaded_context[1]->dstY,
dst, dstStride);
}
return ret;
}
if (c->cascaded_context[0] && srcSliceY == 0 && srcSliceH == c->cascaded_context[0]->srcH) {
ret = sws_scale(c->cascaded_context[0],
srcSlice, srcStride, srcSliceY, srcSliceH,
c->cascaded_tmp, c->cascaded_tmpStride);
if (ret < 0)
return ret;
ret = sws_scale(c->cascaded_context[1],
(const uint8_t * const * )c->cascaded_tmp, c->cascaded_tmpStride, 0, c->cascaded_context[0]->dstH,
dst, dstStride);
return ret;
}
memcpy(src2, srcSlice, sizeof(src2));
memcpy(dst2, dst, sizeof(dst2));
// do not mess up sliceDir if we have a "trailing" 0-size slice
if (srcSliceH == 0)
return 0;
if (!check_image_pointers(srcSlice, c->srcFormat, srcStride)) {
av_log(c, AV_LOG_ERROR, "bad src image pointers\n");
return 0;
}
if (!check_image_pointers((const uint8_t* const*)dst, c->dstFormat, dstStride)) {
av_log(c, AV_LOG_ERROR, "bad dst image pointers\n");
return 0;
}
if (c->sliceDir == 0 && srcSliceY != 0 && srcSliceY + srcSliceH != c->srcH) {
av_log(c, AV_LOG_ERROR, "Slices start in the middle!\n");
return 0;
}
if (c->sliceDir == 0) {
if (srcSliceY == 0) c->sliceDir = 1; else c->sliceDir = -1;
}
if (usePal(c->srcFormat)) {
for (i = 0; i < 256; i++) {
int r, g, b, y, u, v, a = 0xff;
if (c->srcFormat == AV_PIX_FMT_PAL8) {
uint32_t p = ((const uint32_t *)(srcSlice[1]))[i];
a = (p >> 24) & 0xFF;
r = (p >> 16) & 0xFF;
g = (p >> 8) & 0xFF;
b = p & 0xFF;
} else if (c->srcFormat == AV_PIX_FMT_RGB8) {
r = ( i >> 5 ) * 36;
g = ((i >> 2) & 7) * 36;
b = ( i & 3) * 85;
} else if (c->srcFormat == AV_PIX_FMT_BGR8) {
b = ( i >> 6 ) * 85;
g = ((i >> 3) & 7) * 36;
r = ( i & 7) * 36;
} else if (c->srcFormat == AV_PIX_FMT_RGB4_BYTE) {
r = ( i >> 3 ) * 255;
g = ((i >> 1) & 3) * 85;
b = ( i & 1) * 255;
} else if (c->srcFormat == AV_PIX_FMT_GRAY8 || c->srcFormat == AV_PIX_FMT_GRAY8A) {
r = g = b = i;
} else {
av_assert1(c->srcFormat == AV_PIX_FMT_BGR4_BYTE);
b = ( i >> 3 ) * 255;
g = ((i >> 1) & 3) * 85;
r = ( i & 1) * 255;
}
#define RGB2YUV_SHIFT 15
#define BY ( (int) (0.114 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define BV (-(int) (0.081 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define BU ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GY ( (int) (0.587 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GV (-(int) (0.419 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GU (-(int) (0.331 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RY ( (int) (0.299 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RV ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RU (-(int) (0.169 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
y = av_clip_uint8((RY * r + GY * g + BY * b + ( 33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
u = av_clip_uint8((RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
v = av_clip_uint8((RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
c->pal_yuv[i]= y + (u<<8) + (v<<16) + ((unsigned)a<<24);
switch (c->dstFormat) {
case AV_PIX_FMT_BGR32:
#if !HAVE_BIGENDIAN
case AV_PIX_FMT_RGB24:
#endif
c->pal_rgb[i]= r + (g<<8) + (b<<16) + ((unsigned)a<<24);
break;
case AV_PIX_FMT_BGR32_1:
#if HAVE_BIGENDIAN
case AV_PIX_FMT_BGR24:
#endif
c->pal_rgb[i]= a + (r<<8) + (g<<16) + ((unsigned)b<<24);
break;
case AV_PIX_FMT_RGB32_1:
#if HAVE_BIGENDIAN
case AV_PIX_FMT_RGB24:
#endif
c->pal_rgb[i]= a + (b<<8) + (g<<16) + ((unsigned)r<<24);
break;
case AV_PIX_FMT_RGB32:
#if !HAVE_BIGENDIAN
case AV_PIX_FMT_BGR24:
#endif
default:
c->pal_rgb[i]= b + (g<<8) + (r<<16) + ((unsigned)a<<24);
}
}
}
if (c->src0Alpha && !c->dst0Alpha && isALPHA(c->dstFormat)) {
uint8_t *base;
int x,y;
rgb0_tmp = av_malloc(FFABS(srcStride[0]) * srcSliceH + 32);
if (!rgb0_tmp)
return AVERROR(ENOMEM);
base = srcStride[0] < 0 ? rgb0_tmp - srcStride[0] * (srcSliceH-1) : rgb0_tmp;
for (y=0; y<srcSliceH; y++){
memcpy(base + srcStride[0]*y, src2[0] + srcStride[0]*y, 4*c->srcW);
for (x=c->src0Alpha-1; x<4*c->srcW; x+=4) {
base[ srcStride[0]*y + x] = 0xFF;
}
}
src2[0] = base;
}
if (c->srcXYZ && !(c->dstXYZ && c->srcW==c->dstW && c->srcH==c->dstH)) {
uint8_t *base;
rgb0_tmp = av_malloc(FFABS(srcStride[0]) * srcSliceH + 32);
if (!rgb0_tmp)
return AVERROR(ENOMEM);
base = srcStride[0] < 0 ? rgb0_tmp - srcStride[0] * (srcSliceH-1) : rgb0_tmp;
xyz12Torgb48(c, (uint16_t*)base, (const uint16_t*)src2[0], srcStride[0]/2, srcSliceH);
src2[0] = base;
}
if (!srcSliceY && (c->flags & SWS_BITEXACT) && c->dither == SWS_DITHER_ED && c->dither_error[0])
for (i = 0; i < 4; i++)
memset(c->dither_error[i], 0, sizeof(c->dither_error[0][0]) * (c->dstW+2));
if (c->sliceDir != 1) {
// slices go from bottom to top => we flip the image internally
for (i=0; i<4; i++) {
srcStride2[i] *= -1;
dstStride2[i] *= -1;
}
src2[0] += (srcSliceH - 1) * srcStride[0];
if (!usePal(c->srcFormat))
src2[1] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[1];
src2[2] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[2];
src2[3] += (srcSliceH - 1) * srcStride[3];
dst2[0] += ( c->dstH - 1) * dstStride[0];
dst2[1] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[1];
dst2[2] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[2];
dst2[3] += ( c->dstH - 1) * dstStride[3];
srcSliceY_internal = c->srcH-srcSliceY-srcSliceH;
}
reset_ptr(src2, c->srcFormat);
reset_ptr((void*)dst2, c->dstFormat);
/* reset slice direction at end of frame */
if (srcSliceY_internal + srcSliceH == c->srcH)
c->sliceDir = 0;
ret = c->swscale(c, src2, srcStride2, srcSliceY_internal, srcSliceH, dst2, dstStride2);
if (c->dstXYZ && !(c->srcXYZ && c->srcW==c->dstW && c->srcH==c->dstH)) {
int dstY = c->dstY ? c->dstY : srcSliceY + srcSliceH;
uint16_t *dst16 = (uint16_t*)(dst2[0] + (dstY - ret) * dstStride2[0]);
av_assert0(dstY >= ret);
av_assert0(ret >= 0);
av_assert0(c->dstH >= dstY);
/* replace on the same data */
rgb48Toxyz12(c, dst16, dst16, dstStride2[0]/2, ret);
}
av_free(rgb0_tmp);
return ret;
}

336
externals/ffmpeg/ffmpeg/libswscale/swscale.h vendored Executable file
View File

@@ -0,0 +1,336 @@
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_SWSCALE_H
#define SWSCALE_SWSCALE_H
/**
* @file
* @ingroup libsws
* external API header
*/
#include <stdint.h>
#include "libavutil/avutil.h"
#include "libavutil/log.h"
#include "libavutil/pixfmt.h"
#include "version.h"
/**
* @defgroup libsws libswscale
* Color conversion and scaling library.
*
* @{
*
* Return the LIBSWSCALE_VERSION_INT constant.
*/
unsigned swscale_version(void);
/**
* Return the libswscale build-time configuration.
*/
const char *swscale_configuration(void);
/**
* Return the libswscale license.
*/
const char *swscale_license(void);
/* values for the flags, the stuff on the command line is different */
#define SWS_FAST_BILINEAR 1
#define SWS_BILINEAR 2
#define SWS_BICUBIC 4
#define SWS_X 8
#define SWS_POINT 0x10
#define SWS_AREA 0x20
#define SWS_BICUBLIN 0x40
#define SWS_GAUSS 0x80
#define SWS_SINC 0x100
#define SWS_LANCZOS 0x200
#define SWS_SPLINE 0x400
#define SWS_SRC_V_CHR_DROP_MASK 0x30000
#define SWS_SRC_V_CHR_DROP_SHIFT 16
#define SWS_PARAM_DEFAULT 123456
#define SWS_PRINT_INFO 0x1000
//the following 3 flags are not completely implemented
//internal chrominance subsampling info
#define SWS_FULL_CHR_H_INT 0x2000
//input subsampling info
#define SWS_FULL_CHR_H_INP 0x4000
#define SWS_DIRECT_BGR 0x8000
#define SWS_ACCURATE_RND 0x40000
#define SWS_BITEXACT 0x80000
#define SWS_ERROR_DIFFUSION 0x800000
#define SWS_MAX_REDUCE_CUTOFF 0.002
#define SWS_CS_ITU709 1
#define SWS_CS_FCC 4
#define SWS_CS_ITU601 5
#define SWS_CS_ITU624 5
#define SWS_CS_SMPTE170M 5
#define SWS_CS_SMPTE240M 7
#define SWS_CS_DEFAULT 5
#define SWS_CS_BT2020 9
/**
* Return a pointer to yuv<->rgb coefficients for the given colorspace
* suitable for sws_setColorspaceDetails().
*
* @param colorspace One of the SWS_CS_* macros. If invalid,
* SWS_CS_DEFAULT is used.
*/
const int *sws_getCoefficients(int colorspace);
// when used for filters they must have an odd number of elements
// coeffs cannot be shared between vectors
typedef struct SwsVector {
double *coeff; ///< pointer to the list of coefficients
int length; ///< number of coefficients in the vector
} SwsVector;
// vectors can be shared
typedef struct SwsFilter {
SwsVector *lumH;
SwsVector *lumV;
SwsVector *chrH;
SwsVector *chrV;
} SwsFilter;
struct SwsContext;
/**
* Return a positive value if pix_fmt is a supported input format, 0
* otherwise.
*/
int sws_isSupportedInput(enum AVPixelFormat pix_fmt);
/**
* Return a positive value if pix_fmt is a supported output format, 0
* otherwise.
*/
int sws_isSupportedOutput(enum AVPixelFormat pix_fmt);
/**
* @param[in] pix_fmt the pixel format
* @return a positive value if an endianness conversion for pix_fmt is
* supported, 0 otherwise.
*/
int sws_isSupportedEndiannessConversion(enum AVPixelFormat pix_fmt);
/**
* Allocate an empty SwsContext. This must be filled and passed to
* sws_init_context(). For filling see AVOptions, options.c and
* sws_setColorspaceDetails().
*/
struct SwsContext *sws_alloc_context(void);
/**
* Initialize the swscaler context sws_context.
*
* @return zero or positive value on success, a negative value on
* error
*/
av_warn_unused_result
int sws_init_context(struct SwsContext *sws_context, SwsFilter *srcFilter, SwsFilter *dstFilter);
/**
* Free the swscaler context swsContext.
* If swsContext is NULL, then does nothing.
*/
void sws_freeContext(struct SwsContext *swsContext);
/**
* Allocate and return an SwsContext. You need it to perform
* scaling/conversion operations using sws_scale().
*
* @param srcW the width of the source image
* @param srcH the height of the source image
* @param srcFormat the source image format
* @param dstW the width of the destination image
* @param dstH the height of the destination image
* @param dstFormat the destination image format
* @param flags specify which algorithm and options to use for rescaling
* @param param extra parameters to tune the used scaler
* For SWS_BICUBIC param[0] and [1] tune the shape of the basis
* function, param[0] tunes f(1) and param[1] f´(1)
* For SWS_GAUSS param[0] tunes the exponent and thus cutoff
* frequency
* For SWS_LANCZOS param[0] tunes the width of the window function
* @return a pointer to an allocated context, or NULL in case of error
* @note this function is to be removed after a saner alternative is
* written
*/
struct SwsContext *sws_getContext(int srcW, int srcH, enum AVPixelFormat srcFormat,
int dstW, int dstH, enum AVPixelFormat dstFormat,
int flags, SwsFilter *srcFilter,
SwsFilter *dstFilter, const double *param);
/**
* Scale the image slice in srcSlice and put the resulting scaled
* slice in the image in dst. A slice is a sequence of consecutive
* rows in an image.
*
* Slices have to be provided in sequential order, either in
* top-bottom or bottom-top order. If slices are provided in
* non-sequential order the behavior of the function is undefined.
*
* @param c the scaling context previously created with
* sws_getContext()
* @param srcSlice the array containing the pointers to the planes of
* the source slice
* @param srcStride the array containing the strides for each plane of
* the source image
* @param srcSliceY the position in the source image of the slice to
* process, that is the number (counted starting from
* zero) in the image of the first row of the slice
* @param srcSliceH the height of the source slice, that is the number
* of rows in the slice
* @param dst the array containing the pointers to the planes of
* the destination image
* @param dstStride the array containing the strides for each plane of
* the destination image
* @return the height of the output slice
*/
int sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
const int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *const dst[], const int dstStride[]);
/**
* @param dstRange flag indicating the while-black range of the output (1=jpeg / 0=mpeg)
* @param srcRange flag indicating the while-black range of the input (1=jpeg / 0=mpeg)
* @param table the yuv2rgb coefficients describing the output yuv space, normally ff_yuv2rgb_coeffs[x]
* @param inv_table the yuv2rgb coefficients describing the input yuv space, normally ff_yuv2rgb_coeffs[x]
* @param brightness 16.16 fixed point brightness correction
* @param contrast 16.16 fixed point contrast correction
* @param saturation 16.16 fixed point saturation correction
* @return -1 if not supported
*/
int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
int srcRange, const int table[4], int dstRange,
int brightness, int contrast, int saturation);
/**
* @return -1 if not supported
*/
int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table,
int *srcRange, int **table, int *dstRange,
int *brightness, int *contrast, int *saturation);
/**
* Allocate and return an uninitialized vector with length coefficients.
*/
SwsVector *sws_allocVec(int length);
/**
* Return a normalized Gaussian curve used to filter stuff
* quality = 3 is high quality, lower is lower quality.
*/
SwsVector *sws_getGaussianVec(double variance, double quality);
/**
* Scale all the coefficients of a by the scalar value.
*/
void sws_scaleVec(SwsVector *a, double scalar);
/**
* Scale all the coefficients of a so that their sum equals height.
*/
void sws_normalizeVec(SwsVector *a, double height);
#if FF_API_SWS_VECTOR
attribute_deprecated SwsVector *sws_getConstVec(double c, int length);
attribute_deprecated SwsVector *sws_getIdentityVec(void);
attribute_deprecated void sws_convVec(SwsVector *a, SwsVector *b);
attribute_deprecated void sws_addVec(SwsVector *a, SwsVector *b);
attribute_deprecated void sws_subVec(SwsVector *a, SwsVector *b);
attribute_deprecated void sws_shiftVec(SwsVector *a, int shift);
attribute_deprecated SwsVector *sws_cloneVec(SwsVector *a);
attribute_deprecated void sws_printVec2(SwsVector *a, AVClass *log_ctx, int log_level);
#endif
void sws_freeVec(SwsVector *a);
SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
float lumaSharpen, float chromaSharpen,
float chromaHShift, float chromaVShift,
int verbose);
void sws_freeFilter(SwsFilter *filter);
/**
* Check if context can be reused, otherwise reallocate a new one.
*
* If context is NULL, just calls sws_getContext() to get a new
* context. Otherwise, checks if the parameters are the ones already
* saved in context. If that is the case, returns the current
* context. Otherwise, frees context and gets a new context with
* the new parameters.
*
* Be warned that srcFilter and dstFilter are not checked, they
* are assumed to remain the same.
*/
struct SwsContext *sws_getCachedContext(struct SwsContext *context,
int srcW, int srcH, enum AVPixelFormat srcFormat,
int dstW, int dstH, enum AVPixelFormat dstFormat,
int flags, SwsFilter *srcFilter,
SwsFilter *dstFilter, const double *param);
/**
* Convert an 8-bit paletted frame into a frame with a color depth of 32 bits.
*
* The output frame will have the same packed format as the palette.
*
* @param src source frame buffer
* @param dst destination frame buffer
* @param num_pixels number of pixels to convert
* @param palette array with [256] entries, which must match color arrangement (RGB or BGR) of src
*/
void sws_convertPalette8ToPacked32(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette);
/**
* Convert an 8-bit paletted frame into a frame with a color depth of 24 bits.
*
* With the palette format "ABCD", the destination frame ends up with the format "ABC".
*
* @param src source frame buffer
* @param dst destination frame buffer
* @param num_pixels number of pixels to convert
* @param palette array with [256] entries, which must match color arrangement (RGB or BGR) of src
*/
void sws_convertPalette8ToPacked24(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette);
/**
* Get the AVClass for swsContext. It can be used in combination with
* AV_OPT_SEARCH_FAKE_OBJ for examining options.
*
* @see av_opt_find().
*/
const AVClass *sws_get_class(void);
/**
* @}
*/
#endif /* SWSCALE_SWSCALE_H */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,55 @@
/*
* Windows resource file for libswscale
*
* Copyright (C) 2012 James Almer
* Copyright (C) 2013 Tiancheng "Timothy" Gu
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <windows.h>
#include "libswscale/version.h"
#include "libavutil/ffversion.h"
#include "config.h"
1 VERSIONINFO
FILEVERSION LIBSWSCALE_VERSION_MAJOR, LIBSWSCALE_VERSION_MINOR, LIBSWSCALE_VERSION_MICRO, 0
PRODUCTVERSION LIBSWSCALE_VERSION_MAJOR, LIBSWSCALE_VERSION_MINOR, LIBSWSCALE_VERSION_MICRO, 0
FILEFLAGSMASK VS_FFI_FILEFLAGSMASK
FILEOS VOS_NT_WINDOWS32
FILETYPE VFT_DLL
{
BLOCK "StringFileInfo"
{
BLOCK "040904B0"
{
VALUE "CompanyName", "FFmpeg Project"
VALUE "FileDescription", "FFmpeg image rescaling library"
VALUE "FileVersion", AV_STRINGIFY(LIBSWSCALE_VERSION)
VALUE "InternalName", "libswscale"
VALUE "LegalCopyright", "Copyright (C) 2000-" AV_STRINGIFY(CONFIG_THIS_YEAR) " FFmpeg Project"
VALUE "OriginalFilename", "swscale" BUILDSUF "-" AV_STRINGIFY(LIBSWSCALE_VERSION_MAJOR) SLIBSUF
VALUE "ProductName", "FFmpeg"
VALUE "ProductVersion", FFMPEG_VERSION
}
}
BLOCK "VarFileInfo"
{
VALUE "Translation", 0x0409, 0x04B0
}
}

View File

@@ -0,0 +1,4 @@
/colorspace
/floatimg_cmp
/pixdesc_query
/swscale

View File

@@ -0,0 +1,171 @@
/*
* Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <string.h> /* for memset() */
#include <stdlib.h>
#include <inttypes.h>
#include "libavutil/mem.h"
#include "libswscale/swscale.h"
#include "libswscale/rgb2rgb.h"
#define SIZE 1000
#define srcByte 0x55
#define dstByte 0xBB
#define FUNC(s, d, n) { s, d, #n, n }
int main(int argc, char **argv)
{
int i, funcNum;
uint8_t *srcBuffer = av_malloc(SIZE);
uint8_t *dstBuffer = av_malloc(SIZE);
int failedNum = 0;
int passedNum = 0;
if (!srcBuffer || !dstBuffer)
return -1;
av_log(NULL, AV_LOG_INFO, "memory corruption test ...\n");
ff_sws_rgb2rgb_init();
for (funcNum = 0; ; funcNum++) {
struct func_info_s {
int src_bpp;
int dst_bpp;
const char *name;
void (*func)(const uint8_t *src, uint8_t *dst, int src_size);
} func_info[] = {
FUNC(2, 2, rgb12to15),
FUNC(2, 2, rgb15to16),
FUNC(2, 3, rgb15to24),
FUNC(2, 4, rgb15to32),
FUNC(2, 3, rgb16to24),
FUNC(2, 4, rgb16to32),
FUNC(3, 2, rgb24to15),
FUNC(3, 2, rgb24to16),
FUNC(3, 4, rgb24to32),
FUNC(4, 2, rgb32to15),
FUNC(4, 2, rgb32to16),
FUNC(4, 3, rgb32to24),
FUNC(2, 2, rgb16to15),
FUNC(2, 2, rgb12tobgr12),
FUNC(2, 2, rgb15tobgr15),
FUNC(2, 2, rgb15tobgr16),
FUNC(2, 3, rgb15tobgr24),
FUNC(2, 4, rgb15tobgr32),
FUNC(2, 2, rgb16tobgr15),
FUNC(2, 2, rgb16tobgr16),
FUNC(2, 3, rgb16tobgr24),
FUNC(2, 4, rgb16tobgr32),
FUNC(3, 2, rgb24tobgr15),
FUNC(3, 2, rgb24tobgr16),
FUNC(3, 3, rgb24tobgr24),
FUNC(3, 4, rgb24tobgr32),
FUNC(4, 2, rgb32tobgr15),
FUNC(4, 2, rgb32tobgr16),
FUNC(4, 3, rgb32tobgr24),
FUNC(4, 4, shuffle_bytes_2103), /* rgb32tobgr32 */
FUNC(6, 6, rgb48tobgr48_nobswap),
FUNC(6, 6, rgb48tobgr48_bswap),
FUNC(8, 6, rgb64to48_nobswap),
FUNC(8, 6, rgb64to48_bswap),
FUNC(8, 6, rgb64tobgr48_nobswap),
FUNC(8, 6, rgb64tobgr48_bswap),
FUNC(0, 0, NULL)
};
int width;
int failed = 0;
int srcBpp = 0;
int dstBpp = 0;
if (!func_info[funcNum].func)
break;
av_log(NULL, AV_LOG_INFO, ".");
memset(srcBuffer, srcByte, SIZE);
for (width = 63; width > 0; width--) {
int dstOffset;
for (dstOffset = 128; dstOffset < 196; dstOffset += 4) {
int srcOffset;
memset(dstBuffer, dstByte, SIZE);
for (srcOffset = 128; srcOffset < 196; srcOffset += 4) {
uint8_t *src = srcBuffer + srcOffset;
uint8_t *dst = dstBuffer + dstOffset;
const char *name = NULL;
// don't fill the screen with shit ...
if (failed)
break;
srcBpp = func_info[funcNum].src_bpp;
dstBpp = func_info[funcNum].dst_bpp;
name = func_info[funcNum].name;
func_info[funcNum].func(src, dst, width * srcBpp);
if (!srcBpp)
break;
for (i = 0; i < SIZE; i++) {
if (srcBuffer[i] != srcByte) {
av_log(NULL, AV_LOG_INFO,
"src damaged at %d w:%d src:%d dst:%d %s\n",
i, width, srcOffset, dstOffset, name);
failed = 1;
break;
}
}
for (i = 0; i < dstOffset; i++) {
if (dstBuffer[i] != dstByte) {
av_log(NULL, AV_LOG_INFO,
"dst damaged at %d w:%d src:%d dst:%d %s\n",
i, width, srcOffset, dstOffset, name);
failed = 1;
break;
}
}
for (i = dstOffset + width * dstBpp; i < SIZE; i++) {
if (dstBuffer[i] != dstByte) {
av_log(NULL, AV_LOG_INFO,
"dst damaged at %d w:%d src:%d dst:%d %s\n",
i, width, srcOffset, dstOffset, name);
failed = 1;
break;
}
}
}
}
}
if (failed)
failedNum++;
else if (srcBpp)
passedNum++;
}
av_log(NULL, AV_LOG_INFO,
"\n%d converters passed, %d converters randomly overwrote memory\n",
passedNum, failedNum);
return failedNum;
}

View File

@@ -0,0 +1,296 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <inttypes.h>
#include <float.h>
#include "libavutil/avutil.h"
#include "libavutil/imgutils.h"
#include "libavutil/intfloat.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/lfg.h"
#include "libavutil/mem.h"
#include "libavutil/parseutils.h"
#include "libavutil/pixdesc.h"
#include "libswscale/swscale.h"
#define DEFAULT_W 96
#define DEFAULT_H 96
static const enum AVPixelFormat pix_fmts[] = {
AV_PIX_FMT_YUV444P16LE,
AV_PIX_FMT_YUV444P,
AV_PIX_FMT_YUV444P9LE, AV_PIX_FMT_YUV444P10LE,
AV_PIX_FMT_YUV444P12LE, AV_PIX_FMT_YUV444P14LE,
AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24,
AV_PIX_FMT_RGBA, AV_PIX_FMT_BGRA,
AV_PIX_FMT_ARGB, AV_PIX_FMT_ABGR,
AV_PIX_FMT_0RGB, AV_PIX_FMT_0BGR,
AV_PIX_FMT_RGB0, AV_PIX_FMT_BGR0,
AV_PIX_FMT_RGB48LE, AV_PIX_FMT_BGR48LE,
AV_PIX_FMT_RGBA64LE, AV_PIX_FMT_BGRA64LE,
AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRAP,
AV_PIX_FMT_GBRP9LE,
AV_PIX_FMT_GBRP10LE, AV_PIX_FMT_GBRAP10LE,
AV_PIX_FMT_GBRP12LE, AV_PIX_FMT_GBRAP12LE,
AV_PIX_FMT_GBRP14LE,
AV_PIX_FMT_GBRP16LE, AV_PIX_FMT_GBRAP16LE
};
const char *usage = "floatimg_cmp -pixel_format <pix_fmt> -size <image_size> -ref <testfile>\n";
int main(int argc, char **argv)
{
enum AVPixelFormat inFormat = AV_PIX_FMT_NONE;
enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE;
const AVPixFmtDescriptor *desc;
uint8_t *ptr;
uint32_t *in, *out;
uint8_t *rgbIn[4] = {NULL, NULL, NULL, NULL};
uint8_t *rgbOut[4] = {NULL, NULL, NULL, NULL};
int rgbStride[4];
uint8_t *dst[4] = {NULL, NULL, NULL, NULL};
int dstStride[4];
int i, x, y, p, size, count;
int res = -1;
int w = -1;
int h = -1;
union av_intfloat32 v0, v1;
double sum;
float minimum, maximum, diff;
struct SwsContext *sws = NULL;
AVLFG rand;
FILE *fp = NULL;
for (i = 1; i < argc; i += 2) {
if (argv[i][0] != '-' || i + 1 == argc)
goto bad_option;
if (!strcmp(argv[i], "-ref")) {
fp = fopen(argv[i + 1], "rb");
if (!fp) {
fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
goto end;
}
} else if (!strcmp(argv[i], "-size")) {
res = av_parse_video_size(&w, &h, argv[i + 1]);
if (res < 0) {
fprintf(stderr, "invalid video size %s\n", argv[i + 1]);
goto end;
}
} else if (!strcmp(argv[i], "-pixel_format")) {
inFormat = av_get_pix_fmt(argv[i + 1]);
if (inFormat == AV_PIX_FMT_NONE) {
fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
goto end;
}
} else {
bad_option:
fprintf(stderr, "%s", usage);
fprintf(stderr, "bad option or argument missing (%s)\n", argv[i]);
goto end;
};
}
if (!fp) {
inFormat = AV_PIX_FMT_GBRPF32LE;
w = DEFAULT_W;
h = DEFAULT_H;
}
if (w <= 0 || h <= 0) {
fprintf(stderr, "%s", usage);
fprintf(stderr, "invalid -video_size\n");
goto end;
}
if (inFormat == AV_PIX_FMT_NONE) {
fprintf(stderr, "%s", usage);
fprintf(stderr, "invalid input pixel format\n");
goto end;
}
desc = av_pix_fmt_desc_get(inFormat);
if (!(desc->flags & AV_PIX_FMT_FLAG_FLOAT)) {
fprintf(stderr, "input pixel format not floating point.\n");
goto end;
}
res = av_image_fill_linesizes(rgbStride, inFormat, w);
if (res < 0) {
fprintf(stderr, "av_image_fill_linesizes failed\n");
goto end;
}
for (p = 0; p < 4; p++) {
rgbStride[p] = FFALIGN(rgbStride[p], 16);
if (rgbStride[p]) {
rgbIn[p] = av_mallocz(rgbStride[p] * h + 16);
rgbOut[p] = av_mallocz(rgbStride[p] * h + 16);
}
if (rgbStride[p] && (!rgbIn[p] || !rgbOut[p])) {
goto end;
}
}
for (i = 0; i < FF_ARRAY_ELEMS(pix_fmts); i++) {
dstFormat = pix_fmts[i];
if (fp) {
fseek(fp, 0, SEEK_SET);
for (p = 0; p < 4; p++) {
if (!rgbStride[p])
continue;
ptr = rgbIn[p];
for (y = 0; y < h; y++) {
size = fread(ptr, 1, w*4, fp);
if (size != w*4) {
fprintf(stderr, "read error: %d\n", size);
goto end;
}
ptr += rgbStride[p];
}
}
} else {
// fill src with random values between 0.0 - 1.0
av_lfg_init(&rand, 1);
for (p = 0; p < 4; p++) {
if (!rgbStride[p])
continue;
for (y = 0; y < h; y++) {
in = (uint32_t*)(rgbIn[p] + y * rgbStride[p]);
for (x = 0; x < w; x++) {
v0.f = (float)av_lfg_get(&rand)/(float)(UINT32_MAX);
*in++ = AV_RL32(&v0.i);
}
}
}
}
// setup intermediate image
for (p = 0; p < 4; p++) {
av_freep(&dst[p]);
}
res = av_image_fill_linesizes(dstStride, dstFormat, w);
if (res < 0) {
fprintf(stderr, "av_image_fill_linesizes failed\n");
goto end;
}
for (p = 0; p < 4; p++) {
dstStride[p] = FFALIGN(dstStride[p], 16);
if (dstStride[p]) {
dst[p] = av_mallocz(dstStride[p] * h + 16);
}
if (dstStride[p] && !dst[p]) {
goto end;
}
}
// srcFormat -> dstFormat
sws = sws_getContext(w, h, inFormat, w, h,
dstFormat, SWS_BILINEAR, NULL, NULL, NULL);
if (!sws) {
fprintf(stderr, "Failed to get %s -> %s\n", av_get_pix_fmt_name(inFormat), av_get_pix_fmt_name(dstFormat) );
goto end;
}
res = sws_scale(sws, (const uint8_t *const *)rgbIn, rgbStride, 0, h, dst, dstStride);
if (res < 0 || res != h) {
fprintf(stderr, "sws_scale failed\n");
res = -1;
goto end;
}
sws_freeContext(sws);
// dstFormat -> srcFormat
sws = sws_getContext(w, h, dstFormat, w, h,
inFormat, SWS_BILINEAR, NULL, NULL, NULL);
if(!sws) {
fprintf(stderr, "Failed to get %s -> %s\n", av_get_pix_fmt_name(dstFormat), av_get_pix_fmt_name(inFormat) );
goto end;
}
res = sws_scale(sws, (const uint8_t *const *)dst, dstStride, 0, h, rgbOut, rgbStride);
if (res < 0 || res != h) {
fprintf(stderr, "sws_scale failed\n");
res = -1;
goto end;
}
sws_freeContext(sws);
sws = NULL;
minimum = FLT_MAX;
maximum = -FLT_MAX;
count = 0;
sum = 0.0;
for (p = 0; p < 4; p++) {
if (!rgbStride[p])
continue;
for (y = 0; y < h; y++) {
in = (uint32_t*)(rgbIn[p] + y * rgbStride[p]);
out = (uint32_t*)(rgbOut[p] + y * rgbStride[p]);
for (x = 0; x < w; x++) {
if (desc->flags & AV_PIX_FMT_FLAG_BE) {
v0.i = AV_RB32(in);
v1.i = AV_RB32(out);
} else {
v0.i = AV_RL32(in);
v1.i = AV_RL32(out);
}
diff = fabsf(v0.f - v1.f);
sum += diff;
minimum = FFMIN(minimum, diff);
maximum = FFMAX(maximum, diff);
count++;
in++;
out++;
}
}
}
fprintf(stdout, "%s -> %s -> %s\n", av_get_pix_fmt_name(inFormat), av_get_pix_fmt_name(dstFormat), av_get_pix_fmt_name(inFormat) );
fprintf(stdout, "avg diff: %f\nmin diff: %f\nmax diff: %f\n", sum / count, minimum, maximum);
res = 0;
}
end:
sws_freeContext(sws);
for (p = 0; p < 4; p++) {
av_freep(&rgbIn[p]);
av_freep(&rgbOut[p]);
av_freep(&dst[p]);
}
if (fp)
fclose(fp);
return res;
}

View File

@@ -0,0 +1,91 @@
/*
* Copyright (c) 2017 Clément Bœsch <u pkh me>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdlib.h>
#include "libavutil/mem.h"
#include "libswscale/swscale_internal.h"
static const struct {
const char *class;
int (*cond)(enum AVPixelFormat pix_fmt);
} query_tab[] = {
{"is16BPS", is16BPS},
{"isNBPS", isNBPS},
{"isBE", isBE},
{"isYUV", isYUV},
{"isPlanarYUV", isPlanarYUV},
{"isSemiPlanarYUV", isSemiPlanarYUV},
{"isRGB", isRGB},
{"Gray", isGray},
{"RGBinInt", isRGBinInt},
{"BGRinInt", isBGRinInt},
{"Bayer", isBayer},
{"AnyRGB", isAnyRGB},
{"ALPHA", isALPHA},
{"Packed", isPacked},
{"Planar", isPlanar},
{"PackedRGB", isPackedRGB},
{"PlanarRGB", isPlanarRGB},
{"usePal", usePal},
};
static int cmp_str(const void *a, const void *b)
{
const char *s1 = *(const char **)a;
const char *s2 = *(const char **)b;
return strcmp(s1, s2);
}
int main(void)
{
int i, j;
for (i = 0; i < FF_ARRAY_ELEMS(query_tab); i++) {
const char **pix_fmts = NULL;
int nb_pix_fmts = 0;
const AVPixFmtDescriptor *pix_desc = NULL;
while ((pix_desc = av_pix_fmt_desc_next(pix_desc))) {
enum AVPixelFormat pix_fmt = av_pix_fmt_desc_get_id(pix_desc);
if (query_tab[i].cond(pix_fmt)) {
const char *pix_name = pix_desc->name;
if (pix_fmt == AV_PIX_FMT_RGB32) pix_name = "rgb32";
else if (pix_fmt == AV_PIX_FMT_RGB32_1) pix_name = "rgb32_1";
else if (pix_fmt == AV_PIX_FMT_BGR32) pix_name = "bgr32";
else if (pix_fmt == AV_PIX_FMT_BGR32_1) pix_name = "bgr32_1";
av_dynarray_add(&pix_fmts, &nb_pix_fmts, (void *)pix_name);
}
}
if (pix_fmts) {
qsort(pix_fmts, nb_pix_fmts, sizeof(*pix_fmts), cmp_str);
printf("%s:\n", query_tab[i].class);
for (j = 0; j < nb_pix_fmts; j++)
printf(" %s\n", pix_fmts[j]);
printf("\n");
av_free(pix_fmts);
}
}
return 0;
}

View File

@@ -0,0 +1,444 @@
/*
* Copyright (C) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <stdarg.h>
#undef HAVE_AV_CONFIG_H
#include "libavutil/cpu.h"
#include "libavutil/imgutils.h"
#include "libavutil/mem.h"
#include "libavutil/avutil.h"
#include "libavutil/crc.h"
#include "libavutil/pixdesc.h"
#include "libavutil/lfg.h"
#include "libswscale/swscale.h"
/* HACK Duplicated from swscale_internal.h.
* Should be removed when a cleaner pixel format system exists. */
#define isGray(x) \
((x) == AV_PIX_FMT_GRAY8 || \
(x) == AV_PIX_FMT_YA8 || \
(x) == AV_PIX_FMT_GRAY16BE || \
(x) == AV_PIX_FMT_GRAY16LE || \
(x) == AV_PIX_FMT_YA16BE || \
(x) == AV_PIX_FMT_YA16LE)
#define hasChroma(x) \
(!(isGray(x) || \
(x) == AV_PIX_FMT_MONOBLACK || \
(x) == AV_PIX_FMT_MONOWHITE))
#define isALPHA(x) \
((x) == AV_PIX_FMT_BGR32 || \
(x) == AV_PIX_FMT_BGR32_1 || \
(x) == AV_PIX_FMT_RGB32 || \
(x) == AV_PIX_FMT_RGB32_1 || \
(x) == AV_PIX_FMT_YUVA420P)
static uint64_t getSSD(const uint8_t *src1, const uint8_t *src2,
int stride1, int stride2, int w, int h)
{
int x, y;
uint64_t ssd = 0;
for (y = 0; y < h; y++) {
for (x = 0; x < w; x++) {
int d = src1[x + y * stride1] - src2[x + y * stride2];
ssd += d * d;
}
}
return ssd;
}
struct Results {
uint64_t ssdY;
uint64_t ssdU;
uint64_t ssdV;
uint64_t ssdA;
uint32_t crc;
};
// test by ref -> src -> dst -> out & compare out against ref
// ref & out are YV12
static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
enum AVPixelFormat srcFormat, enum AVPixelFormat dstFormat,
int srcW, int srcH, int dstW, int dstH, int flags,
struct Results *r)
{
const AVPixFmtDescriptor *desc_yuva420p = av_pix_fmt_desc_get(AV_PIX_FMT_YUVA420P);
const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(srcFormat);
const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(dstFormat);
static enum AVPixelFormat cur_srcFormat;
static int cur_srcW, cur_srcH;
static const uint8_t *src[4];
static int srcStride[4];
uint8_t *dst[4] = { 0 };
uint8_t *out[4] = { 0 };
int dstStride[4] = {0};
int i;
uint64_t ssdY, ssdU = 0, ssdV = 0, ssdA = 0;
struct SwsContext *dstContext = NULL, *outContext = NULL;
uint32_t crc = 0;
int res = 0;
if (cur_srcFormat != srcFormat || cur_srcW != srcW || cur_srcH != srcH) {
struct SwsContext *srcContext = NULL;
int p;
for (p = 0; p < 4; p++)
av_freep(&src[p]);
res = av_image_fill_linesizes(srcStride, srcFormat, srcW);
if (res < 0) {
fprintf(stderr, "av_image_fill_linesizes failed\n");
goto end;
}
for (p = 0; p < 4; p++) {
srcStride[p] = FFALIGN(srcStride[p], 16);
if (srcStride[p])
src[p] = av_mallocz(srcStride[p] * srcH + 16);
if (srcStride[p] && !src[p]) {
perror("Malloc");
res = -1;
goto end;
}
}
srcContext = sws_getContext(w, h, AV_PIX_FMT_YUVA420P, srcW, srcH,
srcFormat, SWS_BILINEAR, NULL, NULL, NULL);
if (!srcContext) {
fprintf(stderr, "Failed to get %s ---> %s\n",
desc_yuva420p->name,
desc_src->name);
res = -1;
goto end;
}
sws_scale(srcContext, ref, refStride, 0, h,
(uint8_t * const *) src, srcStride);
sws_freeContext(srcContext);
cur_srcFormat = srcFormat;
cur_srcW = srcW;
cur_srcH = srcH;
}
res = av_image_fill_linesizes(dstStride, dstFormat, dstW);
if (res < 0) {
fprintf(stderr, "av_image_fill_linesizes failed\n");
goto end;
}
for (i = 0; i < 4; i++) {
/* Image buffers passed into libswscale can be allocated any way you
* prefer, as long as they're aligned enough for the architecture, and
* they're freed appropriately (such as using av_free for buffers
* allocated with av_malloc). */
/* An extra 16 bytes is being allocated because some scalers may write
* out of bounds. */
dstStride[i] = FFALIGN(dstStride[i], 16);
if (dstStride[i])
dst[i] = av_mallocz(dstStride[i] * dstH + 16);
if (dstStride[i] && !dst[i]) {
perror("Malloc");
res = -1;
goto end;
}
}
dstContext = sws_getContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat,
flags, NULL, NULL, NULL);
if (!dstContext) {
fprintf(stderr, "Failed to get %s ---> %s\n",
desc_src->name, desc_dst->name);
res = -1;
goto end;
}
printf(" %s %dx%d -> %s %3dx%3d flags=%2d",
desc_src->name, srcW, srcH,
desc_dst->name, dstW, dstH,
flags);
fflush(stdout);
sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
for (i = 0; i < 4 && dstStride[i]; i++)
crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i],
dstStride[i] * dstH);
if (r && crc == r->crc) {
ssdY = r->ssdY;
ssdU = r->ssdU;
ssdV = r->ssdV;
ssdA = r->ssdA;
} else {
for (i = 0; i < 4; i++) {
refStride[i] = FFALIGN(refStride[i], 16);
if (refStride[i])
out[i] = av_mallocz(refStride[i] * h);
if (refStride[i] && !out[i]) {
perror("Malloc");
res = -1;
goto end;
}
}
outContext = sws_getContext(dstW, dstH, dstFormat, w, h,
AV_PIX_FMT_YUVA420P, SWS_BILINEAR,
NULL, NULL, NULL);
if (!outContext) {
fprintf(stderr, "Failed to get %s ---> %s\n",
desc_dst->name,
desc_yuva420p->name);
res = -1;
goto end;
}
sws_scale(outContext, (const uint8_t * const *) dst, dstStride, 0, dstH,
out, refStride);
ssdY = getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
if (hasChroma(srcFormat) && hasChroma(dstFormat)) {
//FIXME check that output is really gray
ssdU = getSSD(ref[1], out[1], refStride[1], refStride[1],
(w + 1) >> 1, (h + 1) >> 1);
ssdV = getSSD(ref[2], out[2], refStride[2], refStride[2],
(w + 1) >> 1, (h + 1) >> 1);
}
if (isALPHA(srcFormat) && isALPHA(dstFormat))
ssdA = getSSD(ref[3], out[3], refStride[3], refStride[3], w, h);
ssdY /= w * h;
ssdU /= w * h / 4;
ssdV /= w * h / 4;
ssdA /= w * h;
sws_freeContext(outContext);
for (i = 0; i < 4; i++)
if (refStride[i])
av_free(out[i]);
}
printf(" CRC=%08x SSD=%5"PRId64 ",%5"PRId64 ",%5"PRId64 ",%5"PRId64 "\n",
crc, ssdY, ssdU, ssdV, ssdA);
end:
sws_freeContext(dstContext);
for (i = 0; i < 4; i++)
if (dstStride[i])
av_free(dst[i]);
return !!res;
}
static void selfTest(const uint8_t * const ref[4], int refStride[4],
int w, int h,
enum AVPixelFormat srcFormat_in,
enum AVPixelFormat dstFormat_in)
{
const int flags[] = { SWS_FAST_BILINEAR, SWS_BILINEAR, SWS_BICUBIC,
SWS_X, SWS_POINT, SWS_AREA, 0 };
const int srcW = w;
const int srcH = h;
const int dstW[] = { srcW - srcW / 3, srcW, srcW + srcW / 3, 0 };
const int dstH[] = { srcH - srcH / 3, srcH, srcH + srcH / 3, 0 };
enum AVPixelFormat srcFormat, dstFormat;
const AVPixFmtDescriptor *desc_src, *desc_dst;
for (srcFormat = srcFormat_in != AV_PIX_FMT_NONE ? srcFormat_in : 0;
srcFormat < AV_PIX_FMT_NB; srcFormat++) {
if (!sws_isSupportedInput(srcFormat) ||
!sws_isSupportedOutput(srcFormat))
continue;
desc_src = av_pix_fmt_desc_get(srcFormat);
for (dstFormat = dstFormat_in != AV_PIX_FMT_NONE ? dstFormat_in : 0;
dstFormat < AV_PIX_FMT_NB; dstFormat++) {
int i, j, k;
int res = 0;
if (!sws_isSupportedInput(dstFormat) ||
!sws_isSupportedOutput(dstFormat))
continue;
desc_dst = av_pix_fmt_desc_get(dstFormat);
printf("%s -> %s\n", desc_src->name, desc_dst->name);
fflush(stdout);
for (k = 0; flags[k] && !res; k++)
for (i = 0; dstW[i] && !res; i++)
for (j = 0; dstH[j] && !res; j++)
res = doTest(ref, refStride, w, h,
srcFormat, dstFormat,
srcW, srcH, dstW[i], dstH[j], flags[k],
NULL);
if (dstFormat_in != AV_PIX_FMT_NONE)
break;
}
if (srcFormat_in != AV_PIX_FMT_NONE)
break;
}
}
static int fileTest(const uint8_t * const ref[4], int refStride[4],
int w, int h, FILE *fp,
enum AVPixelFormat srcFormat_in,
enum AVPixelFormat dstFormat_in)
{
char buf[256];
while (fgets(buf, sizeof(buf), fp)) {
struct Results r;
enum AVPixelFormat srcFormat;
char srcStr[21];
int srcW = 0, srcH = 0;
enum AVPixelFormat dstFormat;
char dstStr[21];
int dstW = 0, dstH = 0;
int flags;
int ret;
ret = sscanf(buf,
" %20s %dx%d -> %20s %dx%d flags=%d CRC=%x"
" SSD=%"SCNu64 ", %"SCNu64 ", %"SCNu64 ", %"SCNu64 "\n",
srcStr, &srcW, &srcH, dstStr, &dstW, &dstH,
&flags, &r.crc, &r.ssdY, &r.ssdU, &r.ssdV, &r.ssdA);
if (ret != 12) {
srcStr[0] = dstStr[0] = 0;
ret = sscanf(buf, "%20s -> %20s\n", srcStr, dstStr);
}
srcFormat = av_get_pix_fmt(srcStr);
dstFormat = av_get_pix_fmt(dstStr);
if (srcFormat == AV_PIX_FMT_NONE || dstFormat == AV_PIX_FMT_NONE ||
srcW > 8192U || srcH > 8192U || dstW > 8192U || dstH > 8192U) {
fprintf(stderr, "malformed input file\n");
return -1;
}
if ((srcFormat_in != AV_PIX_FMT_NONE && srcFormat_in != srcFormat) ||
(dstFormat_in != AV_PIX_FMT_NONE && dstFormat_in != dstFormat))
continue;
if (ret != 12) {
printf("%s", buf);
continue;
}
doTest(ref, refStride, w, h,
srcFormat, dstFormat,
srcW, srcH, dstW, dstH, flags,
&r);
}
return 0;
}
#define W 96
#define H 96
int main(int argc, char **argv)
{
enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE;
enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE;
uint8_t *rgb_data = av_malloc(W * H * 4);
const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL };
int rgb_stride[4] = { 4 * W, 0, 0, 0 };
uint8_t *data = av_malloc(4 * W * H);
const uint8_t * const src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 };
int stride[4] = { W, W, W, W };
int x, y;
struct SwsContext *sws;
AVLFG rand;
int res = -1;
int i;
FILE *fp = NULL;
if (!rgb_data || !data)
return -1;
for (i = 1; i < argc; i += 2) {
if (argv[i][0] != '-' || i + 1 == argc)
goto bad_option;
if (!strcmp(argv[i], "-ref")) {
fp = fopen(argv[i + 1], "r");
if (!fp) {
fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
goto error;
}
} else if (!strcmp(argv[i], "-cpuflags")) {
unsigned flags = av_get_cpu_flags();
int ret = av_parse_cpu_caps(&flags, argv[i + 1]);
if (ret < 0) {
fprintf(stderr, "invalid cpu flags %s\n", argv[i + 1]);
return ret;
}
av_force_cpu_flags(flags);
} else if (!strcmp(argv[i], "-src")) {
srcFormat = av_get_pix_fmt(argv[i + 1]);
if (srcFormat == AV_PIX_FMT_NONE) {
fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
return -1;
}
} else if (!strcmp(argv[i], "-dst")) {
dstFormat = av_get_pix_fmt(argv[i + 1]);
if (dstFormat == AV_PIX_FMT_NONE) {
fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
return -1;
}
} else {
bad_option:
fprintf(stderr, "bad option or argument missing (%s)\n", argv[i]);
goto error;
}
}
sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
av_lfg_init(&rand, 1);
for (y = 0; y < H; y++)
for (x = 0; x < W * 4; x++)
rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
res = sws_scale(sws, rgb_src, rgb_stride, 0, H / 12, (uint8_t * const *) src, stride);
if (res < 0 || res != H) {
res = -1;
goto error;
}
sws_freeContext(sws);
av_free(rgb_data);
if(fp) {
res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
fclose(fp);
} else {
selfTest(src, stride, W, H, srcFormat, dstFormat);
res = 0;
}
error:
av_free(data);
return res;
}

2456
externals/ffmpeg/ffmpeg/libswscale/utils.c vendored Executable file

File diff suppressed because it is too large Load Diff

53
externals/ffmpeg/ffmpeg/libswscale/version.h vendored Executable file
View File

@@ -0,0 +1,53 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_VERSION_H
#define SWSCALE_VERSION_H
/**
* @file
* swscale version macros
*/
#include "libavutil/version.h"
#define LIBSWSCALE_VERSION_MAJOR 5
#define LIBSWSCALE_VERSION_MINOR 9
#define LIBSWSCALE_VERSION_MICRO 100
#define LIBSWSCALE_VERSION_INT AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
LIBSWSCALE_VERSION_MINOR, \
LIBSWSCALE_VERSION_MICRO)
#define LIBSWSCALE_VERSION AV_VERSION(LIBSWSCALE_VERSION_MAJOR, \
LIBSWSCALE_VERSION_MINOR, \
LIBSWSCALE_VERSION_MICRO)
#define LIBSWSCALE_BUILD LIBSWSCALE_VERSION_INT
#define LIBSWSCALE_IDENT "SwS" AV_STRINGIFY(LIBSWSCALE_VERSION)
/**
* FF_API_* defines may be placed below to indicate public API that will be
* dropped at a future version bump. The defines themselves are not part of
* the public API and may change, break or disappear at any time.
*/
#ifndef FF_API_SWS_VECTOR
#define FF_API_SWS_VECTOR (LIBSWSCALE_VERSION_MAJOR < 6)
#endif
#endif /* SWSCALE_VERSION_H */

322
externals/ffmpeg/ffmpeg/libswscale/vscale.c vendored Executable file
View File

@@ -0,0 +1,322 @@
/*
* Copyright (C) 2015 Pedro Arthur <bygrandao@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "swscale_internal.h"
typedef struct VScalerContext
{
uint16_t *filter[2];
int32_t *filter_pos;
int filter_size;
int isMMX;
union {
yuv2planar1_fn yuv2planar1;
yuv2planarX_fn yuv2planarX;
yuv2interleavedX_fn yuv2interleavedX;
yuv2packed1_fn yuv2packed1;
yuv2packed2_fn yuv2packed2;
yuv2anyX_fn yuv2anyX;
} pfn;
yuv2packedX_fn yuv2packedX;
} VScalerContext;
static int lum_planar_vscale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
VScalerContext *inst = desc->instance;
int dstW = desc->dst->width;
int first = FFMAX(1-inst->filter_size, inst->filter_pos[sliceY]);
int sp = first - desc->src->plane[0].sliceY;
int dp = sliceY - desc->dst->plane[0].sliceY;
uint8_t **src = desc->src->plane[0].line + sp;
uint8_t **dst = desc->dst->plane[0].line + dp;
uint16_t *filter = inst->filter[0] + (inst->isMMX ? 0 : sliceY * inst->filter_size);
if (inst->filter_size == 1)
inst->pfn.yuv2planar1((const int16_t*)src[0], dst[0], dstW, c->lumDither8, 0);
else
inst->pfn.yuv2planarX(filter, inst->filter_size, (const int16_t**)src, dst[0], dstW, c->lumDither8, 0);
if (desc->alpha) {
int sp = first - desc->src->plane[3].sliceY;
int dp = sliceY - desc->dst->plane[3].sliceY;
uint8_t **src = desc->src->plane[3].line + sp;
uint8_t **dst = desc->dst->plane[3].line + dp;
uint16_t *filter = inst->filter[1] + (inst->isMMX ? 0 : sliceY * inst->filter_size);
if (inst->filter_size == 1)
inst->pfn.yuv2planar1((const int16_t*)src[0], dst[0], dstW, c->lumDither8, 0);
else
inst->pfn.yuv2planarX(filter, inst->filter_size, (const int16_t**)src, dst[0], dstW, c->lumDither8, 0);
}
return 1;
}
static int chr_planar_vscale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
const int chrSkipMask = (1 << desc->dst->v_chr_sub_sample) - 1;
if (sliceY & chrSkipMask)
return 0;
else {
VScalerContext *inst = desc->instance;
int dstW = AV_CEIL_RSHIFT(desc->dst->width, desc->dst->h_chr_sub_sample);
int chrSliceY = sliceY >> desc->dst->v_chr_sub_sample;
int first = FFMAX(1-inst->filter_size, inst->filter_pos[chrSliceY]);
int sp1 = first - desc->src->plane[1].sliceY;
int sp2 = first - desc->src->plane[2].sliceY;
int dp1 = chrSliceY - desc->dst->plane[1].sliceY;
int dp2 = chrSliceY - desc->dst->plane[2].sliceY;
uint8_t **src1 = desc->src->plane[1].line + sp1;
uint8_t **src2 = desc->src->plane[2].line + sp2;
uint8_t **dst1 = desc->dst->plane[1].line + dp1;
uint8_t **dst2 = desc->dst->plane[2].line + dp2;
uint16_t *filter = inst->filter[0] + (inst->isMMX ? 0 : chrSliceY * inst->filter_size);
if (c->yuv2nv12cX) {
inst->pfn.yuv2interleavedX(c->dstFormat, c->chrDither8, filter, inst->filter_size, (const int16_t**)src1, (const int16_t**)src2, dst1[0], dstW);
} else if (inst->filter_size == 1) {
inst->pfn.yuv2planar1((const int16_t*)src1[0], dst1[0], dstW, c->chrDither8, 0);
inst->pfn.yuv2planar1((const int16_t*)src2[0], dst2[0], dstW, c->chrDither8, 3);
} else {
inst->pfn.yuv2planarX(filter, inst->filter_size, (const int16_t**)src1, dst1[0], dstW, c->chrDither8, 0);
inst->pfn.yuv2planarX(filter, inst->filter_size, (const int16_t**)src2, dst2[0], dstW, c->chrDither8, inst->isMMX ? (c->uv_offx2 >> 1) : 3);
}
}
return 1;
}
static int packed_vscale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
VScalerContext *inst = desc->instance;
int dstW = desc->dst->width;
int chrSliceY = sliceY >> desc->dst->v_chr_sub_sample;
int lum_fsize = inst[0].filter_size;
int chr_fsize = inst[1].filter_size;
uint16_t *lum_filter = inst[0].filter[0];
uint16_t *chr_filter = inst[1].filter[0];
int firstLum = FFMAX(1-lum_fsize, inst[0].filter_pos[ sliceY]);
int firstChr = FFMAX(1-chr_fsize, inst[1].filter_pos[chrSliceY]);
int sp0 = firstLum - desc->src->plane[0].sliceY;
int sp1 = firstChr - desc->src->plane[1].sliceY;
int sp2 = firstChr - desc->src->plane[2].sliceY;
int sp3 = firstLum - desc->src->plane[3].sliceY;
int dp = sliceY - desc->dst->plane[0].sliceY;
uint8_t **src0 = desc->src->plane[0].line + sp0;
uint8_t **src1 = desc->src->plane[1].line + sp1;
uint8_t **src2 = desc->src->plane[2].line + sp2;
uint8_t **src3 = desc->alpha ? desc->src->plane[3].line + sp3 : NULL;
uint8_t **dst = desc->dst->plane[0].line + dp;
if (c->yuv2packed1 && lum_fsize == 1 && chr_fsize == 1) { // unscaled RGB
inst->pfn.yuv2packed1(c, (const int16_t*)*src0, (const int16_t**)src1, (const int16_t**)src2,
(const int16_t*)(desc->alpha ? *src3 : NULL), *dst, dstW, 0, sliceY);
} else if (c->yuv2packed1 && lum_fsize == 1 && chr_fsize == 2 &&
chr_filter[2 * chrSliceY + 1] + chr_filter[2 * chrSliceY] == 4096 &&
chr_filter[2 * chrSliceY + 1] <= 4096U) { // unscaled RGB
int chrAlpha = chr_filter[2 * chrSliceY + 1];
inst->pfn.yuv2packed1(c, (const int16_t*)*src0, (const int16_t**)src1, (const int16_t**)src2,
(const int16_t*)(desc->alpha ? *src3 : NULL), *dst, dstW, chrAlpha, sliceY);
} else if (c->yuv2packed2 && lum_fsize == 2 && chr_fsize == 2 &&
lum_filter[2 * sliceY + 1] + lum_filter[2 * sliceY] == 4096 &&
lum_filter[2 * sliceY + 1] <= 4096U &&
chr_filter[2 * chrSliceY + 1] + chr_filter[2 * chrSliceY] == 4096 &&
chr_filter[2 * chrSliceY + 1] <= 4096U
) { // bilinear upscale RGB
int lumAlpha = lum_filter[2 * sliceY + 1];
int chrAlpha = chr_filter[2 * chrSliceY + 1];
c->lumMmxFilter[2] =
c->lumMmxFilter[3] = lum_filter[2 * sliceY] * 0x10001;
c->chrMmxFilter[2] =
c->chrMmxFilter[3] = chr_filter[2 * chrSliceY] * 0x10001;
inst->pfn.yuv2packed2(c, (const int16_t**)src0, (const int16_t**)src1, (const int16_t**)src2, (const int16_t**)src3,
*dst, dstW, lumAlpha, chrAlpha, sliceY);
} else { // general RGB
if ((c->yuv2packed1 && lum_fsize == 1 && chr_fsize == 2) ||
(c->yuv2packed2 && lum_fsize == 2 && chr_fsize == 2)) {
if (!c->warned_unuseable_bilinear)
av_log(c, AV_LOG_INFO, "Optimized 2 tap filter code cannot be used\n");
c->warned_unuseable_bilinear = 1;
}
inst->yuv2packedX(c, lum_filter + sliceY * lum_fsize,
(const int16_t**)src0, lum_fsize, chr_filter + chrSliceY * chr_fsize,
(const int16_t**)src1, (const int16_t**)src2, chr_fsize, (const int16_t**)src3, *dst, dstW, sliceY);
}
return 1;
}
static int any_vscale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
VScalerContext *inst = desc->instance;
int dstW = desc->dst->width;
int chrSliceY = sliceY >> desc->dst->v_chr_sub_sample;
int lum_fsize = inst[0].filter_size;
int chr_fsize = inst[1].filter_size;
uint16_t *lum_filter = inst[0].filter[0];
uint16_t *chr_filter = inst[1].filter[0];
int firstLum = FFMAX(1-lum_fsize, inst[0].filter_pos[ sliceY]);
int firstChr = FFMAX(1-chr_fsize, inst[1].filter_pos[chrSliceY]);
int sp0 = firstLum - desc->src->plane[0].sliceY;
int sp1 = firstChr - desc->src->plane[1].sliceY;
int sp2 = firstChr - desc->src->plane[2].sliceY;
int sp3 = firstLum - desc->src->plane[3].sliceY;
int dp0 = sliceY - desc->dst->plane[0].sliceY;
int dp1 = chrSliceY - desc->dst->plane[1].sliceY;
int dp2 = chrSliceY - desc->dst->plane[2].sliceY;
int dp3 = sliceY - desc->dst->plane[3].sliceY;
uint8_t **src0 = desc->src->plane[0].line + sp0;
uint8_t **src1 = desc->src->plane[1].line + sp1;
uint8_t **src2 = desc->src->plane[2].line + sp2;
uint8_t **src3 = desc->alpha ? desc->src->plane[3].line + sp3 : NULL;
uint8_t *dst[4] = { desc->dst->plane[0].line[dp0],
desc->dst->plane[1].line[dp1],
desc->dst->plane[2].line[dp2],
desc->alpha ? desc->dst->plane[3].line[dp3] : NULL };
av_assert1(!c->yuv2packed1 && !c->yuv2packed2);
inst->pfn.yuv2anyX(c, lum_filter + sliceY * lum_fsize,
(const int16_t**)src0, lum_fsize, chr_filter + sliceY * chr_fsize,
(const int16_t**)src1, (const int16_t**)src2, chr_fsize, (const int16_t**)src3, dst, dstW, sliceY);
return 1;
}
int ff_init_vscale(SwsContext *c, SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst)
{
VScalerContext *lumCtx = NULL;
VScalerContext *chrCtx = NULL;
if (isPlanarYUV(c->dstFormat) || (isGray(c->dstFormat) && !isALPHA(c->dstFormat))) {
lumCtx = av_mallocz(sizeof(VScalerContext));
if (!lumCtx)
return AVERROR(ENOMEM);
desc[0].process = lum_planar_vscale;
desc[0].instance = lumCtx;
desc[0].src = src;
desc[0].dst = dst;
desc[0].alpha = c->needAlpha;
if (!isGray(c->dstFormat)) {
chrCtx = av_mallocz(sizeof(VScalerContext));
if (!chrCtx)
return AVERROR(ENOMEM);
desc[1].process = chr_planar_vscale;
desc[1].instance = chrCtx;
desc[1].src = src;
desc[1].dst = dst;
}
} else {
lumCtx = av_mallocz_array(sizeof(VScalerContext), 2);
if (!lumCtx)
return AVERROR(ENOMEM);
chrCtx = &lumCtx[1];
desc[0].process = c->yuv2packedX ? packed_vscale : any_vscale;
desc[0].instance = lumCtx;
desc[0].src = src;
desc[0].dst = dst;
desc[0].alpha = c->needAlpha;
}
ff_init_vscale_pfn(c, c->yuv2plane1, c->yuv2planeX, c->yuv2nv12cX,
c->yuv2packed1, c->yuv2packed2, c->yuv2packedX, c->yuv2anyX, c->use_mmx_vfilter);
return 0;
}
void ff_init_vscale_pfn(SwsContext *c,
yuv2planar1_fn yuv2plane1,
yuv2planarX_fn yuv2planeX,
yuv2interleavedX_fn yuv2nv12cX,
yuv2packed1_fn yuv2packed1,
yuv2packed2_fn yuv2packed2,
yuv2packedX_fn yuv2packedX,
yuv2anyX_fn yuv2anyX, int use_mmx)
{
VScalerContext *lumCtx = NULL;
VScalerContext *chrCtx = NULL;
int idx = c->numDesc - (c->is_internal_gamma ? 2 : 1); //FIXME avoid hardcoding indexes
if (isPlanarYUV(c->dstFormat) || (isGray(c->dstFormat) && !isALPHA(c->dstFormat))) {
if (!isGray(c->dstFormat)) {
chrCtx = c->desc[idx].instance;
chrCtx->filter[0] = use_mmx ? (int16_t*)c->chrMmxFilter : c->vChrFilter;
chrCtx->filter_size = c->vChrFilterSize;
chrCtx->filter_pos = c->vChrFilterPos;
chrCtx->isMMX = use_mmx;
--idx;
if (yuv2nv12cX) chrCtx->pfn.yuv2interleavedX = yuv2nv12cX;
else if (c->vChrFilterSize == 1) chrCtx->pfn.yuv2planar1 = yuv2plane1;
else chrCtx->pfn.yuv2planarX = yuv2planeX;
}
lumCtx = c->desc[idx].instance;
lumCtx->filter[0] = use_mmx ? (int16_t*)c->lumMmxFilter : c->vLumFilter;
lumCtx->filter[1] = use_mmx ? (int16_t*)c->alpMmxFilter : c->vLumFilter;
lumCtx->filter_size = c->vLumFilterSize;
lumCtx->filter_pos = c->vLumFilterPos;
lumCtx->isMMX = use_mmx;
if (c->vLumFilterSize == 1) lumCtx->pfn.yuv2planar1 = yuv2plane1;
else lumCtx->pfn.yuv2planarX = yuv2planeX;
} else {
lumCtx = c->desc[idx].instance;
chrCtx = &lumCtx[1];
lumCtx->filter[0] = c->vLumFilter;
lumCtx->filter_size = c->vLumFilterSize;
lumCtx->filter_pos = c->vLumFilterPos;
chrCtx->filter[0] = c->vChrFilter;
chrCtx->filter_size = c->vChrFilterSize;
chrCtx->filter_pos = c->vChrFilterPos;
lumCtx->isMMX = use_mmx;
chrCtx->isMMX = use_mmx;
if (yuv2packedX) {
if (c->yuv2packed1 && c->vLumFilterSize == 1 && c->vChrFilterSize <= 2)
lumCtx->pfn.yuv2packed1 = yuv2packed1;
else if (c->yuv2packed2 && c->vLumFilterSize == 2 && c->vChrFilterSize == 2)
lumCtx->pfn.yuv2packed2 = yuv2packed2;
lumCtx->yuv2packedX = yuv2packedX;
} else
lumCtx->pfn.yuv2anyX = yuv2anyX;
}
}

View File

@@ -0,0 +1,16 @@
$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
OBJS += x86/rgb2rgb.o \
x86/swscale.o \
x86/yuv2rgb.o \
MMX-OBJS += x86/hscale_fast_bilinear_simd.o \
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
X86ASM-OBJS += x86/input.o \
x86/output.o \
x86/scale.o \
x86/rgb_2_rgb.o \
x86/yuv_2_rgb.o \
x86/yuv2yuvX.o \

View File

@@ -0,0 +1,360 @@
/*
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "../swscale_internal.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/mem_internal.h"
#define RET 0xC3 // near return opcode for x86
#define PREFETCH "prefetchnta"
#if HAVE_INLINE_ASM
av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
int16_t *filter, int32_t *filterPos,
int numSplits)
{
uint8_t *fragmentA;
x86_reg imm8OfPShufW1A;
x86_reg imm8OfPShufW2A;
x86_reg fragmentLengthA;
uint8_t *fragmentB;
x86_reg imm8OfPShufW1B;
x86_reg imm8OfPShufW2B;
x86_reg fragmentLengthB;
int fragmentPos;
int xpos, i;
// create an optimized horizontal scaling routine
/* This scaler is made of runtime-generated MMXEXT code using specially tuned
* pshufw instructions. For every four output pixels, if four input pixels
* are enough for the fast bilinear scaling, then a chunk of fragmentB is
* used. If five input pixels are needed, then a chunk of fragmentA is used.
*/
// code fragment
__asm__ volatile (
"jmp 9f \n\t"
// Begin
"0: \n\t"
"movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
"movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
"movd 1(%%"FF_REG_c", %%"FF_REG_S"), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"pshufw $0xFF, %%mm1, %%mm1 \n\t"
"1: \n\t"
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
"2: \n\t"
"psubw %%mm1, %%mm0 \n\t"
"movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
"pmullw %%mm3, %%mm0 \n\t"
"psllw $7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
"add $8, %%"FF_REG_a" \n\t"
// End
"9: \n\t"
"lea " LOCAL_MANGLE(0b) ", %0 \n\t"
"lea " LOCAL_MANGLE(1b) ", %1 \n\t"
"lea " LOCAL_MANGLE(2b) ", %2 \n\t"
"dec %1 \n\t"
"dec %2 \n\t"
"sub %0, %1 \n\t"
"sub %0, %2 \n\t"
"lea " LOCAL_MANGLE(9b) ", %3 \n\t"
"sub %0, %3 \n\t"
: "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
"=r" (fragmentLengthA)
);
__asm__ volatile (
"jmp 9f \n\t"
// Begin
"0: \n\t"
"movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
"movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"pshufw $0xFF, %%mm0, %%mm1 \n\t"
"1: \n\t"
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
"2: \n\t"
"psubw %%mm1, %%mm0 \n\t"
"movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
"pmullw %%mm3, %%mm0 \n\t"
"psllw $7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
"add $8, %%"FF_REG_a" \n\t"
// End
"9: \n\t"
"lea " LOCAL_MANGLE(0b) ", %0 \n\t"
"lea " LOCAL_MANGLE(1b) ", %1 \n\t"
"lea " LOCAL_MANGLE(2b) ", %2 \n\t"
"dec %1 \n\t"
"dec %2 \n\t"
"sub %0, %1 \n\t"
"sub %0, %2 \n\t"
"lea " LOCAL_MANGLE(9b) ", %3 \n\t"
"sub %0, %3 \n\t"
: "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
"=r" (fragmentLengthB)
);
xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
fragmentPos = 0;
for (i = 0; i < dstW / numSplits; i++) {
int xx = xpos >> 16;
if ((i & 3) == 0) {
int a = 0;
int b = ((xpos + xInc) >> 16) - xx;
int c = ((xpos + xInc * 2) >> 16) - xx;
int d = ((xpos + xInc * 3) >> 16) - xx;
int inc = (d + 1 < 4);
uint8_t *fragment = inc ? fragmentB : fragmentA;
x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
int maxShift = 3 - (d + inc);
int shift = 0;
if (filterCode) {
filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
filterPos[i / 2] = xx;
memcpy(filterCode + fragmentPos, fragment, fragmentLength);
filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
((b + inc) << 2) |
((c + inc) << 4) |
((d + inc) << 6);
filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
(c << 4) |
(d << 6);
if (i + 4 - inc >= dstW)
shift = maxShift; // avoid overread
else if ((filterPos[i / 2] & 3) <= maxShift)
shift = filterPos[i / 2] & 3; // align
if (shift && i >= shift) {
filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
filterPos[i / 2] -= shift;
}
}
fragmentPos += fragmentLength;
if (filterCode)
filterCode[fragmentPos] = RET;
}
xpos += xInc;
}
if (filterCode)
filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
return fragmentPos + 1;
}
void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
int dstWidth, const uint8_t *src,
int srcW, int xInc)
{
int32_t *filterPos = c->hLumFilterPos;
int16_t *filter = c->hLumFilter;
void *mmxextFilterCode = c->lumMmxextFilterCode;
int i;
#if ARCH_X86_64
uint64_t retsave;
#else
#if !HAVE_EBX_AVAILABLE
uint64_t ebxsave;
#endif
#endif
__asm__ volatile(
#if ARCH_X86_64
"mov -8(%%rsp), %%"FF_REG_a" \n\t"
"mov %%"FF_REG_a", %5 \n\t" // retsave
#else
#if !HAVE_EBX_AVAILABLE
"mov %%"FF_REG_b", %5 \n\t" // ebxsave
#endif
#endif
"pxor %%mm7, %%mm7 \n\t"
"mov %0, %%"FF_REG_c" \n\t"
"mov %1, %%"FF_REG_D" \n\t"
"mov %2, %%"FF_REG_d" \n\t"
"mov %3, %%"FF_REG_b" \n\t"
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
PREFETCH" (%%"FF_REG_c") \n\t"
PREFETCH" 32(%%"FF_REG_c") \n\t"
PREFETCH" 64(%%"FF_REG_c") \n\t"
#if ARCH_X86_64
#define CALL_MMXEXT_FILTER_CODE \
"movl (%%"FF_REG_b"), %%esi \n\t"\
"call *%4 \n\t"\
"movl (%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"\
"add %%"FF_REG_S", %%"FF_REG_c" \n\t"\
"add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
#else
#define CALL_MMXEXT_FILTER_CODE \
"movl (%%"FF_REG_b"), %%esi \n\t"\
"call *%4 \n\t"\
"addl (%%"FF_REG_b", %%"FF_REG_a"), %%"FF_REG_c" \n\t"\
"add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
#endif /* ARCH_X86_64 */
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
#if ARCH_X86_64
"mov %5, %%"FF_REG_a" \n\t"
"mov %%"FF_REG_a", -8(%%rsp) \n\t"
#else
#if !HAVE_EBX_AVAILABLE
"mov %5, %%"FF_REG_b" \n\t"
#endif
#endif
:: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
"m" (mmxextFilterCode)
#if ARCH_X86_64
,"m"(retsave)
#else
#if !HAVE_EBX_AVAILABLE
,"m" (ebxsave)
#endif
#endif
: "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
#if ARCH_X86_64 || HAVE_EBX_AVAILABLE
,"%"FF_REG_b
#endif
);
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
dst[i] = src[srcW-1]*128;
}
void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
int dstWidth, const uint8_t *src1,
const uint8_t *src2, int srcW, int xInc)
{
int32_t *filterPos = c->hChrFilterPos;
int16_t *filter = c->hChrFilter;
void *mmxextFilterCode = c->chrMmxextFilterCode;
int i;
#if ARCH_X86_64
DECLARE_ALIGNED(8, uint64_t, retsave);
#else
#if !HAVE_EBX_AVAILABLE
DECLARE_ALIGNED(8, uint64_t, ebxsave);
#endif
#endif
__asm__ volatile(
#if ARCH_X86_64
"mov -8(%%rsp), %%"FF_REG_a" \n\t"
"mov %%"FF_REG_a", %7 \n\t" // retsave
#else
#if !HAVE_EBX_AVAILABLE
"mov %%"FF_REG_b", %7 \n\t" // ebxsave
#endif
#endif
"pxor %%mm7, %%mm7 \n\t"
"mov %0, %%"FF_REG_c" \n\t"
"mov %1, %%"FF_REG_D" \n\t"
"mov %2, %%"FF_REG_d" \n\t"
"mov %3, %%"FF_REG_b" \n\t"
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
PREFETCH" (%%"FF_REG_c") \n\t"
PREFETCH" 32(%%"FF_REG_c") \n\t"
PREFETCH" 64(%%"FF_REG_c") \n\t"
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
"mov %5, %%"FF_REG_c" \n\t" // src2
"mov %6, %%"FF_REG_D" \n\t" // dst2
PREFETCH" (%%"FF_REG_c") \n\t"
PREFETCH" 32(%%"FF_REG_c") \n\t"
PREFETCH" 64(%%"FF_REG_c") \n\t"
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
#if ARCH_X86_64
"mov %7, %%"FF_REG_a" \n\t"
"mov %%"FF_REG_a", -8(%%rsp) \n\t"
#else
#if !HAVE_EBX_AVAILABLE
"mov %7, %%"FF_REG_b" \n\t"
#endif
#endif
:: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
"m" (mmxextFilterCode), "m" (src2), "m"(dst2)
#if ARCH_X86_64
,"m"(retsave)
#else
#if !HAVE_EBX_AVAILABLE
,"m" (ebxsave)
#endif
#endif
: "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
#if ARCH_X86_64 || HAVE_EBX_AVAILABLE
,"%"FF_REG_b
#endif
);
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
dst1[i] = src1[srcW-1]*128;
dst2[i] = src2[srcW-1]*128;
}
}
#endif //HAVE_INLINE_ASM

View File

@@ -0,0 +1,740 @@
;******************************************************************************
;* x86-optimized input routines; does shuffling of packed
;* YUV formats into individual planes, and converts RGB
;* into YUV planes also.
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
%define RY 0x20DE
%define GY 0x4087
%define BY 0x0C88
%define RU 0xECFF
%define GU 0xDAC8
%define BU 0x3838
%define RV 0x3838
%define GV 0xD0E3
%define BV 0xF6E4
rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15
rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15
%define bgr_Ycoeff_12x4 16*4 + 16* 0 + tableq
%define bgr_Ycoeff_3x56 16*4 + 16* 1 + tableq
%define rgb_Ycoeff_12x4 16*4 + 16* 2 + tableq
%define rgb_Ycoeff_3x56 16*4 + 16* 3 + tableq
%define bgr_Ucoeff_12x4 16*4 + 16* 4 + tableq
%define bgr_Ucoeff_3x56 16*4 + 16* 5 + tableq
%define rgb_Ucoeff_12x4 16*4 + 16* 6 + tableq
%define rgb_Ucoeff_3x56 16*4 + 16* 7 + tableq
%define bgr_Vcoeff_12x4 16*4 + 16* 8 + tableq
%define bgr_Vcoeff_3x56 16*4 + 16* 9 + tableq
%define rgb_Vcoeff_12x4 16*4 + 16*10 + tableq
%define rgb_Vcoeff_3x56 16*4 + 16*11 + tableq
%define rgba_Ycoeff_rb 16*4 + 16*12 + tableq
%define rgba_Ycoeff_br 16*4 + 16*13 + tableq
%define rgba_Ycoeff_ga 16*4 + 16*14 + tableq
%define rgba_Ycoeff_ag 16*4 + 16*15 + tableq
%define rgba_Ucoeff_rb 16*4 + 16*16 + tableq
%define rgba_Ucoeff_br 16*4 + 16*17 + tableq
%define rgba_Ucoeff_ga 16*4 + 16*18 + tableq
%define rgba_Ucoeff_ag 16*4 + 16*19 + tableq
%define rgba_Vcoeff_rb 16*4 + 16*20 + tableq
%define rgba_Vcoeff_br 16*4 + 16*21 + tableq
%define rgba_Vcoeff_ga 16*4 + 16*22 + tableq
%define rgba_Vcoeff_ag 16*4 + 16*23 + tableq
; bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
; bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
; rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
; rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY
; bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU
; bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU
; rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU
; rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU
; bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV
; bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
; rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
; rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
; rgba_Ycoeff_rb: times 4 dw RY, BY
; rgba_Ycoeff_br: times 4 dw BY, RY
; rgba_Ycoeff_ga: times 4 dw GY, 0
; rgba_Ycoeff_ag: times 4 dw 0, GY
; rgba_Ucoeff_rb: times 4 dw RU, BU
; rgba_Ucoeff_br: times 4 dw BU, RU
; rgba_Ucoeff_ga: times 4 dw GU, 0
; rgba_Ucoeff_ag: times 4 dw 0, GU
; rgba_Vcoeff_rb: times 4 dw RV, BV
; rgba_Vcoeff_br: times 4 dw BV, RV
; rgba_Vcoeff_ga: times 4 dw GV, 0
; rgba_Vcoeff_ag: times 4 dw 0, GV
shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \
6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80
shuf_rgb_3x56: db 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, \
8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80
SECTION .text
;-----------------------------------------------------------------------------
; RGB to Y/UV.
;
; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
; and
; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
; const uint8_t *unused, int w);
;-----------------------------------------------------------------------------
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro RGB24_TO_Y_FN 2-3
cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
%if mmsize == 8
mova m5, [%2_Ycoeff_12x4]
mova m6, [%2_Ycoeff_3x56]
%define coeff1 m5
%define coeff2 m6
%elif ARCH_X86_64
mova m8, [%2_Ycoeff_12x4]
mova m9, [%2_Ycoeff_3x56]
%define coeff1 m8
%define coeff2 m9
%else ; x86-32 && mmsize == 16
%define coeff1 [%2_Ycoeff_12x4]
%define coeff2 [%2_Ycoeff_3x56]
%endif ; x86-32/64 && mmsize == 8/16
%if (ARCH_X86_64 || mmsize == 8) && %0 == 3
jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body
%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
.body:
%if cpuflag(ssse3)
mova m7, [shuf_rgb_12x4]
%define shuf_rgb1 m7
%if ARCH_X86_64
mova m10, [shuf_rgb_3x56]
%define shuf_rgb2 m10
%else ; x86-32
%define shuf_rgb2 [shuf_rgb_3x56]
%endif ; x86-32/64
%endif ; cpuflag(ssse3)
%if ARCH_X86_64
movsxd wq, wd
%endif
add wq, wq
add dstq, wq
neg wq
%if notcpuflag(ssse3)
pxor m7, m7
%endif ; !cpuflag(ssse3)
mova m4, [rgb_Yrnd]
.loop:
%if cpuflag(ssse3)
movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3]
movu m2, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7]
pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
pshufb m3, m2, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
pshufb m2, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
%else ; !cpuflag(ssse3)
movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 }
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 }
movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 }
%if mmsize == 16 ; i.e. sse2
punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 }
movd m3, [srcq+14] ; (byte) { R4, B5, G5, R5 }
movd m5, [srcq+18] ; (byte) { B6, G6, R6, B7 }
movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 }
punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; mmsize == 16
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpcklbw m3, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; cpuflag(ssse3)
add srcq, 3 * mmsize / 2
pmaddwd m0, coeff1 ; (dword) { B0*BY + G0*GY, B1*BY, B2*BY + G2*GY, B3*BY }
pmaddwd m1, coeff2 ; (dword) { R0*RY, G1+GY + R1*RY, R2*RY, G3+GY + R3*RY }
pmaddwd m2, coeff1 ; (dword) { B4*BY + G4*GY, B5*BY, B6*BY + G6*GY, B7*BY }
pmaddwd m3, coeff2 ; (dword) { R4*RY, G5+GY + R5*RY, R6*RY, G7+GY + R7*RY }
paddd m0, m1 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[0-3]
paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
psrad m0, 9
psrad m2, 9
packssdw m0, m2 ; (word) { Y[0-7] }
mova [dstq+wq], m0
add wq, mmsize
jl .loop
REP_RET
%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
%endmacro
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro RGB24_TO_UV_FN 2-3
cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
%if ARCH_X86_64
mova m8, [%2_Ucoeff_12x4]
mova m9, [%2_Ucoeff_3x56]
mova m10, [%2_Vcoeff_12x4]
mova m11, [%2_Vcoeff_3x56]
%define coeffU1 m8
%define coeffU2 m9
%define coeffV1 m10
%define coeffV2 m11
%else ; x86-32
%define coeffU1 [%2_Ucoeff_12x4]
%define coeffU2 [%2_Ucoeff_3x56]
%define coeffV1 [%2_Vcoeff_12x4]
%define coeffV2 [%2_Vcoeff_3x56]
%endif ; x86-32/64
%if ARCH_X86_64 && %0 == 3
jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToUV %+ SUFFIX).body
%else ; ARCH_X86_64 && %0 == 3
.body:
%if cpuflag(ssse3)
mova m7, [shuf_rgb_12x4]
%define shuf_rgb1 m7
%if ARCH_X86_64
mova m12, [shuf_rgb_3x56]
%define shuf_rgb2 m12
%else ; x86-32
%define shuf_rgb2 [shuf_rgb_3x56]
%endif ; x86-32/64
%endif ; cpuflag(ssse3)
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add wq, wq
add dstUq, wq
add dstVq, wq
neg wq
mova m6, [rgb_UVrnd]
%if notcpuflag(ssse3)
pxor m7, m7
%endif
.loop:
%if cpuflag(ssse3)
movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3]
movu m4, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7]
pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
%else ; !cpuflag(ssse3)
movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 }
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 }
movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 }
%if mmsize == 16
punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 }
movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 }
%endif ; mmsize == 16
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
%endif ; cpuflag(ssse3)
pmaddwd m2, m0, coeffV1 ; (dword) { B0*BV + G0*GV, B1*BV, B2*BV + G2*GV, B3*BV }
pmaddwd m3, m1, coeffV2 ; (dword) { R0*BV, G1*GV + R1*BV, R2*BV, G3*GV + R3*BV }
pmaddwd m0, coeffU1 ; (dword) { B0*BU + G0*GU, B1*BU, B2*BU + G2*GU, B3*BU }
pmaddwd m1, coeffU2 ; (dword) { R0*BU, G1*GU + R1*BU, R2*BU, G3*GU + R3*BU }
paddd m0, m1 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[0-3]
paddd m2, m3 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[0-3]
%if cpuflag(ssse3)
pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
%else ; !cpuflag(ssse3)
%if mmsize == 16
movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 }
movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 }
punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; mmsize == 16 && !cpuflag(ssse3)
punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; cpuflag(ssse3)
add srcq, 3 * mmsize / 2
pmaddwd m1, m4, coeffU1 ; (dword) { B4*BU + G4*GU, B5*BU, B6*BU + G6*GU, B7*BU }
pmaddwd m3, m5, coeffU2 ; (dword) { R4*BU, G5*GU + R5*BU, R6*BU, G7*GU + R7*BU }
pmaddwd m4, coeffV1 ; (dword) { B4*BV + G4*GV, B5*BV, B6*BV + G6*GV, B7*BV }
pmaddwd m5, coeffV2 ; (dword) { R4*BV, G5*GV + R5*BV, R6*BV, G7*GV + R7*BV }
paddd m1, m3 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[4-7]
paddd m4, m5 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[4-7]
paddd m0, m6 ; += rgb_UVrnd, i.e. (dword) { U[0-3] }
paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
psrad m0, 9
psrad m2, 9
psrad m1, 9
psrad m4, 9
packssdw m0, m1 ; (word) { U[0-7] }
packssdw m2, m4 ; (word) { V[0-7] }
%if mmsize == 8
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%else ; mmsize == 16
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%endif ; mmsize == 8/16
add wq, mmsize
jl .loop
REP_RET
%endif ; ARCH_X86_64 && %0 == 3
%endmacro
; %1 = nr. of XMM registers for rgb-to-Y func
; %2 = nr. of XMM registers for rgb-to-UV func
%macro RGB24_FUNCS 2
RGB24_TO_Y_FN %1, rgb
RGB24_TO_Y_FN %1, bgr, rgb
RGB24_TO_UV_FN %2, rgb
RGB24_TO_UV_FN %2, bgr, rgb
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
RGB24_FUNCS 0, 0
%endif
INIT_XMM sse2
RGB24_FUNCS 10, 12
INIT_XMM ssse3
RGB24_FUNCS 11, 13
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
RGB24_FUNCS 11, 13
%endif
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro RGB32_TO_Y_FN 5-6
cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
mova m5, [rgba_Ycoeff_%2%4]
mova m6, [rgba_Ycoeff_%3%5]
%if %0 == 6
jmp mangle(private_prefix %+ _ %+ %6 %+ ToY %+ SUFFIX).body
%else ; %0 == 6
.body:
%if ARCH_X86_64
movsxd wq, wd
%endif
add wq, wq
sub wq, mmsize - 1
lea srcq, [srcq+wq*2]
add dstq, wq
neg wq
mova m4, [rgb_Yrnd]
pcmpeqb m7, m7
psrlw m7, 8 ; (word) { 0x00ff } x4
.loop:
; FIXME check alignment and use mova
movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
pmaddwd m3, m5 ; (dword) { Bx*BY + Rx*RY }[4-7]
pmaddwd m2, m6 ; (dword) { Gx*GY }[4-7]
paddd m0, m4 ; += rgb_Yrnd
paddd m2, m4 ; += rgb_Yrnd
paddd m0, m1 ; (dword) { Y[0-3] }
paddd m2, m3 ; (dword) { Y[4-7] }
psrad m0, 9
psrad m2, 9
packssdw m0, m2 ; (word) { Y[0-7] }
mova [dstq+wq], m0
add wq, mmsize
jl .loop
sub wq, mmsize - 1
jz .end
add srcq, 2*mmsize - 2
add dstq, mmsize - 1
.loop2:
movd m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
paddd m0, m4 ; += rgb_Yrnd
paddd m0, m1 ; (dword) { Y[0-3] }
psrad m0, 9
packssdw m0, m0 ; (word) { Y[0-7] }
movd [dstq+wq], m0
add wq, 2
jl .loop2
.end:
REP_RET
%endif ; %0 == 3
%endmacro
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro RGB32_TO_UV_FN 5-6
cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
%if ARCH_X86_64
mova m8, [rgba_Ucoeff_%2%4]
mova m9, [rgba_Ucoeff_%3%5]
mova m10, [rgba_Vcoeff_%2%4]
mova m11, [rgba_Vcoeff_%3%5]
%define coeffU1 m8
%define coeffU2 m9
%define coeffV1 m10
%define coeffV2 m11
%else ; x86-32
%define coeffU1 [rgba_Ucoeff_%2%4]
%define coeffU2 [rgba_Ucoeff_%3%5]
%define coeffV1 [rgba_Vcoeff_%2%4]
%define coeffV2 [rgba_Vcoeff_%3%5]
%endif ; x86-64/32
%if ARCH_X86_64 && %0 == 6
jmp mangle(private_prefix %+ _ %+ %6 %+ ToUV %+ SUFFIX).body
%else ; ARCH_X86_64 && %0 == 6
.body:
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add wq, wq
sub wq, mmsize - 1
add dstUq, wq
add dstVq, wq
lea srcq, [srcq+wq*2]
neg wq
pcmpeqb m7, m7
psrlw m7, 8 ; (word) { 0x00ff } x4
mova m6, [rgb_UVrnd]
.loop:
; FIXME check alignment and use mova
movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3]
pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3]
paddd m3, m6 ; += rgb_UVrnd
paddd m1, m6 ; += rgb_UVrnd
paddd m2, m3 ; (dword) { V[0-3] }
paddd m0, m1 ; (dword) { U[0-3] }
pmaddwd m3, m5, coeffV1 ; (dword) { Bx*BV + Rx*RV }[4-7]
pmaddwd m1, m4, coeffV2 ; (dword) { Gx*GV }[4-7]
pmaddwd m5, coeffU1 ; (dword) { Bx*BU + Rx*RU }[4-7]
pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7]
paddd m3, m6 ; += rgb_UVrnd
paddd m5, m6 ; += rgb_UVrnd
psrad m0, 9
paddd m1, m3 ; (dword) { V[4-7] }
paddd m4, m5 ; (dword) { U[4-7] }
psrad m2, 9
psrad m4, 9
psrad m1, 9
packssdw m0, m4 ; (word) { U[0-7] }
packssdw m2, m1 ; (word) { V[0-7] }
%if mmsize == 8
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%else ; mmsize == 16
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%endif ; mmsize == 8/16
add wq, mmsize
jl .loop
sub wq, mmsize - 1
jz .end
add srcq , 2*mmsize - 2
add dstUq, mmsize - 1
add dstVq, mmsize - 1
.loop2:
movd m0, [srcq+wq*2] ; (byte) { Bx, Gx, Rx, xx }[0-3]
DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3]
pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3]
paddd m3, m6 ; += rgb_UVrnd
paddd m1, m6 ; += rgb_UVrnd
paddd m2, m3 ; (dword) { V[0-3] }
paddd m0, m1 ; (dword) { U[0-3] }
psrad m0, 9
psrad m2, 9
packssdw m0, m0 ; (word) { U[0-7] }
packssdw m2, m2 ; (word) { V[0-7] }
movd [dstUq+wq], m0
movd [dstVq+wq], m2
add wq, 2
jl .loop2
.end:
REP_RET
%endif ; ARCH_X86_64 && %0 == 3
%endmacro
; %1 = nr. of XMM registers for rgb-to-Y func
; %2 = nr. of XMM registers for rgb-to-UV func
%macro RGB32_FUNCS 2
RGB32_TO_Y_FN %1, r, g, b, a
RGB32_TO_Y_FN %1, b, g, r, a, rgba
RGB32_TO_Y_FN %1, a, r, g, b, rgba
RGB32_TO_Y_FN %1, a, b, g, r, rgba
RGB32_TO_UV_FN %2, r, g, b, a
RGB32_TO_UV_FN %2, b, g, r, a, rgba
RGB32_TO_UV_FN %2, a, r, g, b, rgba
RGB32_TO_UV_FN %2, a, b, g, r, rgba
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
RGB32_FUNCS 0, 0
%endif
INIT_XMM sse2
RGB32_FUNCS 8, 12
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
RGB32_FUNCS 8, 12
%endif
;-----------------------------------------------------------------------------
; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
;
; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
; and
; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
; const uint8_t *unused, int w);
;-----------------------------------------------------------------------------
; %1 = a (aligned) or u (unaligned)
; %2 = yuyv or uyvy
%macro LOOP_YUYV_TO_Y 2
.loop_%1:
mov%1 m0, [srcq+wq*2] ; (byte) { Y0, U0, Y1, V0, ... }
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
%ifidn %2, yuyv
pand m0, m2 ; (word) { Y0, Y1, ..., Y7 }
pand m1, m2 ; (word) { Y8, Y9, ..., Y15 }
%else ; uyvy
psrlw m0, 8 ; (word) { Y0, Y1, ..., Y7 }
psrlw m1, 8 ; (word) { Y8, Y9, ..., Y15 }
%endif ; yuyv/uyvy
packuswb m0, m1 ; (byte) { Y0, ..., Y15 }
mova [dstq+wq], m0
add wq, mmsize
jl .loop_%1
REP_RET
%endmacro
; %1 = nr. of XMM registers
; %2 = yuyv or uyvy
; %3 = if specified, it means that unaligned and aligned code in loop
; will be the same (i.e. YUYV+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro YUYV_TO_Y_FN 2-3
cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
%if ARCH_X86_64
movsxd wq, wd
%endif
add dstq, wq
%if mmsize == 16
test srcq, 15
%endif
lea srcq, [srcq+wq*2]
%ifidn %2, yuyv
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
psrlw m2, 8 ; (word) { 0x00ff } x 8
%endif ; yuyv
%if mmsize == 16
jnz .loop_u_start
neg wq
LOOP_YUYV_TO_Y a, %2
.loop_u_start:
neg wq
LOOP_YUYV_TO_Y u, %2
%else ; mmsize == 8
neg wq
LOOP_YUYV_TO_Y a, %2
%endif ; mmsize == 8/16
%endmacro
; %1 = a (aligned) or u (unaligned)
; %2 = yuyv or uyvy
%macro LOOP_YUYV_TO_UV 2
.loop_%1:
%ifidn %2, yuyv
mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
psrlw m0, 8 ; (word) { U0, V0, ..., U3, V3 }
psrlw m1, 8 ; (word) { U4, V4, ..., U7, V7 }
%else ; uyvy
%if cpuflag(avx)
vpand m0, m2, [srcq+wq*4] ; (word) { U0, V0, ..., U3, V3 }
vpand m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 }
%else
mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
pand m0, m2 ; (word) { U0, V0, ..., U3, V3 }
pand m1, m2 ; (word) { U4, V4, ..., U7, V7 }
%endif
%endif ; yuyv/uyvy
packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 }
pand m1, m0, m2 ; (word) { U0, U1, ..., U7 }
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
%if mmsize == 16
packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 }
movh [dstUq+wq], m1
movhps [dstVq+wq], m1
%else ; mmsize == 8
packuswb m1, m1 ; (byte) { U0, ... U3 }
packuswb m0, m0 ; (byte) { V0, ... V3 }
movh [dstUq+wq], m1
movh [dstVq+wq], m0
%endif ; mmsize == 8/16
add wq, mmsize / 2
jl .loop_%1
REP_RET
%endmacro
; %1 = nr. of XMM registers
; %2 = yuyv or uyvy
; %3 = if specified, it means that unaligned and aligned code in loop
; will be the same (i.e. UYVY+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro YUYV_TO_UV_FN 2-3
cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add dstUq, wq
add dstVq, wq
%if mmsize == 16 && %0 == 2
test srcq, 15
%endif
lea srcq, [srcq+wq*4]
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
psrlw m2, 8 ; (word) { 0x00ff } x 8
; NOTE: if uyvy+avx, u/a are identical
%if mmsize == 16 && %0 == 2
jnz .loop_u_start
neg wq
LOOP_YUYV_TO_UV a, %2
.loop_u_start:
neg wq
LOOP_YUYV_TO_UV u, %2
%else ; mmsize == 8
neg wq
LOOP_YUYV_TO_UV a, %2
%endif ; mmsize == 8/16
%endmacro
; %1 = a (aligned) or u (unaligned)
; %2 = nv12 or nv21
%macro LOOP_NVXX_TO_UV 2
.loop_%1:
mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... }
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
pand m2, m0, m5 ; (word) { U0, U1, ..., U7 }
pand m3, m1, m5 ; (word) { U8, U9, ..., U15 }
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
psrlw m1, 8 ; (word) { V8, V9, ..., V15 }
packuswb m2, m3 ; (byte) { U0, ..., U15 }
packuswb m0, m1 ; (byte) { V0, ..., V15 }
%ifidn %2, nv12
mova [dstUq+wq], m2
mova [dstVq+wq], m0
%else ; nv21
mova [dstVq+wq], m2
mova [dstUq+wq], m0
%endif ; nv12/21
add wq, mmsize
jl .loop_%1
REP_RET
%endmacro
; %1 = nr. of XMM registers
; %2 = nv12 or nv21
%macro NVXX_TO_UV_FN 2
cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add dstUq, wq
add dstVq, wq
%if mmsize == 16
test srcq, 15
%endif
lea srcq, [srcq+wq*2]
pcmpeqb m5, m5 ; (byte) { 0xff } x 16
psrlw m5, 8 ; (word) { 0x00ff } x 8
%if mmsize == 16
jnz .loop_u_start
neg wq
LOOP_NVXX_TO_UV a, %2
.loop_u_start:
neg wq
LOOP_NVXX_TO_UV u, %2
%else ; mmsize == 8
neg wq
LOOP_NVXX_TO_UV a, %2
%endif ; mmsize == 8/16
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
YUYV_TO_Y_FN 0, yuyv
YUYV_TO_Y_FN 0, uyvy
YUYV_TO_UV_FN 0, yuyv
YUYV_TO_UV_FN 0, uyvy
NVXX_TO_UV_FN 0, nv12
NVXX_TO_UV_FN 0, nv21
%endif
INIT_XMM sse2
YUYV_TO_Y_FN 3, yuyv
YUYV_TO_Y_FN 2, uyvy
YUYV_TO_UV_FN 3, yuyv
YUYV_TO_UV_FN 3, uyvy
NVXX_TO_UV_FN 5, nv12
NVXX_TO_UV_FN 5, nv21
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
; that's not faster in practice
YUYV_TO_UV_FN 3, yuyv
YUYV_TO_UV_FN 3, uyvy, 1
NVXX_TO_UV_FN 5, nv12
NVXX_TO_UV_FN 5, nv21
%endif

View File

@@ -0,0 +1,551 @@
;******************************************************************************
;* x86-optimized vertical line scaling functions
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;* Kieran Kunhya <kieran@kunhya.com>
;* (c) 2020 Nelson Gomez <nelson.gomez@microsoft.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
minshort: times 8 dw 0x8000
yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
yuv2yuvX_10_start: times 4 dd 0x10000
yuv2yuvX_9_start: times 4 dd 0x20000
yuv2yuvX_10_upper: times 8 dw 0x3ff
yuv2yuvX_9_upper: times 8 dw 0x1ff
pd_4: times 4 dd 4
pd_4min0x40000:times 4 dd 4 - (0x40000)
pw_16: times 8 dw 16
pw_32: times 8 dw 32
pd_255: times 8 dd 255
pw_512: times 8 dw 512
pw_1024: times 8 dw 1024
yuv2nv12_shuffle_mask: times 2 db 0, 4, 8, 12, \
-1, -1, -1, -1, \
-1, -1, -1, -1, \
-1, -1, -1, -1
yuv2nv21_shuffle_mask: times 2 db 4, 0, 12, 8, \
-1, -1, -1, -1, \
-1, -1, -1, -1, \
-1, -1, -1, -1
yuv2nv12_permute_mask: dd 0, 4, 1, 2, 3, 5, 6, 7
SECTION .text
;-----------------------------------------------------------------------------
; vertical line scaling
;
; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
; const uint8_t *dither, int offset)
; and
; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
; const int16_t **src, uint8_t *dst, int dstW,
; const uint8_t *dither, int offset)
;
; Scale one or $filterSize lines of source data to generate one line of output
; data. The input is 15 bits in int16_t if $output_size is [8,10] and 19 bits in
; int32_t if $output_size is 16. $filter is 12 bits. $filterSize is a multiple
; of 2. $offset is either 0 or 3. $dither holds 8 values.
;-----------------------------------------------------------------------------
%macro yuv2planeX_mainloop 2
.pixelloop_%2:
%assign %%i 0
; the rep here is for the 8-bit output MMX case, where dither covers
; 8 pixels but we can only handle 2 pixels per register, and thus 4
; pixels per iteration. In order to not have to keep track of where
; we are w.r.t. dithering, we unroll the MMX/8-bit loop x2.
%if %1 == 8
%assign %%repcnt 16/mmsize
%else
%assign %%repcnt 1
%endif
%rep %%repcnt
%if %1 == 8
%if ARCH_X86_32
mova m2, [rsp+mmsize*(0+%%i)]
mova m1, [rsp+mmsize*(1+%%i)]
%else ; x86-64
mova m2, m8
mova m1, m_dith
%endif ; x86-32/64
%else ; %1 == 9/10/16
mova m1, [yuv2yuvX_%1_start]
mova m2, m1
%endif ; %1 == 8/9/10/16
movsx cntr_reg, fltsizem
.filterloop_%2_ %+ %%i:
; input pixels
mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
%if %1 == 16
mova m3, [r6+r5*4]
mova m5, [r6+r5*4+mmsize]
%else ; %1 == 8/9/10
mova m3, [r6+r5*2]
%endif ; %1 == 8/9/10/16
mov r6, [srcq+gprsize*cntr_reg-gprsize]
%if %1 == 16
mova m4, [r6+r5*4]
mova m6, [r6+r5*4+mmsize]
%else ; %1 == 8/9/10
mova m4, [r6+r5*2]
%endif ; %1 == 8/9/10/16
; coefficients
movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
%if %1 == 16
pshuflw m7, m0, 0 ; coeff[0]
pshuflw m0, m0, 0x55 ; coeff[1]
pmovsxwd m7, m7 ; word -> dword
pmovsxwd m0, m0 ; word -> dword
pmulld m3, m7
pmulld m5, m7
pmulld m4, m0
pmulld m6, m0
paddd m2, m3
paddd m1, m5
paddd m2, m4
paddd m1, m6
%else ; %1 == 10/9/8
punpcklwd m5, m3, m4
punpckhwd m3, m4
SPLATD m0
pmaddwd m5, m0
pmaddwd m3, m0
paddd m2, m5
paddd m1, m3
%endif ; %1 == 8/9/10/16
sub cntr_reg, 2
jg .filterloop_%2_ %+ %%i
%if %1 == 16
psrad m2, 31 - %1
psrad m1, 31 - %1
%else ; %1 == 10/9/8
psrad m2, 27 - %1
psrad m1, 27 - %1
%endif ; %1 == 8/9/10/16
%if %1 == 8
packssdw m2, m1
packuswb m2, m2
movh [dstq+r5*1], m2
%else ; %1 == 9/10/16
%if %1 == 16
packssdw m2, m1
paddw m2, [minshort]
%else ; %1 == 9/10
%if cpuflag(sse4)
packusdw m2, m1
%else ; mmxext/sse2
packssdw m2, m1
pmaxsw m2, m6
%endif ; mmxext/sse2/sse4/avx
pminsw m2, [yuv2yuvX_%1_upper]
%endif ; %1 == 9/10/16
mov%2 [dstq+r5*2], m2
%endif ; %1 == 8/9/10/16
add r5, mmsize/2
sub wd, mmsize/2
%assign %%i %%i+2
%endrep
jg .pixelloop_%2
%endmacro
%macro yuv2planeX_fn 3
%if ARCH_X86_32
%define cntr_reg fltsizeq
%define movsx mov
%else
%define cntr_reg r7
%define movsx movsxd
%endif
cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%if %1 == 8 || %1 == 9 || %1 == 10
pxor m6, m6
%endif ; %1 == 8/9/10
%if %1 == 8
%if ARCH_X86_32
%assign pad 0x2c - (stack_offset & 15)
SUB rsp, pad
%define m_dith m7
%else ; x86-64
%define m_dith m9
%endif ; x86-32
; create registers holding dither
movq m_dith, [ditherq] ; dither
test offsetd, offsetd
jz .no_rot
%if mmsize == 16
punpcklqdq m_dith, m_dith
%endif ; mmsize == 16
PALIGNR m_dith, m_dith, 3, m0
.no_rot:
%if mmsize == 16
punpcklbw m_dith, m6
%if ARCH_X86_64
punpcklwd m8, m_dith, m6
pslld m8, 12
%else ; x86-32
punpcklwd m5, m_dith, m6
pslld m5, 12
%endif ; x86-32/64
punpckhwd m_dith, m6
pslld m_dith, 12
%if ARCH_X86_32
mova [rsp+ 0], m5
mova [rsp+16], m_dith
%endif
%else ; mmsize == 8
punpcklbw m5, m_dith, m6
punpckhbw m_dith, m6
punpcklwd m4, m5, m6
punpckhwd m5, m6
punpcklwd m3, m_dith, m6
punpckhwd m_dith, m6
pslld m4, 12
pslld m5, 12
pslld m3, 12
pslld m_dith, 12
mova [rsp+ 0], m4
mova [rsp+ 8], m5
mova [rsp+16], m3
mova [rsp+24], m_dith
%endif ; mmsize == 8/16
%endif ; %1 == 8
xor r5, r5
%if mmsize == 8 || %1 == 8
yuv2planeX_mainloop %1, a
%else ; mmsize == 16
test dstq, 15
jnz .unaligned
yuv2planeX_mainloop %1, a
REP_RET
.unaligned:
yuv2planeX_mainloop %1, u
%endif ; mmsize == 8/16
%if %1 == 8
%if ARCH_X86_32
ADD rsp, pad
RET
%else ; x86-64
REP_RET
%endif ; x86-32/64
%else ; %1 == 9/10/16
REP_RET
%endif ; %1 == 8/9/10/16
%endmacro
%if ARCH_X86_32
INIT_MMX mmxext
yuv2planeX_fn 8, 0, 7
yuv2planeX_fn 9, 0, 5
yuv2planeX_fn 10, 0, 5
%endif
INIT_XMM sse2
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
INIT_XMM sse4
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
yuv2planeX_fn 16, 8, 5
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
%endif
; %1=outout-bpc, %2=alignment (u/a)
%macro yuv2plane1_mainloop 2
.loop_%2:
%if %1 == 8
paddsw m0, m2, [srcq+wq*2+mmsize*0]
paddsw m1, m3, [srcq+wq*2+mmsize*1]
psraw m0, 7
psraw m1, 7
packuswb m0, m1
mov%2 [dstq+wq], m0
%elif %1 == 16
paddd m0, m4, [srcq+wq*4+mmsize*0]
paddd m1, m4, [srcq+wq*4+mmsize*1]
paddd m2, m4, [srcq+wq*4+mmsize*2]
paddd m3, m4, [srcq+wq*4+mmsize*3]
psrad m0, 3
psrad m1, 3
psrad m2, 3
psrad m3, 3
%if cpuflag(sse4) ; avx/sse4
packusdw m0, m1
packusdw m2, m3
%else ; mmx/sse2
packssdw m0, m1
packssdw m2, m3
paddw m0, m5
paddw m2, m5
%endif ; mmx/sse2/sse4/avx
mov%2 [dstq+wq*2+mmsize*0], m0
mov%2 [dstq+wq*2+mmsize*1], m2
%else ; %1 == 9/10
paddsw m0, m2, [srcq+wq*2+mmsize*0]
paddsw m1, m2, [srcq+wq*2+mmsize*1]
psraw m0, 15 - %1
psraw m1, 15 - %1
pmaxsw m0, m4
pmaxsw m1, m4
pminsw m0, m3
pminsw m1, m3
mov%2 [dstq+wq*2+mmsize*0], m0
mov%2 [dstq+wq*2+mmsize*1], m1
%endif
add wq, mmsize
jl .loop_%2
%endmacro
%macro yuv2plane1_fn 3
cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
movsxdifnidn wq, wd
add wq, mmsize - 1
and wq, ~(mmsize - 1)
%if %1 == 8
add dstq, wq
%else ; %1 != 8
lea dstq, [dstq+wq*2]
%endif ; %1 == 8
%if %1 == 16
lea srcq, [srcq+wq*4]
%else ; %1 != 16
lea srcq, [srcq+wq*2]
%endif ; %1 == 16
neg wq
%if %1 == 8
pxor m4, m4 ; zero
; create registers holding dither
movq m3, [ditherq] ; dither
test offsetd, offsetd
jz .no_rot
%if mmsize == 16
punpcklqdq m3, m3
%endif ; mmsize == 16
PALIGNR m3, m3, 3, m2
.no_rot:
%if mmsize == 8
mova m2, m3
punpckhbw m3, m4 ; byte->word
punpcklbw m2, m4 ; byte->word
%else
punpcklbw m3, m4
mova m2, m3
%endif
%elif %1 == 9
pxor m4, m4
mova m3, [pw_512]
mova m2, [pw_32]
%elif %1 == 10
pxor m4, m4
mova m3, [pw_1024]
mova m2, [pw_16]
%else ; %1 == 16
%if cpuflag(sse4) ; sse4/avx
mova m4, [pd_4]
%else ; mmx/sse2
mova m4, [pd_4min0x40000]
mova m5, [minshort]
%endif ; mmx/sse2/sse4/avx
%endif ; %1 == ..
; actual pixel scaling
%if mmsize == 8
yuv2plane1_mainloop %1, a
%else ; mmsize == 16
test dstq, 15
jnz .unaligned
yuv2plane1_mainloop %1, a
REP_RET
.unaligned:
yuv2plane1_mainloop %1, u
%endif ; mmsize == 8/16
REP_RET
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
yuv2plane1_fn 8, 0, 5
yuv2plane1_fn 16, 0, 3
INIT_MMX mmxext
yuv2plane1_fn 9, 0, 3
yuv2plane1_fn 10, 0, 3
%endif
INIT_XMM sse2
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
yuv2plane1_fn 10, 5, 3
yuv2plane1_fn 16, 6, 3
INIT_XMM sse4
yuv2plane1_fn 16, 5, 3
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
yuv2plane1_fn 10, 5, 3
yuv2plane1_fn 16, 5, 3
%endif
%undef movsx
;-----------------------------------------------------------------------------
; AVX2 yuv2nv12cX implementation
;
; void ff_yuv2nv12cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
; const int16_t *filter, int filterSize,
; const int16_t **u, const int16_t **v,
; uint8_t *dst, int dstWidth)
;
; void ff_yuv2nv21cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
; const int16_t *filter, int filterSize,
; const int16_t **u, const int16_t **v,
; uint8_t *dst, int dstWidth)
;-----------------------------------------------------------------------------
%if ARCH_X86_64
%macro yuv2nv12cX_fn 1
cglobal %1cX, 8, 11, 13, tmp1, dither, filter, filterSize, u, v, dst, dstWidth
mov tmp1q, qword [ditherq]
movq xm0, tmp1q
ror tmp1q, 24
movq xm1, tmp1q
pmovzxbd m0, xm0
pslld m0, m0, 12 ; ditherLo
pmovzxbd m1, xm1
pslld m1, m1, 12 ; ditherHi
pxor m9, m9 ; uint8_min dwords
mova m10, [pd_255] ; uint8_max dwords
mova m11, [%1_shuffle_mask] ; shuffle_mask
mova m12, [yuv2nv12_permute_mask] ; permute mask
DEFINE_ARGS tmp1, tmp2, filter, filterSize, u, v, dst, dstWidth
xor r8q, r8q
nv12_outer_%1:
mova m2, m0 ; resultLo
mova m3, m1 ; resultHi
xor r9q, r9q
nv12_inner_%1:
movsx r10d, word [filterq + (2 * r9q)]
movd xm4, r10d
vpbroadcastd m4, xm4 ; filter
mov tmp1q, [uq + (gprsize * r9q)]
mova xm7, oword [tmp1q + 2 * r8q]
mov tmp2q, [vq + (gprsize * r9q)]
mova xm8, oword [tmp2q + 2 * r8q]
punpcklwd xm5, xm7, xm8
pmovsxwd m5, xm5 ; multiplicandsLo
punpckhwd xm6, xm7, xm8
pmovsxwd m6, xm6 ; multiplicandsHi
pmulld m7, m5, m4 ; mulResultLo
pmulld m8, m6, m4 ; mulResultHi
paddd m2, m2, m7 ; resultLo += mulResultLo
paddd m3, m3, m8 ; resultHi += mulResultHi
inc r9d
cmp r9d, filterSized
jl nv12_inner_%1
; end of inner loop
psrad m2, m2, 19
psrad m3, m3, 19
; Vectorized av_clip_uint8
pmaxsd m2, m2, m9
pmaxsd m3, m3, m9
pminsd m2, m2, m10
pminsd m3, m3, m10
; At this point we have clamped uint8s arranged in this order:
; m2: u1 0 0 0 v1 0 0 0 [...]
; m3: u5 0 0 0 v5 0 0 0 [...]
;
; First, we shuffle the bytes to make the bytes semi-contiguous.
; AVX-2 doesn't have cross-lane shuffling, so we'll end up with:
; m2: u1 v1 u2 v2 0 0 0 0 0 0 0 0 u3 v3 u4 v4
; m3: u5 v5 u6 v6 0 0 0 0 0 0 0 0 u7 v7 u8 v8
pshufb m2, m2, m11
pshufb m3, m3, m11
; To fix the cross-lane shuffling issue, we'll then use cross-lane
; permutation to combine the two segments
vpermd m2, m12, m2
vpermd m3, m12, m3
; Now we have the final results in the lower 8 bytes of each register
movq [dstq], xm2
movq [dstq + 8], xm3
add r8d, 8
add dstq, 16
cmp r8d, dstWidthd
jl nv12_outer_%1
RET
%endmacro
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
yuv2nv12cX_fn yuv2nv12
yuv2nv12cX_fn yuv2nv21
%endif
%endif ; ARCH_X86_64

View File

@@ -0,0 +1,194 @@
/*
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#include "libavutil/bswap.h"
#include "libavutil/mem_internal.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#if HAVE_INLINE_ASM
DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL;
DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL;
DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL;
DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL;
DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL;
DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL;
DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL;
DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL;
DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL;
#define mask16b mask15b
DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL;
DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL;
DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL;
DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2YOffset);
DECLARE_ALIGNED(8, extern const uint64_t, ff_w1111);
DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
// Note: We have C, MMX, MMXEXT, 3DNOW versions, there is no 3DNOW + MMXEXT one.
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_AMD3DNOW 0
#define COMPILE_TEMPLATE_SSE2 0
#define COMPILE_TEMPLATE_AVX 0
//MMX versions
#undef RENAME
#define RENAME(a) a ## _mmx
#include "rgb2rgb_template.c"
// MMXEXT versions
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define RENAME(a) a ## _mmxext
#include "rgb2rgb_template.c"
//SSE2 versions
#undef RENAME
#undef COMPILE_TEMPLATE_SSE2
#define COMPILE_TEMPLATE_SSE2 1
#define RENAME(a) a ## _sse2
#include "rgb2rgb_template.c"
//AVX versions
#undef RENAME
#undef COMPILE_TEMPLATE_AVX
#define COMPILE_TEMPLATE_AVX 1
#define RENAME(a) a ## _avx
#include "rgb2rgb_template.c"
//3DNOW versions
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#undef COMPILE_TEMPLATE_SSE2
#undef COMPILE_TEMPLATE_AVX
#undef COMPILE_TEMPLATE_AMD3DNOW
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 0
#define COMPILE_TEMPLATE_AVX 0
#define COMPILE_TEMPLATE_AMD3DNOW 1
#define RENAME(a) a ## _3dnow
#include "rgb2rgb_template.c"
/*
RGB15->RGB16 original by Strepto/Astral
ported to gcc & bugfixed : A'rpi
MMXEXT, 3DNOW optimization by Nick Kurshev
32-bit C version, and and&add trick by Michael Niedermayer
*/
#endif /* HAVE_INLINE_ASM */
void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
#if ARCH_X86_64
void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
#endif
av_cold void rgb2rgb_init_x86(void)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_INLINE_ASM
if (INLINE_MMX(cpu_flags))
rgb2rgb_init_mmx();
if (INLINE_AMD3DNOW(cpu_flags))
rgb2rgb_init_3dnow();
if (INLINE_MMXEXT(cpu_flags))
rgb2rgb_init_mmxext();
if (INLINE_SSE2(cpu_flags))
rgb2rgb_init_sse2();
if (INLINE_AVX(cpu_flags))
rgb2rgb_init_avx();
#endif /* HAVE_INLINE_ASM */
if (EXTERNAL_MMXEXT(cpu_flags)) {
shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
#if ARCH_X86_64
uyvytoyuv422 = ff_uyvytoyuv422_sse2;
#endif
}
if (EXTERNAL_SSSE3(cpu_flags)) {
shuffle_bytes_0321 = ff_shuffle_bytes_0321_ssse3;
shuffle_bytes_2103 = ff_shuffle_bytes_2103_ssse3;
shuffle_bytes_1230 = ff_shuffle_bytes_1230_ssse3;
shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3;
shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3;
}
if (EXTERNAL_AVX(cpu_flags)) {
#if ARCH_X86_64
uyvytoyuv422 = ff_uyvytoyuv422_avx;
#endif
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,300 @@
;******************************************************************************
;* Copyright Nick Kurshev
;* Copyright Michael (michaelni@gmx.at)
;* Copyright 2018 Jokyo Images
;* Copyright Ivo van Poorten
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pb_mask_shuffle2103_mmx times 8 dw 255
pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
SECTION .text
%macro RSHIFT_COPY 3
; %1 dst ; %2 src ; %3 shift
%if cpuflag(avx)
psrldq %1, %2, %3
%else
mova %1, %2
RSHIFT %1, %3
%endif
%endmacro
;------------------------------------------------------------------------------
; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
;------------------------------------------------------------------------------
INIT_MMX mmxext
cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
mova m6, [pb_mask_shuffle2103_mmx]
mova m7, m6
psllq m7, 8
movsxdifnidn wq, wd
mov xq, wq
add srcq, wq
add dstq, wq
neg wq
;calc scalar loop
and xq, mmsize*2 -4
je .loop_simd
.loop_scalar:
mov tmpb, [srcq + wq + 2]
mov [dstq+wq + 0], tmpb
mov tmpb, [srcq + wq + 1]
mov [dstq+wq + 1], tmpb
mov tmpb, [srcq + wq + 0]
mov [dstq+wq + 2], tmpb
mov tmpb, [srcq + wq + 3]
mov [dstq+wq + 3], tmpb
add wq, 4
sub xq, 4
jg .loop_scalar
;check if src_size < mmsize * 2
cmp wq, 0
jge .end
.loop_simd:
movu m0, [srcq+wq]
movu m1, [srcq+wq+8]
pshufw m3, m0, 177
pshufw m5, m1, 177
pand m0, m7
pand m3, m6
pand m1, m7
pand m5, m6
por m0, m3
por m1, m5
movu [dstq+wq], m0
movu [dstq+wq + 8], m1
add wq, mmsize*2
jl .loop_simd
.end:
RET
;------------------------------------------------------------------------------
; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
;------------------------------------------------------------------------------
; %1-4 index shuffle
%macro SHUFFLE_BYTES 4
cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
VBROADCASTI128 m0, [pb_shuffle%1%2%3%4]
movsxdifnidn wq, wd
mov xq, wq
add srcq, wq
add dstq, wq
neg wq
;calc scalar loop
and xq, mmsize-4
je .loop_simd
.loop_scalar:
mov tmpb, [srcq + wq + %1]
mov [dstq+wq + 0], tmpb
mov tmpb, [srcq + wq + %2]
mov [dstq+wq + 1], tmpb
mov tmpb, [srcq + wq + %3]
mov [dstq+wq + 2], tmpb
mov tmpb, [srcq + wq + %4]
mov [dstq+wq + 3], tmpb
add wq, 4
sub xq, 4
jg .loop_scalar
;check if src_size < mmsize
cmp wq, 0
jge .end
.loop_simd:
movu m1, [srcq+wq]
pshufb m1, m0
movu [dstq+wq], m1
add wq, mmsize
jl .loop_simd
.end:
RET
%endmacro
INIT_XMM ssse3
SHUFFLE_BYTES 2, 1, 0, 3
SHUFFLE_BYTES 0, 3, 2, 1
SHUFFLE_BYTES 1, 2, 3, 0
SHUFFLE_BYTES 3, 0, 1, 2
SHUFFLE_BYTES 3, 2, 1, 0
;-----------------------------------------------------------------------------------------------
; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
; const uint8_t *src, int width, int height,
; int lumStride, int chromStride, int srcStride)
;-----------------------------------------------------------------------------------------------
%macro UYVY_TO_YUV422 0
cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
pxor m0, m0
pcmpeqw m1, m1
psrlw m1, 8
movsxdifnidn wq, wd
movsxdifnidn lum_strideq, lum_strided
movsxdifnidn chrom_strideq, chrom_strided
movsxdifnidn src_strideq, src_strided
mov back_wq, wq
mov whalfq, wq
shr whalfq, 1 ; whalf = width / 2
lea srcq, [srcq + wq * 2]
add ydstq, wq
add udstq, whalfq
add vdstq, whalfq
.loop_line:
mov xq, wq
mov wtwoq, wq
add wtwoq, wtwoq ; wtwo = width * 2
neg wq
neg wtwoq
neg whalfq
;calc scalar loop count
and xq, mmsize * 2 - 1
je .loop_simd
.loop_scalar:
mov tmpb, [srcq + wtwoq + 0]
mov [udstq + whalfq], tmpb
mov tmpb, [srcq + wtwoq + 1]
mov [ydstq + wq], tmpb
mov tmpb, [srcq + wtwoq + 2]
mov [vdstq + whalfq], tmpb
mov tmpb, [srcq + wtwoq + 3]
mov [ydstq + wq + 1], tmpb
add wq, 2
add wtwoq, 4
add whalfq, 1
sub xq, 2
jg .loop_scalar
; check if simd loop is need
cmp wq, 0
jge .end_line
.loop_simd:
movu m2, [srcq + wtwoq ]
movu m3, [srcq + wtwoq + mmsize ]
movu m4, [srcq + wtwoq + mmsize * 2]
movu m5, [srcq + wtwoq + mmsize * 3]
; extract y part 1
RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
pand m6, m1; YxYx YxYx...
RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
pand m7, m1 ; YxYx YxYx...
packuswb m6, m7 ; YYYY YYYY...
movu [ydstq + wq], m6
; extract y part 2
RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY...
pand m6, m1; YxYx YxYx...
RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
pand m7, m1 ; YxYx YxYx...
packuswb m6, m7 ; YYYY YYYY...
movu [ydstq + wq + mmsize], m6
; extract uv
pand m2, m1 ; UxVx...
pand m3, m1 ; UxVx...
pand m4, m1 ; UxVx...
pand m5, m1 ; UxVx...
packuswb m2, m3 ; UVUV...
packuswb m4, m5 ; UVUV...
; U
pand m6, m2, m1 ; UxUx...
pand m7, m4, m1 ; UxUx...
packuswb m6, m7 ; UUUU
movu [udstq + whalfq], m6
; V
psrlw m2, 8 ; VxVx...
psrlw m4, 8 ; VxVx...
packuswb m2, m4 ; VVVV
movu [vdstq + whalfq], m2
add whalfq, mmsize
add wtwoq, mmsize * 4
add wq, mmsize * 2
jl .loop_simd
.end_line:
add srcq, src_strideq
add ydstq, lum_strideq
add udstq, chrom_strideq
add vdstq, chrom_strideq
;restore initial state of line variable
mov wq, back_wq
mov xq, wq
mov whalfq, wq
shr whalfq, 1 ; whalf = width / 2
sub hd, 1
jg .loop_line
RET
%endmacro
%if ARCH_X86_64
INIT_XMM sse2
UYVY_TO_YUV422
INIT_XMM avx
UYVY_TO_YUV422
%endif

View File

@@ -0,0 +1,423 @@
;******************************************************************************
;* x86-optimized horizontal line scaling functions
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
max_19bit_int: times 4 dd 0x7ffff
max_19bit_flt: times 4 dd 524287.0
minshort: times 8 dw 0x8000
unicoeff: times 4 dd 0x20000000
SECTION .text
;-----------------------------------------------------------------------------
; horizontal line scaling
;
; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
; (SwsContext *c, int{16,32}_t *dst,
; int dstW, const uint{8,16}_t *src,
; const int16_t *filter,
; const int32_t *filterPos, int filterSize);
;
; Scale one horizontal line. Input is either 8-bit width or 16-bit width
; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
; downscale before multiplying). Filter is 14 bits. Output is either 15 bits
; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each
; output pixel is generated from $filterSize input pixels, the position of
; the first pixel is given in filterPos[nOutputPixel].
;-----------------------------------------------------------------------------
; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
%macro SCALE_FUNC 6
%ifnidn %3, X
cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
%else
cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
%endif
%if ARCH_X86_64
movsxd wq, wd
%define mov32 movsxd
%else ; x86-32
%define mov32 mov
%endif ; x86-64
%if %2 == 19
%if mmsize == 8 ; mmx
mova m2, [max_19bit_int]
%elif cpuflag(sse4)
mova m2, [max_19bit_int]
%else ; ssse3/sse2
mova m2, [max_19bit_flt]
%endif ; mmx/sse2/ssse3/sse4
%endif ; %2 == 19
%if %1 == 16
mova m6, [minshort]
mova m7, [unicoeff]
%elif %1 == 8
pxor m3, m3
%endif ; %1 == 8/16
%if %1 == 8
%define movlh movd
%define movbh movh
%define srcmul 1
%else ; %1 == 9-16
%define movlh movq
%define movbh movu
%define srcmul 2
%endif ; %1 == 8/9-16
%ifnidn %3, X
; setup loop
%if %3 == 8
shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
%define wshr 1
%else ; %3 == 4
%define wshr 0
%endif ; %3 == 8
lea filterq, [filterq+wq*8]
%if %2 == 15
lea dstq, [dstq+wq*(2>>wshr)]
%else ; %2 == 19
lea dstq, [dstq+wq*(4>>wshr)]
%endif ; %2 == 15/19
lea fltposq, [fltposq+wq*(4>>wshr)]
neg wq
.loop:
%if %3 == 4 ; filterSize == 4 scaling
; load 2x4 or 4x4 source pixels into m0/m1
mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0]
mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1]
movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}]
%if mmsize == 8
movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
%else ; mmsize == 16
%if %1 > 8
movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
%else ; %1 == 8
movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
%endif
mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2]
mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3]
movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}]
%if %1 > 8
movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
%else ; %1 == 8
movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
punpckldq m0, m4
punpckldq m1, m5
%endif ; %1 == 8
%endif ; mmsize == 8/16
%if %1 == 8
punpcklbw m0, m3 ; byte -> word
punpcklbw m1, m3 ; byte -> word
%endif ; %1 == 8
; multiply with filter coefficients
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
psubw m1, m6
%endif ; %1 == 16
pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
%if mmsize == 8 ; mmx
movq m4, m0
punpckldq m0, m1
punpckhdq m4, m1
paddd m0, m4
%elif notcpuflag(ssse3) ; sse2
mova m4, m0
shufps m0, m1, 10001000b
shufps m4, m1, 11011101b
paddd m0, m4
%else ; ssse3/sse4
phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
%endif ; mmx/sse2/ssse3/sse4
%else ; %3 == 8, i.e. filterSize == 8 scaling
; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1]
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
%if mmsize == 8
movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}]
movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
%else ; mmsize == 16
movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2]
mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3]
movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
%endif ; mmsize == 8/16
%if %1 == 8
punpcklbw m0, m3 ; byte -> word
punpcklbw m1, m3 ; byte -> word
punpcklbw m4, m3 ; byte -> word
punpcklbw m5, m3 ; byte -> word
%endif ; %1 == 8
; multiply
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
psubw m1, m6
psubw m4, m6
psubw m5, m6
%endif ; %1 == 16
pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}]
pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
%if mmsize == 8
paddd m0, m1
paddd m4, m5
movq m1, m0
punpckldq m0, m4
punpckhdq m1, m4
paddd m0, m1
%elif notcpuflag(ssse3) ; sse2
%if %1 == 8
%define mex m6
%else
%define mex m3
%endif
; emulate horizontal add as transpose + vertical add
mova mex, m0
punpckldq m0, m1
punpckhdq mex, m1
paddd m0, mex
mova m1, m4
punpckldq m4, m5
punpckhdq m1, m5
paddd m4, m1
mova m1, m0
punpcklqdq m0, m4
punpckhqdq m1, m4
paddd m0, m1
%else ; ssse3/sse4
; FIXME if we rearrange the filter in pairs of 4, we can
; load pixels likewise and use 2 x paddd + phaddd instead
; of 3 x phaddd here, faster on older cpus
phaddd m0, m1
phaddd m4, m5
phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
%endif ; mmx/sse2/ssse3/sse4
%endif ; %3 == 4/8
%else ; %3 == X, i.e. any filterSize scaling
%ifidn %4, X4
%define dlt 4
%else ; %4 == X || %4 == X8
%define dlt 0
%endif ; %4 ==/!= X4
%if ARCH_X86_64
%define srcq r8
%define pos1q r7
%define srcendq r9
movsxd fltsizeq, fltsized ; filterSize
lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
%else ; x86-32
%define srcq srcmemq
%define pos1q dstq
%define srcendq r6m
lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
mov srcendq, pos0q
%endif ; x86-32/64
lea fltposq, [fltposq+wq*4]
%if %2 == 15
lea dstq, [dstq+wq*2]
%else ; %2 == 19
lea dstq, [dstq+wq*4]
%endif ; %2 == 15/19
movifnidn dstmp, dstq
neg wq
.loop:
mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0]
mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
pxor m4, m4
pxor m5, m5
mov srcq, srcmemmp
.innerloop:
; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
%if %1 == 8
punpcklbw m0, m3
punpcklbw m1, m3
%endif ; %1 == 8
; multiply
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
psubw m1, m6
%endif ; %1 == 16
pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}]
pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
paddd m4, m0
paddd m5, m1
add filterq, mmsize
add srcq, srcmul*mmsize/2
cmp srcq, srcendq ; while (src += 4) < &src[filterSize]
jl .innerloop
%ifidn %4, X4
mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0]
sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1]
%if %1 > 8
movhps m0, [srcq+(pos1q+dlt)*srcmul]
%else ; %1 == 8
movd m1, [srcq+(pos1q+dlt)*srcmul]
punpckldq m0, m1
%endif ; %1 == 8
%if %1 == 8
punpcklbw m0, m3
%endif ; %1 == 8
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
%endif ; %1 == 16
pmaddwd m0, [filterq]
%endif ; %4 == X4
lea filterq, [filterq+(fltsizeq+dlt)*2]
%if mmsize == 8 ; mmx
movq m0, m4
punpckldq m4, m5
punpckhdq m0, m5
paddd m0, m4
%else ; mmsize == 16
%if notcpuflag(ssse3) ; sse2
mova m1, m4
punpcklqdq m4, m5
punpckhqdq m1, m5
paddd m4, m1
%else ; ssse3/sse4
phaddd m4, m5
%endif ; sse2/ssse3/sse4
%ifidn %4, X4
paddd m4, m0
%endif ; %3 == X4
%if notcpuflag(ssse3) ; sse2
pshufd m4, m4, 11011000b
movhlps m0, m4
paddd m0, m4
%else ; ssse3/sse4
phaddd m4, m4
SWAP 0, 4
%endif ; sse2/ssse3/sse4
%endif ; mmsize == 8/16
%endif ; %3 ==/!= X
%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
paddd m0, m7
%endif ; %1 == 16
; clip, store
psrad m0, 14 + %1 - %2
%ifidn %3, X
movifnidn dstq, dstmp
%endif ; %3 == X
%if %2 == 15
packssdw m0, m0
%ifnidn %3, X
movh [dstq+wq*(2>>wshr)], m0
%else ; %3 == X
movd [dstq+wq*2], m0
%endif ; %3 ==/!= X
%else ; %2 == 19
PMINSD m0, m2, m4
%ifnidn %3, X
mova [dstq+wq*(4>>wshr)], m0
%else ; %3 == X
movq [dstq+wq*4], m0
%endif ; %3 ==/!= X
%endif ; %2 == 15/19
%ifnidn %3, X
add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
; per iteration. see "shl wq,1" above as for why we do this
%else ; %3 == X
add wq, 2
%endif ; %3 ==/!= X
jl .loop
REP_RET
%endmacro
; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
%macro SCALE_FUNCS 3
SCALE_FUNC %1, %2, 4, 4, 6, %3
SCALE_FUNC %1, %2, 8, 8, 6, %3
%if mmsize == 8
SCALE_FUNC %1, %2, X, X, 7, %3
%else
SCALE_FUNC %1, %2, X, X4, 7, %3
SCALE_FUNC %1, %2, X, X8, 7, %3
%endif
%endmacro
; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
%macro SCALE_FUNCS2 3
%if notcpuflag(sse4)
SCALE_FUNCS 8, 15, %1
SCALE_FUNCS 9, 15, %2
SCALE_FUNCS 10, 15, %2
SCALE_FUNCS 12, 15, %2
SCALE_FUNCS 14, 15, %2
SCALE_FUNCS 16, 15, %3
%endif ; !sse4
SCALE_FUNCS 8, 19, %1
SCALE_FUNCS 9, 19, %2
SCALE_FUNCS 10, 19, %2
SCALE_FUNCS 12, 19, %2
SCALE_FUNCS 14, 19, %2
SCALE_FUNCS 16, 19, %3
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
SCALE_FUNCS2 0, 0, 0
%endif
INIT_XMM sse2
SCALE_FUNCS2 7, 6, 8
INIT_XMM ssse3
SCALE_FUNCS2 6, 6, 8
INIT_XMM sse4
SCALE_FUNCS2 6, 6, 8

View File

@@ -0,0 +1,586 @@
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#include "libavutil/mem_internal.h"
#include "libavutil/pixdesc.h"
const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
0x0103010301030103LL,
0x0200020002000200LL,};
const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
0x0602060206020602LL,
0x0004000400040004LL,};
#if HAVE_INLINE_ASM
#define DITHER1XBPP
DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL;
DECLARE_ASM_CONST(8, uint64_t, bFC)= 0xFCFCFCFCFCFCFCFCLL;
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL;
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL;
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL;
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL;
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
//MMX versions
#if HAVE_MMX_INLINE
#undef RENAME
#define COMPILE_TEMPLATE_MMXEXT 0
#define RENAME(a) a ## _mmx
#include "swscale_template.c"
#endif
// MMXEXT versions
#if HAVE_MMXEXT_INLINE
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define RENAME(a) a ## _mmxext
#include "swscale_template.c"
#endif
void ff_updateMMXDitherTables(SwsContext *c, int dstY)
{
const int dstH= c->dstH;
const int flags= c->flags;
SwsPlane *lumPlane = &c->slice[c->numSlice-2].plane[0];
SwsPlane *chrUPlane = &c->slice[c->numSlice-2].plane[1];
SwsPlane *alpPlane = &c->slice[c->numSlice-2].plane[3];
int hasAlpha = c->needAlpha;
int32_t *vLumFilterPos= c->vLumFilterPos;
int32_t *vChrFilterPos= c->vChrFilterPos;
int16_t *vLumFilter= c->vLumFilter;
int16_t *vChrFilter= c->vChrFilter;
int32_t *lumMmxFilter= c->lumMmxFilter;
int32_t *chrMmxFilter= c->chrMmxFilter;
int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
const int vLumFilterSize= c->vLumFilterSize;
const int vChrFilterSize= c->vChrFilterSize;
const int chrDstY= dstY>>c->chrDstVSubSample;
const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
c->blueDither= ff_dither8[dstY&1];
if (c->dstFormat == AV_PIX_FMT_RGB555 || c->dstFormat == AV_PIX_FMT_BGR555)
c->greenDither= ff_dither8[dstY&1];
else
c->greenDither= ff_dither4[dstY&1];
c->redDither= ff_dither8[(dstY+1)&1];
if (dstY < dstH - 2) {
const int16_t **lumSrcPtr = (const int16_t **)(void*) lumPlane->line + firstLumSrcY - lumPlane->sliceY;
const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPlane->line + firstChrSrcY - chrUPlane->sliceY;
const int16_t **alpSrcPtr = (CONFIG_SWSCALE_ALPHA && hasAlpha) ? (const int16_t **)(void*) alpPlane->line + firstLumSrcY - alpPlane->sliceY : NULL;
int i;
if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
const int16_t **tmpY = (const int16_t **) lumPlane->tmp;
int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
for (i = 0; i < neg; i++)
tmpY[i] = lumSrcPtr[neg];
for ( ; i < end; i++)
tmpY[i] = lumSrcPtr[i];
for ( ; i < vLumFilterSize; i++)
tmpY[i] = tmpY[i-1];
lumSrcPtr = tmpY;
if (alpSrcPtr) {
const int16_t **tmpA = (const int16_t **) alpPlane->tmp;
for (i = 0; i < neg; i++)
tmpA[i] = alpSrcPtr[neg];
for ( ; i < end; i++)
tmpA[i] = alpSrcPtr[i];
for ( ; i < vLumFilterSize; i++)
tmpA[i] = tmpA[i - 1];
alpSrcPtr = tmpA;
}
}
if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
const int16_t **tmpU = (const int16_t **) chrUPlane->tmp;
int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
for (i = 0; i < neg; i++) {
tmpU[i] = chrUSrcPtr[neg];
}
for ( ; i < end; i++) {
tmpU[i] = chrUSrcPtr[i];
}
for ( ; i < vChrFilterSize; i++) {
tmpU[i] = tmpU[i - 1];
}
chrUSrcPtr = tmpU;
}
if (flags & SWS_ACCURATE_RND) {
int s= APCK_SIZE / 8;
for (i=0; i<vLumFilterSize; i+=2) {
*(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
*(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
lumMmxFilter[s*i+APCK_COEF/4 ]=
lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
+ (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1] * (1 << 16) : 0);
if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
*(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
*(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
alpMmxFilter[s*i+APCK_COEF/4 ]=
alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
}
}
for (i=0; i<vChrFilterSize; i+=2) {
*(const void**)&chrMmxFilter[s*i ]= chrUSrcPtr[i ];
*(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrUSrcPtr[i+(vChrFilterSize>1)];
chrMmxFilter[s*i+APCK_COEF/4 ]=
chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
+ (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1] * (1 << 16) : 0);
}
} else {
for (i=0; i<vLumFilterSize; i++) {
*(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
lumMmxFilter[4*i+2]=
lumMmxFilter[4*i+3]=
((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
*(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
alpMmxFilter[4*i+2]=
alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
}
}
for (i=0; i<vChrFilterSize; i++) {
*(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
chrMmxFilter[4*i+2]=
chrMmxFilter[4*i+3]=
((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U;
}
}
}
}
#endif /* HAVE_INLINE_ASM */
#define YUV2YUVX_FUNC_MMX(opt, step) \
void ff_yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, int srcOffset, \
uint8_t *dest, int dstW, \
const uint8_t *dither, int offset); \
static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset) \
{ \
if(dstW > 0) \
ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \
return; \
}
#define YUV2YUVX_FUNC(opt, step) \
void ff_yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, int srcOffset, \
uint8_t *dest, int dstW, \
const uint8_t *dither, int offset); \
static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset) \
{ \
int remainder = (dstW % step); \
int pixelsProcessed = dstW - remainder; \
if(((uintptr_t)dest) & 15){ \
yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \
return; \
} \
if(pixelsProcessed > 0) \
ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \
if(remainder > 0){ \
ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \
} \
return; \
}
#if HAVE_MMX_EXTERNAL
YUV2YUVX_FUNC_MMX(mmx, 16)
#endif
#if HAVE_MMXEXT_EXTERNAL
YUV2YUVX_FUNC_MMX(mmxext, 16)
#endif
#if HAVE_SSE3_EXTERNAL
YUV2YUVX_FUNC(sse3, 32)
#endif
#if HAVE_AVX2_EXTERNAL
YUV2YUVX_FUNC(avx2, 64)
#endif
#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
SwsContext *c, int16_t *data, \
int dstW, const uint8_t *src, \
const int16_t *filter, \
const int32_t *filterPos, int filterSize)
#define SCALE_FUNCS(filter_n, opt) \
SCALE_FUNC(filter_n, 8, 15, opt); \
SCALE_FUNC(filter_n, 9, 15, opt); \
SCALE_FUNC(filter_n, 10, 15, opt); \
SCALE_FUNC(filter_n, 12, 15, opt); \
SCALE_FUNC(filter_n, 14, 15, opt); \
SCALE_FUNC(filter_n, 16, 15, opt); \
SCALE_FUNC(filter_n, 8, 19, opt); \
SCALE_FUNC(filter_n, 9, 19, opt); \
SCALE_FUNC(filter_n, 10, 19, opt); \
SCALE_FUNC(filter_n, 12, 19, opt); \
SCALE_FUNC(filter_n, 14, 19, opt); \
SCALE_FUNC(filter_n, 16, 19, opt)
#define SCALE_FUNCS_MMX(opt) \
SCALE_FUNCS(4, opt); \
SCALE_FUNCS(8, opt); \
SCALE_FUNCS(X, opt)
#define SCALE_FUNCS_SSE(opt) \
SCALE_FUNCS(4, opt); \
SCALE_FUNCS(8, opt); \
SCALE_FUNCS(X4, opt); \
SCALE_FUNCS(X8, opt)
#if ARCH_X86_32
SCALE_FUNCS_MMX(mmx);
#endif
SCALE_FUNCS_SSE(sse2);
SCALE_FUNCS_SSE(ssse3);
SCALE_FUNCS_SSE(sse4);
#define VSCALEX_FUNC(size, opt) \
void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset)
#define VSCALEX_FUNCS(opt) \
VSCALEX_FUNC(8, opt); \
VSCALEX_FUNC(9, opt); \
VSCALEX_FUNC(10, opt)
#if ARCH_X86_32
VSCALEX_FUNCS(mmxext);
#endif
VSCALEX_FUNCS(sse2);
VSCALEX_FUNCS(sse4);
VSCALEX_FUNC(16, sse4);
VSCALEX_FUNCS(avx);
#define VSCALE_FUNC(size, opt) \
void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \
const uint8_t *dither, int offset)
#define VSCALE_FUNCS(opt1, opt2) \
VSCALE_FUNC(8, opt1); \
VSCALE_FUNC(9, opt2); \
VSCALE_FUNC(10, opt2); \
VSCALE_FUNC(16, opt1)
#if ARCH_X86_32
VSCALE_FUNCS(mmx, mmxext);
#endif
VSCALE_FUNCS(sse2, sse2);
VSCALE_FUNC(16, sse4);
VSCALE_FUNCS(avx, avx);
#define INPUT_Y_FUNC(fmt, opt) \
void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
const uint8_t *unused1, const uint8_t *unused2, \
int w, uint32_t *unused)
#define INPUT_UV_FUNC(fmt, opt) \
void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
const uint8_t *unused0, \
const uint8_t *src1, \
const uint8_t *src2, \
int w, uint32_t *unused)
#define INPUT_FUNC(fmt, opt) \
INPUT_Y_FUNC(fmt, opt); \
INPUT_UV_FUNC(fmt, opt)
#define INPUT_FUNCS(opt) \
INPUT_FUNC(uyvy, opt); \
INPUT_FUNC(yuyv, opt); \
INPUT_UV_FUNC(nv12, opt); \
INPUT_UV_FUNC(nv21, opt); \
INPUT_FUNC(rgba, opt); \
INPUT_FUNC(bgra, opt); \
INPUT_FUNC(argb, opt); \
INPUT_FUNC(abgr, opt); \
INPUT_FUNC(rgb24, opt); \
INPUT_FUNC(bgr24, opt)
#if ARCH_X86_32
INPUT_FUNCS(mmx);
#endif
INPUT_FUNCS(sse2);
INPUT_FUNCS(ssse3);
INPUT_FUNCS(avx);
#if ARCH_X86_64
#define YUV2NV_DECL(fmt, opt) \
void ff_yuv2 ## fmt ## cX_ ## opt(enum AVPixelFormat format, const uint8_t *dither, \
const int16_t *filter, int filterSize, \
const int16_t **u, const int16_t **v, \
uint8_t *dst, int dstWidth)
YUV2NV_DECL(nv12, avx2);
YUV2NV_DECL(nv21, avx2);
#endif
av_cold void ff_sws_init_swscale_x86(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_MMX_INLINE
if (INLINE_MMX(cpu_flags))
sws_init_swscale_mmx(c);
#endif
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags))
sws_init_swscale_mmxext(c);
#endif
if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) {
#if HAVE_MMX_EXTERNAL
if (EXTERNAL_MMX(cpu_flags))
c->yuv2planeX = yuv2yuvX_mmx;
#endif
#if HAVE_MMXEXT_EXTERNAL
if (EXTERNAL_MMXEXT(cpu_flags))
c->yuv2planeX = yuv2yuvX_mmxext;
#endif
#if HAVE_SSE3_EXTERNAL
if (EXTERNAL_SSE3(cpu_flags))
c->yuv2planeX = yuv2yuvX_sse3;
#endif
#if HAVE_AVX2_EXTERNAL
if (EXTERNAL_AVX2_FAST(cpu_flags))
c->yuv2planeX = yuv2yuvX_avx2;
#endif
}
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
if (c->srcBpc == 8) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
ff_hscale8to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 9) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
ff_hscale9to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 10) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
ff_hscale10to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 12) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \
ff_hscale12to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth<16)) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
ff_hscale14to19_ ## filtersize ## _ ## opt1; \
} else { /* c->srcBpc == 16 */ \
av_assert0(c->srcBpc == 16);\
hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
ff_hscale16to19_ ## filtersize ## _ ## opt1; \
} \
} while (0)
#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
switch (filtersize) { \
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
}
#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
switch(c->dstBpc){ \
case 16: do_16_case; break; \
case 10: if (!isBE(c->dstFormat) && c->dstFormat != AV_PIX_FMT_P010LE) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \
case 8: if ((condition_8bit) && !c->use_mmx_vfilter) vscalefn = ff_yuv2planeX_8_ ## opt; break; \
}
#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
switch(c->dstBpc){ \
case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
case 10: if (!isBE(c->dstFormat) && c->dstFormat != AV_PIX_FMT_P010LE && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \
case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
default: av_assert0(c->dstBpc>8); \
}
#define case_rgb(x, X, opt) \
case AV_PIX_FMT_ ## X: \
c->lumToYV12 = ff_ ## x ## ToY_ ## opt; \
if (!c->chrSrcHSubSample) \
c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
break
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);
switch (c->srcFormat) {
case AV_PIX_FMT_YA8:
c->lumToYV12 = ff_yuyvToY_mmx;
if (c->needAlpha)
c->alpToYV12 = ff_uyvyToY_mmx;
break;
case AV_PIX_FMT_YUYV422:
c->lumToYV12 = ff_yuyvToY_mmx;
c->chrToYV12 = ff_yuyvToUV_mmx;
break;
case AV_PIX_FMT_UYVY422:
c->lumToYV12 = ff_uyvyToY_mmx;
c->chrToYV12 = ff_uyvyToUV_mmx;
break;
case AV_PIX_FMT_NV12:
c->chrToYV12 = ff_nv12ToUV_mmx;
break;
case AV_PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_mmx;
break;
case_rgb(rgb24, RGB24, mmx);
case_rgb(bgr24, BGR24, mmx);
case_rgb(bgra, BGRA, mmx);
case_rgb(rgba, RGBA, mmx);
case_rgb(abgr, ABGR, mmx);
case_rgb(argb, ARGB, mmx);
default:
break;
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1);
}
#endif /* ARCH_X86_32 */
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
switch (filtersize) { \
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \
else ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \
break; \
}
if (EXTERNAL_SSE2(cpu_flags)) {
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, ,
HAVE_ALIGNED_STACK || ARCH_X86_64);
ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
switch (c->srcFormat) {
case AV_PIX_FMT_YA8:
c->lumToYV12 = ff_yuyvToY_sse2;
if (c->needAlpha)
c->alpToYV12 = ff_uyvyToY_sse2;
break;
case AV_PIX_FMT_YUYV422:
c->lumToYV12 = ff_yuyvToY_sse2;
c->chrToYV12 = ff_yuyvToUV_sse2;
break;
case AV_PIX_FMT_UYVY422:
c->lumToYV12 = ff_uyvyToY_sse2;
c->chrToYV12 = ff_uyvyToUV_sse2;
break;
case AV_PIX_FMT_NV12:
c->chrToYV12 = ff_nv12ToUV_sse2;
break;
case AV_PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_sse2;
break;
case_rgb(rgb24, RGB24, sse2);
case_rgb(bgr24, BGR24, sse2);
case_rgb(bgra, BGRA, sse2);
case_rgb(rgba, RGBA, sse2);
case_rgb(abgr, ABGR, sse2);
case_rgb(argb, ARGB, sse2);
default:
break;
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3);
switch (c->srcFormat) {
case_rgb(rgb24, RGB24, ssse3);
case_rgb(bgr24, BGR24, ssse3);
default:
break;
}
}
if (EXTERNAL_SSE4(cpu_flags)) {
/* Xto15 don't need special sse4 functions */
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4,
if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4,
HAVE_ALIGNED_STACK || ARCH_X86_64);
if (c->dstBpc == 16 && !isBE(c->dstFormat))
c->yuv2plane1 = ff_yuv2plane1_16_sse4;
}
if (EXTERNAL_AVX(cpu_flags)) {
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, ,
HAVE_ALIGNED_STACK || ARCH_X86_64);
ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
switch (c->srcFormat) {
case AV_PIX_FMT_YUYV422:
c->chrToYV12 = ff_yuyvToUV_avx;
break;
case AV_PIX_FMT_UYVY422:
c->chrToYV12 = ff_uyvyToUV_avx;
break;
case AV_PIX_FMT_NV12:
c->chrToYV12 = ff_nv12ToUV_avx;
break;
case AV_PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_avx;
break;
case_rgb(rgb24, RGB24, avx);
case_rgb(bgr24, BGR24, avx);
case_rgb(bgra, BGRA, avx);
case_rgb(rgba, RGBA, avx);
case_rgb(abgr, ABGR, avx);
case_rgb(argb, ARGB, avx);
default:
break;
}
}
#if ARCH_X86_64
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
switch (c->dstFormat) {
case AV_PIX_FMT_NV12:
case AV_PIX_FMT_NV24:
c->yuv2nv12cX = ff_yuv2nv12cX_avx2;
break;
case AV_PIX_FMT_NV21:
case AV_PIX_FMT_NV42:
c->yuv2nv12cX = ff_yuv2nv21cX_avx2;
break;
default:
break;
}
}
#endif
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,31 @@
/*
* check XMM registers for clobbers on Win64
* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86/w64xmmtest.h"
#include "libswscale/swscale.h"
wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
const int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *const dst[], const int dstStride[]))
{
testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY,
srcSliceH, dst, dstStride);
}

View File

@@ -0,0 +1,143 @@
/*
* software YUV to RGB converter
*
* Copyright (C) 2009 Konstantin Shishkov
*
* MMX/MMXEXT template stuff (needed for fast movntq support),
* 1,4,8bpp support and context / deglobalize stuff
* by Michael Niedermayer (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include "config.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#if HAVE_X86ASM
#define DITHER1XBPP // only for MMX
//MMX versions
#if HAVE_MMX
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 0
#define RENAME(a) a ## _mmx
#include "yuv2rgb_template.c"
#endif /* HAVE_MMX */
// MMXEXT versions
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define RENAME(a) a ## _mmxext
#include "yuv2rgb_template.c"
//SSSE3 versions
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 0
#define RENAME(a) a ## _ssse3
#include "yuv2rgb_template.c"
#endif /* HAVE_X86ASM */
av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSSE3(cpu_flags)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
#if CONFIG_SWSCALE_ALPHA
return yuva420_rgb32_ssse3;
#endif
break;
} else
return yuv420_rgb32_ssse3;
case AV_PIX_FMT_BGR32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
#if CONFIG_SWSCALE_ALPHA
return yuva420_bgr32_ssse3;
#endif
break;
} else
return yuv420_bgr32_ssse3;
case AV_PIX_FMT_RGB24:
return yuv420_rgb24_ssse3;
case AV_PIX_FMT_BGR24:
return yuv420_bgr24_ssse3;
case AV_PIX_FMT_RGB565:
return yuv420_rgb16_ssse3;
case AV_PIX_FMT_RGB555:
return yuv420_rgb15_ssse3;
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB24:
return yuv420_rgb24_mmxext;
case AV_PIX_FMT_BGR24:
return yuv420_bgr24_mmxext;
}
}
if (EXTERNAL_MMX(cpu_flags)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
#if CONFIG_SWSCALE_ALPHA
return yuva420_rgb32_mmx;
#endif
break;
} else
return yuv420_rgb32_mmx;
case AV_PIX_FMT_BGR32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
#if CONFIG_SWSCALE_ALPHA
return yuva420_bgr32_mmx;
#endif
break;
} else
return yuv420_bgr32_mmx;
case AV_PIX_FMT_RGB24:
return yuv420_rgb24_mmx;
case AV_PIX_FMT_BGR24:
return yuv420_bgr24_mmx;
case AV_PIX_FMT_RGB565:
return yuv420_rgb16_mmx;
case AV_PIX_FMT_RGB555:
return yuv420_rgb15_mmx;
}
}
#endif /* HAVE_X86ASM */
return NULL;
}

View File

@@ -0,0 +1,195 @@
/*
* software YUV to RGB converter
*
* Copyright (C) 2001-2007 Michael Niedermayer
* (c) 2010 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/x86/asm.h"
#include "libswscale/swscale_internal.h"
#define YUV2RGB_LOOP(depth) \
h_size = (c->dstW + 7) & ~7; \
if (h_size * depth > FFABS(dstStride[0])) \
h_size -= 8; \
\
vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
\
for (y = 0; y < srcSliceH; y++) { \
uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \
const uint8_t *py = src[0] + y * srcStride[0]; \
const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
x86_reg index = -h_size / 2; \
extern void RENAME(ff_yuv_420_rgb24)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void RENAME(ff_yuv_420_bgr24)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
#if !COMPILE_TEMPLATE_MMXEXT
extern void RENAME(ff_yuv_420_rgb15)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void RENAME(ff_yuv_420_rgb16)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void RENAME(ff_yuv_420_rgb32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void RENAME(ff_yuv_420_bgr32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void RENAME(ff_yuva_420_rgb32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index, const uint8_t *pa_2index);
extern void RENAME(ff_yuva_420_bgr32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index, const uint8_t *pa_2index);
static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(2)
#ifdef DITHER1XBPP
c->blueDither = ff_dither8[y & 1];
c->greenDither = ff_dither8[y & 1];
c->redDither = ff_dither8[(y + 1) & 1];
#endif
RENAME(ff_yuv_420_rgb15)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(2)
#ifdef DITHER1XBPP
c->blueDither = ff_dither8[y & 1];
c->greenDither = ff_dither4[y & 1];
c->redDither = ff_dither8[(y + 1) & 1];
#endif
RENAME(ff_yuv_420_rgb16)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
RENAME(ff_yuv_420_rgb32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
RENAME(ff_yuv_420_bgr32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
const uint8_t *pa = src[3] + y * srcStride[3];
RENAME(ff_yuva_420_rgb32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
}
return srcSliceH;
}
static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
const uint8_t *pa = src[3] + y * srcStride[3];
RENAME(ff_yuva_420_bgr32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
}
return srcSliceH;
}
#endif
static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(3)
RENAME(ff_yuv_420_rgb24)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(3)
RENAME(ff_yuv_420_bgr24)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}

View File

@@ -0,0 +1,136 @@
;******************************************************************************
;* x86-optimized yuv2yuvX
;* Copyright 2020 Google LLC
;* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
;-----------------------------------------------------------------------------
; yuv2yuvX
;
; void ff_yuv2yuvX_<opt>(const int16_t *filter, int filterSize,
; int srcOffset, uint8_t *dest, int dstW,
; const uint8_t *dither, int offset);
;
;-----------------------------------------------------------------------------
%macro YUV2YUVX_FUNC 0
cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
%if notcpuflag(sse3)
%define movr mova
%define unroll 1
%else
%define movr movdqu
%define unroll 2
%endif
movsxdifnidn dstWq, dstWd
movsxdifnidn offsetq, offsetd
movsxdifnidn srcq, srcd
%if cpuflag(avx2)
vpbroadcastq m3, [ditherq]
%else
movq xm3, [ditherq]
%endif ; avx2
cmp offsetd, 0
jz .offset
; offset != 0 path.
psrlq m5, m3, $18
psllq m3, m3, $28
por m3, m3, m5
.offset:
add offsetq, srcq
movd xm1, filterSized
SPLATW m1, xm1, 0
pxor m0, m0, m0
mov filterSizeq, filterq
mov srcq, [filterSizeq]
punpcklbw m3, m0
psllw m1, m1, 3
paddw m3, m3, m1
psraw m7, m3, 4
.outerloop:
mova m4, m7
mova m3, m7
%if cpuflag(sse3)
mova m6, m7
mova m1, m7
%endif
.loop:
%if cpuflag(avx2)
vpbroadcastq m0, [filterSizeq + 8]
%elif cpuflag(sse3)
movddup m0, [filterSizeq + 8]
%else
mova m0, [filterSizeq + 8]
%endif
pmulhw m2, m0, [srcq + offsetq * 2]
pmulhw m5, m0, [srcq + offsetq * 2 + mmsize]
paddw m3, m3, m2
paddw m4, m4, m5
%if cpuflag(sse3)
pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize]
pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize]
paddw m6, m6, m2
paddw m1, m1, m5
%endif
add filterSizeq, $10
mov srcq, [filterSizeq]
test srcq, srcq
jnz .loop
psraw m3, m3, 3
psraw m4, m4, 3
%if cpuflag(sse3)
psraw m6, m6, 3
psraw m1, m1, 3
%endif
packuswb m3, m3, m4
%if cpuflag(sse3)
packuswb m6, m6, m1
%endif
mov srcq, [filterq]
%if cpuflag(avx2)
vpermq m3, m3, 216
vpermq m6, m6, 216
%endif
movr [destq + offsetq], m3
%if cpuflag(sse3)
movr [destq + offsetq + mmsize], m6
%endif
add offsetq, mmsize * unroll
mov filterSizeq, filterq
cmp offsetq, dstWq
jb .outerloop
REP_RET
%endmacro
INIT_MMX mmx
YUV2YUVX_FUNC
INIT_MMX mmxext
YUV2YUVX_FUNC
INIT_XMM sse3
YUV2YUVX_FUNC
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
YUV2YUVX_FUNC
%endif

View File

@@ -0,0 +1,383 @@
;******************************************************************************
;* software YUV to RGB converter
;*
;* Copyright (C) 2001-2007 Michael Niedermayer
;* (c) 2010 Konstantin Shishkov
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
; below variables are named like mask_dwXY, which means to preserve dword No.X & No.Y
mask_dw036 : db -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0
mask_dw147 : db 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1
mask_dw25 : db 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0
rgb24_shuf1: db 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11
rgb24_shuf2: db 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5
rgb24_shuf3: db 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15
pw_00ff: times 8 dw 255
pb_f8: times 16 db 248
pb_e0: times 16 db 224
pb_03: times 16 db 3
pb_07: times 16 db 7
mask_1101: dw -1, -1, 0, -1
mask_0010: dw 0, 0, -1, 0
mask_0110: dw 0, -1, -1, 0
mask_1001: dw -1, 0, 0, -1
mask_0100: dw 0, -1, 0, 0
SECTION .text
;-----------------------------------------------------------------------------
;
; YUV420/YUVA420 to RGB/BGR 15/16/24/32
; R = Y + ((vrCoff * (v - 128)) >> 8)
; G = Y - ((ugCoff * (u - 128) + vgCoff * (v - 128)) >> 8)
; B = Y + ((ubCoff * (u - 128)) >> 8)
;
;-----------------------------------------------------------------------------
%macro MOV_H2L 1
%if mmsize == 8
psrlq %1, 32
%else ; mmsize == 16
psrldq %1, 8
%endif
%endmacro
%macro yuv2rgb_fn 3
%if %3 == 32
%ifidn %1, yuva
%define parameters index, image, pu_index, pv_index, pointer_c_dither, py_2index, pa_2index
%define GPR_num 7
%endif
%else
%define parameters index, image, pu_index, pv_index, pointer_c_dither, py_2index
%define GPR_num 6
%endif
%define m_green m2
%define m_alpha m3
%define m_y m6
%define m_u m0
%define m_v m1
%ifidn %2, rgb
%define m_red m1
%define m_blue m0
%else
%define m_red m0
%define m_blue m1
%endif
%if mmsize == 8
%define time_num 1
%define reg_num 8
%define y_offset [pointer_c_ditherq + 8 * 8]
%define u_offset [pointer_c_ditherq + 9 * 8]
%define v_offset [pointer_c_ditherq + 10 * 8]
%define ug_coff [pointer_c_ditherq + 7 * 8]
%define vg_coff [pointer_c_ditherq + 6 * 8]
%define y_coff [pointer_c_ditherq + 3 * 8]
%define ub_coff [pointer_c_ditherq + 5 * 8]
%define vr_coff [pointer_c_ditherq + 4 * 8]
%elif mmsize == 16
%define time_num 2
%if ARCH_X86_32
%define reg_num 8
%define my_offset [pointer_c_ditherq + 8 * 8]
%define mu_offset [pointer_c_ditherq + 9 * 8]
%define mv_offset [pointer_c_ditherq + 10 * 8]
%define mug_coff [pointer_c_ditherq + 7 * 8]
%define mvg_coff [pointer_c_ditherq + 6 * 8]
%define my_coff [pointer_c_ditherq + 3 * 8]
%define mub_coff [pointer_c_ditherq + 5 * 8]
%define mvr_coff [pointer_c_ditherq + 4 * 8]
%else ; ARCH_X86_64
%define reg_num 16
%define y_offset m8
%define u_offset m9
%define v_offset m10
%define ug_coff m11
%define vg_coff m12
%define y_coff m13
%define ub_coff m14
%define vr_coff m15
%endif ; ARCH_X86_32/64
%endif ; coeff define mmsize == 8/16
cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
%if ARCH_X86_64
movsxd indexq, indexd
%if mmsize == 16
VBROADCASTSD y_offset, [pointer_c_ditherq + 8 * 8]
VBROADCASTSD u_offset, [pointer_c_ditherq + 9 * 8]
VBROADCASTSD v_offset, [pointer_c_ditherq + 10 * 8]
VBROADCASTSD ug_coff, [pointer_c_ditherq + 7 * 8]
VBROADCASTSD vg_coff, [pointer_c_ditherq + 6 * 8]
VBROADCASTSD y_coff, [pointer_c_ditherq + 3 * 8]
VBROADCASTSD ub_coff, [pointer_c_ditherq + 5 * 8]
VBROADCASTSD vr_coff, [pointer_c_ditherq + 4 * 8]
%endif
%endif
movu m_y, [py_2indexq + 2 * indexq]
movh m_u, [pu_indexq + indexq]
movh m_v, [pv_indexq + indexq]
.loop0:
pxor m4, m4
mova m7, m6
punpcklbw m0, m4
punpcklbw m1, m4
mova m2, [pw_00ff]
pand m6, m2
psrlw m7, 8
psllw m0, 3
psllw m1, 3
psllw m6, 3
psllw m7, 3
%if (ARCH_X86_32 && mmsize == 16)
VBROADCASTSD m2, mu_offset
VBROADCASTSD m3, mv_offset
VBROADCASTSD m4, my_offset
psubsw m0, m2 ; U = U - 128
psubsw m1, m3 ; V = V - 128
psubw m6, m4
psubw m7, m4
VBROADCASTSD m2, mug_coff
VBROADCASTSD m3, mvg_coff
VBROADCASTSD m4, my_coff
VBROADCASTSD m5, mub_coff
pmulhw m2, m0
pmulhw m3, m1
pmulhw m6, m4
pmulhw m7, m4
pmulhw m0, m5
VBROADCASTSD m4, mvr_coff
pmulhw m1, m4
%else ; ARCH_X86_64 || mmsize == 8
psubsw m0, u_offset ; U = U - 128
psubsw m1, v_offset ; V = V - 128
psubw m6, y_offset
psubw m7, y_offset
mova m2, m0
mova m3, m1
pmulhw m2, ug_coff
pmulhw m3, vg_coff
pmulhw m6, y_coff
pmulhw m7, y_coff
pmulhw m0, ub_coff
pmulhw m1, vr_coff
%endif
paddsw m2, m3
mova m3, m7
mova m5, m7
paddsw m3, m0 ; B1 B3 B5 B7 ...
paddsw m5, m1 ; R1 R3 R5 R7 ...
paddsw m7, m2 ; G1 G3 G4 G7 ...
paddsw m0, m6 ; B0 B2 B4 B6 ...
paddsw m1, m6 ; R0 R2 R4 R6 ...
paddsw m2, m6 ; G0 G2 G4 G6 ...
%if %3 == 24 ; PACK RGB24
%define depth 3
packuswb m0, m3 ; R0 R2 R4 R6 ... R1 R3 R5 R7 ...
packuswb m1, m5 ; B0 B2 B4 B6 ... B1 B3 B5 B7 ...
packuswb m2, m7 ; G0 G2 G4 G6 ... G1 G3 G5 G7 ...
mova m3, m_red
mova m6, m_blue
MOV_H2L m_red
punpcklbw m3, m2 ; R0 G0 R2 G2 R4 G4 R6 G6 R8 G8 ...
punpcklbw m6, m_red ; B0 R1 B2 R3 B4 R5 B6 R7 B8 R9 ...
mova m5, m3
punpckhbw m2, m_blue ; G1 B1 G3 B3 G5 B5 G7 B7 G9 B9 ...
%if mmsize == 8
punpcklwd m3 ,m6 ; R0 G0 B0 R1 R2 G2 B2 R3
punpckhwd m5, m6 ; R4 G4 B4 R5 R6 G6 B6 R7
%if cpuflag(mmxext)
pshufw m1, m2, 0xc6
pshufw m6, m3, 0x84
pshufw m7, m5, 0x38
pand m6, [mask_1101] ; R0 G0 B0 R1 -- -- R2 G2
movq m0, m1
pand m7, [mask_0110] ; -- -- R6 G6 B6 R7 -- --
movq m2, m1
pand m1, [mask_0100] ; -- -- G3 B3 -- -- -- --
psrlq m3, 48 ; B2 R3 -- -- -- -- -- --
pand m0, [mask_0010] ; -- -- -- -- G1 B1 -- --
psllq m5, 32 ; -- -- -- -- R4 G4 B4 R5
pand m2, [mask_1001] ; G5 B5 -- -- -- -- G7 B7
por m1, m3
por m0, m6
por m1, m5
por m2, m7
movntq [imageq], m0
movntq [imageq + 8], m1
movntq [imageq + 16], m2
%else ; cpuflag(mmx)
movd [imageq], m3 ; R0 G0 R2 G2
movd [imageq + 4], m2 ; G1 B1
psrlq m3, 32
psrlq m2, 16
movd [imageq + 6], m3 ; R2 G2 B2 R3
movd [imageq + 10], m2 ; G3 B3
psrlq m2, 16
movd [imageq + 12], m5 ; R4 G4 B4 R5
movd [imageq + 16], m2 ; G5 B5
psrlq m5, 32
movd [imageq + 20], m2 ; -- -- G7 B7
movd [imageq + 18], m5 ; R6 G6 B6 R7
%endif ; mmsize = 8
%else ; mmsize == 16
pshufb m3, [rgb24_shuf1] ; r0 g0 r6 g6 r12 g12 r2 g2 r8 g8 r14 g14 r4 g4 r10 g10
pshufb m6, [rgb24_shuf2] ; b10 r11 b0 r1 b6 r7 b12 r13 b2 r3 b8 r9 b14 r15 b4 r5
pshufb m2, [rgb24_shuf3] ; g5 b5 g11 b11 g1 b1 g7 b7 g13 b13 g3 b3 g9 b9 g15 b15
mova m7, [mask_dw036]
mova m4, [mask_dw147]
mova m5, [mask_dw25]
pand m0, m7, m3 ; r0 g0 --- --- --- --- r2 g2 --- --- --- --- r4 g4 --- ---
pand m1, m4, m6 ; --- --- b0 r1 --- --- --- --- b2 r3 --- --- --- --- b4 r5
por m0, m1
pand m1, m5, m2 ; --- --- --- --- g1 b1 --- --- --- --- g3 b3 --- --- --- ---
por m0, m1 ; r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5
pand m1, m7, m2 ; g5 b5 --- --- --- --- g7 b7 --- --- --- --- g9 b9 --- ---
pand m7, m6 ; b10 r11 --- --- --- --- b12 r13 --- --- --- --- b14 r15 --- ---
pand m6, m5 ; --- --- --- --- b6 r7 --- --- --- --- b8 r9 --- --- --- ---
por m1, m6
pand m6, m4, m3 ; --- --- r6 g6 --- --- --- --- r8 g8 --- --- --- --- r10 g10
pand m2, m4 ; --- --- g11 b11 --- --- --- --- g13 b13 --- --- --- --- g15 b15
pand m3, m5 ; --- --- --- --- r12 g12 --- --- --- --- r14 g14 --- --- --- ---
por m2, m7
por m1, m6 ; g5 b5 r6 g6 b6 r7 g7 b7 r8 g8 b8 r9 g9 b9 r10 g10
por m2, m3 ; b10 r11 g11 b11 r12 g12 b12 r13 g13 b13 r14 g14 b14 r15 g15 b15
movu [imageq], m0
movu [imageq + 16], m1
movu [imageq + 32], m2
%endif ; mmsize = 16
%else ; PACK RGB15/16/32
packuswb m0, m1
packuswb m3, m5
packuswb m2, m2
mova m1, m0
packuswb m7, m7
punpcklbw m0, m3 ; B0 B1 B2 B3 ... B7
punpckhbw m1, m3 ; R0 R1 R2 R3 ... R7
punpcklbw m2, m7 ; G0 G1 G2 G3 ... G7
%if %3 == 32 ; PACK RGB32
%define depth 4
%ifidn %1, yuv
pcmpeqd m3, m3 ; Set alpha empty
%else
movu m3, [pa_2indexq + 2 * indexq] ; Load alpha
%endif
mova m5, m_blue
mova m6, m_red
punpckhbw m5, m_green
punpcklbw m_blue, m_green
punpckhbw m6, m_alpha
punpcklbw m_red, m_alpha
mova m_green, m_blue
mova m_alpha, m5
punpcklwd m_blue, m_red
punpckhwd m_green, m_red
punpcklwd m5, m6
punpckhwd m_alpha, m6
movu [imageq + 0], m_blue
movu [imageq + 8 * time_num], m_green
movu [imageq + 16 * time_num], m5
movu [imageq + 24 * time_num], m_alpha
%else ; PACK RGB15/16
%define depth 2
%if cpuflag(ssse3)
%define red_dither m3
%define green_dither m4
%define blue_dither m5
VBROADCASTSD red_dither, [pointer_c_ditherq + 0 * 8]
VBROADCASTSD green_dither, [pointer_c_ditherq + 1 * 8]
VBROADCASTSD blue_dither, [pointer_c_ditherq + 2 * 8]
%else ; cpuflag(mmx/mmxext)
%define blue_dither [pointer_c_ditherq + 2 * 8]
%define green_dither [pointer_c_ditherq + 1 * 8]
%define red_dither [pointer_c_ditherq + 0 * 8]
%endif
%if %3 == 15
%define gmask pb_03
%define isRGB15 1
%else
%define gmask pb_07
%define isRGB15 0
%endif
paddusb m0, blue_dither
paddusb m2, green_dither
paddusb m1, red_dither
pand m0, [pb_f8]
pand m1, [pb_f8]
mova m3, m2
psllw m2, 3 - isRGB15
psrlw m3, 5 + isRGB15
psrlw m0, 3
psrlw m1, isRGB15
pand m2, [pb_e0]
pand m3, [gmask]
por m0, m2
por m1, m3
mova m2, m0
punpcklbw m0, m1
punpckhbw m2, m1
movu [imageq], m0
movu [imageq + 8 * time_num], m2
%endif ; PACK RGB15/16
%endif ; PACK RGB15/16/32
movu m_y, [py_2indexq + 2 * indexq + 8 * time_num]
movh m_v, [pv_indexq + indexq + 4 * time_num]
movh m_u, [pu_indexq + indexq + 4 * time_num]
add imageq, 8 * depth * time_num
add indexq, 4 * time_num
js .loop0
REP_RET
%endmacro
INIT_MMX mmx
yuv2rgb_fn yuv, rgb, 24
yuv2rgb_fn yuv, bgr, 24
yuv2rgb_fn yuv, rgb, 32
yuv2rgb_fn yuv, bgr, 32
yuv2rgb_fn yuva, rgb, 32
yuv2rgb_fn yuva, bgr, 32
yuv2rgb_fn yuv, rgb, 15
yuv2rgb_fn yuv, rgb, 16
INIT_MMX mmxext
yuv2rgb_fn yuv, rgb, 24
yuv2rgb_fn yuv, bgr, 24
INIT_XMM ssse3
yuv2rgb_fn yuv, rgb, 24
yuv2rgb_fn yuv, bgr, 24
yuv2rgb_fn yuv, rgb, 32
yuv2rgb_fn yuv, bgr, 32
yuv2rgb_fn yuva, rgb, 32
yuv2rgb_fn yuva, bgr, 32
yuv2rgb_fn yuv, rgb, 15
yuv2rgb_fn yuv, rgb, 16

1027
externals/ffmpeg/ffmpeg/libswscale/yuv2rgb.c vendored Executable file

File diff suppressed because it is too large Load Diff