early-access version 1432
This commit is contained in:
15
externals/ffmpeg/libswscale/x86/Makefile
vendored
Executable file
15
externals/ffmpeg/libswscale/x86/Makefile
vendored
Executable file
@@ -0,0 +1,15 @@
|
||||
$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
|
||||
|
||||
OBJS += x86/rgb2rgb.o \
|
||||
x86/swscale.o \
|
||||
x86/yuv2rgb.o \
|
||||
|
||||
MMX-OBJS += x86/hscale_fast_bilinear_simd.o \
|
||||
|
||||
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
|
||||
|
||||
X86ASM-OBJS += x86/input.o \
|
||||
x86/output.o \
|
||||
x86/scale.o \
|
||||
x86/rgb_2_rgb.o \
|
||||
x86/yuv_2_rgb.o \
|
359
externals/ffmpeg/libswscale/x86/hscale_fast_bilinear_simd.c
vendored
Executable file
359
externals/ffmpeg/libswscale/x86/hscale_fast_bilinear_simd.c
vendored
Executable file
@@ -0,0 +1,359 @@
|
||||
/*
|
||||
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "../swscale_internal.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
|
||||
#define RET 0xC3 // near return opcode for x86
|
||||
#define PREFETCH "prefetchnta"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
|
||||
int16_t *filter, int32_t *filterPos,
|
||||
int numSplits)
|
||||
{
|
||||
uint8_t *fragmentA;
|
||||
x86_reg imm8OfPShufW1A;
|
||||
x86_reg imm8OfPShufW2A;
|
||||
x86_reg fragmentLengthA;
|
||||
uint8_t *fragmentB;
|
||||
x86_reg imm8OfPShufW1B;
|
||||
x86_reg imm8OfPShufW2B;
|
||||
x86_reg fragmentLengthB;
|
||||
int fragmentPos;
|
||||
|
||||
int xpos, i;
|
||||
|
||||
// create an optimized horizontal scaling routine
|
||||
/* This scaler is made of runtime-generated MMXEXT code using specially tuned
|
||||
* pshufw instructions. For every four output pixels, if four input pixels
|
||||
* are enough for the fast bilinear scaling, then a chunk of fragmentB is
|
||||
* used. If five input pixels are needed, then a chunk of fragmentA is used.
|
||||
*/
|
||||
|
||||
// code fragment
|
||||
|
||||
__asm__ volatile (
|
||||
"jmp 9f \n\t"
|
||||
// Begin
|
||||
"0: \n\t"
|
||||
"movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
|
||||
"movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
|
||||
"movd 1(%%"FF_REG_c", %%"FF_REG_S"), %%mm1 \n\t"
|
||||
"punpcklbw %%mm7, %%mm1 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"pshufw $0xFF, %%mm1, %%mm1 \n\t"
|
||||
"1: \n\t"
|
||||
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
|
||||
"2: \n\t"
|
||||
"psubw %%mm1, %%mm0 \n\t"
|
||||
"movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
|
||||
"pmullw %%mm3, %%mm0 \n\t"
|
||||
"psllw $7, %%mm1 \n\t"
|
||||
"paddw %%mm1, %%mm0 \n\t"
|
||||
|
||||
"movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
|
||||
|
||||
"add $8, %%"FF_REG_a" \n\t"
|
||||
// End
|
||||
"9: \n\t"
|
||||
"lea " LOCAL_MANGLE(0b) ", %0 \n\t"
|
||||
"lea " LOCAL_MANGLE(1b) ", %1 \n\t"
|
||||
"lea " LOCAL_MANGLE(2b) ", %2 \n\t"
|
||||
"dec %1 \n\t"
|
||||
"dec %2 \n\t"
|
||||
"sub %0, %1 \n\t"
|
||||
"sub %0, %2 \n\t"
|
||||
"lea " LOCAL_MANGLE(9b) ", %3 \n\t"
|
||||
"sub %0, %3 \n\t"
|
||||
|
||||
|
||||
: "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
|
||||
"=r" (fragmentLengthA)
|
||||
);
|
||||
|
||||
__asm__ volatile (
|
||||
"jmp 9f \n\t"
|
||||
// Begin
|
||||
"0: \n\t"
|
||||
"movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
|
||||
"movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"pshufw $0xFF, %%mm0, %%mm1 \n\t"
|
||||
"1: \n\t"
|
||||
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
|
||||
"2: \n\t"
|
||||
"psubw %%mm1, %%mm0 \n\t"
|
||||
"movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
|
||||
"pmullw %%mm3, %%mm0 \n\t"
|
||||
"psllw $7, %%mm1 \n\t"
|
||||
"paddw %%mm1, %%mm0 \n\t"
|
||||
|
||||
"movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
|
||||
|
||||
"add $8, %%"FF_REG_a" \n\t"
|
||||
// End
|
||||
"9: \n\t"
|
||||
"lea " LOCAL_MANGLE(0b) ", %0 \n\t"
|
||||
"lea " LOCAL_MANGLE(1b) ", %1 \n\t"
|
||||
"lea " LOCAL_MANGLE(2b) ", %2 \n\t"
|
||||
"dec %1 \n\t"
|
||||
"dec %2 \n\t"
|
||||
"sub %0, %1 \n\t"
|
||||
"sub %0, %2 \n\t"
|
||||
"lea " LOCAL_MANGLE(9b) ", %3 \n\t"
|
||||
"sub %0, %3 \n\t"
|
||||
|
||||
|
||||
: "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
|
||||
"=r" (fragmentLengthB)
|
||||
);
|
||||
|
||||
xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
|
||||
fragmentPos = 0;
|
||||
|
||||
for (i = 0; i < dstW / numSplits; i++) {
|
||||
int xx = xpos >> 16;
|
||||
|
||||
if ((i & 3) == 0) {
|
||||
int a = 0;
|
||||
int b = ((xpos + xInc) >> 16) - xx;
|
||||
int c = ((xpos + xInc * 2) >> 16) - xx;
|
||||
int d = ((xpos + xInc * 3) >> 16) - xx;
|
||||
int inc = (d + 1 < 4);
|
||||
uint8_t *fragment = inc ? fragmentB : fragmentA;
|
||||
x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
|
||||
x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
|
||||
x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
|
||||
int maxShift = 3 - (d + inc);
|
||||
int shift = 0;
|
||||
|
||||
if (filterCode) {
|
||||
filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
|
||||
filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
|
||||
filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
|
||||
filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
|
||||
filterPos[i / 2] = xx;
|
||||
|
||||
memcpy(filterCode + fragmentPos, fragment, fragmentLength);
|
||||
|
||||
filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
|
||||
((b + inc) << 2) |
|
||||
((c + inc) << 4) |
|
||||
((d + inc) << 6);
|
||||
filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
|
||||
(c << 4) |
|
||||
(d << 6);
|
||||
|
||||
if (i + 4 - inc >= dstW)
|
||||
shift = maxShift; // avoid overread
|
||||
else if ((filterPos[i / 2] & 3) <= maxShift)
|
||||
shift = filterPos[i / 2] & 3; // align
|
||||
|
||||
if (shift && i >= shift) {
|
||||
filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
|
||||
filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
|
||||
filterPos[i / 2] -= shift;
|
||||
}
|
||||
}
|
||||
|
||||
fragmentPos += fragmentLength;
|
||||
|
||||
if (filterCode)
|
||||
filterCode[fragmentPos] = RET;
|
||||
}
|
||||
xpos += xInc;
|
||||
}
|
||||
if (filterCode)
|
||||
filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
|
||||
|
||||
return fragmentPos + 1;
|
||||
}
|
||||
|
||||
void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
|
||||
int dstWidth, const uint8_t *src,
|
||||
int srcW, int xInc)
|
||||
{
|
||||
int32_t *filterPos = c->hLumFilterPos;
|
||||
int16_t *filter = c->hLumFilter;
|
||||
void *mmxextFilterCode = c->lumMmxextFilterCode;
|
||||
int i;
|
||||
#if ARCH_X86_64
|
||||
uint64_t retsave;
|
||||
#else
|
||||
#if !HAVE_EBX_AVAILABLE
|
||||
uint64_t ebxsave;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__asm__ volatile(
|
||||
#if ARCH_X86_64
|
||||
"mov -8(%%rsp), %%"FF_REG_a" \n\t"
|
||||
"mov %%"FF_REG_a", %5 \n\t" // retsave
|
||||
#else
|
||||
#if !HAVE_EBX_AVAILABLE
|
||||
"mov %%"FF_REG_b", %5 \n\t" // ebxsave
|
||||
#endif
|
||||
#endif
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
"mov %0, %%"FF_REG_c" \n\t"
|
||||
"mov %1, %%"FF_REG_D" \n\t"
|
||||
"mov %2, %%"FF_REG_d" \n\t"
|
||||
"mov %3, %%"FF_REG_b" \n\t"
|
||||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
|
||||
PREFETCH" (%%"FF_REG_c") \n\t"
|
||||
PREFETCH" 32(%%"FF_REG_c") \n\t"
|
||||
PREFETCH" 64(%%"FF_REG_c") \n\t"
|
||||
|
||||
#if ARCH_X86_64
|
||||
#define CALL_MMXEXT_FILTER_CODE \
|
||||
"movl (%%"FF_REG_b"), %%esi \n\t"\
|
||||
"call *%4 \n\t"\
|
||||
"movl (%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"\
|
||||
"add %%"FF_REG_S", %%"FF_REG_c" \n\t"\
|
||||
"add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
|
||||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
|
||||
|
||||
#else
|
||||
#define CALL_MMXEXT_FILTER_CODE \
|
||||
"movl (%%"FF_REG_b"), %%esi \n\t"\
|
||||
"call *%4 \n\t"\
|
||||
"addl (%%"FF_REG_b", %%"FF_REG_a"), %%"FF_REG_c" \n\t"\
|
||||
"add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
|
||||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
|
||||
|
||||
#endif /* ARCH_X86_64 */
|
||||
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
|
||||
#if ARCH_X86_64
|
||||
"mov %5, %%"FF_REG_a" \n\t"
|
||||
"mov %%"FF_REG_a", -8(%%rsp) \n\t"
|
||||
#else
|
||||
#if !HAVE_EBX_AVAILABLE
|
||||
"mov %5, %%"FF_REG_b" \n\t"
|
||||
#endif
|
||||
#endif
|
||||
:: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
|
||||
"m" (mmxextFilterCode)
|
||||
#if ARCH_X86_64
|
||||
,"m"(retsave)
|
||||
#else
|
||||
#if !HAVE_EBX_AVAILABLE
|
||||
,"m" (ebxsave)
|
||||
#endif
|
||||
#endif
|
||||
: "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
|
||||
#if ARCH_X86_64 || HAVE_EBX_AVAILABLE
|
||||
,"%"FF_REG_b
|
||||
#endif
|
||||
);
|
||||
|
||||
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
|
||||
dst[i] = src[srcW-1]*128;
|
||||
}
|
||||
|
||||
void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
|
||||
int dstWidth, const uint8_t *src1,
|
||||
const uint8_t *src2, int srcW, int xInc)
|
||||
{
|
||||
int32_t *filterPos = c->hChrFilterPos;
|
||||
int16_t *filter = c->hChrFilter;
|
||||
void *mmxextFilterCode = c->chrMmxextFilterCode;
|
||||
int i;
|
||||
#if ARCH_X86_64
|
||||
DECLARE_ALIGNED(8, uint64_t, retsave);
|
||||
#else
|
||||
#if !HAVE_EBX_AVAILABLE
|
||||
DECLARE_ALIGNED(8, uint64_t, ebxsave);
|
||||
#endif
|
||||
#endif
|
||||
__asm__ volatile(
|
||||
#if ARCH_X86_64
|
||||
"mov -8(%%rsp), %%"FF_REG_a" \n\t"
|
||||
"mov %%"FF_REG_a", %7 \n\t" // retsave
|
||||
#else
|
||||
#if !HAVE_EBX_AVAILABLE
|
||||
"mov %%"FF_REG_b", %7 \n\t" // ebxsave
|
||||
#endif
|
||||
#endif
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
"mov %0, %%"FF_REG_c" \n\t"
|
||||
"mov %1, %%"FF_REG_D" \n\t"
|
||||
"mov %2, %%"FF_REG_d" \n\t"
|
||||
"mov %3, %%"FF_REG_b" \n\t"
|
||||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
|
||||
PREFETCH" (%%"FF_REG_c") \n\t"
|
||||
PREFETCH" 32(%%"FF_REG_c") \n\t"
|
||||
PREFETCH" 64(%%"FF_REG_c") \n\t"
|
||||
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
|
||||
"mov %5, %%"FF_REG_c" \n\t" // src2
|
||||
"mov %6, %%"FF_REG_D" \n\t" // dst2
|
||||
PREFETCH" (%%"FF_REG_c") \n\t"
|
||||
PREFETCH" 32(%%"FF_REG_c") \n\t"
|
||||
PREFETCH" 64(%%"FF_REG_c") \n\t"
|
||||
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
CALL_MMXEXT_FILTER_CODE
|
||||
|
||||
#if ARCH_X86_64
|
||||
"mov %7, %%"FF_REG_a" \n\t"
|
||||
"mov %%"FF_REG_a", -8(%%rsp) \n\t"
|
||||
#else
|
||||
#if !HAVE_EBX_AVAILABLE
|
||||
"mov %7, %%"FF_REG_b" \n\t"
|
||||
#endif
|
||||
#endif
|
||||
:: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
|
||||
"m" (mmxextFilterCode), "m" (src2), "m"(dst2)
|
||||
#if ARCH_X86_64
|
||||
,"m"(retsave)
|
||||
#else
|
||||
#if !HAVE_EBX_AVAILABLE
|
||||
,"m" (ebxsave)
|
||||
#endif
|
||||
#endif
|
||||
: "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
|
||||
#if ARCH_X86_64 || HAVE_EBX_AVAILABLE
|
||||
,"%"FF_REG_b
|
||||
#endif
|
||||
);
|
||||
|
||||
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
|
||||
dst1[i] = src1[srcW-1]*128;
|
||||
dst2[i] = src2[srcW-1]*128;
|
||||
}
|
||||
}
|
||||
#endif //HAVE_INLINE_ASM
|
740
externals/ffmpeg/libswscale/x86/input.asm
vendored
Executable file
740
externals/ffmpeg/libswscale/x86/input.asm
vendored
Executable file
@@ -0,0 +1,740 @@
|
||||
;******************************************************************************
|
||||
;* x86-optimized input routines; does shuffling of packed
|
||||
;* YUV formats into individual planes, and converts RGB
|
||||
;* into YUV planes also.
|
||||
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
%define RY 0x20DE
|
||||
%define GY 0x4087
|
||||
%define BY 0x0C88
|
||||
%define RU 0xECFF
|
||||
%define GU 0xDAC8
|
||||
%define BU 0x3838
|
||||
%define RV 0x3838
|
||||
%define GV 0xD0E3
|
||||
%define BV 0xF6E4
|
||||
|
||||
rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15
|
||||
rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15
|
||||
%define bgr_Ycoeff_12x4 16*4 + 16* 0 + tableq
|
||||
%define bgr_Ycoeff_3x56 16*4 + 16* 1 + tableq
|
||||
%define rgb_Ycoeff_12x4 16*4 + 16* 2 + tableq
|
||||
%define rgb_Ycoeff_3x56 16*4 + 16* 3 + tableq
|
||||
%define bgr_Ucoeff_12x4 16*4 + 16* 4 + tableq
|
||||
%define bgr_Ucoeff_3x56 16*4 + 16* 5 + tableq
|
||||
%define rgb_Ucoeff_12x4 16*4 + 16* 6 + tableq
|
||||
%define rgb_Ucoeff_3x56 16*4 + 16* 7 + tableq
|
||||
%define bgr_Vcoeff_12x4 16*4 + 16* 8 + tableq
|
||||
%define bgr_Vcoeff_3x56 16*4 + 16* 9 + tableq
|
||||
%define rgb_Vcoeff_12x4 16*4 + 16*10 + tableq
|
||||
%define rgb_Vcoeff_3x56 16*4 + 16*11 + tableq
|
||||
|
||||
%define rgba_Ycoeff_rb 16*4 + 16*12 + tableq
|
||||
%define rgba_Ycoeff_br 16*4 + 16*13 + tableq
|
||||
%define rgba_Ycoeff_ga 16*4 + 16*14 + tableq
|
||||
%define rgba_Ycoeff_ag 16*4 + 16*15 + tableq
|
||||
%define rgba_Ucoeff_rb 16*4 + 16*16 + tableq
|
||||
%define rgba_Ucoeff_br 16*4 + 16*17 + tableq
|
||||
%define rgba_Ucoeff_ga 16*4 + 16*18 + tableq
|
||||
%define rgba_Ucoeff_ag 16*4 + 16*19 + tableq
|
||||
%define rgba_Vcoeff_rb 16*4 + 16*20 + tableq
|
||||
%define rgba_Vcoeff_br 16*4 + 16*21 + tableq
|
||||
%define rgba_Vcoeff_ga 16*4 + 16*22 + tableq
|
||||
%define rgba_Vcoeff_ag 16*4 + 16*23 + tableq
|
||||
|
||||
; bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
|
||||
; bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
|
||||
; rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
|
||||
; rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY
|
||||
; bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU
|
||||
; bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU
|
||||
; rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU
|
||||
; rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU
|
||||
; bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV
|
||||
; bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
|
||||
; rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
|
||||
; rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
|
||||
|
||||
; rgba_Ycoeff_rb: times 4 dw RY, BY
|
||||
; rgba_Ycoeff_br: times 4 dw BY, RY
|
||||
; rgba_Ycoeff_ga: times 4 dw GY, 0
|
||||
; rgba_Ycoeff_ag: times 4 dw 0, GY
|
||||
; rgba_Ucoeff_rb: times 4 dw RU, BU
|
||||
; rgba_Ucoeff_br: times 4 dw BU, RU
|
||||
; rgba_Ucoeff_ga: times 4 dw GU, 0
|
||||
; rgba_Ucoeff_ag: times 4 dw 0, GU
|
||||
; rgba_Vcoeff_rb: times 4 dw RV, BV
|
||||
; rgba_Vcoeff_br: times 4 dw BV, RV
|
||||
; rgba_Vcoeff_ga: times 4 dw GV, 0
|
||||
; rgba_Vcoeff_ag: times 4 dw 0, GV
|
||||
|
||||
shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \
|
||||
6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80
|
||||
shuf_rgb_3x56: db 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, \
|
||||
8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; RGB to Y/UV.
|
||||
;
|
||||
; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
|
||||
; and
|
||||
; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
|
||||
; const uint8_t *unused, int w);
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
; %1 = nr. of XMM registers
|
||||
; %2 = rgb or bgr
|
||||
%macro RGB24_TO_Y_FN 2-3
|
||||
cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
|
||||
%if mmsize == 8
|
||||
mova m5, [%2_Ycoeff_12x4]
|
||||
mova m6, [%2_Ycoeff_3x56]
|
||||
%define coeff1 m5
|
||||
%define coeff2 m6
|
||||
%elif ARCH_X86_64
|
||||
mova m8, [%2_Ycoeff_12x4]
|
||||
mova m9, [%2_Ycoeff_3x56]
|
||||
%define coeff1 m8
|
||||
%define coeff2 m9
|
||||
%else ; x86-32 && mmsize == 16
|
||||
%define coeff1 [%2_Ycoeff_12x4]
|
||||
%define coeff2 [%2_Ycoeff_3x56]
|
||||
%endif ; x86-32/64 && mmsize == 8/16
|
||||
%if (ARCH_X86_64 || mmsize == 8) && %0 == 3
|
||||
jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body
|
||||
%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
|
||||
.body:
|
||||
%if cpuflag(ssse3)
|
||||
mova m7, [shuf_rgb_12x4]
|
||||
%define shuf_rgb1 m7
|
||||
%if ARCH_X86_64
|
||||
mova m10, [shuf_rgb_3x56]
|
||||
%define shuf_rgb2 m10
|
||||
%else ; x86-32
|
||||
%define shuf_rgb2 [shuf_rgb_3x56]
|
||||
%endif ; x86-32/64
|
||||
%endif ; cpuflag(ssse3)
|
||||
%if ARCH_X86_64
|
||||
movsxd wq, wd
|
||||
%endif
|
||||
add wq, wq
|
||||
add dstq, wq
|
||||
neg wq
|
||||
%if notcpuflag(ssse3)
|
||||
pxor m7, m7
|
||||
%endif ; !cpuflag(ssse3)
|
||||
mova m4, [rgb_Yrnd]
|
||||
.loop:
|
||||
%if cpuflag(ssse3)
|
||||
movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3]
|
||||
movu m2, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7]
|
||||
pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
|
||||
pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
|
||||
pshufb m3, m2, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
|
||||
pshufb m2, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
||||
%else ; !cpuflag(ssse3)
|
||||
movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 }
|
||||
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
|
||||
movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 }
|
||||
movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 }
|
||||
%if mmsize == 16 ; i.e. sse2
|
||||
punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
|
||||
punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
|
||||
movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 }
|
||||
movd m3, [srcq+14] ; (byte) { R4, B5, G5, R5 }
|
||||
movd m5, [srcq+18] ; (byte) { B6, G6, R6, B7 }
|
||||
movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 }
|
||||
punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
||||
punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
|
||||
%endif ; mmsize == 16
|
||||
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
|
||||
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
|
||||
punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
||||
punpcklbw m3, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
|
||||
%endif ; cpuflag(ssse3)
|
||||
add srcq, 3 * mmsize / 2
|
||||
pmaddwd m0, coeff1 ; (dword) { B0*BY + G0*GY, B1*BY, B2*BY + G2*GY, B3*BY }
|
||||
pmaddwd m1, coeff2 ; (dword) { R0*RY, G1+GY + R1*RY, R2*RY, G3+GY + R3*RY }
|
||||
pmaddwd m2, coeff1 ; (dword) { B4*BY + G4*GY, B5*BY, B6*BY + G6*GY, B7*BY }
|
||||
pmaddwd m3, coeff2 ; (dword) { R4*RY, G5+GY + R5*RY, R6*RY, G7+GY + R7*RY }
|
||||
paddd m0, m1 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[0-3]
|
||||
paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
|
||||
paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
|
||||
paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
|
||||
psrad m0, 9
|
||||
psrad m2, 9
|
||||
packssdw m0, m2 ; (word) { Y[0-7] }
|
||||
mova [dstq+wq], m0
|
||||
add wq, mmsize
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
|
||||
%endmacro
|
||||
|
||||
; %1 = nr. of XMM registers
|
||||
; %2 = rgb or bgr
|
||||
%macro RGB24_TO_UV_FN 2-3
|
||||
cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
|
||||
%if ARCH_X86_64
|
||||
mova m8, [%2_Ucoeff_12x4]
|
||||
mova m9, [%2_Ucoeff_3x56]
|
||||
mova m10, [%2_Vcoeff_12x4]
|
||||
mova m11, [%2_Vcoeff_3x56]
|
||||
%define coeffU1 m8
|
||||
%define coeffU2 m9
|
||||
%define coeffV1 m10
|
||||
%define coeffV2 m11
|
||||
%else ; x86-32
|
||||
%define coeffU1 [%2_Ucoeff_12x4]
|
||||
%define coeffU2 [%2_Ucoeff_3x56]
|
||||
%define coeffV1 [%2_Vcoeff_12x4]
|
||||
%define coeffV2 [%2_Vcoeff_3x56]
|
||||
%endif ; x86-32/64
|
||||
%if ARCH_X86_64 && %0 == 3
|
||||
jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToUV %+ SUFFIX).body
|
||||
%else ; ARCH_X86_64 && %0 == 3
|
||||
.body:
|
||||
%if cpuflag(ssse3)
|
||||
mova m7, [shuf_rgb_12x4]
|
||||
%define shuf_rgb1 m7
|
||||
%if ARCH_X86_64
|
||||
mova m12, [shuf_rgb_3x56]
|
||||
%define shuf_rgb2 m12
|
||||
%else ; x86-32
|
||||
%define shuf_rgb2 [shuf_rgb_3x56]
|
||||
%endif ; x86-32/64
|
||||
%endif ; cpuflag(ssse3)
|
||||
%if ARCH_X86_64
|
||||
movsxd wq, dword r5m
|
||||
%else ; x86-32
|
||||
mov wq, r5m
|
||||
%endif
|
||||
add wq, wq
|
||||
add dstUq, wq
|
||||
add dstVq, wq
|
||||
neg wq
|
||||
mova m6, [rgb_UVrnd]
|
||||
%if notcpuflag(ssse3)
|
||||
pxor m7, m7
|
||||
%endif
|
||||
.loop:
|
||||
%if cpuflag(ssse3)
|
||||
movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3]
|
||||
movu m4, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7]
|
||||
pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
|
||||
pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
|
||||
%else ; !cpuflag(ssse3)
|
||||
movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 }
|
||||
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
|
||||
movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 }
|
||||
movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 }
|
||||
%if mmsize == 16
|
||||
punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
|
||||
punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
|
||||
movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 }
|
||||
movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 }
|
||||
%endif ; mmsize == 16
|
||||
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
|
||||
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
|
||||
%endif ; cpuflag(ssse3)
|
||||
pmaddwd m2, m0, coeffV1 ; (dword) { B0*BV + G0*GV, B1*BV, B2*BV + G2*GV, B3*BV }
|
||||
pmaddwd m3, m1, coeffV2 ; (dword) { R0*BV, G1*GV + R1*BV, R2*BV, G3*GV + R3*BV }
|
||||
pmaddwd m0, coeffU1 ; (dword) { B0*BU + G0*GU, B1*BU, B2*BU + G2*GU, B3*BU }
|
||||
pmaddwd m1, coeffU2 ; (dword) { R0*BU, G1*GU + R1*BU, R2*BU, G3*GU + R3*BU }
|
||||
paddd m0, m1 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[0-3]
|
||||
paddd m2, m3 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[0-3]
|
||||
%if cpuflag(ssse3)
|
||||
pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
|
||||
pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
||||
%else ; !cpuflag(ssse3)
|
||||
%if mmsize == 16
|
||||
movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 }
|
||||
movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 }
|
||||
punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
||||
punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
|
||||
%endif ; mmsize == 16 && !cpuflag(ssse3)
|
||||
punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
|
||||
punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
|
||||
%endif ; cpuflag(ssse3)
|
||||
add srcq, 3 * mmsize / 2
|
||||
pmaddwd m1, m4, coeffU1 ; (dword) { B4*BU + G4*GU, B5*BU, B6*BU + G6*GU, B7*BU }
|
||||
pmaddwd m3, m5, coeffU2 ; (dword) { R4*BU, G5*GU + R5*BU, R6*BU, G7*GU + R7*BU }
|
||||
pmaddwd m4, coeffV1 ; (dword) { B4*BV + G4*GV, B5*BV, B6*BV + G6*GV, B7*BV }
|
||||
pmaddwd m5, coeffV2 ; (dword) { R4*BV, G5*GV + R5*BV, R6*BV, G7*GV + R7*BV }
|
||||
paddd m1, m3 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[4-7]
|
||||
paddd m4, m5 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[4-7]
|
||||
paddd m0, m6 ; += rgb_UVrnd, i.e. (dword) { U[0-3] }
|
||||
paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
|
||||
paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
|
||||
paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
|
||||
psrad m0, 9
|
||||
psrad m2, 9
|
||||
psrad m1, 9
|
||||
psrad m4, 9
|
||||
packssdw m0, m1 ; (word) { U[0-7] }
|
||||
packssdw m2, m4 ; (word) { V[0-7] }
|
||||
%if mmsize == 8
|
||||
mova [dstUq+wq], m0
|
||||
mova [dstVq+wq], m2
|
||||
%else ; mmsize == 16
|
||||
mova [dstUq+wq], m0
|
||||
mova [dstVq+wq], m2
|
||||
%endif ; mmsize == 8/16
|
||||
add wq, mmsize
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endif ; ARCH_X86_64 && %0 == 3
|
||||
%endmacro
|
||||
|
||||
; %1 = nr. of XMM registers for rgb-to-Y func
|
||||
; %2 = nr. of XMM registers for rgb-to-UV func
|
||||
%macro RGB24_FUNCS 2
|
||||
RGB24_TO_Y_FN %1, rgb
|
||||
RGB24_TO_Y_FN %1, bgr, rgb
|
||||
RGB24_TO_UV_FN %2, rgb
|
||||
RGB24_TO_UV_FN %2, bgr, rgb
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
RGB24_FUNCS 0, 0
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
RGB24_FUNCS 10, 12
|
||||
|
||||
INIT_XMM ssse3
|
||||
RGB24_FUNCS 11, 13
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
RGB24_FUNCS 11, 13
|
||||
%endif
|
||||
|
||||
; %1 = nr. of XMM registers
|
||||
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
|
||||
%macro RGB32_TO_Y_FN 5-6
|
||||
cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
|
||||
mova m5, [rgba_Ycoeff_%2%4]
|
||||
mova m6, [rgba_Ycoeff_%3%5]
|
||||
%if %0 == 6
|
||||
jmp mangle(private_prefix %+ _ %+ %6 %+ ToY %+ SUFFIX).body
|
||||
%else ; %0 == 6
|
||||
.body:
|
||||
%if ARCH_X86_64
|
||||
movsxd wq, wd
|
||||
%endif
|
||||
add wq, wq
|
||||
sub wq, mmsize - 1
|
||||
lea srcq, [srcq+wq*2]
|
||||
add dstq, wq
|
||||
neg wq
|
||||
mova m4, [rgb_Yrnd]
|
||||
pcmpeqb m7, m7
|
||||
psrlw m7, 8 ; (word) { 0x00ff } x4
|
||||
.loop:
|
||||
; FIXME check alignment and use mova
|
||||
movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
|
||||
movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
|
||||
DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
|
||||
pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
|
||||
pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
|
||||
pmaddwd m3, m5 ; (dword) { Bx*BY + Rx*RY }[4-7]
|
||||
pmaddwd m2, m6 ; (dword) { Gx*GY }[4-7]
|
||||
paddd m0, m4 ; += rgb_Yrnd
|
||||
paddd m2, m4 ; += rgb_Yrnd
|
||||
paddd m0, m1 ; (dword) { Y[0-3] }
|
||||
paddd m2, m3 ; (dword) { Y[4-7] }
|
||||
psrad m0, 9
|
||||
psrad m2, 9
|
||||
packssdw m0, m2 ; (word) { Y[0-7] }
|
||||
mova [dstq+wq], m0
|
||||
add wq, mmsize
|
||||
jl .loop
|
||||
sub wq, mmsize - 1
|
||||
jz .end
|
||||
add srcq, 2*mmsize - 2
|
||||
add dstq, mmsize - 1
|
||||
.loop2:
|
||||
movd m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
|
||||
DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
|
||||
pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
|
||||
pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
|
||||
paddd m0, m4 ; += rgb_Yrnd
|
||||
paddd m0, m1 ; (dword) { Y[0-3] }
|
||||
psrad m0, 9
|
||||
packssdw m0, m0 ; (word) { Y[0-7] }
|
||||
movd [dstq+wq], m0
|
||||
add wq, 2
|
||||
jl .loop2
|
||||
.end:
|
||||
REP_RET
|
||||
%endif ; %0 == 3
|
||||
%endmacro
|
||||
|
||||
; %1 = nr. of XMM registers
|
||||
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
|
||||
%macro RGB32_TO_UV_FN 5-6
|
||||
cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
|
||||
%if ARCH_X86_64
|
||||
mova m8, [rgba_Ucoeff_%2%4]
|
||||
mova m9, [rgba_Ucoeff_%3%5]
|
||||
mova m10, [rgba_Vcoeff_%2%4]
|
||||
mova m11, [rgba_Vcoeff_%3%5]
|
||||
%define coeffU1 m8
|
||||
%define coeffU2 m9
|
||||
%define coeffV1 m10
|
||||
%define coeffV2 m11
|
||||
%else ; x86-32
|
||||
%define coeffU1 [rgba_Ucoeff_%2%4]
|
||||
%define coeffU2 [rgba_Ucoeff_%3%5]
|
||||
%define coeffV1 [rgba_Vcoeff_%2%4]
|
||||
%define coeffV2 [rgba_Vcoeff_%3%5]
|
||||
%endif ; x86-64/32
|
||||
%if ARCH_X86_64 && %0 == 6
|
||||
jmp mangle(private_prefix %+ _ %+ %6 %+ ToUV %+ SUFFIX).body
|
||||
%else ; ARCH_X86_64 && %0 == 6
|
||||
.body:
|
||||
%if ARCH_X86_64
|
||||
movsxd wq, dword r5m
|
||||
%else ; x86-32
|
||||
mov wq, r5m
|
||||
%endif
|
||||
add wq, wq
|
||||
sub wq, mmsize - 1
|
||||
add dstUq, wq
|
||||
add dstVq, wq
|
||||
lea srcq, [srcq+wq*2]
|
||||
neg wq
|
||||
pcmpeqb m7, m7
|
||||
psrlw m7, 8 ; (word) { 0x00ff } x4
|
||||
mova m6, [rgb_UVrnd]
|
||||
.loop:
|
||||
; FIXME check alignment and use mova
|
||||
movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
|
||||
movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
|
||||
DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
|
||||
pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
|
||||
pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
|
||||
pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3]
|
||||
pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3]
|
||||
paddd m3, m6 ; += rgb_UVrnd
|
||||
paddd m1, m6 ; += rgb_UVrnd
|
||||
paddd m2, m3 ; (dword) { V[0-3] }
|
||||
paddd m0, m1 ; (dword) { U[0-3] }
|
||||
pmaddwd m3, m5, coeffV1 ; (dword) { Bx*BV + Rx*RV }[4-7]
|
||||
pmaddwd m1, m4, coeffV2 ; (dword) { Gx*GV }[4-7]
|
||||
pmaddwd m5, coeffU1 ; (dword) { Bx*BU + Rx*RU }[4-7]
|
||||
pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7]
|
||||
paddd m3, m6 ; += rgb_UVrnd
|
||||
paddd m5, m6 ; += rgb_UVrnd
|
||||
psrad m0, 9
|
||||
paddd m1, m3 ; (dword) { V[4-7] }
|
||||
paddd m4, m5 ; (dword) { U[4-7] }
|
||||
psrad m2, 9
|
||||
psrad m4, 9
|
||||
psrad m1, 9
|
||||
packssdw m0, m4 ; (word) { U[0-7] }
|
||||
packssdw m2, m1 ; (word) { V[0-7] }
|
||||
%if mmsize == 8
|
||||
mova [dstUq+wq], m0
|
||||
mova [dstVq+wq], m2
|
||||
%else ; mmsize == 16
|
||||
mova [dstUq+wq], m0
|
||||
mova [dstVq+wq], m2
|
||||
%endif ; mmsize == 8/16
|
||||
add wq, mmsize
|
||||
jl .loop
|
||||
sub wq, mmsize - 1
|
||||
jz .end
|
||||
add srcq , 2*mmsize - 2
|
||||
add dstUq, mmsize - 1
|
||||
add dstVq, mmsize - 1
|
||||
.loop2:
|
||||
movd m0, [srcq+wq*2] ; (byte) { Bx, Gx, Rx, xx }[0-3]
|
||||
DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
|
||||
pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
|
||||
pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
|
||||
pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3]
|
||||
pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3]
|
||||
paddd m3, m6 ; += rgb_UVrnd
|
||||
paddd m1, m6 ; += rgb_UVrnd
|
||||
paddd m2, m3 ; (dword) { V[0-3] }
|
||||
paddd m0, m1 ; (dword) { U[0-3] }
|
||||
psrad m0, 9
|
||||
psrad m2, 9
|
||||
packssdw m0, m0 ; (word) { U[0-7] }
|
||||
packssdw m2, m2 ; (word) { V[0-7] }
|
||||
movd [dstUq+wq], m0
|
||||
movd [dstVq+wq], m2
|
||||
add wq, 2
|
||||
jl .loop2
|
||||
.end:
|
||||
REP_RET
|
||||
%endif ; ARCH_X86_64 && %0 == 3
|
||||
%endmacro
|
||||
|
||||
; %1 = nr. of XMM registers for rgb-to-Y func
|
||||
; %2 = nr. of XMM registers for rgb-to-UV func
|
||||
%macro RGB32_FUNCS 2
|
||||
RGB32_TO_Y_FN %1, r, g, b, a
|
||||
RGB32_TO_Y_FN %1, b, g, r, a, rgba
|
||||
RGB32_TO_Y_FN %1, a, r, g, b, rgba
|
||||
RGB32_TO_Y_FN %1, a, b, g, r, rgba
|
||||
|
||||
RGB32_TO_UV_FN %2, r, g, b, a
|
||||
RGB32_TO_UV_FN %2, b, g, r, a, rgba
|
||||
RGB32_TO_UV_FN %2, a, r, g, b, rgba
|
||||
RGB32_TO_UV_FN %2, a, b, g, r, rgba
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
RGB32_FUNCS 0, 0
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
RGB32_FUNCS 8, 12
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
RGB32_FUNCS 8, 12
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
|
||||
;
|
||||
; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
|
||||
; and
|
||||
; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
|
||||
; const uint8_t *unused, int w);
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
; %1 = a (aligned) or u (unaligned)
|
||||
; %2 = yuyv or uyvy
|
||||
%macro LOOP_YUYV_TO_Y 2
|
||||
.loop_%1:
|
||||
mov%1 m0, [srcq+wq*2] ; (byte) { Y0, U0, Y1, V0, ... }
|
||||
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
|
||||
%ifidn %2, yuyv
|
||||
pand m0, m2 ; (word) { Y0, Y1, ..., Y7 }
|
||||
pand m1, m2 ; (word) { Y8, Y9, ..., Y15 }
|
||||
%else ; uyvy
|
||||
psrlw m0, 8 ; (word) { Y0, Y1, ..., Y7 }
|
||||
psrlw m1, 8 ; (word) { Y8, Y9, ..., Y15 }
|
||||
%endif ; yuyv/uyvy
|
||||
packuswb m0, m1 ; (byte) { Y0, ..., Y15 }
|
||||
mova [dstq+wq], m0
|
||||
add wq, mmsize
|
||||
jl .loop_%1
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
; %1 = nr. of XMM registers
|
||||
; %2 = yuyv or uyvy
|
||||
; %3 = if specified, it means that unaligned and aligned code in loop
|
||||
; will be the same (i.e. YUYV+AVX), and thus we don't need to
|
||||
; split the loop in an aligned and unaligned case
|
||||
%macro YUYV_TO_Y_FN 2-3
|
||||
cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
|
||||
%if ARCH_X86_64
|
||||
movsxd wq, wd
|
||||
%endif
|
||||
add dstq, wq
|
||||
%if mmsize == 16
|
||||
test srcq, 15
|
||||
%endif
|
||||
lea srcq, [srcq+wq*2]
|
||||
%ifidn %2, yuyv
|
||||
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
|
||||
psrlw m2, 8 ; (word) { 0x00ff } x 8
|
||||
%endif ; yuyv
|
||||
%if mmsize == 16
|
||||
jnz .loop_u_start
|
||||
neg wq
|
||||
LOOP_YUYV_TO_Y a, %2
|
||||
.loop_u_start:
|
||||
neg wq
|
||||
LOOP_YUYV_TO_Y u, %2
|
||||
%else ; mmsize == 8
|
||||
neg wq
|
||||
LOOP_YUYV_TO_Y a, %2
|
||||
%endif ; mmsize == 8/16
|
||||
%endmacro
|
||||
|
||||
; %1 = a (aligned) or u (unaligned)
|
||||
; %2 = yuyv or uyvy
|
||||
%macro LOOP_YUYV_TO_UV 2
|
||||
.loop_%1:
|
||||
%ifidn %2, yuyv
|
||||
mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
|
||||
mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
|
||||
psrlw m0, 8 ; (word) { U0, V0, ..., U3, V3 }
|
||||
psrlw m1, 8 ; (word) { U4, V4, ..., U7, V7 }
|
||||
%else ; uyvy
|
||||
%if cpuflag(avx)
|
||||
vpand m0, m2, [srcq+wq*4] ; (word) { U0, V0, ..., U3, V3 }
|
||||
vpand m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 }
|
||||
%else
|
||||
mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
|
||||
mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
|
||||
pand m0, m2 ; (word) { U0, V0, ..., U3, V3 }
|
||||
pand m1, m2 ; (word) { U4, V4, ..., U7, V7 }
|
||||
%endif
|
||||
%endif ; yuyv/uyvy
|
||||
packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 }
|
||||
pand m1, m0, m2 ; (word) { U0, U1, ..., U7 }
|
||||
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
|
||||
%if mmsize == 16
|
||||
packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 }
|
||||
movh [dstUq+wq], m1
|
||||
movhps [dstVq+wq], m1
|
||||
%else ; mmsize == 8
|
||||
packuswb m1, m1 ; (byte) { U0, ... U3 }
|
||||
packuswb m0, m0 ; (byte) { V0, ... V3 }
|
||||
movh [dstUq+wq], m1
|
||||
movh [dstVq+wq], m0
|
||||
%endif ; mmsize == 8/16
|
||||
add wq, mmsize / 2
|
||||
jl .loop_%1
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
; %1 = nr. of XMM registers
|
||||
; %2 = yuyv or uyvy
|
||||
; %3 = if specified, it means that unaligned and aligned code in loop
|
||||
; will be the same (i.e. UYVY+AVX), and thus we don't need to
|
||||
; split the loop in an aligned and unaligned case
|
||||
%macro YUYV_TO_UV_FN 2-3
|
||||
cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
|
||||
%if ARCH_X86_64
|
||||
movsxd wq, dword r5m
|
||||
%else ; x86-32
|
||||
mov wq, r5m
|
||||
%endif
|
||||
add dstUq, wq
|
||||
add dstVq, wq
|
||||
%if mmsize == 16 && %0 == 2
|
||||
test srcq, 15
|
||||
%endif
|
||||
lea srcq, [srcq+wq*4]
|
||||
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
|
||||
psrlw m2, 8 ; (word) { 0x00ff } x 8
|
||||
; NOTE: if uyvy+avx, u/a are identical
|
||||
%if mmsize == 16 && %0 == 2
|
||||
jnz .loop_u_start
|
||||
neg wq
|
||||
LOOP_YUYV_TO_UV a, %2
|
||||
.loop_u_start:
|
||||
neg wq
|
||||
LOOP_YUYV_TO_UV u, %2
|
||||
%else ; mmsize == 8
|
||||
neg wq
|
||||
LOOP_YUYV_TO_UV a, %2
|
||||
%endif ; mmsize == 8/16
|
||||
%endmacro
|
||||
|
||||
; %1 = a (aligned) or u (unaligned)
|
||||
; %2 = nv12 or nv21
|
||||
%macro LOOP_NVXX_TO_UV 2
|
||||
.loop_%1:
|
||||
mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... }
|
||||
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
|
||||
pand m2, m0, m5 ; (word) { U0, U1, ..., U7 }
|
||||
pand m3, m1, m5 ; (word) { U8, U9, ..., U15 }
|
||||
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
|
||||
psrlw m1, 8 ; (word) { V8, V9, ..., V15 }
|
||||
packuswb m2, m3 ; (byte) { U0, ..., U15 }
|
||||
packuswb m0, m1 ; (byte) { V0, ..., V15 }
|
||||
%ifidn %2, nv12
|
||||
mova [dstUq+wq], m2
|
||||
mova [dstVq+wq], m0
|
||||
%else ; nv21
|
||||
mova [dstVq+wq], m2
|
||||
mova [dstUq+wq], m0
|
||||
%endif ; nv12/21
|
||||
add wq, mmsize
|
||||
jl .loop_%1
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
; %1 = nr. of XMM registers
|
||||
; %2 = nv12 or nv21
|
||||
%macro NVXX_TO_UV_FN 2
|
||||
cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
|
||||
%if ARCH_X86_64
|
||||
movsxd wq, dword r5m
|
||||
%else ; x86-32
|
||||
mov wq, r5m
|
||||
%endif
|
||||
add dstUq, wq
|
||||
add dstVq, wq
|
||||
%if mmsize == 16
|
||||
test srcq, 15
|
||||
%endif
|
||||
lea srcq, [srcq+wq*2]
|
||||
pcmpeqb m5, m5 ; (byte) { 0xff } x 16
|
||||
psrlw m5, 8 ; (word) { 0x00ff } x 8
|
||||
%if mmsize == 16
|
||||
jnz .loop_u_start
|
||||
neg wq
|
||||
LOOP_NVXX_TO_UV a, %2
|
||||
.loop_u_start:
|
||||
neg wq
|
||||
LOOP_NVXX_TO_UV u, %2
|
||||
%else ; mmsize == 8
|
||||
neg wq
|
||||
LOOP_NVXX_TO_UV a, %2
|
||||
%endif ; mmsize == 8/16
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
YUYV_TO_Y_FN 0, yuyv
|
||||
YUYV_TO_Y_FN 0, uyvy
|
||||
YUYV_TO_UV_FN 0, yuyv
|
||||
YUYV_TO_UV_FN 0, uyvy
|
||||
NVXX_TO_UV_FN 0, nv12
|
||||
NVXX_TO_UV_FN 0, nv21
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
YUYV_TO_Y_FN 3, yuyv
|
||||
YUYV_TO_Y_FN 2, uyvy
|
||||
YUYV_TO_UV_FN 3, yuyv
|
||||
YUYV_TO_UV_FN 3, uyvy
|
||||
NVXX_TO_UV_FN 5, nv12
|
||||
NVXX_TO_UV_FN 5, nv21
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
|
||||
; that's not faster in practice
|
||||
YUYV_TO_UV_FN 3, yuyv
|
||||
YUYV_TO_UV_FN 3, uyvy, 1
|
||||
NVXX_TO_UV_FN 5, nv12
|
||||
NVXX_TO_UV_FN 5, nv21
|
||||
%endif
|
425
externals/ffmpeg/libswscale/x86/output.asm
vendored
Executable file
425
externals/ffmpeg/libswscale/x86/output.asm
vendored
Executable file
@@ -0,0 +1,425 @@
|
||||
;******************************************************************************
|
||||
;* x86-optimized vertical line scaling functions
|
||||
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;* Kieran Kunhya <kieran@kunhya.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
minshort: times 8 dw 0x8000
|
||||
yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
|
||||
yuv2yuvX_10_start: times 4 dd 0x10000
|
||||
yuv2yuvX_9_start: times 4 dd 0x20000
|
||||
yuv2yuvX_10_upper: times 8 dw 0x3ff
|
||||
yuv2yuvX_9_upper: times 8 dw 0x1ff
|
||||
pd_4: times 4 dd 4
|
||||
pd_4min0x40000:times 4 dd 4 - (0x40000)
|
||||
pw_16: times 8 dw 16
|
||||
pw_32: times 8 dw 32
|
||||
pw_512: times 8 dw 512
|
||||
pw_1024: times 8 dw 1024
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; vertical line scaling
|
||||
;
|
||||
; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
|
||||
; const uint8_t *dither, int offset)
|
||||
; and
|
||||
; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
|
||||
; const int16_t **src, uint8_t *dst, int dstW,
|
||||
; const uint8_t *dither, int offset)
|
||||
;
|
||||
; Scale one or $filterSize lines of source data to generate one line of output
|
||||
; data. The input is 15 bits in int16_t if $output_size is [8,10] and 19 bits in
|
||||
; int32_t if $output_size is 16. $filter is 12 bits. $filterSize is a multiple
|
||||
; of 2. $offset is either 0 or 3. $dither holds 8 values.
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro yuv2planeX_mainloop 2
|
||||
.pixelloop_%2:
|
||||
%assign %%i 0
|
||||
; the rep here is for the 8-bit output MMX case, where dither covers
|
||||
; 8 pixels but we can only handle 2 pixels per register, and thus 4
|
||||
; pixels per iteration. In order to not have to keep track of where
|
||||
; we are w.r.t. dithering, we unroll the MMX/8-bit loop x2.
|
||||
%if %1 == 8
|
||||
%assign %%repcnt 16/mmsize
|
||||
%else
|
||||
%assign %%repcnt 1
|
||||
%endif
|
||||
|
||||
%rep %%repcnt
|
||||
|
||||
%if %1 == 8
|
||||
%if ARCH_X86_32
|
||||
mova m2, [rsp+mmsize*(0+%%i)]
|
||||
mova m1, [rsp+mmsize*(1+%%i)]
|
||||
%else ; x86-64
|
||||
mova m2, m8
|
||||
mova m1, m_dith
|
||||
%endif ; x86-32/64
|
||||
%else ; %1 == 9/10/16
|
||||
mova m1, [yuv2yuvX_%1_start]
|
||||
mova m2, m1
|
||||
%endif ; %1 == 8/9/10/16
|
||||
movsx cntr_reg, fltsizem
|
||||
.filterloop_%2_ %+ %%i:
|
||||
; input pixels
|
||||
mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
|
||||
%if %1 == 16
|
||||
mova m3, [r6+r5*4]
|
||||
mova m5, [r6+r5*4+mmsize]
|
||||
%else ; %1 == 8/9/10
|
||||
mova m3, [r6+r5*2]
|
||||
%endif ; %1 == 8/9/10/16
|
||||
mov r6, [srcq+gprsize*cntr_reg-gprsize]
|
||||
%if %1 == 16
|
||||
mova m4, [r6+r5*4]
|
||||
mova m6, [r6+r5*4+mmsize]
|
||||
%else ; %1 == 8/9/10
|
||||
mova m4, [r6+r5*2]
|
||||
%endif ; %1 == 8/9/10/16
|
||||
|
||||
; coefficients
|
||||
movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
|
||||
%if %1 == 16
|
||||
pshuflw m7, m0, 0 ; coeff[0]
|
||||
pshuflw m0, m0, 0x55 ; coeff[1]
|
||||
pmovsxwd m7, m7 ; word -> dword
|
||||
pmovsxwd m0, m0 ; word -> dword
|
||||
|
||||
pmulld m3, m7
|
||||
pmulld m5, m7
|
||||
pmulld m4, m0
|
||||
pmulld m6, m0
|
||||
|
||||
paddd m2, m3
|
||||
paddd m1, m5
|
||||
paddd m2, m4
|
||||
paddd m1, m6
|
||||
%else ; %1 == 10/9/8
|
||||
punpcklwd m5, m3, m4
|
||||
punpckhwd m3, m4
|
||||
SPLATD m0
|
||||
|
||||
pmaddwd m5, m0
|
||||
pmaddwd m3, m0
|
||||
|
||||
paddd m2, m5
|
||||
paddd m1, m3
|
||||
%endif ; %1 == 8/9/10/16
|
||||
|
||||
sub cntr_reg, 2
|
||||
jg .filterloop_%2_ %+ %%i
|
||||
|
||||
%if %1 == 16
|
||||
psrad m2, 31 - %1
|
||||
psrad m1, 31 - %1
|
||||
%else ; %1 == 10/9/8
|
||||
psrad m2, 27 - %1
|
||||
psrad m1, 27 - %1
|
||||
%endif ; %1 == 8/9/10/16
|
||||
|
||||
%if %1 == 8
|
||||
packssdw m2, m1
|
||||
packuswb m2, m2
|
||||
movh [dstq+r5*1], m2
|
||||
%else ; %1 == 9/10/16
|
||||
%if %1 == 16
|
||||
packssdw m2, m1
|
||||
paddw m2, [minshort]
|
||||
%else ; %1 == 9/10
|
||||
%if cpuflag(sse4)
|
||||
packusdw m2, m1
|
||||
%else ; mmxext/sse2
|
||||
packssdw m2, m1
|
||||
pmaxsw m2, m6
|
||||
%endif ; mmxext/sse2/sse4/avx
|
||||
pminsw m2, [yuv2yuvX_%1_upper]
|
||||
%endif ; %1 == 9/10/16
|
||||
mov%2 [dstq+r5*2], m2
|
||||
%endif ; %1 == 8/9/10/16
|
||||
|
||||
add r5, mmsize/2
|
||||
sub wd, mmsize/2
|
||||
|
||||
%assign %%i %%i+2
|
||||
%endrep
|
||||
jg .pixelloop_%2
|
||||
%endmacro
|
||||
|
||||
%macro yuv2planeX_fn 3
|
||||
|
||||
%if ARCH_X86_32
|
||||
%define cntr_reg fltsizeq
|
||||
%define movsx mov
|
||||
%else
|
||||
%define cntr_reg r7
|
||||
%define movsx movsxd
|
||||
%endif
|
||||
|
||||
cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
|
||||
%if %1 == 8 || %1 == 9 || %1 == 10
|
||||
pxor m6, m6
|
||||
%endif ; %1 == 8/9/10
|
||||
|
||||
%if %1 == 8
|
||||
%if ARCH_X86_32
|
||||
%assign pad 0x2c - (stack_offset & 15)
|
||||
SUB rsp, pad
|
||||
%define m_dith m7
|
||||
%else ; x86-64
|
||||
%define m_dith m9
|
||||
%endif ; x86-32
|
||||
|
||||
; create registers holding dither
|
||||
movq m_dith, [ditherq] ; dither
|
||||
test offsetd, offsetd
|
||||
jz .no_rot
|
||||
%if mmsize == 16
|
||||
punpcklqdq m_dith, m_dith
|
||||
%endif ; mmsize == 16
|
||||
PALIGNR m_dith, m_dith, 3, m0
|
||||
.no_rot:
|
||||
%if mmsize == 16
|
||||
punpcklbw m_dith, m6
|
||||
%if ARCH_X86_64
|
||||
punpcklwd m8, m_dith, m6
|
||||
pslld m8, 12
|
||||
%else ; x86-32
|
||||
punpcklwd m5, m_dith, m6
|
||||
pslld m5, 12
|
||||
%endif ; x86-32/64
|
||||
punpckhwd m_dith, m6
|
||||
pslld m_dith, 12
|
||||
%if ARCH_X86_32
|
||||
mova [rsp+ 0], m5
|
||||
mova [rsp+16], m_dith
|
||||
%endif
|
||||
%else ; mmsize == 8
|
||||
punpcklbw m5, m_dith, m6
|
||||
punpckhbw m_dith, m6
|
||||
punpcklwd m4, m5, m6
|
||||
punpckhwd m5, m6
|
||||
punpcklwd m3, m_dith, m6
|
||||
punpckhwd m_dith, m6
|
||||
pslld m4, 12
|
||||
pslld m5, 12
|
||||
pslld m3, 12
|
||||
pslld m_dith, 12
|
||||
mova [rsp+ 0], m4
|
||||
mova [rsp+ 8], m5
|
||||
mova [rsp+16], m3
|
||||
mova [rsp+24], m_dith
|
||||
%endif ; mmsize == 8/16
|
||||
%endif ; %1 == 8
|
||||
|
||||
xor r5, r5
|
||||
|
||||
%if mmsize == 8 || %1 == 8
|
||||
yuv2planeX_mainloop %1, a
|
||||
%else ; mmsize == 16
|
||||
test dstq, 15
|
||||
jnz .unaligned
|
||||
yuv2planeX_mainloop %1, a
|
||||
REP_RET
|
||||
.unaligned:
|
||||
yuv2planeX_mainloop %1, u
|
||||
%endif ; mmsize == 8/16
|
||||
|
||||
%if %1 == 8
|
||||
%if ARCH_X86_32
|
||||
ADD rsp, pad
|
||||
RET
|
||||
%else ; x86-64
|
||||
REP_RET
|
||||
%endif ; x86-32/64
|
||||
%else ; %1 == 9/10/16
|
||||
REP_RET
|
||||
%endif ; %1 == 8/9/10/16
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmxext
|
||||
yuv2planeX_fn 8, 0, 7
|
||||
yuv2planeX_fn 9, 0, 5
|
||||
yuv2planeX_fn 10, 0, 5
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
yuv2planeX_fn 8, 10, 7
|
||||
yuv2planeX_fn 9, 7, 5
|
||||
yuv2planeX_fn 10, 7, 5
|
||||
|
||||
INIT_XMM sse4
|
||||
yuv2planeX_fn 8, 10, 7
|
||||
yuv2planeX_fn 9, 7, 5
|
||||
yuv2planeX_fn 10, 7, 5
|
||||
yuv2planeX_fn 16, 8, 5
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
yuv2planeX_fn 8, 10, 7
|
||||
yuv2planeX_fn 9, 7, 5
|
||||
yuv2planeX_fn 10, 7, 5
|
||||
%endif
|
||||
|
||||
; %1=outout-bpc, %2=alignment (u/a)
|
||||
%macro yuv2plane1_mainloop 2
|
||||
.loop_%2:
|
||||
%if %1 == 8
|
||||
paddsw m0, m2, [srcq+wq*2+mmsize*0]
|
||||
paddsw m1, m3, [srcq+wq*2+mmsize*1]
|
||||
psraw m0, 7
|
||||
psraw m1, 7
|
||||
packuswb m0, m1
|
||||
mov%2 [dstq+wq], m0
|
||||
%elif %1 == 16
|
||||
paddd m0, m4, [srcq+wq*4+mmsize*0]
|
||||
paddd m1, m4, [srcq+wq*4+mmsize*1]
|
||||
paddd m2, m4, [srcq+wq*4+mmsize*2]
|
||||
paddd m3, m4, [srcq+wq*4+mmsize*3]
|
||||
psrad m0, 3
|
||||
psrad m1, 3
|
||||
psrad m2, 3
|
||||
psrad m3, 3
|
||||
%if cpuflag(sse4) ; avx/sse4
|
||||
packusdw m0, m1
|
||||
packusdw m2, m3
|
||||
%else ; mmx/sse2
|
||||
packssdw m0, m1
|
||||
packssdw m2, m3
|
||||
paddw m0, m5
|
||||
paddw m2, m5
|
||||
%endif ; mmx/sse2/sse4/avx
|
||||
mov%2 [dstq+wq*2+mmsize*0], m0
|
||||
mov%2 [dstq+wq*2+mmsize*1], m2
|
||||
%else ; %1 == 9/10
|
||||
paddsw m0, m2, [srcq+wq*2+mmsize*0]
|
||||
paddsw m1, m2, [srcq+wq*2+mmsize*1]
|
||||
psraw m0, 15 - %1
|
||||
psraw m1, 15 - %1
|
||||
pmaxsw m0, m4
|
||||
pmaxsw m1, m4
|
||||
pminsw m0, m3
|
||||
pminsw m1, m3
|
||||
mov%2 [dstq+wq*2+mmsize*0], m0
|
||||
mov%2 [dstq+wq*2+mmsize*1], m1
|
||||
%endif
|
||||
add wq, mmsize
|
||||
jl .loop_%2
|
||||
%endmacro
|
||||
|
||||
%macro yuv2plane1_fn 3
|
||||
cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
|
||||
movsxdifnidn wq, wd
|
||||
add wq, mmsize - 1
|
||||
and wq, ~(mmsize - 1)
|
||||
%if %1 == 8
|
||||
add dstq, wq
|
||||
%else ; %1 != 8
|
||||
lea dstq, [dstq+wq*2]
|
||||
%endif ; %1 == 8
|
||||
%if %1 == 16
|
||||
lea srcq, [srcq+wq*4]
|
||||
%else ; %1 != 16
|
||||
lea srcq, [srcq+wq*2]
|
||||
%endif ; %1 == 16
|
||||
neg wq
|
||||
|
||||
%if %1 == 8
|
||||
pxor m4, m4 ; zero
|
||||
|
||||
; create registers holding dither
|
||||
movq m3, [ditherq] ; dither
|
||||
test offsetd, offsetd
|
||||
jz .no_rot
|
||||
%if mmsize == 16
|
||||
punpcklqdq m3, m3
|
||||
%endif ; mmsize == 16
|
||||
PALIGNR m3, m3, 3, m2
|
||||
.no_rot:
|
||||
%if mmsize == 8
|
||||
mova m2, m3
|
||||
punpckhbw m3, m4 ; byte->word
|
||||
punpcklbw m2, m4 ; byte->word
|
||||
%else
|
||||
punpcklbw m3, m4
|
||||
mova m2, m3
|
||||
%endif
|
||||
%elif %1 == 9
|
||||
pxor m4, m4
|
||||
mova m3, [pw_512]
|
||||
mova m2, [pw_32]
|
||||
%elif %1 == 10
|
||||
pxor m4, m4
|
||||
mova m3, [pw_1024]
|
||||
mova m2, [pw_16]
|
||||
%else ; %1 == 16
|
||||
%if cpuflag(sse4) ; sse4/avx
|
||||
mova m4, [pd_4]
|
||||
%else ; mmx/sse2
|
||||
mova m4, [pd_4min0x40000]
|
||||
mova m5, [minshort]
|
||||
%endif ; mmx/sse2/sse4/avx
|
||||
%endif ; %1 == ..
|
||||
|
||||
; actual pixel scaling
|
||||
%if mmsize == 8
|
||||
yuv2plane1_mainloop %1, a
|
||||
%else ; mmsize == 16
|
||||
test dstq, 15
|
||||
jnz .unaligned
|
||||
yuv2plane1_mainloop %1, a
|
||||
REP_RET
|
||||
.unaligned:
|
||||
yuv2plane1_mainloop %1, u
|
||||
%endif ; mmsize == 8/16
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
yuv2plane1_fn 8, 0, 5
|
||||
yuv2plane1_fn 16, 0, 3
|
||||
|
||||
INIT_MMX mmxext
|
||||
yuv2plane1_fn 9, 0, 3
|
||||
yuv2plane1_fn 10, 0, 3
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
yuv2plane1_fn 8, 5, 5
|
||||
yuv2plane1_fn 9, 5, 3
|
||||
yuv2plane1_fn 10, 5, 3
|
||||
yuv2plane1_fn 16, 6, 3
|
||||
|
||||
INIT_XMM sse4
|
||||
yuv2plane1_fn 16, 5, 3
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
yuv2plane1_fn 8, 5, 5
|
||||
yuv2plane1_fn 9, 5, 3
|
||||
yuv2plane1_fn 10, 5, 3
|
||||
yuv2plane1_fn 16, 5, 3
|
||||
%endif
|
200
externals/ffmpeg/libswscale/x86/rgb2rgb.c
vendored
Executable file
200
externals/ffmpeg/libswscale/x86/rgb2rgb.c
vendored
Executable file
@@ -0,0 +1,200 @@
|
||||
/*
|
||||
* software RGB to RGB converter
|
||||
* pluralize by software PAL8 to RGB converter
|
||||
* software YUV to YUV converter
|
||||
* software YUV to RGB converter
|
||||
* Written by Nick Kurshev.
|
||||
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/bswap.h"
|
||||
#include "libswscale/rgb2rgb.h"
|
||||
#include "libswscale/swscale.h"
|
||||
#include "libswscale/swscale_internal.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mmx_one) = 0xFFFFFFFFFFFFFFFFULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask32b) = 0x000000FF000000FFULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask32g) = 0x0000FF000000FF00ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask32r) = 0x00FF000000FF0000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask32) = 0x00FFFFFF00FFFFFFULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask24hh) = 0xffff000000000000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask24hhh) = 0xffffffff00000000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask24hhhh) = 0xffffffffffff0000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL;
|
||||
#define mask16b mask15b
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
|
||||
|
||||
DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2YOffset);
|
||||
DECLARE_ALIGNED(8, extern const uint64_t, ff_w1111);
|
||||
DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
|
||||
|
||||
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
|
||||
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
|
||||
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
|
||||
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
|
||||
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
|
||||
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
|
||||
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
|
||||
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
|
||||
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
|
||||
|
||||
// Note: We have C, MMX, MMXEXT, 3DNOW versions, there is no 3DNOW + MMXEXT one.
|
||||
|
||||
#define COMPILE_TEMPLATE_MMXEXT 0
|
||||
#define COMPILE_TEMPLATE_AMD3DNOW 0
|
||||
#define COMPILE_TEMPLATE_SSE2 0
|
||||
#define COMPILE_TEMPLATE_AVX 0
|
||||
|
||||
//MMX versions
|
||||
#undef RENAME
|
||||
#define RENAME(a) a ## _mmx
|
||||
#include "rgb2rgb_template.c"
|
||||
|
||||
// MMXEXT versions
|
||||
#undef RENAME
|
||||
#undef COMPILE_TEMPLATE_MMXEXT
|
||||
#define COMPILE_TEMPLATE_MMXEXT 1
|
||||
#define RENAME(a) a ## _mmxext
|
||||
#include "rgb2rgb_template.c"
|
||||
|
||||
//SSE2 versions
|
||||
#undef RENAME
|
||||
#undef COMPILE_TEMPLATE_SSE2
|
||||
#define COMPILE_TEMPLATE_SSE2 1
|
||||
#define RENAME(a) a ## _sse2
|
||||
#include "rgb2rgb_template.c"
|
||||
|
||||
//AVX versions
|
||||
#undef RENAME
|
||||
#undef COMPILE_TEMPLATE_AVX
|
||||
#define COMPILE_TEMPLATE_AVX 1
|
||||
#define RENAME(a) a ## _avx
|
||||
#include "rgb2rgb_template.c"
|
||||
|
||||
//3DNOW versions
|
||||
#undef RENAME
|
||||
#undef COMPILE_TEMPLATE_MMXEXT
|
||||
#undef COMPILE_TEMPLATE_SSE2
|
||||
#undef COMPILE_TEMPLATE_AVX
|
||||
#undef COMPILE_TEMPLATE_AMD3DNOW
|
||||
#define COMPILE_TEMPLATE_MMXEXT 0
|
||||
#define COMPILE_TEMPLATE_SSE2 0
|
||||
#define COMPILE_TEMPLATE_AVX 0
|
||||
#define COMPILE_TEMPLATE_AMD3DNOW 1
|
||||
#define RENAME(a) a ## _3dnow
|
||||
#include "rgb2rgb_template.c"
|
||||
|
||||
/*
|
||||
RGB15->RGB16 original by Strepto/Astral
|
||||
ported to gcc & bugfixed : A'rpi
|
||||
MMXEXT, 3DNOW optimization by Nick Kurshev
|
||||
32-bit C version, and and&add trick by Michael Niedermayer
|
||||
*/
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
|
||||
void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
|
||||
void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
|
||||
void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
|
||||
void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
|
||||
void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
|
||||
|
||||
#if ARCH_X86_64
|
||||
void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
||||
const uint8_t *src, int width, int height,
|
||||
int lumStride, int chromStride, int srcStride);
|
||||
void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
||||
const uint8_t *src, int width, int height,
|
||||
int lumStride, int chromStride, int srcStride);
|
||||
#endif
|
||||
|
||||
av_cold void rgb2rgb_init_x86(void)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
if (INLINE_MMX(cpu_flags))
|
||||
rgb2rgb_init_mmx();
|
||||
if (INLINE_AMD3DNOW(cpu_flags))
|
||||
rgb2rgb_init_3dnow();
|
||||
if (INLINE_MMXEXT(cpu_flags))
|
||||
rgb2rgb_init_mmxext();
|
||||
if (INLINE_SSE2(cpu_flags))
|
||||
rgb2rgb_init_sse2();
|
||||
if (INLINE_AVX(cpu_flags))
|
||||
rgb2rgb_init_avx();
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
#if ARCH_X86_64
|
||||
uyvytoyuv422 = ff_uyvytoyuv422_sse2;
|
||||
#endif
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
shuffle_bytes_0321 = ff_shuffle_bytes_0321_ssse3;
|
||||
shuffle_bytes_2103 = ff_shuffle_bytes_2103_ssse3;
|
||||
shuffle_bytes_1230 = ff_shuffle_bytes_1230_ssse3;
|
||||
shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3;
|
||||
shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
#if ARCH_X86_64
|
||||
uyvytoyuv422 = ff_uyvytoyuv422_avx;
|
||||
#endif
|
||||
}
|
||||
}
|
2545
externals/ffmpeg/libswscale/x86/rgb2rgb_template.c
vendored
Executable file
2545
externals/ffmpeg/libswscale/x86/rgb2rgb_template.c
vendored
Executable file
File diff suppressed because it is too large
Load Diff
300
externals/ffmpeg/libswscale/x86/rgb_2_rgb.asm
vendored
Executable file
300
externals/ffmpeg/libswscale/x86/rgb_2_rgb.asm
vendored
Executable file
@@ -0,0 +1,300 @@
|
||||
;******************************************************************************
|
||||
;* Copyright Nick Kurshev
|
||||
;* Copyright Michael (michaelni@gmx.at)
|
||||
;* Copyright 2018 Jokyo Images
|
||||
;* Copyright Ivo van Poorten
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
pb_mask_shuffle2103_mmx times 8 dw 255
|
||||
pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
|
||||
pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
|
||||
pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
|
||||
pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
|
||||
pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro RSHIFT_COPY 3
|
||||
; %1 dst ; %2 src ; %3 shift
|
||||
%if cpuflag(avx)
|
||||
psrldq %1, %2, %3
|
||||
%else
|
||||
mova %1, %2
|
||||
RSHIFT %1, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
|
||||
;------------------------------------------------------------------------------
|
||||
INIT_MMX mmxext
|
||||
cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
|
||||
mova m6, [pb_mask_shuffle2103_mmx]
|
||||
mova m7, m6
|
||||
psllq m7, 8
|
||||
|
||||
movsxdifnidn wq, wd
|
||||
mov xq, wq
|
||||
|
||||
add srcq, wq
|
||||
add dstq, wq
|
||||
neg wq
|
||||
|
||||
;calc scalar loop
|
||||
and xq, mmsize*2 -4
|
||||
je .loop_simd
|
||||
|
||||
.loop_scalar:
|
||||
mov tmpb, [srcq + wq + 2]
|
||||
mov [dstq+wq + 0], tmpb
|
||||
mov tmpb, [srcq + wq + 1]
|
||||
mov [dstq+wq + 1], tmpb
|
||||
mov tmpb, [srcq + wq + 0]
|
||||
mov [dstq+wq + 2], tmpb
|
||||
mov tmpb, [srcq + wq + 3]
|
||||
mov [dstq+wq + 3], tmpb
|
||||
add wq, 4
|
||||
sub xq, 4
|
||||
jg .loop_scalar
|
||||
|
||||
;check if src_size < mmsize * 2
|
||||
cmp wq, 0
|
||||
jge .end
|
||||
|
||||
.loop_simd:
|
||||
movu m0, [srcq+wq]
|
||||
movu m1, [srcq+wq+8]
|
||||
|
||||
pshufw m3, m0, 177
|
||||
pshufw m5, m1, 177
|
||||
|
||||
pand m0, m7
|
||||
pand m3, m6
|
||||
|
||||
pand m1, m7
|
||||
pand m5, m6
|
||||
|
||||
por m0, m3
|
||||
por m1, m5
|
||||
|
||||
movu [dstq+wq], m0
|
||||
movu [dstq+wq + 8], m1
|
||||
|
||||
add wq, mmsize*2
|
||||
jl .loop_simd
|
||||
|
||||
.end:
|
||||
RET
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
|
||||
;------------------------------------------------------------------------------
|
||||
; %1-4 index shuffle
|
||||
%macro SHUFFLE_BYTES 4
|
||||
cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
|
||||
VBROADCASTI128 m0, [pb_shuffle%1%2%3%4]
|
||||
movsxdifnidn wq, wd
|
||||
mov xq, wq
|
||||
|
||||
add srcq, wq
|
||||
add dstq, wq
|
||||
neg wq
|
||||
|
||||
;calc scalar loop
|
||||
and xq, mmsize-4
|
||||
je .loop_simd
|
||||
|
||||
.loop_scalar:
|
||||
mov tmpb, [srcq + wq + %1]
|
||||
mov [dstq+wq + 0], tmpb
|
||||
mov tmpb, [srcq + wq + %2]
|
||||
mov [dstq+wq + 1], tmpb
|
||||
mov tmpb, [srcq + wq + %3]
|
||||
mov [dstq+wq + 2], tmpb
|
||||
mov tmpb, [srcq + wq + %4]
|
||||
mov [dstq+wq + 3], tmpb
|
||||
add wq, 4
|
||||
sub xq, 4
|
||||
jg .loop_scalar
|
||||
|
||||
;check if src_size < mmsize
|
||||
cmp wq, 0
|
||||
jge .end
|
||||
|
||||
.loop_simd:
|
||||
movu m1, [srcq+wq]
|
||||
pshufb m1, m0
|
||||
movu [dstq+wq], m1
|
||||
add wq, mmsize
|
||||
jl .loop_simd
|
||||
|
||||
.end:
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
SHUFFLE_BYTES 2, 1, 0, 3
|
||||
SHUFFLE_BYTES 0, 3, 2, 1
|
||||
SHUFFLE_BYTES 1, 2, 3, 0
|
||||
SHUFFLE_BYTES 3, 0, 1, 2
|
||||
SHUFFLE_BYTES 3, 2, 1, 0
|
||||
|
||||
;-----------------------------------------------------------------------------------------------
|
||||
; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
||||
; const uint8_t *src, int width, int height,
|
||||
; int lumStride, int chromStride, int srcStride)
|
||||
;-----------------------------------------------------------------------------------------------
|
||||
%macro UYVY_TO_YUV422 0
|
||||
cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
|
||||
pxor m0, m0
|
||||
pcmpeqw m1, m1
|
||||
psrlw m1, 8
|
||||
|
||||
movsxdifnidn wq, wd
|
||||
movsxdifnidn lum_strideq, lum_strided
|
||||
movsxdifnidn chrom_strideq, chrom_strided
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
|
||||
mov back_wq, wq
|
||||
mov whalfq, wq
|
||||
shr whalfq, 1 ; whalf = width / 2
|
||||
|
||||
lea srcq, [srcq + wq * 2]
|
||||
add ydstq, wq
|
||||
add udstq, whalfq
|
||||
add vdstq, whalfq
|
||||
|
||||
.loop_line:
|
||||
mov xq, wq
|
||||
mov wtwoq, wq
|
||||
add wtwoq, wtwoq ; wtwo = width * 2
|
||||
|
||||
neg wq
|
||||
neg wtwoq
|
||||
neg whalfq
|
||||
|
||||
;calc scalar loop count
|
||||
and xq, mmsize * 2 - 1
|
||||
je .loop_simd
|
||||
|
||||
.loop_scalar:
|
||||
mov tmpb, [srcq + wtwoq + 0]
|
||||
mov [udstq + whalfq], tmpb
|
||||
|
||||
mov tmpb, [srcq + wtwoq + 1]
|
||||
mov [ydstq + wq], tmpb
|
||||
|
||||
mov tmpb, [srcq + wtwoq + 2]
|
||||
mov [vdstq + whalfq], tmpb
|
||||
|
||||
mov tmpb, [srcq + wtwoq + 3]
|
||||
mov [ydstq + wq + 1], tmpb
|
||||
|
||||
add wq, 2
|
||||
add wtwoq, 4
|
||||
add whalfq, 1
|
||||
sub xq, 2
|
||||
jg .loop_scalar
|
||||
|
||||
; check if simd loop is need
|
||||
cmp wq, 0
|
||||
jge .end_line
|
||||
|
||||
.loop_simd:
|
||||
movu m2, [srcq + wtwoq ]
|
||||
movu m3, [srcq + wtwoq + mmsize ]
|
||||
movu m4, [srcq + wtwoq + mmsize * 2]
|
||||
movu m5, [srcq + wtwoq + mmsize * 3]
|
||||
|
||||
; extract y part 1
|
||||
RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
|
||||
pand m6, m1; YxYx YxYx...
|
||||
|
||||
RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
|
||||
pand m7, m1 ; YxYx YxYx...
|
||||
|
||||
packuswb m6, m7 ; YYYY YYYY...
|
||||
movu [ydstq + wq], m6
|
||||
|
||||
; extract y part 2
|
||||
RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY...
|
||||
pand m6, m1; YxYx YxYx...
|
||||
|
||||
RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
|
||||
pand m7, m1 ; YxYx YxYx...
|
||||
|
||||
packuswb m6, m7 ; YYYY YYYY...
|
||||
movu [ydstq + wq + mmsize], m6
|
||||
|
||||
; extract uv
|
||||
pand m2, m1 ; UxVx...
|
||||
pand m3, m1 ; UxVx...
|
||||
pand m4, m1 ; UxVx...
|
||||
pand m5, m1 ; UxVx...
|
||||
|
||||
packuswb m2, m3 ; UVUV...
|
||||
packuswb m4, m5 ; UVUV...
|
||||
|
||||
; U
|
||||
pand m6, m2, m1 ; UxUx...
|
||||
pand m7, m4, m1 ; UxUx...
|
||||
|
||||
packuswb m6, m7 ; UUUU
|
||||
movu [udstq + whalfq], m6
|
||||
|
||||
|
||||
; V
|
||||
psrlw m2, 8 ; VxVx...
|
||||
psrlw m4, 8 ; VxVx...
|
||||
packuswb m2, m4 ; VVVV
|
||||
movu [vdstq + whalfq], m2
|
||||
|
||||
add whalfq, mmsize
|
||||
add wtwoq, mmsize * 4
|
||||
add wq, mmsize * 2
|
||||
jl .loop_simd
|
||||
|
||||
.end_line:
|
||||
add srcq, src_strideq
|
||||
add ydstq, lum_strideq
|
||||
add udstq, chrom_strideq
|
||||
add vdstq, chrom_strideq
|
||||
|
||||
;restore initial state of line variable
|
||||
mov wq, back_wq
|
||||
mov xq, wq
|
||||
mov whalfq, wq
|
||||
shr whalfq, 1 ; whalf = width / 2
|
||||
sub hd, 1
|
||||
jg .loop_line
|
||||
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
INIT_XMM sse2
|
||||
UYVY_TO_YUV422
|
||||
|
||||
INIT_XMM avx
|
||||
UYVY_TO_YUV422
|
||||
%endif
|
423
externals/ffmpeg/libswscale/x86/scale.asm
vendored
Executable file
423
externals/ffmpeg/libswscale/x86/scale.asm
vendored
Executable file
@@ -0,0 +1,423 @@
|
||||
;******************************************************************************
|
||||
;* x86-optimized horizontal line scaling functions
|
||||
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
max_19bit_int: times 4 dd 0x7ffff
|
||||
max_19bit_flt: times 4 dd 524287.0
|
||||
minshort: times 8 dw 0x8000
|
||||
unicoeff: times 4 dd 0x20000000
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; horizontal line scaling
|
||||
;
|
||||
; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
|
||||
; (SwsContext *c, int{16,32}_t *dst,
|
||||
; int dstW, const uint{8,16}_t *src,
|
||||
; const int16_t *filter,
|
||||
; const int32_t *filterPos, int filterSize);
|
||||
;
|
||||
; Scale one horizontal line. Input is either 8-bit width or 16-bit width
|
||||
; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
|
||||
; downscale before multiplying). Filter is 14 bits. Output is either 15 bits
|
||||
; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each
|
||||
; output pixel is generated from $filterSize input pixels, the position of
|
||||
; the first pixel is given in filterPos[nOutputPixel].
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
|
||||
%macro SCALE_FUNC 6
|
||||
%ifnidn %3, X
|
||||
cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
|
||||
%else
|
||||
cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
|
||||
%endif
|
||||
%if ARCH_X86_64
|
||||
movsxd wq, wd
|
||||
%define mov32 movsxd
|
||||
%else ; x86-32
|
||||
%define mov32 mov
|
||||
%endif ; x86-64
|
||||
%if %2 == 19
|
||||
%if mmsize == 8 ; mmx
|
||||
mova m2, [max_19bit_int]
|
||||
%elif cpuflag(sse4)
|
||||
mova m2, [max_19bit_int]
|
||||
%else ; ssse3/sse2
|
||||
mova m2, [max_19bit_flt]
|
||||
%endif ; mmx/sse2/ssse3/sse4
|
||||
%endif ; %2 == 19
|
||||
%if %1 == 16
|
||||
mova m6, [minshort]
|
||||
mova m7, [unicoeff]
|
||||
%elif %1 == 8
|
||||
pxor m3, m3
|
||||
%endif ; %1 == 8/16
|
||||
|
||||
%if %1 == 8
|
||||
%define movlh movd
|
||||
%define movbh movh
|
||||
%define srcmul 1
|
||||
%else ; %1 == 9-16
|
||||
%define movlh movq
|
||||
%define movbh movu
|
||||
%define srcmul 2
|
||||
%endif ; %1 == 8/9-16
|
||||
|
||||
%ifnidn %3, X
|
||||
|
||||
; setup loop
|
||||
%if %3 == 8
|
||||
shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
|
||||
%define wshr 1
|
||||
%else ; %3 == 4
|
||||
%define wshr 0
|
||||
%endif ; %3 == 8
|
||||
lea filterq, [filterq+wq*8]
|
||||
%if %2 == 15
|
||||
lea dstq, [dstq+wq*(2>>wshr)]
|
||||
%else ; %2 == 19
|
||||
lea dstq, [dstq+wq*(4>>wshr)]
|
||||
%endif ; %2 == 15/19
|
||||
lea fltposq, [fltposq+wq*(4>>wshr)]
|
||||
neg wq
|
||||
|
||||
.loop:
|
||||
%if %3 == 4 ; filterSize == 4 scaling
|
||||
; load 2x4 or 4x4 source pixels into m0/m1
|
||||
mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0]
|
||||
mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1]
|
||||
movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}]
|
||||
%if mmsize == 8
|
||||
movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
|
||||
%else ; mmsize == 16
|
||||
%if %1 > 8
|
||||
movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
|
||||
%else ; %1 == 8
|
||||
movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
|
||||
%endif
|
||||
mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2]
|
||||
mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3]
|
||||
movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}]
|
||||
%if %1 > 8
|
||||
movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
|
||||
%else ; %1 == 8
|
||||
movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
|
||||
punpckldq m0, m4
|
||||
punpckldq m1, m5
|
||||
%endif ; %1 == 8
|
||||
%endif ; mmsize == 8/16
|
||||
%if %1 == 8
|
||||
punpcklbw m0, m3 ; byte -> word
|
||||
punpcklbw m1, m3 ; byte -> word
|
||||
%endif ; %1 == 8
|
||||
|
||||
; multiply with filter coefficients
|
||||
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
|
||||
; add back 0x8000 * sum(coeffs) after the horizontal add
|
||||
psubw m0, m6
|
||||
psubw m1, m6
|
||||
%endif ; %1 == 16
|
||||
pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
|
||||
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
|
||||
|
||||
; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
|
||||
%if mmsize == 8 ; mmx
|
||||
movq m4, m0
|
||||
punpckldq m0, m1
|
||||
punpckhdq m4, m1
|
||||
paddd m0, m4
|
||||
%elif notcpuflag(ssse3) ; sse2
|
||||
mova m4, m0
|
||||
shufps m0, m1, 10001000b
|
||||
shufps m4, m1, 11011101b
|
||||
paddd m0, m4
|
||||
%else ; ssse3/sse4
|
||||
phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
|
||||
; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
|
||||
; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
|
||||
; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
|
||||
%endif ; mmx/sse2/ssse3/sse4
|
||||
%else ; %3 == 8, i.e. filterSize == 8 scaling
|
||||
; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
|
||||
mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
|
||||
mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1]
|
||||
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
|
||||
%if mmsize == 8
|
||||
movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
|
||||
movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}]
|
||||
movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
|
||||
%else ; mmsize == 16
|
||||
movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
|
||||
mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2]
|
||||
mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3]
|
||||
movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
|
||||
movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
|
||||
%endif ; mmsize == 8/16
|
||||
%if %1 == 8
|
||||
punpcklbw m0, m3 ; byte -> word
|
||||
punpcklbw m1, m3 ; byte -> word
|
||||
punpcklbw m4, m3 ; byte -> word
|
||||
punpcklbw m5, m3 ; byte -> word
|
||||
%endif ; %1 == 8
|
||||
|
||||
; multiply
|
||||
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
|
||||
; add back 0x8000 * sum(coeffs) after the horizontal add
|
||||
psubw m0, m6
|
||||
psubw m1, m6
|
||||
psubw m4, m6
|
||||
psubw m5, m6
|
||||
%endif ; %1 == 16
|
||||
pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
|
||||
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
|
||||
pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}]
|
||||
pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
|
||||
|
||||
; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
|
||||
%if mmsize == 8
|
||||
paddd m0, m1
|
||||
paddd m4, m5
|
||||
movq m1, m0
|
||||
punpckldq m0, m4
|
||||
punpckhdq m1, m4
|
||||
paddd m0, m1
|
||||
%elif notcpuflag(ssse3) ; sse2
|
||||
%if %1 == 8
|
||||
%define mex m6
|
||||
%else
|
||||
%define mex m3
|
||||
%endif
|
||||
; emulate horizontal add as transpose + vertical add
|
||||
mova mex, m0
|
||||
punpckldq m0, m1
|
||||
punpckhdq mex, m1
|
||||
paddd m0, mex
|
||||
mova m1, m4
|
||||
punpckldq m4, m5
|
||||
punpckhdq m1, m5
|
||||
paddd m4, m1
|
||||
mova m1, m0
|
||||
punpcklqdq m0, m4
|
||||
punpckhqdq m1, m4
|
||||
paddd m0, m1
|
||||
%else ; ssse3/sse4
|
||||
; FIXME if we rearrange the filter in pairs of 4, we can
|
||||
; load pixels likewise and use 2 x paddd + phaddd instead
|
||||
; of 3 x phaddd here, faster on older cpus
|
||||
phaddd m0, m1
|
||||
phaddd m4, m5
|
||||
phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
|
||||
; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
|
||||
; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
|
||||
; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
|
||||
%endif ; mmx/sse2/ssse3/sse4
|
||||
%endif ; %3 == 4/8
|
||||
|
||||
%else ; %3 == X, i.e. any filterSize scaling
|
||||
|
||||
%ifidn %4, X4
|
||||
%define dlt 4
|
||||
%else ; %4 == X || %4 == X8
|
||||
%define dlt 0
|
||||
%endif ; %4 ==/!= X4
|
||||
%if ARCH_X86_64
|
||||
%define srcq r8
|
||||
%define pos1q r7
|
||||
%define srcendq r9
|
||||
movsxd fltsizeq, fltsized ; filterSize
|
||||
lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
|
||||
%else ; x86-32
|
||||
%define srcq srcmemq
|
||||
%define pos1q dstq
|
||||
%define srcendq r6m
|
||||
lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
|
||||
mov srcendq, pos0q
|
||||
%endif ; x86-32/64
|
||||
lea fltposq, [fltposq+wq*4]
|
||||
%if %2 == 15
|
||||
lea dstq, [dstq+wq*2]
|
||||
%else ; %2 == 19
|
||||
lea dstq, [dstq+wq*4]
|
||||
%endif ; %2 == 15/19
|
||||
movifnidn dstmp, dstq
|
||||
neg wq
|
||||
|
||||
.loop:
|
||||
mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0]
|
||||
mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
|
||||
; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
|
||||
pxor m4, m4
|
||||
pxor m5, m5
|
||||
mov srcq, srcmemmp
|
||||
|
||||
.innerloop:
|
||||
; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
|
||||
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
|
||||
movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
|
||||
%if %1 == 8
|
||||
punpcklbw m0, m3
|
||||
punpcklbw m1, m3
|
||||
%endif ; %1 == 8
|
||||
|
||||
; multiply
|
||||
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
|
||||
; add back 0x8000 * sum(coeffs) after the horizontal add
|
||||
psubw m0, m6
|
||||
psubw m1, m6
|
||||
%endif ; %1 == 16
|
||||
pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}]
|
||||
pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
|
||||
paddd m4, m0
|
||||
paddd m5, m1
|
||||
add filterq, mmsize
|
||||
add srcq, srcmul*mmsize/2
|
||||
cmp srcq, srcendq ; while (src += 4) < &src[filterSize]
|
||||
jl .innerloop
|
||||
|
||||
%ifidn %4, X4
|
||||
mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
|
||||
movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0]
|
||||
sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1]
|
||||
%if %1 > 8
|
||||
movhps m0, [srcq+(pos1q+dlt)*srcmul]
|
||||
%else ; %1 == 8
|
||||
movd m1, [srcq+(pos1q+dlt)*srcmul]
|
||||
punpckldq m0, m1
|
||||
%endif ; %1 == 8
|
||||
%if %1 == 8
|
||||
punpcklbw m0, m3
|
||||
%endif ; %1 == 8
|
||||
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
|
||||
; add back 0x8000 * sum(coeffs) after the horizontal add
|
||||
psubw m0, m6
|
||||
%endif ; %1 == 16
|
||||
pmaddwd m0, [filterq]
|
||||
%endif ; %4 == X4
|
||||
|
||||
lea filterq, [filterq+(fltsizeq+dlt)*2]
|
||||
|
||||
%if mmsize == 8 ; mmx
|
||||
movq m0, m4
|
||||
punpckldq m4, m5
|
||||
punpckhdq m0, m5
|
||||
paddd m0, m4
|
||||
%else ; mmsize == 16
|
||||
%if notcpuflag(ssse3) ; sse2
|
||||
mova m1, m4
|
||||
punpcklqdq m4, m5
|
||||
punpckhqdq m1, m5
|
||||
paddd m4, m1
|
||||
%else ; ssse3/sse4
|
||||
phaddd m4, m5
|
||||
%endif ; sse2/ssse3/sse4
|
||||
%ifidn %4, X4
|
||||
paddd m4, m0
|
||||
%endif ; %3 == X4
|
||||
%if notcpuflag(ssse3) ; sse2
|
||||
pshufd m4, m4, 11011000b
|
||||
movhlps m0, m4
|
||||
paddd m0, m4
|
||||
%else ; ssse3/sse4
|
||||
phaddd m4, m4
|
||||
SWAP 0, 4
|
||||
%endif ; sse2/ssse3/sse4
|
||||
%endif ; mmsize == 8/16
|
||||
%endif ; %3 ==/!= X
|
||||
|
||||
%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
|
||||
paddd m0, m7
|
||||
%endif ; %1 == 16
|
||||
|
||||
; clip, store
|
||||
psrad m0, 14 + %1 - %2
|
||||
%ifidn %3, X
|
||||
movifnidn dstq, dstmp
|
||||
%endif ; %3 == X
|
||||
%if %2 == 15
|
||||
packssdw m0, m0
|
||||
%ifnidn %3, X
|
||||
movh [dstq+wq*(2>>wshr)], m0
|
||||
%else ; %3 == X
|
||||
movd [dstq+wq*2], m0
|
||||
%endif ; %3 ==/!= X
|
||||
%else ; %2 == 19
|
||||
PMINSD m0, m2, m4
|
||||
%ifnidn %3, X
|
||||
mova [dstq+wq*(4>>wshr)], m0
|
||||
%else ; %3 == X
|
||||
movq [dstq+wq*4], m0
|
||||
%endif ; %3 ==/!= X
|
||||
%endif ; %2 == 15/19
|
||||
%ifnidn %3, X
|
||||
add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
|
||||
; per iteration. see "shl wq,1" above as for why we do this
|
||||
%else ; %3 == X
|
||||
add wq, 2
|
||||
%endif ; %3 ==/!= X
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
|
||||
%macro SCALE_FUNCS 3
|
||||
SCALE_FUNC %1, %2, 4, 4, 6, %3
|
||||
SCALE_FUNC %1, %2, 8, 8, 6, %3
|
||||
%if mmsize == 8
|
||||
SCALE_FUNC %1, %2, X, X, 7, %3
|
||||
%else
|
||||
SCALE_FUNC %1, %2, X, X4, 7, %3
|
||||
SCALE_FUNC %1, %2, X, X8, 7, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
|
||||
%macro SCALE_FUNCS2 3
|
||||
%if notcpuflag(sse4)
|
||||
SCALE_FUNCS 8, 15, %1
|
||||
SCALE_FUNCS 9, 15, %2
|
||||
SCALE_FUNCS 10, 15, %2
|
||||
SCALE_FUNCS 12, 15, %2
|
||||
SCALE_FUNCS 14, 15, %2
|
||||
SCALE_FUNCS 16, 15, %3
|
||||
%endif ; !sse4
|
||||
SCALE_FUNCS 8, 19, %1
|
||||
SCALE_FUNCS 9, 19, %2
|
||||
SCALE_FUNCS 10, 19, %2
|
||||
SCALE_FUNCS 12, 19, %2
|
||||
SCALE_FUNCS 14, 19, %2
|
||||
SCALE_FUNCS 16, 19, %3
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
SCALE_FUNCS2 0, 0, 0
|
||||
%endif
|
||||
INIT_XMM sse2
|
||||
SCALE_FUNCS2 7, 6, 8
|
||||
INIT_XMM ssse3
|
||||
SCALE_FUNCS2 6, 6, 8
|
||||
INIT_XMM sse4
|
||||
SCALE_FUNCS2 6, 6, 8
|
583
externals/ffmpeg/libswscale/x86/swscale.c
vendored
Executable file
583
externals/ffmpeg/libswscale/x86/swscale.c
vendored
Executable file
@@ -0,0 +1,583 @@
|
||||
/*
|
||||
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <inttypes.h>
|
||||
#include "config.h"
|
||||
#include "libswscale/swscale.h"
|
||||
#include "libswscale/swscale_internal.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/avassert.h"
|
||||
#include "libavutil/intreadwrite.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/pixdesc.h"
|
||||
|
||||
const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
|
||||
0x0103010301030103LL,
|
||||
0x0200020002000200LL,};
|
||||
|
||||
const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
|
||||
0x0602060206020602LL,
|
||||
0x0004000400040004LL,};
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#define DITHER1XBPP
|
||||
|
||||
DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, bFC)= 0xFCFCFCFCFCFCFCFCLL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, w10)= 0x0010001000100010LL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, w02)= 0x0002000200020002LL;
|
||||
|
||||
DECLARE_ASM_CONST(8, uint64_t, b16Mask)= 0x001F001F001F001FLL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, g16Mask)= 0x07E007E007E007E0LL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, r16Mask)= 0xF800F800F800F800LL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, b15Mask)= 0x001F001F001F001FLL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, g15Mask)= 0x03E003E003E003E0LL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, r15Mask)= 0x7C007C007C007C00LL;
|
||||
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL;
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL;
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL;
|
||||
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL;
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
|
||||
|
||||
|
||||
//MMX versions
|
||||
#if HAVE_MMX_INLINE
|
||||
#undef RENAME
|
||||
#define COMPILE_TEMPLATE_MMXEXT 0
|
||||
#define RENAME(a) a ## _mmx
|
||||
#include "swscale_template.c"
|
||||
#endif
|
||||
|
||||
// MMXEXT versions
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
#undef RENAME
|
||||
#undef COMPILE_TEMPLATE_MMXEXT
|
||||
#define COMPILE_TEMPLATE_MMXEXT 1
|
||||
#define RENAME(a) a ## _mmxext
|
||||
#include "swscale_template.c"
|
||||
#endif
|
||||
|
||||
void ff_updateMMXDitherTables(SwsContext *c, int dstY)
|
||||
{
|
||||
const int dstH= c->dstH;
|
||||
const int flags= c->flags;
|
||||
|
||||
SwsPlane *lumPlane = &c->slice[c->numSlice-2].plane[0];
|
||||
SwsPlane *chrUPlane = &c->slice[c->numSlice-2].plane[1];
|
||||
SwsPlane *alpPlane = &c->slice[c->numSlice-2].plane[3];
|
||||
|
||||
int hasAlpha = c->needAlpha;
|
||||
int32_t *vLumFilterPos= c->vLumFilterPos;
|
||||
int32_t *vChrFilterPos= c->vChrFilterPos;
|
||||
int16_t *vLumFilter= c->vLumFilter;
|
||||
int16_t *vChrFilter= c->vChrFilter;
|
||||
int32_t *lumMmxFilter= c->lumMmxFilter;
|
||||
int32_t *chrMmxFilter= c->chrMmxFilter;
|
||||
int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
|
||||
const int vLumFilterSize= c->vLumFilterSize;
|
||||
const int vChrFilterSize= c->vChrFilterSize;
|
||||
const int chrDstY= dstY>>c->chrDstVSubSample;
|
||||
const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
|
||||
const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
|
||||
|
||||
c->blueDither= ff_dither8[dstY&1];
|
||||
if (c->dstFormat == AV_PIX_FMT_RGB555 || c->dstFormat == AV_PIX_FMT_BGR555)
|
||||
c->greenDither= ff_dither8[dstY&1];
|
||||
else
|
||||
c->greenDither= ff_dither4[dstY&1];
|
||||
c->redDither= ff_dither8[(dstY+1)&1];
|
||||
if (dstY < dstH - 2) {
|
||||
const int16_t **lumSrcPtr = (const int16_t **)(void*) lumPlane->line + firstLumSrcY - lumPlane->sliceY;
|
||||
const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPlane->line + firstChrSrcY - chrUPlane->sliceY;
|
||||
const int16_t **alpSrcPtr = (CONFIG_SWSCALE_ALPHA && hasAlpha) ? (const int16_t **)(void*) alpPlane->line + firstLumSrcY - alpPlane->sliceY : NULL;
|
||||
|
||||
int i;
|
||||
if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
|
||||
const int16_t **tmpY = (const int16_t **) lumPlane->tmp;
|
||||
|
||||
int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
|
||||
for (i = 0; i < neg; i++)
|
||||
tmpY[i] = lumSrcPtr[neg];
|
||||
for ( ; i < end; i++)
|
||||
tmpY[i] = lumSrcPtr[i];
|
||||
for ( ; i < vLumFilterSize; i++)
|
||||
tmpY[i] = tmpY[i-1];
|
||||
lumSrcPtr = tmpY;
|
||||
|
||||
if (alpSrcPtr) {
|
||||
const int16_t **tmpA = (const int16_t **) alpPlane->tmp;
|
||||
for (i = 0; i < neg; i++)
|
||||
tmpA[i] = alpSrcPtr[neg];
|
||||
for ( ; i < end; i++)
|
||||
tmpA[i] = alpSrcPtr[i];
|
||||
for ( ; i < vLumFilterSize; i++)
|
||||
tmpA[i] = tmpA[i - 1];
|
||||
alpSrcPtr = tmpA;
|
||||
}
|
||||
}
|
||||
if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
|
||||
const int16_t **tmpU = (const int16_t **) chrUPlane->tmp;
|
||||
int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
|
||||
for (i = 0; i < neg; i++) {
|
||||
tmpU[i] = chrUSrcPtr[neg];
|
||||
}
|
||||
for ( ; i < end; i++) {
|
||||
tmpU[i] = chrUSrcPtr[i];
|
||||
}
|
||||
for ( ; i < vChrFilterSize; i++) {
|
||||
tmpU[i] = tmpU[i - 1];
|
||||
}
|
||||
chrUSrcPtr = tmpU;
|
||||
}
|
||||
|
||||
if (flags & SWS_ACCURATE_RND) {
|
||||
int s= APCK_SIZE / 8;
|
||||
for (i=0; i<vLumFilterSize; i+=2) {
|
||||
*(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
|
||||
*(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
|
||||
lumMmxFilter[s*i+APCK_COEF/4 ]=
|
||||
lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
|
||||
+ (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1] * (1 << 16) : 0);
|
||||
if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
|
||||
*(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
|
||||
*(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
|
||||
alpMmxFilter[s*i+APCK_COEF/4 ]=
|
||||
alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
|
||||
}
|
||||
}
|
||||
for (i=0; i<vChrFilterSize; i+=2) {
|
||||
*(const void**)&chrMmxFilter[s*i ]= chrUSrcPtr[i ];
|
||||
*(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrUSrcPtr[i+(vChrFilterSize>1)];
|
||||
chrMmxFilter[s*i+APCK_COEF/4 ]=
|
||||
chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
|
||||
+ (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1] * (1 << 16) : 0);
|
||||
}
|
||||
} else {
|
||||
for (i=0; i<vLumFilterSize; i++) {
|
||||
*(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
|
||||
lumMmxFilter[4*i+2]=
|
||||
lumMmxFilter[4*i+3]=
|
||||
((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
|
||||
if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
|
||||
*(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
|
||||
alpMmxFilter[4*i+2]=
|
||||
alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
|
||||
}
|
||||
}
|
||||
for (i=0; i<vChrFilterSize; i++) {
|
||||
*(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
|
||||
chrMmxFilter[4*i+2]=
|
||||
chrMmxFilter[4*i+3]=
|
||||
((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_MMXEXT
|
||||
static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
|
||||
const int16_t **src, uint8_t *dest, int dstW,
|
||||
const uint8_t *dither, int offset)
|
||||
{
|
||||
if(((uintptr_t)dest) & 15){
|
||||
yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
|
||||
return;
|
||||
}
|
||||
filterSize--;
|
||||
#define MAIN_FUNCTION \
|
||||
"pxor %%xmm0, %%xmm0 \n\t" \
|
||||
"punpcklbw %%xmm0, %%xmm3 \n\t" \
|
||||
"movd %4, %%xmm1 \n\t" \
|
||||
"punpcklwd %%xmm1, %%xmm1 \n\t" \
|
||||
"punpckldq %%xmm1, %%xmm1 \n\t" \
|
||||
"punpcklqdq %%xmm1, %%xmm1 \n\t" \
|
||||
"psllw $3, %%xmm1 \n\t" \
|
||||
"paddw %%xmm1, %%xmm3 \n\t" \
|
||||
"psraw $4, %%xmm3 \n\t" \
|
||||
"movdqa %%xmm3, %%xmm4 \n\t" \
|
||||
"movdqa %%xmm3, %%xmm7 \n\t" \
|
||||
"movl %3, %%ecx \n\t" \
|
||||
"mov %0, %%"FF_REG_d" \n\t"\
|
||||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
|
||||
".p2align 4 \n\t" /* FIXME Unroll? */\
|
||||
"1: \n\t"\
|
||||
"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\
|
||||
"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\
|
||||
"movdqa 16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\
|
||||
"add $16, %%"FF_REG_d" \n\t"\
|
||||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
|
||||
"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
|
||||
"pmulhw %%xmm0, %%xmm2 \n\t"\
|
||||
"pmulhw %%xmm0, %%xmm5 \n\t"\
|
||||
"paddw %%xmm2, %%xmm3 \n\t"\
|
||||
"paddw %%xmm5, %%xmm4 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
"psraw $3, %%xmm3 \n\t"\
|
||||
"psraw $3, %%xmm4 \n\t"\
|
||||
"packuswb %%xmm4, %%xmm3 \n\t"\
|
||||
"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\
|
||||
"add $16, %%"FF_REG_c" \n\t"\
|
||||
"cmp %2, %%"FF_REG_c" \n\t"\
|
||||
"movdqa %%xmm7, %%xmm3 \n\t" \
|
||||
"movdqa %%xmm7, %%xmm4 \n\t" \
|
||||
"mov %0, %%"FF_REG_d" \n\t"\
|
||||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
|
||||
"jb 1b \n\t"
|
||||
|
||||
if (offset) {
|
||||
__asm__ volatile(
|
||||
"movq %5, %%xmm3 \n\t"
|
||||
"movdqa %%xmm3, %%xmm4 \n\t"
|
||||
"psrlq $24, %%xmm3 \n\t"
|
||||
"psllq $40, %%xmm4 \n\t"
|
||||
"por %%xmm4, %%xmm3 \n\t"
|
||||
MAIN_FUNCTION
|
||||
:: "g" (filter),
|
||||
"r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
|
||||
"m"(filterSize), "m"(((uint64_t *) dither)[0])
|
||||
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
|
||||
"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
|
||||
);
|
||||
} else {
|
||||
__asm__ volatile(
|
||||
"movq %5, %%xmm3 \n\t"
|
||||
MAIN_FUNCTION
|
||||
:: "g" (filter),
|
||||
"r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
|
||||
"m"(filterSize), "m"(((uint64_t *) dither)[0])
|
||||
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
|
||||
"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
|
||||
);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
|
||||
void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
|
||||
SwsContext *c, int16_t *data, \
|
||||
int dstW, const uint8_t *src, \
|
||||
const int16_t *filter, \
|
||||
const int32_t *filterPos, int filterSize)
|
||||
|
||||
#define SCALE_FUNCS(filter_n, opt) \
|
||||
SCALE_FUNC(filter_n, 8, 15, opt); \
|
||||
SCALE_FUNC(filter_n, 9, 15, opt); \
|
||||
SCALE_FUNC(filter_n, 10, 15, opt); \
|
||||
SCALE_FUNC(filter_n, 12, 15, opt); \
|
||||
SCALE_FUNC(filter_n, 14, 15, opt); \
|
||||
SCALE_FUNC(filter_n, 16, 15, opt); \
|
||||
SCALE_FUNC(filter_n, 8, 19, opt); \
|
||||
SCALE_FUNC(filter_n, 9, 19, opt); \
|
||||
SCALE_FUNC(filter_n, 10, 19, opt); \
|
||||
SCALE_FUNC(filter_n, 12, 19, opt); \
|
||||
SCALE_FUNC(filter_n, 14, 19, opt); \
|
||||
SCALE_FUNC(filter_n, 16, 19, opt)
|
||||
|
||||
#define SCALE_FUNCS_MMX(opt) \
|
||||
SCALE_FUNCS(4, opt); \
|
||||
SCALE_FUNCS(8, opt); \
|
||||
SCALE_FUNCS(X, opt)
|
||||
|
||||
#define SCALE_FUNCS_SSE(opt) \
|
||||
SCALE_FUNCS(4, opt); \
|
||||
SCALE_FUNCS(8, opt); \
|
||||
SCALE_FUNCS(X4, opt); \
|
||||
SCALE_FUNCS(X8, opt)
|
||||
|
||||
#if ARCH_X86_32
|
||||
SCALE_FUNCS_MMX(mmx);
|
||||
#endif
|
||||
SCALE_FUNCS_SSE(sse2);
|
||||
SCALE_FUNCS_SSE(ssse3);
|
||||
SCALE_FUNCS_SSE(sse4);
|
||||
|
||||
#define VSCALEX_FUNC(size, opt) \
|
||||
void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
|
||||
const int16_t **src, uint8_t *dest, int dstW, \
|
||||
const uint8_t *dither, int offset)
|
||||
#define VSCALEX_FUNCS(opt) \
|
||||
VSCALEX_FUNC(8, opt); \
|
||||
VSCALEX_FUNC(9, opt); \
|
||||
VSCALEX_FUNC(10, opt)
|
||||
|
||||
#if ARCH_X86_32
|
||||
VSCALEX_FUNCS(mmxext);
|
||||
#endif
|
||||
VSCALEX_FUNCS(sse2);
|
||||
VSCALEX_FUNCS(sse4);
|
||||
VSCALEX_FUNC(16, sse4);
|
||||
VSCALEX_FUNCS(avx);
|
||||
|
||||
#define VSCALE_FUNC(size, opt) \
|
||||
void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \
|
||||
const uint8_t *dither, int offset)
|
||||
#define VSCALE_FUNCS(opt1, opt2) \
|
||||
VSCALE_FUNC(8, opt1); \
|
||||
VSCALE_FUNC(9, opt2); \
|
||||
VSCALE_FUNC(10, opt2); \
|
||||
VSCALE_FUNC(16, opt1)
|
||||
|
||||
#if ARCH_X86_32
|
||||
VSCALE_FUNCS(mmx, mmxext);
|
||||
#endif
|
||||
VSCALE_FUNCS(sse2, sse2);
|
||||
VSCALE_FUNC(16, sse4);
|
||||
VSCALE_FUNCS(avx, avx);
|
||||
|
||||
#define INPUT_Y_FUNC(fmt, opt) \
|
||||
void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
|
||||
const uint8_t *unused1, const uint8_t *unused2, \
|
||||
int w, uint32_t *unused)
|
||||
#define INPUT_UV_FUNC(fmt, opt) \
|
||||
void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
|
||||
const uint8_t *unused0, \
|
||||
const uint8_t *src1, \
|
||||
const uint8_t *src2, \
|
||||
int w, uint32_t *unused)
|
||||
#define INPUT_FUNC(fmt, opt) \
|
||||
INPUT_Y_FUNC(fmt, opt); \
|
||||
INPUT_UV_FUNC(fmt, opt)
|
||||
#define INPUT_FUNCS(opt) \
|
||||
INPUT_FUNC(uyvy, opt); \
|
||||
INPUT_FUNC(yuyv, opt); \
|
||||
INPUT_UV_FUNC(nv12, opt); \
|
||||
INPUT_UV_FUNC(nv21, opt); \
|
||||
INPUT_FUNC(rgba, opt); \
|
||||
INPUT_FUNC(bgra, opt); \
|
||||
INPUT_FUNC(argb, opt); \
|
||||
INPUT_FUNC(abgr, opt); \
|
||||
INPUT_FUNC(rgb24, opt); \
|
||||
INPUT_FUNC(bgr24, opt)
|
||||
|
||||
#if ARCH_X86_32
|
||||
INPUT_FUNCS(mmx);
|
||||
#endif
|
||||
INPUT_FUNCS(sse2);
|
||||
INPUT_FUNCS(ssse3);
|
||||
INPUT_FUNCS(avx);
|
||||
|
||||
av_cold void ff_sws_init_swscale_x86(SwsContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if HAVE_MMX_INLINE
|
||||
if (INLINE_MMX(cpu_flags))
|
||||
sws_init_swscale_mmx(c);
|
||||
#endif
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
if (INLINE_MMXEXT(cpu_flags))
|
||||
sws_init_swscale_mmxext(c);
|
||||
if (cpu_flags & AV_CPU_FLAG_SSE3){
|
||||
if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
|
||||
c->yuv2planeX = yuv2yuvX_sse3;
|
||||
}
|
||||
#endif
|
||||
|
||||
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
|
||||
if (c->srcBpc == 8) { \
|
||||
hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
|
||||
ff_hscale8to19_ ## filtersize ## _ ## opt1; \
|
||||
} else if (c->srcBpc == 9) { \
|
||||
hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
|
||||
ff_hscale9to19_ ## filtersize ## _ ## opt1; \
|
||||
} else if (c->srcBpc == 10) { \
|
||||
hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
|
||||
ff_hscale10to19_ ## filtersize ## _ ## opt1; \
|
||||
} else if (c->srcBpc == 12) { \
|
||||
hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \
|
||||
ff_hscale12to19_ ## filtersize ## _ ## opt1; \
|
||||
} else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth<16)) { \
|
||||
hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
|
||||
ff_hscale14to19_ ## filtersize ## _ ## opt1; \
|
||||
} else { /* c->srcBpc == 16 */ \
|
||||
av_assert0(c->srcBpc == 16);\
|
||||
hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
|
||||
ff_hscale16to19_ ## filtersize ## _ ## opt1; \
|
||||
} \
|
||||
} while (0)
|
||||
#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
|
||||
switch (filtersize) { \
|
||||
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
|
||||
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
|
||||
default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
|
||||
}
|
||||
#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
|
||||
switch(c->dstBpc){ \
|
||||
case 16: do_16_case; break; \
|
||||
case 10: if (!isBE(c->dstFormat) && c->dstFormat != AV_PIX_FMT_P010LE) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
|
||||
case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \
|
||||
case 8: if ((condition_8bit) && !c->use_mmx_vfilter) vscalefn = ff_yuv2planeX_8_ ## opt; break; \
|
||||
}
|
||||
#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
|
||||
switch(c->dstBpc){ \
|
||||
case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
|
||||
case 10: if (!isBE(c->dstFormat) && c->dstFormat != AV_PIX_FMT_P010LE && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
|
||||
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \
|
||||
case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
|
||||
default: av_assert0(c->dstBpc>8); \
|
||||
}
|
||||
#define case_rgb(x, X, opt) \
|
||||
case AV_PIX_FMT_ ## X: \
|
||||
c->lumToYV12 = ff_ ## x ## ToY_ ## opt; \
|
||||
if (!c->chrSrcHSubSample) \
|
||||
c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
|
||||
break
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
|
||||
ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
|
||||
ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);
|
||||
|
||||
switch (c->srcFormat) {
|
||||
case AV_PIX_FMT_YA8:
|
||||
c->lumToYV12 = ff_yuyvToY_mmx;
|
||||
if (c->needAlpha)
|
||||
c->alpToYV12 = ff_uyvyToY_mmx;
|
||||
break;
|
||||
case AV_PIX_FMT_YUYV422:
|
||||
c->lumToYV12 = ff_yuyvToY_mmx;
|
||||
c->chrToYV12 = ff_yuyvToUV_mmx;
|
||||
break;
|
||||
case AV_PIX_FMT_UYVY422:
|
||||
c->lumToYV12 = ff_uyvyToY_mmx;
|
||||
c->chrToYV12 = ff_uyvyToUV_mmx;
|
||||
break;
|
||||
case AV_PIX_FMT_NV12:
|
||||
c->chrToYV12 = ff_nv12ToUV_mmx;
|
||||
break;
|
||||
case AV_PIX_FMT_NV21:
|
||||
c->chrToYV12 = ff_nv21ToUV_mmx;
|
||||
break;
|
||||
case_rgb(rgb24, RGB24, mmx);
|
||||
case_rgb(bgr24, BGR24, mmx);
|
||||
case_rgb(bgra, BGRA, mmx);
|
||||
case_rgb(rgba, RGBA, mmx);
|
||||
case_rgb(abgr, ABGR, mmx);
|
||||
case_rgb(argb, ARGB, mmx);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1);
|
||||
}
|
||||
#endif /* ARCH_X86_32 */
|
||||
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
|
||||
switch (filtersize) { \
|
||||
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
|
||||
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
|
||||
default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \
|
||||
else ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \
|
||||
break; \
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
|
||||
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
|
||||
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, ,
|
||||
HAVE_ALIGNED_STACK || ARCH_X86_64);
|
||||
ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
|
||||
|
||||
switch (c->srcFormat) {
|
||||
case AV_PIX_FMT_YA8:
|
||||
c->lumToYV12 = ff_yuyvToY_sse2;
|
||||
if (c->needAlpha)
|
||||
c->alpToYV12 = ff_uyvyToY_sse2;
|
||||
break;
|
||||
case AV_PIX_FMT_YUYV422:
|
||||
c->lumToYV12 = ff_yuyvToY_sse2;
|
||||
c->chrToYV12 = ff_yuyvToUV_sse2;
|
||||
break;
|
||||
case AV_PIX_FMT_UYVY422:
|
||||
c->lumToYV12 = ff_uyvyToY_sse2;
|
||||
c->chrToYV12 = ff_uyvyToUV_sse2;
|
||||
break;
|
||||
case AV_PIX_FMT_NV12:
|
||||
c->chrToYV12 = ff_nv12ToUV_sse2;
|
||||
break;
|
||||
case AV_PIX_FMT_NV21:
|
||||
c->chrToYV12 = ff_nv21ToUV_sse2;
|
||||
break;
|
||||
case_rgb(rgb24, RGB24, sse2);
|
||||
case_rgb(bgr24, BGR24, sse2);
|
||||
case_rgb(bgra, BGRA, sse2);
|
||||
case_rgb(rgba, RGBA, sse2);
|
||||
case_rgb(abgr, ABGR, sse2);
|
||||
case_rgb(argb, ARGB, sse2);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
|
||||
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3);
|
||||
switch (c->srcFormat) {
|
||||
case_rgb(rgb24, RGB24, ssse3);
|
||||
case_rgb(bgr24, BGR24, ssse3);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
/* Xto15 don't need special sse4 functions */
|
||||
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
|
||||
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
|
||||
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4,
|
||||
if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4,
|
||||
HAVE_ALIGNED_STACK || ARCH_X86_64);
|
||||
if (c->dstBpc == 16 && !isBE(c->dstFormat))
|
||||
c->yuv2plane1 = ff_yuv2plane1_16_sse4;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, ,
|
||||
HAVE_ALIGNED_STACK || ARCH_X86_64);
|
||||
ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
|
||||
|
||||
switch (c->srcFormat) {
|
||||
case AV_PIX_FMT_YUYV422:
|
||||
c->chrToYV12 = ff_yuyvToUV_avx;
|
||||
break;
|
||||
case AV_PIX_FMT_UYVY422:
|
||||
c->chrToYV12 = ff_uyvyToUV_avx;
|
||||
break;
|
||||
case AV_PIX_FMT_NV12:
|
||||
c->chrToYV12 = ff_nv12ToUV_avx;
|
||||
break;
|
||||
case AV_PIX_FMT_NV21:
|
||||
c->chrToYV12 = ff_nv21ToUV_avx;
|
||||
break;
|
||||
case_rgb(rgb24, RGB24, avx);
|
||||
case_rgb(bgr24, BGR24, avx);
|
||||
case_rgb(bgra, BGRA, avx);
|
||||
case_rgb(rgba, RGBA, avx);
|
||||
case_rgb(abgr, ABGR, avx);
|
||||
case_rgb(argb, ARGB, avx);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
1577
externals/ffmpeg/libswscale/x86/swscale_template.c
vendored
Executable file
1577
externals/ffmpeg/libswscale/x86/swscale_template.c
vendored
Executable file
File diff suppressed because it is too large
Load Diff
31
externals/ffmpeg/libswscale/x86/w64xmmtest.c
vendored
Executable file
31
externals/ffmpeg/libswscale/x86/w64xmmtest.c
vendored
Executable file
@@ -0,0 +1,31 @@
|
||||
/*
|
||||
* check XMM registers for clobbers on Win64
|
||||
* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/x86/w64xmmtest.h"
|
||||
#include "libswscale/swscale.h"
|
||||
|
||||
wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
|
||||
const int srcStride[], int srcSliceY, int srcSliceH,
|
||||
uint8_t *const dst[], const int dstStride[]))
|
||||
{
|
||||
testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY,
|
||||
srcSliceH, dst, dstStride);
|
||||
}
|
155
externals/ffmpeg/libswscale/x86/yuv2rgb.c
vendored
Executable file
155
externals/ffmpeg/libswscale/x86/yuv2rgb.c
vendored
Executable file
@@ -0,0 +1,155 @@
|
||||
/*
|
||||
* software YUV to RGB converter
|
||||
*
|
||||
* Copyright (C) 2009 Konstantin Shishkov
|
||||
*
|
||||
* MMX/MMXEXT template stuff (needed for fast movntq support),
|
||||
* 1,4,8bpp support and context / deglobalize stuff
|
||||
* by Michael Niedermayer (michaelni@gmx.at)
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "libswscale/rgb2rgb.h"
|
||||
#include "libswscale/swscale.h"
|
||||
#include "libswscale/swscale_internal.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavutil/cpu.h"
|
||||
|
||||
#if HAVE_X86ASM
|
||||
|
||||
#define DITHER1XBPP // only for MMX
|
||||
|
||||
/* hope these constant values are cache line aligned */
|
||||
DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw) = 0x00ff00ff00ff00ffULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, pb_e0) = 0xe0e0e0e0e0e0e0e0ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
|
||||
|
||||
//MMX versions
|
||||
#if HAVE_MMX
|
||||
#undef RENAME
|
||||
#undef COMPILE_TEMPLATE_MMXEXT
|
||||
#define COMPILE_TEMPLATE_MMXEXT 0
|
||||
#define RENAME(a) a ## _mmx
|
||||
#include "yuv2rgb_template.c"
|
||||
#endif /* HAVE_MMX */
|
||||
|
||||
// MMXEXT versions
|
||||
#if HAVE_MMXEXT
|
||||
#undef RENAME
|
||||
#undef COMPILE_TEMPLATE_MMXEXT
|
||||
#define COMPILE_TEMPLATE_MMXEXT 1
|
||||
#define RENAME(a) a ## _mmxext
|
||||
#include "yuv2rgb_template.c"
|
||||
#endif /* HAVE_MMXEXT */
|
||||
|
||||
//SSSE3 versions
|
||||
#if HAVE_SSSE3
|
||||
#undef RENAME
|
||||
#undef COMPILE_TEMPLATE_MMXEXT
|
||||
#define COMPILE_TEMPLATE_MMXEXT 0
|
||||
#define RENAME(a) a ## _ssse3
|
||||
#include "yuv2rgb_template.c"
|
||||
#endif
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
switch (c->dstFormat) {
|
||||
case AV_PIX_FMT_RGB32:
|
||||
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
|
||||
#if CONFIG_SWSCALE_ALPHA
|
||||
return yuva420_rgb32_ssse3;
|
||||
#endif
|
||||
break;
|
||||
} else
|
||||
return yuv420_rgb32_ssse3;
|
||||
case AV_PIX_FMT_BGR32:
|
||||
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
|
||||
#if CONFIG_SWSCALE_ALPHA
|
||||
return yuva420_bgr32_ssse3;
|
||||
#endif
|
||||
break;
|
||||
} else
|
||||
return yuv420_bgr32_ssse3;
|
||||
case AV_PIX_FMT_RGB24:
|
||||
return yuv420_rgb24_ssse3;
|
||||
case AV_PIX_FMT_BGR24:
|
||||
return yuv420_bgr24_ssse3;
|
||||
case AV_PIX_FMT_RGB565:
|
||||
return yuv420_rgb16_ssse3;
|
||||
case AV_PIX_FMT_RGB555:
|
||||
return yuv420_rgb15_ssse3;
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
switch (c->dstFormat) {
|
||||
case AV_PIX_FMT_RGB24:
|
||||
return yuv420_rgb24_mmxext;
|
||||
case AV_PIX_FMT_BGR24:
|
||||
return yuv420_bgr24_mmxext;
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
switch (c->dstFormat) {
|
||||
case AV_PIX_FMT_RGB32:
|
||||
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
|
||||
#if CONFIG_SWSCALE_ALPHA
|
||||
return yuva420_rgb32_mmx;
|
||||
#endif
|
||||
break;
|
||||
} else
|
||||
return yuv420_rgb32_mmx;
|
||||
case AV_PIX_FMT_BGR32:
|
||||
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
|
||||
#if CONFIG_SWSCALE_ALPHA
|
||||
return yuva420_bgr32_mmx;
|
||||
#endif
|
||||
break;
|
||||
} else
|
||||
return yuv420_bgr32_mmx;
|
||||
case AV_PIX_FMT_RGB24:
|
||||
return yuv420_rgb24_mmx;
|
||||
case AV_PIX_FMT_BGR24:
|
||||
return yuv420_bgr24_mmx;
|
||||
case AV_PIX_FMT_RGB565:
|
||||
return yuv420_rgb16_mmx;
|
||||
case AV_PIX_FMT_RGB555:
|
||||
return yuv420_rgb15_mmx;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
return NULL;
|
||||
}
|
195
externals/ffmpeg/libswscale/x86/yuv2rgb_template.c
vendored
Executable file
195
externals/ffmpeg/libswscale/x86/yuv2rgb_template.c
vendored
Executable file
@@ -0,0 +1,195 @@
|
||||
/*
|
||||
* software YUV to RGB converter
|
||||
*
|
||||
* Copyright (C) 2001-2007 Michael Niedermayer
|
||||
* (c) 2010 Konstantin Shishkov
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libswscale/swscale_internal.h"
|
||||
|
||||
#define YUV2RGB_LOOP(depth) \
|
||||
h_size = (c->dstW + 7) & ~7; \
|
||||
if (h_size * depth > FFABS(dstStride[0])) \
|
||||
h_size -= 8; \
|
||||
\
|
||||
vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
|
||||
\
|
||||
for (y = 0; y < srcSliceH; y++) { \
|
||||
uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \
|
||||
const uint8_t *py = src[0] + y * srcStride[0]; \
|
||||
const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
|
||||
const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
|
||||
x86_reg index = -h_size / 2; \
|
||||
|
||||
extern void RENAME(ff_yuv_420_rgb24)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
|
||||
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
|
||||
const uint8_t *py_2index);
|
||||
extern void RENAME(ff_yuv_420_bgr24)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
|
||||
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
|
||||
const uint8_t *py_2index);
|
||||
|
||||
#if !COMPILE_TEMPLATE_MMXEXT
|
||||
extern void RENAME(ff_yuv_420_rgb15)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
|
||||
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
|
||||
const uint8_t *py_2index);
|
||||
extern void RENAME(ff_yuv_420_rgb16)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
|
||||
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
|
||||
const uint8_t *py_2index);
|
||||
extern void RENAME(ff_yuv_420_rgb32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
|
||||
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
|
||||
const uint8_t *py_2index);
|
||||
extern void RENAME(ff_yuv_420_bgr32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
|
||||
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
|
||||
const uint8_t *py_2index);
|
||||
extern void RENAME(ff_yuva_420_rgb32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
|
||||
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
|
||||
const uint8_t *py_2index, const uint8_t *pa_2index);
|
||||
extern void RENAME(ff_yuva_420_bgr32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
|
||||
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
|
||||
const uint8_t *py_2index, const uint8_t *pa_2index);
|
||||
|
||||
static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
|
||||
int srcStride[],
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t *dst[], int dstStride[])
|
||||
{
|
||||
int y, h_size, vshift;
|
||||
|
||||
YUV2RGB_LOOP(2)
|
||||
|
||||
#ifdef DITHER1XBPP
|
||||
c->blueDither = ff_dither8[y & 1];
|
||||
c->greenDither = ff_dither8[y & 1];
|
||||
c->redDither = ff_dither8[(y + 1) & 1];
|
||||
#endif
|
||||
|
||||
RENAME(ff_yuv_420_rgb15)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
|
||||
}
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
|
||||
int srcStride[],
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t *dst[], int dstStride[])
|
||||
{
|
||||
int y, h_size, vshift;
|
||||
|
||||
YUV2RGB_LOOP(2)
|
||||
|
||||
#ifdef DITHER1XBPP
|
||||
c->blueDither = ff_dither8[y & 1];
|
||||
c->greenDither = ff_dither4[y & 1];
|
||||
c->redDither = ff_dither8[(y + 1) & 1];
|
||||
#endif
|
||||
|
||||
RENAME(ff_yuv_420_rgb16)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
|
||||
}
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
|
||||
int srcStride[],
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t *dst[], int dstStride[])
|
||||
{
|
||||
int y, h_size, vshift;
|
||||
|
||||
YUV2RGB_LOOP(4)
|
||||
|
||||
RENAME(ff_yuv_420_rgb32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
|
||||
}
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
|
||||
int srcStride[],
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t *dst[], int dstStride[])
|
||||
{
|
||||
int y, h_size, vshift;
|
||||
|
||||
YUV2RGB_LOOP(4)
|
||||
|
||||
RENAME(ff_yuv_420_bgr32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
|
||||
}
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
|
||||
int srcStride[],
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t *dst[], int dstStride[])
|
||||
{
|
||||
int y, h_size, vshift;
|
||||
YUV2RGB_LOOP(4)
|
||||
|
||||
const uint8_t *pa = src[3] + y * srcStride[3];
|
||||
RENAME(ff_yuva_420_rgb32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
|
||||
}
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
|
||||
int srcStride[],
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t *dst[], int dstStride[])
|
||||
{
|
||||
int y, h_size, vshift;
|
||||
|
||||
YUV2RGB_LOOP(4)
|
||||
|
||||
const uint8_t *pa = src[3] + y * srcStride[3];
|
||||
RENAME(ff_yuva_420_bgr32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
|
||||
}
|
||||
return srcSliceH;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
|
||||
int srcStride[],
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t *dst[], int dstStride[])
|
||||
{
|
||||
int y, h_size, vshift;
|
||||
|
||||
YUV2RGB_LOOP(3)
|
||||
|
||||
RENAME(ff_yuv_420_rgb24)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
|
||||
}
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
|
||||
int srcStride[],
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t *dst[], int dstStride[])
|
||||
{
|
||||
int y, h_size, vshift;
|
||||
|
||||
YUV2RGB_LOOP(3)
|
||||
|
||||
RENAME(ff_yuv_420_bgr24)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
|
||||
}
|
||||
return srcSliceH;
|
||||
}
|
||||
|
383
externals/ffmpeg/libswscale/x86/yuv_2_rgb.asm
vendored
Executable file
383
externals/ffmpeg/libswscale/x86/yuv_2_rgb.asm
vendored
Executable file
@@ -0,0 +1,383 @@
|
||||
;******************************************************************************
|
||||
;* software YUV to RGB converter
|
||||
;*
|
||||
;* Copyright (C) 2001-2007 Michael Niedermayer
|
||||
;* (c) 2010 Konstantin Shishkov
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
; below variables are named like mask_dwXY, which means to preserve dword No.X & No.Y
|
||||
mask_dw036 : db -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0
|
||||
mask_dw147 : db 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1
|
||||
mask_dw25 : db 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0
|
||||
rgb24_shuf1: db 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11
|
||||
rgb24_shuf2: db 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5
|
||||
rgb24_shuf3: db 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15
|
||||
pw_00ff: times 8 dw 255
|
||||
pb_f8: times 16 db 248
|
||||
pb_e0: times 16 db 224
|
||||
pb_03: times 16 db 3
|
||||
pb_07: times 16 db 7
|
||||
|
||||
mask_1101: dw -1, -1, 0, -1
|
||||
mask_0010: dw 0, 0, -1, 0
|
||||
mask_0110: dw 0, -1, -1, 0
|
||||
mask_1001: dw -1, 0, 0, -1
|
||||
mask_0100: dw 0, -1, 0, 0
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
;
|
||||
; YUV420/YUVA420 to RGB/BGR 15/16/24/32
|
||||
; R = Y + ((vrCoff * (v - 128)) >> 8)
|
||||
; G = Y - ((ugCoff * (u - 128) + vgCoff * (v - 128)) >> 8)
|
||||
; B = Y + ((ubCoff * (u - 128)) >> 8)
|
||||
;
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro MOV_H2L 1
|
||||
%if mmsize == 8
|
||||
psrlq %1, 32
|
||||
%else ; mmsize == 16
|
||||
psrldq %1, 8
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro yuv2rgb_fn 3
|
||||
|
||||
%if %3 == 32
|
||||
%ifidn %1, yuva
|
||||
%define parameters index, image, pu_index, pv_index, pointer_c_dither, py_2index, pa_2index
|
||||
%define GPR_num 7
|
||||
%endif
|
||||
%else
|
||||
%define parameters index, image, pu_index, pv_index, pointer_c_dither, py_2index
|
||||
%define GPR_num 6
|
||||
%endif
|
||||
|
||||
%define m_green m2
|
||||
%define m_alpha m3
|
||||
%define m_y m6
|
||||
%define m_u m0
|
||||
%define m_v m1
|
||||
%ifidn %2, rgb
|
||||
%define m_red m1
|
||||
%define m_blue m0
|
||||
%else
|
||||
%define m_red m0
|
||||
%define m_blue m1
|
||||
%endif
|
||||
|
||||
%if mmsize == 8
|
||||
%define time_num 1
|
||||
%define reg_num 8
|
||||
%define y_offset [pointer_c_ditherq + 8 * 8]
|
||||
%define u_offset [pointer_c_ditherq + 9 * 8]
|
||||
%define v_offset [pointer_c_ditherq + 10 * 8]
|
||||
%define ug_coff [pointer_c_ditherq + 7 * 8]
|
||||
%define vg_coff [pointer_c_ditherq + 6 * 8]
|
||||
%define y_coff [pointer_c_ditherq + 3 * 8]
|
||||
%define ub_coff [pointer_c_ditherq + 5 * 8]
|
||||
%define vr_coff [pointer_c_ditherq + 4 * 8]
|
||||
%elif mmsize == 16
|
||||
%define time_num 2
|
||||
%if ARCH_X86_32
|
||||
%define reg_num 8
|
||||
%define my_offset [pointer_c_ditherq + 8 * 8]
|
||||
%define mu_offset [pointer_c_ditherq + 9 * 8]
|
||||
%define mv_offset [pointer_c_ditherq + 10 * 8]
|
||||
%define mug_coff [pointer_c_ditherq + 7 * 8]
|
||||
%define mvg_coff [pointer_c_ditherq + 6 * 8]
|
||||
%define my_coff [pointer_c_ditherq + 3 * 8]
|
||||
%define mub_coff [pointer_c_ditherq + 5 * 8]
|
||||
%define mvr_coff [pointer_c_ditherq + 4 * 8]
|
||||
%else ; ARCH_X86_64
|
||||
%define reg_num 16
|
||||
%define y_offset m8
|
||||
%define u_offset m9
|
||||
%define v_offset m10
|
||||
%define ug_coff m11
|
||||
%define vg_coff m12
|
||||
%define y_coff m13
|
||||
%define ub_coff m14
|
||||
%define vr_coff m15
|
||||
%endif ; ARCH_X86_32/64
|
||||
%endif ; coeff define mmsize == 8/16
|
||||
|
||||
cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
|
||||
|
||||
%if ARCH_X86_64
|
||||
movsxd indexq, indexd
|
||||
%if mmsize == 16
|
||||
VBROADCASTSD y_offset, [pointer_c_ditherq + 8 * 8]
|
||||
VBROADCASTSD u_offset, [pointer_c_ditherq + 9 * 8]
|
||||
VBROADCASTSD v_offset, [pointer_c_ditherq + 10 * 8]
|
||||
VBROADCASTSD ug_coff, [pointer_c_ditherq + 7 * 8]
|
||||
VBROADCASTSD vg_coff, [pointer_c_ditherq + 6 * 8]
|
||||
VBROADCASTSD y_coff, [pointer_c_ditherq + 3 * 8]
|
||||
VBROADCASTSD ub_coff, [pointer_c_ditherq + 5 * 8]
|
||||
VBROADCASTSD vr_coff, [pointer_c_ditherq + 4 * 8]
|
||||
%endif
|
||||
%endif
|
||||
movu m_y, [py_2indexq + 2 * indexq]
|
||||
movh m_u, [pu_indexq + indexq]
|
||||
movh m_v, [pv_indexq + indexq]
|
||||
.loop0:
|
||||
pxor m4, m4
|
||||
mova m7, m6
|
||||
punpcklbw m0, m4
|
||||
punpcklbw m1, m4
|
||||
mova m2, [pw_00ff]
|
||||
pand m6, m2
|
||||
psrlw m7, 8
|
||||
psllw m0, 3
|
||||
psllw m1, 3
|
||||
psllw m6, 3
|
||||
psllw m7, 3
|
||||
%if (ARCH_X86_32 && mmsize == 16)
|
||||
VBROADCASTSD m2, mu_offset
|
||||
VBROADCASTSD m3, mv_offset
|
||||
VBROADCASTSD m4, my_offset
|
||||
psubsw m0, m2 ; U = U - 128
|
||||
psubsw m1, m3 ; V = V - 128
|
||||
psubw m6, m4
|
||||
psubw m7, m4
|
||||
VBROADCASTSD m2, mug_coff
|
||||
VBROADCASTSD m3, mvg_coff
|
||||
VBROADCASTSD m4, my_coff
|
||||
VBROADCASTSD m5, mub_coff
|
||||
pmulhw m2, m0
|
||||
pmulhw m3, m1
|
||||
pmulhw m6, m4
|
||||
pmulhw m7, m4
|
||||
pmulhw m0, m5
|
||||
VBROADCASTSD m4, mvr_coff
|
||||
pmulhw m1, m4
|
||||
%else ; ARCH_X86_64 || mmsize == 8
|
||||
psubsw m0, u_offset ; U = U - 128
|
||||
psubsw m1, v_offset ; V = V - 128
|
||||
psubw m6, y_offset
|
||||
psubw m7, y_offset
|
||||
mova m2, m0
|
||||
mova m3, m1
|
||||
pmulhw m2, ug_coff
|
||||
pmulhw m3, vg_coff
|
||||
pmulhw m6, y_coff
|
||||
pmulhw m7, y_coff
|
||||
pmulhw m0, ub_coff
|
||||
pmulhw m1, vr_coff
|
||||
%endif
|
||||
paddsw m2, m3
|
||||
mova m3, m7
|
||||
mova m5, m7
|
||||
paddsw m3, m0 ; B1 B3 B5 B7 ...
|
||||
paddsw m5, m1 ; R1 R3 R5 R7 ...
|
||||
paddsw m7, m2 ; G1 G3 G4 G7 ...
|
||||
paddsw m0, m6 ; B0 B2 B4 B6 ...
|
||||
paddsw m1, m6 ; R0 R2 R4 R6 ...
|
||||
paddsw m2, m6 ; G0 G2 G4 G6 ...
|
||||
|
||||
%if %3 == 24 ; PACK RGB24
|
||||
%define depth 3
|
||||
packuswb m0, m3 ; R0 R2 R4 R6 ... R1 R3 R5 R7 ...
|
||||
packuswb m1, m5 ; B0 B2 B4 B6 ... B1 B3 B5 B7 ...
|
||||
packuswb m2, m7 ; G0 G2 G4 G6 ... G1 G3 G5 G7 ...
|
||||
mova m3, m_red
|
||||
mova m6, m_blue
|
||||
MOV_H2L m_red
|
||||
punpcklbw m3, m2 ; R0 G0 R2 G2 R4 G4 R6 G6 R8 G8 ...
|
||||
punpcklbw m6, m_red ; B0 R1 B2 R3 B4 R5 B6 R7 B8 R9 ...
|
||||
mova m5, m3
|
||||
punpckhbw m2, m_blue ; G1 B1 G3 B3 G5 B5 G7 B7 G9 B9 ...
|
||||
%if mmsize == 8
|
||||
punpcklwd m3 ,m6 ; R0 G0 B0 R1 R2 G2 B2 R3
|
||||
punpckhwd m5, m6 ; R4 G4 B4 R5 R6 G6 B6 R7
|
||||
%if cpuflag(mmxext)
|
||||
pshufw m1, m2, 0xc6
|
||||
pshufw m6, m3, 0x84
|
||||
pshufw m7, m5, 0x38
|
||||
pand m6, [mask_1101] ; R0 G0 B0 R1 -- -- R2 G2
|
||||
movq m0, m1
|
||||
pand m7, [mask_0110] ; -- -- R6 G6 B6 R7 -- --
|
||||
movq m2, m1
|
||||
pand m1, [mask_0100] ; -- -- G3 B3 -- -- -- --
|
||||
psrlq m3, 48 ; B2 R3 -- -- -- -- -- --
|
||||
pand m0, [mask_0010] ; -- -- -- -- G1 B1 -- --
|
||||
psllq m5, 32 ; -- -- -- -- R4 G4 B4 R5
|
||||
pand m2, [mask_1001] ; G5 B5 -- -- -- -- G7 B7
|
||||
por m1, m3
|
||||
por m0, m6
|
||||
por m1, m5
|
||||
por m2, m7
|
||||
movntq [imageq], m0
|
||||
movntq [imageq + 8], m1
|
||||
movntq [imageq + 16], m2
|
||||
%else ; cpuflag(mmx)
|
||||
movd [imageq], m3 ; R0 G0 R2 G2
|
||||
movd [imageq + 4], m2 ; G1 B1
|
||||
psrlq m3, 32
|
||||
psrlq m2, 16
|
||||
movd [imageq + 6], m3 ; R2 G2 B2 R3
|
||||
movd [imageq + 10], m2 ; G3 B3
|
||||
psrlq m2, 16
|
||||
movd [imageq + 12], m5 ; R4 G4 B4 R5
|
||||
movd [imageq + 16], m2 ; G5 B5
|
||||
psrlq m5, 32
|
||||
movd [imageq + 20], m2 ; -- -- G7 B7
|
||||
movd [imageq + 18], m5 ; R6 G6 B6 R7
|
||||
%endif ; mmsize = 8
|
||||
%else ; mmsize == 16
|
||||
pshufb m3, [rgb24_shuf1] ; r0 g0 r6 g6 r12 g12 r2 g2 r8 g8 r14 g14 r4 g4 r10 g10
|
||||
pshufb m6, [rgb24_shuf2] ; b10 r11 b0 r1 b6 r7 b12 r13 b2 r3 b8 r9 b14 r15 b4 r5
|
||||
pshufb m2, [rgb24_shuf3] ; g5 b5 g11 b11 g1 b1 g7 b7 g13 b13 g3 b3 g9 b9 g15 b15
|
||||
mova m7, [mask_dw036]
|
||||
mova m4, [mask_dw147]
|
||||
mova m5, [mask_dw25]
|
||||
pand m0, m7, m3 ; r0 g0 --- --- --- --- r2 g2 --- --- --- --- r4 g4 --- ---
|
||||
pand m1, m4, m6 ; --- --- b0 r1 --- --- --- --- b2 r3 --- --- --- --- b4 r5
|
||||
por m0, m1
|
||||
pand m1, m5, m2 ; --- --- --- --- g1 b1 --- --- --- --- g3 b3 --- --- --- ---
|
||||
por m0, m1 ; r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5
|
||||
pand m1, m7, m2 ; g5 b5 --- --- --- --- g7 b7 --- --- --- --- g9 b9 --- ---
|
||||
pand m7, m6 ; b10 r11 --- --- --- --- b12 r13 --- --- --- --- b14 r15 --- ---
|
||||
pand m6, m5 ; --- --- --- --- b6 r7 --- --- --- --- b8 r9 --- --- --- ---
|
||||
por m1, m6
|
||||
pand m6, m4, m3 ; --- --- r6 g6 --- --- --- --- r8 g8 --- --- --- --- r10 g10
|
||||
pand m2, m4 ; --- --- g11 b11 --- --- --- --- g13 b13 --- --- --- --- g15 b15
|
||||
pand m3, m5 ; --- --- --- --- r12 g12 --- --- --- --- r14 g14 --- --- --- ---
|
||||
por m2, m7
|
||||
por m1, m6 ; g5 b5 r6 g6 b6 r7 g7 b7 r8 g8 b8 r9 g9 b9 r10 g10
|
||||
por m2, m3 ; b10 r11 g11 b11 r12 g12 b12 r13 g13 b13 r14 g14 b14 r15 g15 b15
|
||||
mova [imageq], m0
|
||||
mova [imageq + 16], m1
|
||||
mova [imageq + 32], m2
|
||||
%endif ; mmsize = 16
|
||||
%else ; PACK RGB15/16/32
|
||||
packuswb m0, m1
|
||||
packuswb m3, m5
|
||||
packuswb m2, m2
|
||||
mova m1, m0
|
||||
packuswb m7, m7
|
||||
punpcklbw m0, m3 ; B0 B1 B2 B3 ... B7
|
||||
punpckhbw m1, m3 ; R0 R1 R2 R3 ... R7
|
||||
punpcklbw m2, m7 ; G0 G1 G2 G3 ... G7
|
||||
%if %3 == 32 ; PACK RGB32
|
||||
%define depth 4
|
||||
%ifidn %1, yuv
|
||||
pcmpeqd m3, m3 ; Set alpha empty
|
||||
%else
|
||||
mova m3, [pa_2indexq + 2 * indexq] ; Load alpha
|
||||
%endif
|
||||
mova m5, m_blue
|
||||
mova m6, m_red
|
||||
punpckhbw m5, m_green
|
||||
punpcklbw m_blue, m_green
|
||||
punpckhbw m6, m_alpha
|
||||
punpcklbw m_red, m_alpha
|
||||
mova m_green, m_blue
|
||||
mova m_alpha, m5
|
||||
punpcklwd m_blue, m_red
|
||||
punpckhwd m_green, m_red
|
||||
punpcklwd m5, m6
|
||||
punpckhwd m_alpha, m6
|
||||
mova [imageq + 0], m_blue
|
||||
mova [imageq + 8 * time_num], m_green
|
||||
mova [imageq + 16 * time_num], m5
|
||||
mova [imageq + 24 * time_num], m_alpha
|
||||
%else ; PACK RGB15/16
|
||||
%define depth 2
|
||||
%if cpuflag(ssse3)
|
||||
%define red_dither m3
|
||||
%define green_dither m4
|
||||
%define blue_dither m5
|
||||
VBROADCASTSD red_dither, [pointer_c_ditherq + 0 * 8]
|
||||
VBROADCASTSD green_dither, [pointer_c_ditherq + 1 * 8]
|
||||
VBROADCASTSD blue_dither, [pointer_c_ditherq + 2 * 8]
|
||||
%else ; cpuflag(mmx/mmxext)
|
||||
%define blue_dither [pointer_c_ditherq + 2 * 8]
|
||||
%define green_dither [pointer_c_ditherq + 1 * 8]
|
||||
%define red_dither [pointer_c_ditherq + 0 * 8]
|
||||
%endif
|
||||
%if %3 == 15
|
||||
%define gmask pb_03
|
||||
%define isRGB15 1
|
||||
%else
|
||||
%define gmask pb_07
|
||||
%define isRGB15 0
|
||||
%endif
|
||||
paddusb m0, blue_dither
|
||||
paddusb m2, green_dither
|
||||
paddusb m1, red_dither
|
||||
pand m0, [pb_f8]
|
||||
pand m1, [pb_f8]
|
||||
mova m3, m2
|
||||
psllw m2, 3 - isRGB15
|
||||
psrlw m3, 5 + isRGB15
|
||||
psrlw m0, 3
|
||||
psrlw m1, isRGB15
|
||||
pand m2, [pb_e0]
|
||||
pand m3, [gmask]
|
||||
por m0, m2
|
||||
por m1, m3
|
||||
mova m2, m0
|
||||
punpcklbw m0, m1
|
||||
punpckhbw m2, m1
|
||||
mova [imageq], m0
|
||||
mova [imageq + 8 * time_num], m2
|
||||
%endif ; PACK RGB15/16
|
||||
%endif ; PACK RGB15/16/32
|
||||
|
||||
movu m_y, [py_2indexq + 2 * indexq + 8 * time_num]
|
||||
movh m_v, [pv_indexq + indexq + 4 * time_num]
|
||||
movh m_u, [pu_indexq + indexq + 4 * time_num]
|
||||
add imageq, 8 * depth * time_num
|
||||
add indexq, 4 * time_num
|
||||
js .loop0
|
||||
|
||||
REP_RET
|
||||
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
yuv2rgb_fn yuv, rgb, 24
|
||||
yuv2rgb_fn yuv, bgr, 24
|
||||
yuv2rgb_fn yuv, rgb, 32
|
||||
yuv2rgb_fn yuv, bgr, 32
|
||||
yuv2rgb_fn yuva, rgb, 32
|
||||
yuv2rgb_fn yuva, bgr, 32
|
||||
yuv2rgb_fn yuv, rgb, 15
|
||||
yuv2rgb_fn yuv, rgb, 16
|
||||
|
||||
INIT_MMX mmxext
|
||||
yuv2rgb_fn yuv, rgb, 24
|
||||
yuv2rgb_fn yuv, bgr, 24
|
||||
|
||||
INIT_XMM ssse3
|
||||
yuv2rgb_fn yuv, rgb, 24
|
||||
yuv2rgb_fn yuv, bgr, 24
|
||||
yuv2rgb_fn yuv, rgb, 32
|
||||
yuv2rgb_fn yuv, bgr, 32
|
||||
yuv2rgb_fn yuva, rgb, 32
|
||||
yuv2rgb_fn yuva, bgr, 32
|
||||
yuv2rgb_fn yuv, rgb, 15
|
||||
yuv2rgb_fn yuv, rgb, 16
|
Reference in New Issue
Block a user