early-access version 2864
This commit is contained in:
@@ -13,4 +13,3 @@ X86ASM-OBJS += x86/input.o \
|
||||
x86/scale.o \
|
||||
x86/rgb_2_rgb.o \
|
||||
x86/yuv_2_rgb.o \
|
||||
x86/yuv2yuvX.o \
|
||||
|
||||
@@ -21,7 +21,6 @@
|
||||
#include "../swscale_internal.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavutil/mem_internal.h"
|
||||
|
||||
#define RET 0xC3 // near return opcode for x86
|
||||
#define PREFETCH "prefetchnta"
|
||||
|
||||
128
externals/ffmpeg/ffmpeg/libswscale/x86/output.asm
vendored
128
externals/ffmpeg/ffmpeg/libswscale/x86/output.asm
vendored
@@ -2,7 +2,6 @@
|
||||
;* x86-optimized vertical line scaling functions
|
||||
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;* Kieran Kunhya <kieran@kunhya.com>
|
||||
;* (c) 2020 Nelson Gomez <nelson.gomez@microsoft.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
@@ -23,7 +22,7 @@
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
SECTION_RODATA
|
||||
|
||||
minshort: times 8 dw 0x8000
|
||||
yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
|
||||
@@ -35,20 +34,9 @@ pd_4: times 4 dd 4
|
||||
pd_4min0x40000:times 4 dd 4 - (0x40000)
|
||||
pw_16: times 8 dw 16
|
||||
pw_32: times 8 dw 32
|
||||
pd_255: times 8 dd 255
|
||||
pw_512: times 8 dw 512
|
||||
pw_1024: times 8 dw 1024
|
||||
|
||||
yuv2nv12_shuffle_mask: times 2 db 0, 4, 8, 12, \
|
||||
-1, -1, -1, -1, \
|
||||
-1, -1, -1, -1, \
|
||||
-1, -1, -1, -1
|
||||
yuv2nv21_shuffle_mask: times 2 db 4, 0, 12, 8, \
|
||||
-1, -1, -1, -1, \
|
||||
-1, -1, -1, -1, \
|
||||
-1, -1, -1, -1
|
||||
yuv2nv12_permute_mask: dd 0, 4, 1, 2, 3, 5, 6, 7
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
@@ -435,117 +423,3 @@ yuv2plane1_fn 9, 5, 3
|
||||
yuv2plane1_fn 10, 5, 3
|
||||
yuv2plane1_fn 16, 5, 3
|
||||
%endif
|
||||
|
||||
%undef movsx
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; AVX2 yuv2nv12cX implementation
|
||||
;
|
||||
; void ff_yuv2nv12cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
|
||||
; const int16_t *filter, int filterSize,
|
||||
; const int16_t **u, const int16_t **v,
|
||||
; uint8_t *dst, int dstWidth)
|
||||
;
|
||||
; void ff_yuv2nv21cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
|
||||
; const int16_t *filter, int filterSize,
|
||||
; const int16_t **u, const int16_t **v,
|
||||
; uint8_t *dst, int dstWidth)
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%if ARCH_X86_64
|
||||
%macro yuv2nv12cX_fn 1
|
||||
cglobal %1cX, 8, 11, 13, tmp1, dither, filter, filterSize, u, v, dst, dstWidth
|
||||
|
||||
mov tmp1q, qword [ditherq]
|
||||
movq xm0, tmp1q
|
||||
ror tmp1q, 24
|
||||
movq xm1, tmp1q
|
||||
|
||||
pmovzxbd m0, xm0
|
||||
pslld m0, m0, 12 ; ditherLo
|
||||
pmovzxbd m1, xm1
|
||||
pslld m1, m1, 12 ; ditherHi
|
||||
|
||||
pxor m9, m9 ; uint8_min dwords
|
||||
mova m10, [pd_255] ; uint8_max dwords
|
||||
mova m11, [%1_shuffle_mask] ; shuffle_mask
|
||||
mova m12, [yuv2nv12_permute_mask] ; permute mask
|
||||
|
||||
DEFINE_ARGS tmp1, tmp2, filter, filterSize, u, v, dst, dstWidth
|
||||
|
||||
xor r8q, r8q
|
||||
|
||||
nv12_outer_%1:
|
||||
mova m2, m0 ; resultLo
|
||||
mova m3, m1 ; resultHi
|
||||
xor r9q, r9q
|
||||
|
||||
nv12_inner_%1:
|
||||
movsx r10d, word [filterq + (2 * r9q)]
|
||||
movd xm4, r10d
|
||||
vpbroadcastd m4, xm4 ; filter
|
||||
|
||||
mov tmp1q, [uq + (gprsize * r9q)]
|
||||
mova xm7, oword [tmp1q + 2 * r8q]
|
||||
|
||||
mov tmp2q, [vq + (gprsize * r9q)]
|
||||
mova xm8, oword [tmp2q + 2 * r8q]
|
||||
|
||||
punpcklwd xm5, xm7, xm8
|
||||
pmovsxwd m5, xm5 ; multiplicandsLo
|
||||
punpckhwd xm6, xm7, xm8
|
||||
pmovsxwd m6, xm6 ; multiplicandsHi
|
||||
|
||||
pmulld m7, m5, m4 ; mulResultLo
|
||||
pmulld m8, m6, m4 ; mulResultHi
|
||||
paddd m2, m2, m7 ; resultLo += mulResultLo
|
||||
paddd m3, m3, m8 ; resultHi += mulResultHi
|
||||
|
||||
inc r9d
|
||||
cmp r9d, filterSized
|
||||
jl nv12_inner_%1
|
||||
; end of inner loop
|
||||
|
||||
psrad m2, m2, 19
|
||||
psrad m3, m3, 19
|
||||
|
||||
; Vectorized av_clip_uint8
|
||||
pmaxsd m2, m2, m9
|
||||
pmaxsd m3, m3, m9
|
||||
pminsd m2, m2, m10
|
||||
pminsd m3, m3, m10
|
||||
|
||||
; At this point we have clamped uint8s arranged in this order:
|
||||
; m2: u1 0 0 0 v1 0 0 0 [...]
|
||||
; m3: u5 0 0 0 v5 0 0 0 [...]
|
||||
;
|
||||
; First, we shuffle the bytes to make the bytes semi-contiguous.
|
||||
; AVX-2 doesn't have cross-lane shuffling, so we'll end up with:
|
||||
; m2: u1 v1 u2 v2 0 0 0 0 0 0 0 0 u3 v3 u4 v4
|
||||
; m3: u5 v5 u6 v6 0 0 0 0 0 0 0 0 u7 v7 u8 v8
|
||||
pshufb m2, m2, m11
|
||||
pshufb m3, m3, m11
|
||||
|
||||
; To fix the cross-lane shuffling issue, we'll then use cross-lane
|
||||
; permutation to combine the two segments
|
||||
vpermd m2, m12, m2
|
||||
vpermd m3, m12, m3
|
||||
|
||||
; Now we have the final results in the lower 8 bytes of each register
|
||||
movq [dstq], xm2
|
||||
movq [dstq + 8], xm3
|
||||
|
||||
add r8d, 8
|
||||
add dstq, 16
|
||||
|
||||
cmp r8d, dstWidthd
|
||||
jl nv12_outer_%1
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
yuv2nv12cX_fn yuv2nv12
|
||||
yuv2nv12cX_fn yuv2nv21
|
||||
%endif
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
10
externals/ffmpeg/ffmpeg/libswscale/x86/rgb2rgb.c
vendored
10
externals/ffmpeg/ffmpeg/libswscale/x86/rgb2rgb.c
vendored
@@ -30,8 +30,6 @@
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/bswap.h"
|
||||
#include "libavutil/mem_internal.h"
|
||||
|
||||
#include "libswscale/rgb2rgb.h"
|
||||
#include "libswscale/swscale.h"
|
||||
#include "libswscale/swscale_internal.h"
|
||||
@@ -40,7 +38,12 @@
|
||||
|
||||
DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mmx_one) = 0xFFFFFFFFFFFFFFFFULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask32b) = 0x000000FF000000FFULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask32g) = 0x0000FF000000FF00ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask32r) = 0x00FF000000FF0000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask32) = 0x00FFFFFF00FFFFFFULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL;
|
||||
@@ -51,6 +54,9 @@ DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask24hh) = 0xffff000000000000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask24hhh) = 0xffffffff00000000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask24hhhh) = 0xffffffffffff0000ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
|
||||
DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL;
|
||||
|
||||
185
externals/ffmpeg/ffmpeg/libswscale/x86/swscale.c
vendored
185
externals/ffmpeg/ffmpeg/libswscale/x86/swscale.c
vendored
@@ -27,7 +27,6 @@
|
||||
#include "libavutil/intreadwrite.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/mem_internal.h"
|
||||
#include "libavutil/pixdesc.h"
|
||||
|
||||
const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
|
||||
@@ -44,6 +43,15 @@ const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
|
||||
|
||||
DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, bFC)= 0xFCFCFCFCFCFCFCFCLL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, w10)= 0x0010001000100010LL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, w02)= 0x0002000200020002LL;
|
||||
|
||||
DECLARE_ASM_CONST(8, uint64_t, b16Mask)= 0x001F001F001F001FLL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, g16Mask)= 0x07E007E007E007E0LL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, r16Mask)= 0xF800F800F800F800LL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, b15Mask)= 0x001F001F001F001FLL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, g15Mask)= 0x03E003E003E003E0LL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, r15Mask)= 0x7C007C007C007C00LL;
|
||||
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL;
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL;
|
||||
@@ -187,56 +195,87 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_MMXEXT
|
||||
static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
|
||||
const int16_t **src, uint8_t *dest, int dstW,
|
||||
const uint8_t *dither, int offset)
|
||||
{
|
||||
if(((uintptr_t)dest) & 15){
|
||||
yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
|
||||
return;
|
||||
}
|
||||
filterSize--;
|
||||
#define MAIN_FUNCTION \
|
||||
"pxor %%xmm0, %%xmm0 \n\t" \
|
||||
"punpcklbw %%xmm0, %%xmm3 \n\t" \
|
||||
"movd %4, %%xmm1 \n\t" \
|
||||
"punpcklwd %%xmm1, %%xmm1 \n\t" \
|
||||
"punpckldq %%xmm1, %%xmm1 \n\t" \
|
||||
"punpcklqdq %%xmm1, %%xmm1 \n\t" \
|
||||
"psllw $3, %%xmm1 \n\t" \
|
||||
"paddw %%xmm1, %%xmm3 \n\t" \
|
||||
"psraw $4, %%xmm3 \n\t" \
|
||||
"movdqa %%xmm3, %%xmm4 \n\t" \
|
||||
"movdqa %%xmm3, %%xmm7 \n\t" \
|
||||
"movl %3, %%ecx \n\t" \
|
||||
"mov %0, %%"FF_REG_d" \n\t"\
|
||||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
|
||||
".p2align 4 \n\t" /* FIXME Unroll? */\
|
||||
"1: \n\t"\
|
||||
"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\
|
||||
"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\
|
||||
"movdqa 16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\
|
||||
"add $16, %%"FF_REG_d" \n\t"\
|
||||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
|
||||
"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
|
||||
"pmulhw %%xmm0, %%xmm2 \n\t"\
|
||||
"pmulhw %%xmm0, %%xmm5 \n\t"\
|
||||
"paddw %%xmm2, %%xmm3 \n\t"\
|
||||
"paddw %%xmm5, %%xmm4 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
"psraw $3, %%xmm3 \n\t"\
|
||||
"psraw $3, %%xmm4 \n\t"\
|
||||
"packuswb %%xmm4, %%xmm3 \n\t"\
|
||||
"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\
|
||||
"add $16, %%"FF_REG_c" \n\t"\
|
||||
"cmp %2, %%"FF_REG_c" \n\t"\
|
||||
"movdqa %%xmm7, %%xmm3 \n\t" \
|
||||
"movdqa %%xmm7, %%xmm4 \n\t" \
|
||||
"mov %0, %%"FF_REG_d" \n\t"\
|
||||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
|
||||
"jb 1b \n\t"
|
||||
|
||||
if (offset) {
|
||||
__asm__ volatile(
|
||||
"movq %5, %%xmm3 \n\t"
|
||||
"movdqa %%xmm3, %%xmm4 \n\t"
|
||||
"psrlq $24, %%xmm3 \n\t"
|
||||
"psllq $40, %%xmm4 \n\t"
|
||||
"por %%xmm4, %%xmm3 \n\t"
|
||||
MAIN_FUNCTION
|
||||
:: "g" (filter),
|
||||
"r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
|
||||
"m"(filterSize), "m"(((uint64_t *) dither)[0])
|
||||
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
|
||||
"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
|
||||
);
|
||||
} else {
|
||||
__asm__ volatile(
|
||||
"movq %5, %%xmm3 \n\t"
|
||||
MAIN_FUNCTION
|
||||
:: "g" (filter),
|
||||
"r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
|
||||
"m"(filterSize), "m"(((uint64_t *) dither)[0])
|
||||
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
|
||||
"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
|
||||
);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
#define YUV2YUVX_FUNC_MMX(opt, step) \
|
||||
void ff_yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, int srcOffset, \
|
||||
uint8_t *dest, int dstW, \
|
||||
const uint8_t *dither, int offset); \
|
||||
static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \
|
||||
const int16_t **src, uint8_t *dest, int dstW, \
|
||||
const uint8_t *dither, int offset) \
|
||||
{ \
|
||||
if(dstW > 0) \
|
||||
ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \
|
||||
return; \
|
||||
}
|
||||
|
||||
#define YUV2YUVX_FUNC(opt, step) \
|
||||
void ff_yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, int srcOffset, \
|
||||
uint8_t *dest, int dstW, \
|
||||
const uint8_t *dither, int offset); \
|
||||
static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \
|
||||
const int16_t **src, uint8_t *dest, int dstW, \
|
||||
const uint8_t *dither, int offset) \
|
||||
{ \
|
||||
int remainder = (dstW % step); \
|
||||
int pixelsProcessed = dstW - remainder; \
|
||||
if(((uintptr_t)dest) & 15){ \
|
||||
yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \
|
||||
return; \
|
||||
} \
|
||||
if(pixelsProcessed > 0) \
|
||||
ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \
|
||||
if(remainder > 0){ \
|
||||
ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \
|
||||
} \
|
||||
return; \
|
||||
}
|
||||
|
||||
#if HAVE_MMX_EXTERNAL
|
||||
YUV2YUVX_FUNC_MMX(mmx, 16)
|
||||
#endif
|
||||
#if HAVE_MMXEXT_EXTERNAL
|
||||
YUV2YUVX_FUNC_MMX(mmxext, 16)
|
||||
#endif
|
||||
#if HAVE_SSE3_EXTERNAL
|
||||
YUV2YUVX_FUNC(sse3, 32)
|
||||
#endif
|
||||
#if HAVE_AVX2_EXTERNAL
|
||||
YUV2YUVX_FUNC(avx2, 64)
|
||||
#endif
|
||||
|
||||
#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
|
||||
void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
|
||||
SwsContext *c, int16_t *data, \
|
||||
@@ -341,17 +380,6 @@ INPUT_FUNCS(sse2);
|
||||
INPUT_FUNCS(ssse3);
|
||||
INPUT_FUNCS(avx);
|
||||
|
||||
#if ARCH_X86_64
|
||||
#define YUV2NV_DECL(fmt, opt) \
|
||||
void ff_yuv2 ## fmt ## cX_ ## opt(enum AVPixelFormat format, const uint8_t *dither, \
|
||||
const int16_t *filter, int filterSize, \
|
||||
const int16_t **u, const int16_t **v, \
|
||||
uint8_t *dst, int dstWidth)
|
||||
|
||||
YUV2NV_DECL(nv12, avx2);
|
||||
YUV2NV_DECL(nv21, avx2);
|
||||
#endif
|
||||
|
||||
av_cold void ff_sws_init_swscale_x86(SwsContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
@@ -363,25 +391,11 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
if (INLINE_MMXEXT(cpu_flags))
|
||||
sws_init_swscale_mmxext(c);
|
||||
#endif
|
||||
if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) {
|
||||
#if HAVE_MMX_EXTERNAL
|
||||
if (EXTERNAL_MMX(cpu_flags))
|
||||
c->yuv2planeX = yuv2yuvX_mmx;
|
||||
#endif
|
||||
#if HAVE_MMXEXT_EXTERNAL
|
||||
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||
c->yuv2planeX = yuv2yuvX_mmxext;
|
||||
#endif
|
||||
#if HAVE_SSE3_EXTERNAL
|
||||
if (EXTERNAL_SSE3(cpu_flags))
|
||||
if (cpu_flags & AV_CPU_FLAG_SSE3){
|
||||
if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
|
||||
c->yuv2planeX = yuv2yuvX_sse3;
|
||||
#endif
|
||||
#if HAVE_AVX2_EXTERNAL
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags))
|
||||
c->yuv2planeX = yuv2yuvX_avx2;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
|
||||
if (c->srcBpc == 8) { \
|
||||
@@ -566,21 +580,4 @@ switch(c->dstBpc){ \
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#if ARCH_X86_64
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
switch (c->dstFormat) {
|
||||
case AV_PIX_FMT_NV12:
|
||||
case AV_PIX_FMT_NV24:
|
||||
c->yuv2nv12cX = ff_yuv2nv12cX_avx2;
|
||||
break;
|
||||
case AV_PIX_FMT_NV21:
|
||||
case AV_PIX_FMT_NV42:
|
||||
c->yuv2nv12cX = ff_yuv2nv21cX_avx2;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -38,6 +38,88 @@
|
||||
#endif
|
||||
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
|
||||
|
||||
#if !COMPILE_TEMPLATE_MMXEXT
|
||||
static av_always_inline void
|
||||
dither_8to16(const uint8_t *srcDither, int rot)
|
||||
{
|
||||
if (rot) {
|
||||
__asm__ volatile("pxor %%mm0, %%mm0\n\t"
|
||||
"movq (%0), %%mm3\n\t"
|
||||
"movq %%mm3, %%mm4\n\t"
|
||||
"psrlq $24, %%mm3\n\t"
|
||||
"psllq $40, %%mm4\n\t"
|
||||
"por %%mm4, %%mm3\n\t"
|
||||
"movq %%mm3, %%mm4\n\t"
|
||||
"punpcklbw %%mm0, %%mm3\n\t"
|
||||
"punpckhbw %%mm0, %%mm4\n\t"
|
||||
:: "r"(srcDither)
|
||||
);
|
||||
} else {
|
||||
__asm__ volatile("pxor %%mm0, %%mm0\n\t"
|
||||
"movq (%0), %%mm3\n\t"
|
||||
"movq %%mm3, %%mm4\n\t"
|
||||
"punpcklbw %%mm0, %%mm3\n\t"
|
||||
"punpckhbw %%mm0, %%mm4\n\t"
|
||||
:: "r"(srcDither)
|
||||
);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
|
||||
const int16_t **src, uint8_t *dest, int dstW,
|
||||
const uint8_t *dither, int offset)
|
||||
{
|
||||
dither_8to16(dither, offset);
|
||||
filterSize--;
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm1\n\t"
|
||||
"punpcklwd %%mm1, %%mm1\n\t"
|
||||
"punpckldq %%mm1, %%mm1\n\t"
|
||||
"psllw $3, %%mm1\n\t"
|
||||
"paddw %%mm1, %%mm3\n\t"
|
||||
"paddw %%mm1, %%mm4\n\t"
|
||||
"psraw $4, %%mm3\n\t"
|
||||
"psraw $4, %%mm4\n\t"
|
||||
::"m"(filterSize)
|
||||
);
|
||||
|
||||
__asm__ volatile(\
|
||||
"movq %%mm3, %%mm6\n\t"
|
||||
"movq %%mm4, %%mm7\n\t"
|
||||
"movl %3, %%ecx\n\t"
|
||||
"mov %0, %%"FF_REG_d" \n\t"\
|
||||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
|
||||
".p2align 4 \n\t" /* FIXME Unroll? */\
|
||||
"1: \n\t"\
|
||||
"movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\
|
||||
"movq (%%"FF_REG_S", %%"FF_REG_c", 2), %%mm2 \n\t" /* srcData */\
|
||||
"movq 8(%%"FF_REG_S", %%"FF_REG_c", 2), %%mm5 \n\t" /* srcData */\
|
||||
"add $16, %%"FF_REG_d" \n\t"\
|
||||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
|
||||
"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
|
||||
"pmulhw %%mm0, %%mm2 \n\t"\
|
||||
"pmulhw %%mm0, %%mm5 \n\t"\
|
||||
"paddw %%mm2, %%mm3 \n\t"\
|
||||
"paddw %%mm5, %%mm4 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
"psraw $3, %%mm3 \n\t"\
|
||||
"psraw $3, %%mm4 \n\t"\
|
||||
"packuswb %%mm4, %%mm3 \n\t"
|
||||
MOVNTQ2 " %%mm3, (%1, %%"FF_REG_c")\n\t"
|
||||
"add $8, %%"FF_REG_c" \n\t"\
|
||||
"cmp %2, %%"FF_REG_c" \n\t"\
|
||||
"movq %%mm6, %%mm3\n\t"
|
||||
"movq %%mm7, %%mm4\n\t"
|
||||
"mov %0, %%"FF_REG_d" \n\t"\
|
||||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
|
||||
"jb 1b \n\t"\
|
||||
:: "g" (filter),
|
||||
"r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
|
||||
: "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
|
||||
);
|
||||
}
|
||||
|
||||
#define YSCALEYUV2PACKEDX_UV \
|
||||
__asm__ volatile(\
|
||||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
|
||||
@@ -1435,6 +1517,7 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
|
||||
}
|
||||
} else {
|
||||
c->use_mmx_vfilter= 1;
|
||||
c->yuv2planeX = RENAME(yuv2yuvX );
|
||||
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
|
||||
switch (c->dstFormat) {
|
||||
case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
|
||||
|
||||
12
externals/ffmpeg/ffmpeg/libswscale/x86/yuv2rgb.c
vendored
12
externals/ffmpeg/ffmpeg/libswscale/x86/yuv2rgb.c
vendored
@@ -41,6 +41,14 @@
|
||||
|
||||
#define DITHER1XBPP // only for MMX
|
||||
|
||||
/* hope these constant values are cache line aligned */
|
||||
DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw) = 0x00ff00ff00ff00ffULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, pb_e0) = 0xe0e0e0e0e0e0e0e0ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
|
||||
DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
|
||||
|
||||
//MMX versions
|
||||
#if HAVE_MMX
|
||||
#undef RENAME
|
||||
@@ -51,18 +59,22 @@
|
||||
#endif /* HAVE_MMX */
|
||||
|
||||
// MMXEXT versions
|
||||
#if HAVE_MMXEXT
|
||||
#undef RENAME
|
||||
#undef COMPILE_TEMPLATE_MMXEXT
|
||||
#define COMPILE_TEMPLATE_MMXEXT 1
|
||||
#define RENAME(a) a ## _mmxext
|
||||
#include "yuv2rgb_template.c"
|
||||
#endif /* HAVE_MMXEXT */
|
||||
|
||||
//SSSE3 versions
|
||||
#if HAVE_SSSE3
|
||||
#undef RENAME
|
||||
#undef COMPILE_TEMPLATE_MMXEXT
|
||||
#define COMPILE_TEMPLATE_MMXEXT 0
|
||||
#define RENAME(a) a ## _ssse3
|
||||
#include "yuv2rgb_template.c"
|
||||
#endif
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
|
||||
@@ -268,9 +268,9 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
|
||||
por m2, m7
|
||||
por m1, m6 ; g5 b5 r6 g6 b6 r7 g7 b7 r8 g8 b8 r9 g9 b9 r10 g10
|
||||
por m2, m3 ; b10 r11 g11 b11 r12 g12 b12 r13 g13 b13 r14 g14 b14 r15 g15 b15
|
||||
movu [imageq], m0
|
||||
movu [imageq + 16], m1
|
||||
movu [imageq + 32], m2
|
||||
mova [imageq], m0
|
||||
mova [imageq + 16], m1
|
||||
mova [imageq + 32], m2
|
||||
%endif ; mmsize = 16
|
||||
%else ; PACK RGB15/16/32
|
||||
packuswb m0, m1
|
||||
@@ -286,7 +286,7 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
|
||||
%ifidn %1, yuv
|
||||
pcmpeqd m3, m3 ; Set alpha empty
|
||||
%else
|
||||
movu m3, [pa_2indexq + 2 * indexq] ; Load alpha
|
||||
mova m3, [pa_2indexq + 2 * indexq] ; Load alpha
|
||||
%endif
|
||||
mova m5, m_blue
|
||||
mova m6, m_red
|
||||
@@ -300,10 +300,10 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
|
||||
punpckhwd m_green, m_red
|
||||
punpcklwd m5, m6
|
||||
punpckhwd m_alpha, m6
|
||||
movu [imageq + 0], m_blue
|
||||
movu [imageq + 8 * time_num], m_green
|
||||
movu [imageq + 16 * time_num], m5
|
||||
movu [imageq + 24 * time_num], m_alpha
|
||||
mova [imageq + 0], m_blue
|
||||
mova [imageq + 8 * time_num], m_green
|
||||
mova [imageq + 16 * time_num], m5
|
||||
mova [imageq + 24 * time_num], m_alpha
|
||||
%else ; PACK RGB15/16
|
||||
%define depth 2
|
||||
%if cpuflag(ssse3)
|
||||
@@ -342,8 +342,8 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
|
||||
mova m2, m0
|
||||
punpcklbw m0, m1
|
||||
punpckhbw m2, m1
|
||||
movu [imageq], m0
|
||||
movu [imageq + 8 * time_num], m2
|
||||
mova [imageq], m0
|
||||
mova [imageq + 8 * time_num], m2
|
||||
%endif ; PACK RGB15/16
|
||||
%endif ; PACK RGB15/16/32
|
||||
|
||||
|
||||
Reference in New Issue
Block a user