early-access version 1432

This commit is contained in:
pineappleEA
2021-02-09 04:25:58 +01:00
parent de64eab4b4
commit 3d5a9d908a
7336 changed files with 1773492 additions and 111 deletions

199
externals/ffmpeg/libavcodec/x86/Makefile vendored Executable file
View File

@@ -0,0 +1,199 @@
OBJS += x86/constants.o \
# subsystems
OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o
OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o
OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o
OBJS-$(CONFIG_DCT) += x86/dct_init.o
OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \
x86/dirac_dwt_init.o
OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
OBJS-$(CONFIG_FFT) += x86/fft_init.o
OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o
OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert_init.o
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp_init.o
OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp_init.o
OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp_init.o
OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o
OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_init.o
OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o
OBJS-$(CONFIG_LPC) += x86/lpc.o
OBJS-$(CONFIG_MDCT15) += x86/mdct15_init.o
OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \
x86/mpegvideodsp.o
OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o \
x86/mpegvideoencdsp_init.o
OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp_init.o
OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp_init.o
OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp_init.o
OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_init.o
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
# decoders/encoders
OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp_init.o \
x86/sbrdsp_init.o
OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp_init.o
OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o
OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o
OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o
OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o
OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp_init.o
OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_init.o
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o
OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o
OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_init.o
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o
OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o
OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o
OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp_init.o
OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp_init.o
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3_init.o
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o \
x86/vp9dsp_init_10bpp.o \
x86/vp9dsp_init_12bpp.o \
x86/vp9dsp_init_16bpp.o
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
# GCC inline assembly optimizations
# subsystems
MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o
MMX-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_mmx.o
# decoders/encoders
MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
# subsystems
X86ASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o \
x86/ac3dsp_downmix.o
X86ASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o
X86ASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o
X86ASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o
X86ASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
X86ASM-OBJS-$(CONFIG_FFT) += x86/fft.o
X86ASM-OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert.o
X86ASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
X86ASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
x86/h264_chromamc_10bit.o
X86ASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
x86/h264_deblock_10bit.o \
x86/h264_idct.o \
x86/h264_idct_10bit.o \
x86/h264_weight.o \
x86/h264_weight_10bit.o
X86ASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
x86/h264_intrapred_10bit.o
X86ASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
x86/h264_qpel_10bit.o \
x86/fpel.o \
x86/qpel.o
X86ASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
x86/hpeldsp.o
X86ASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o
X86ASM-OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp.o
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o
X86ASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o
X86ASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o
X86ASM-OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp.o
X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o
X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o
X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o
X86ASM-OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp.o
X86ASM-OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_search.o
X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o
X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \
x86/fpel.o \
x86/qpel.o
X86ASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o
X86ASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o \
x86/vc1dsp_mc.o
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o \
x86/simple_idct.o
X86ASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
X86ASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
X86ASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \
x86/vp8dsp_loopfilter.o
# decoders/encoders
X86ASM-OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp.o \
x86/sbrdsp.o
X86ASM-OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp.o
X86ASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o
X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \
x86/dirac_dwt.o
X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o
X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
ifdef CONFIG_GPL
X86ASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o
endif
X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \
x86/hevc_deblock.o \
x86/hevc_idct.o \
x86/hevc_mc.o \
x86/hevc_sao.o \
x86/hevc_sao_10bit.o
X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
X86ASM-OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
X86ASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o
X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o
X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o
X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
X86ASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o
X86ASM-OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp.o
X86ASM-OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp.o
X86ASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o
X86ASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
X86ASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
X86ASM-OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3.o
X86ASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
x86/vp9intrapred_16bpp.o \
x86/vp9itxfm.o \
x86/vp9itxfm_16bpp.o \
x86/vp9lpf.o \
x86/vp9lpf_16bpp.o \
x86/vp9mc.o \
x86/vp9mc_16bpp.o
X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o

86
externals/ffmpeg/libavcodec/x86/aacencdsp.asm vendored Executable file
View File

@@ -0,0 +1,86 @@
;******************************************************************************
;* SIMD optimized AAC encoder DSP functions
;*
;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
float_abs_mask: times 4 dd 0x7fffffff
SECTION .text
;*******************************************************************
;void ff_abs_pow34(float *out, const float *in, const int size);
;*******************************************************************
INIT_XMM sse
cglobal abs_pow34, 3, 3, 3, out, in, size
mova m2, [float_abs_mask]
shl sizeq, 2
add inq, sizeq
add outq, sizeq
neg sizeq
.loop:
andps m0, m2, [inq+sizeq]
sqrtps m1, m0
mulps m0, m1
sqrtps m0, m0
mova [outq+sizeq], m0
add sizeq, mmsize
jl .loop
RET
;*******************************************************************
;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled,
; int size, int is_signed, int maxval, const float Q34,
; const float rounding)
;*******************************************************************
INIT_XMM sse2
cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
%if UNIX64 == 0
movss m0, Q34m
movss m1, roundingm
cvtsi2ss m3, dword maxvalm
%else
cvtsi2ss m3, maxvald
%endif
shufps m0, m0, 0
shufps m1, m1, 0
shufps m3, m3, 0
shl is_signedd, 31
movd m4, is_signedd
shufps m4, m4, 0
shl sized, 2
add inq, sizeq
add outq, sizeq
add scaledq, sizeq
neg sizeq
.loop:
mulps m2, m0, [scaledq+sizeq]
addps m2, m1
minps m2, m3
andps m5, m4, [inq+sizeq]
orps m2, m5
cvttps2dq m2, m2
mova [outq+sizeq], m2
add sizeq, mmsize
jl .loop
RET

View File

@@ -0,0 +1,43 @@
/*
* AAC encoder assembly optimizations
* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/float_dsp.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/aacenc.h"
void ff_abs_pow34_sse(float *out, const float *in, const int size);
void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled,
int size, int is_signed, int maxval, const float Q34,
const float rounding);
av_cold void ff_aac_dsp_init_x86(AACEncContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE(cpu_flags))
s->abs_pow34 = ff_abs_pow34_sse;
if (EXTERNAL_SSE2(cpu_flags))
s->quant_bands = ff_aac_quantize_bands_sse2;
}

487
externals/ffmpeg/libavcodec/x86/aacpsdsp.asm vendored Executable file
View File

@@ -0,0 +1,487 @@
;******************************************************************************
;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
;*
;* Copyright (C) 2015 James Almer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
SECTION .text
;*************************************************************************
;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
;*************************************************************************
%macro PS_ADD_SQUARES 1
cglobal ps_add_squares, 3, 3, %1, dst, src, n
shl nd, 3
add srcq, nq
neg nq
align 16
.loop:
movaps m0, [srcq+nq]
movaps m1, [srcq+nq+mmsize]
mulps m0, m0
mulps m1, m1
HADDPS m0, m1, m2
addps m0, [dstq]
movaps [dstq], m0
add dstq, mmsize
add nq, mmsize*2
jl .loop
REP_RET
%endmacro
INIT_XMM sse
PS_ADD_SQUARES 2
INIT_XMM sse3
PS_ADD_SQUARES 3
;*******************************************************************
;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
; float *src1, int n);
;*******************************************************************
INIT_XMM sse
cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
shl nd, 3
add src1q, nq
add dstq, nq
neg nq
align 16
.loop:
movu m0, [src1q+nq]
movu m1, [src1q+nq+mmsize]
mova m2, [src2q]
mova m3, m2
unpcklps m2, m2
unpckhps m3, m3
mulps m0, m2
mulps m1, m3
mova [dstq+nq], m0
mova [dstq+nq+mmsize], m1
add src2q, mmsize
add nq, mmsize*2
jl .loop
REP_RET
;***********************************************************************
;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
; float h[2][4], float h_step[2][4],
; int len);
;***********************************************************************
INIT_XMM sse3
cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
movaps m0, [hq]
movaps m1, [h_stepq]
unpcklps m4, m0, m0
unpckhps m0, m0
unpcklps m5, m1, m1
unpckhps m1, m1
shl nd, 3
add lq, nq
add rq, nq
neg nq
align 16
.loop:
addps m4, m5
addps m0, m1
movddup m2, [lq+nq]
movddup m3, [rq+nq]
mulps m2, m4
mulps m3, m0
addps m2, m3
movsd [lq+nq], m2
movhps [rq+nq], m2
add nq, 8
jl .loop
REP_RET
;***************************************************************************
;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
; float h[2][4], float h_step[2][4],
; int len);
;***************************************************************************
INIT_XMM sse3
cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
movaps m0, [hq]
movaps m1, [hq+mmsize]
%if ARCH_X86_64
movaps m8, [h_stepq]
movaps m9, [h_stepq+mmsize]
%define H_STEP0 m8
%define H_STEP1 m9
%else
%define H_STEP0 [h_stepq]
%define H_STEP1 [h_stepq+mmsize]
%endif
shl nd, 3
add lq, nq
add rq, nq
neg nq
align 16
.loop:
addps m0, H_STEP0
addps m1, H_STEP1
movddup m2, [lq+nq]
movddup m3, [rq+nq]
shufps m4, m2, m2, q2301
shufps m5, m3, m3, q2301
unpcklps m6, m0, m0
unpckhps m7, m0, m0
mulps m2, m6
mulps m3, m7
unpcklps m6, m1, m1
unpckhps m7, m1, m1
mulps m4, m6
mulps m5, m7
addps m2, m3
addsubps m2, m4
addsubps m2, m5
movsd [lq+nq], m2
movhps [rq+nq], m2
add nq, 8
jl .loop
REP_RET
;**********************************************************
;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
; float (*in)[32][2],
; int i, int len)
;**********************************************************
INIT_XMM sse
cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
movsxdifnidn iq, id
mov lend, 32 << 3
lea inq, [inq+iq*4]
mov tmpd, id
shl tmpd, 8
add outq, tmpq
mov tmpd, 64
sub tmpd, id
mov id, tmpd
test id, 1
jne .loop4
test id, 2
jne .loop8
align 16
.loop16:
mov in0q, inq
mov in1q, 38*64*4
add in1q, in0q
mov tmpd, lend
.inner_loop16:
movaps m0, [in0q]
movaps m1, [in1q]
movaps m2, [in0q+lenq]
movaps m3, [in1q+lenq]
TRANSPOSE4x4PS 0, 1, 2, 3, 4
movaps [outq], m0
movaps [outq+lenq], m1
movaps [outq+lenq*2], m2
movaps [outq+3*32*2*4], m3
lea in0q, [in0q+lenq*2]
lea in1q, [in1q+lenq*2]
add outq, mmsize
sub tmpd, mmsize
jg .inner_loop16
add inq, 16
add outq, 3*32*2*4
sub id, 4
jg .loop16
RET
align 16
.loop8:
mov in0q, inq
mov in1q, 38*64*4
add in1q, in0q
mov tmpd, lend
.inner_loop8:
movlps m0, [in0q]
movlps m1, [in1q]
movhps m0, [in0q+lenq]
movhps m1, [in1q+lenq]
SBUTTERFLYPS 0, 1, 2
SBUTTERFLYPD 0, 1, 2
movaps [outq], m0
movaps [outq+lenq], m1
lea in0q, [in0q+lenq*2]
lea in1q, [in1q+lenq*2]
add outq, mmsize
sub tmpd, mmsize
jg .inner_loop8
add inq, 8
add outq, lenq
sub id, 2
jg .loop16
RET
align 16
.loop4:
mov in0q, inq
mov in1q, 38*64*4
add in1q, in0q
mov tmpd, lend
.inner_loop4:
movss m0, [in0q]
movss m1, [in1q]
movss m2, [in0q+lenq]
movss m3, [in1q+lenq]
movlhps m0, m1
movlhps m2, m3
shufps m0, m2, q2020
movaps [outq], m0
lea in0q, [in0q+lenq*2]
lea in1q, [in1q+lenq*2]
add outq, mmsize
sub tmpd, mmsize
jg .inner_loop4
add inq, 4
sub id, 1
test id, 2
jne .loop8
cmp id, 4
jge .loop16
RET
;***********************************************************
;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
; float (*in)[32][2],
; int i, int len)
;***********************************************************
%macro HYBRID_SYNTHESIS_DEINT 0
cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
%if cpuflag(sse4)
%define MOVH movsd
%else
%define MOVH movlps
%endif
movsxdifnidn iq, id
mov lend, 32 << 3
lea outq, [outq+iq*4]
mov tmpd, id
shl tmpd, 8
add inq, tmpq
mov tmpd, 64
sub tmpd, id
mov id, tmpd
test id, 1
jne .loop4
test id, 2
jne .loop8
align 16
.loop16:
mov out0q, outq
mov out1q, 38*64*4
add out1q, out0q
mov tmpd, lend
.inner_loop16:
movaps m0, [inq]
movaps m1, [inq+lenq]
movaps m2, [inq+lenq*2]
movaps m3, [inq+3*32*2*4]
TRANSPOSE4x4PS 0, 1, 2, 3, 4
movaps [out0q], m0
movaps [out1q], m1
movaps [out0q+lenq], m2
movaps [out1q+lenq], m3
lea out0q, [out0q+lenq*2]
lea out1q, [out1q+lenq*2]
add inq, mmsize
sub tmpd, mmsize
jg .inner_loop16
add outq, 16
add inq, 3*32*2*4
sub id, 4
jg .loop16
RET
align 16
.loop8:
mov out0q, outq
mov out1q, 38*64*4
add out1q, out0q
mov tmpd, lend
.inner_loop8:
movaps m0, [inq]
movaps m1, [inq+lenq]
SBUTTERFLYPS 0, 1, 2
SBUTTERFLYPD 0, 1, 2
MOVH [out0q], m0
MOVH [out1q], m1
movhps [out0q+lenq], m0
movhps [out1q+lenq], m1
lea out0q, [out0q+lenq*2]
lea out1q, [out1q+lenq*2]
add inq, mmsize
sub tmpd, mmsize
jg .inner_loop8
add outq, 8
add inq, lenq
sub id, 2
jg .loop16
RET
align 16
.loop4:
mov out0q, outq
mov out1q, 38*64*4
add out1q, out0q
mov tmpd, lend
.inner_loop4:
movaps m0, [inq]
movss [out0q], m0
%if cpuflag(sse4)
extractps [out1q], m0, 1
extractps [out0q+lenq], m0, 2
extractps [out1q+lenq], m0, 3
%else
movhlps m1, m0
movss [out0q+lenq], m1
shufps m0, m0, 0xb1
movss [out1q], m0
movhlps m1, m0
movss [out1q+lenq], m1
%endif
lea out0q, [out0q+lenq*2]
lea out1q, [out1q+lenq*2]
add inq, mmsize
sub tmpd, mmsize
jg .inner_loop4
add outq, 4
sub id, 1
test id, 2
jne .loop8
cmp id, 4
jge .loop16
RET
%endmacro
INIT_XMM sse
HYBRID_SYNTHESIS_DEINT
INIT_XMM sse4
HYBRID_SYNTHESIS_DEINT
;*******************************************************************
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
; const float (*filter)[8][2],
; ptrdiff_t stride, int n);
;*******************************************************************
%macro PS_HYBRID_ANALYSIS_LOOP 3
movu %1, [inq+mmsize*%3]
movu m1, [inq+mmsize*(5-%3)+8]
%if cpuflag(sse3)
pshufd %2, %1, q2301
pshufd m4, m1, q0123
pshufd m1, m1, q1032
pshufd m2, [filterq+nq+mmsize*%3], q2301
addsubps %2, m4
addsubps %1, m1
%else
mova m2, [filterq+nq+mmsize*%3]
mova %2, %1
mova m4, m1
shufps %2, %2, q2301
shufps m4, m4, q0123
shufps m1, m1, q1032
shufps m2, m2, q2301
xorps m4, m7
xorps m1, m7
subps %2, m4
subps %1, m1
%endif
mulps %2, m2
mulps %1, m2
%if %3
addps m3, %2
addps m0, %1
%endif
%endmacro
%macro PS_HYBRID_ANALYSIS 0
cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
%if cpuflag(sse3)
%define MOVH movsd
%else
%define MOVH movlps
%endif
shl strideq, 3
shl nd, 6
add filterq, nq
neg nq
mova m7, [ps_p1m1p1m1]
align 16
.loop:
PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
%if cpuflag(sse3)
pshufd m3, m3, q2301
xorps m0, m7
hsubps m3, m0
pshufd m1, m3, q0020
pshufd m3, m3, q0031
addps m1, m3
movsd m2, [inq+6*8]
%else
mova m1, m3
mova m2, m0
shufps m1, m1, q2301
shufps m2, m2, q2301
subps m1, m3
addps m2, m0
unpcklps m3, m1, m2
unpckhps m1, m2
addps m1, m3
movu m2, [inq+6*8] ; faster than movlps and no risk of overread
%endif
movss m3, [filterq+nq+8*6]
SPLATD m3
mulps m2, m3
addps m1, m2
MOVH [outq], m1
add outq, strideq
add nq, 64
jl .loop
REP_RET
%endmacro
INIT_XMM sse
PS_HYBRID_ANALYSIS
INIT_XMM sse3
PS_HYBRID_ANALYSIS

View File

@@ -0,0 +1,72 @@
/*
* SIMD optimized MPEG-4 Parametric Stereo decoding functions
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include "config.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/attributes.h"
#include "libavcodec/aacpsdsp.h"
void ff_ps_add_squares_sse (float *dst, const float (*src)[2], int n);
void ff_ps_add_squares_sse3 (float *dst, const float (*src)[2], int n);
void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
float *src1, int n);
void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
const float (*filter)[8][2],
ptrdiff_t stride, int n);
void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
const float (*filter)[8][2],
ptrdiff_t stride, int n);
void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
float h[2][4], float h_step[2][4],
int len);
void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
float h[2][4], float h_step[2][4],
int len);
void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2],
int i, int len);
void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2],
int i, int len);
void ff_ps_hybrid_analysis_ileave_sse(float (*out)[32][2], float L[2][38][64],
int i, int len);
av_cold void ff_psdsp_init_x86(PSDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE(cpu_flags)) {
s->add_squares = ff_ps_add_squares_sse;
s->mul_pair_single = ff_ps_mul_pair_single_sse;
s->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_sse;
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse;
s->hybrid_analysis = ff_ps_hybrid_analysis_sse;
}
if (EXTERNAL_SSE3(cpu_flags)) {
s->add_squares = ff_ps_add_squares_sse3;
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3;
s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3;
s->hybrid_analysis = ff_ps_hybrid_analysis_sse3;
}
if (EXTERNAL_SSE4(cpu_flags)) {
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4;
}
}

552
externals/ffmpeg/libavcodec/x86/ac3dsp.asm vendored Executable file
View File

@@ -0,0 +1,552 @@
;*****************************************************************************
;* x86-optimized AC-3 DSP functions
;* Copyright (c) 2011 Justin Ruggles
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
; 16777216.0f - used in ff_float_to_fixed24()
pf_1_24: times 4 dd 0x4B800000
; used in ff_ac3_compute_mantissa_size()
cextern ac3_bap_bits
pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
; used in ff_ac3_extract_exponents()
cextern pd_1
pd_151: times 4 dd 151
; used in ff_apply_window_int16()
pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
pd_16384: times 4 dd 16384
SECTION .text
;-----------------------------------------------------------------------------
; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
;-----------------------------------------------------------------------------
%macro AC3_EXPONENT_MIN 0
cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
shl reuse_blksq, 8
jz .end
LOOP_ALIGN
.nextexp:
mov offsetq, reuse_blksq
mova m0, [expq+offsetq]
sub offsetq, 256
LOOP_ALIGN
.nextblk:
PMINUB m0, [expq+offsetq], m1
sub offsetq, 256
jae .nextblk
mova [expq], m0
add expq, mmsize
sub expnq, mmsize
jg .nextexp
.end:
REP_RET
%endmacro
%define LOOP_ALIGN
INIT_MMX mmx
AC3_EXPONENT_MIN
%if HAVE_MMXEXT_EXTERNAL
%define LOOP_ALIGN ALIGN 16
INIT_MMX mmxext
AC3_EXPONENT_MIN
%endif
%if HAVE_SSE2_EXTERNAL
INIT_XMM sse2
AC3_EXPONENT_MIN
%endif
%undef LOOP_ALIGN
;-----------------------------------------------------------------------------
; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
;
; This function uses 2 different methods to calculate a valid result.
; 1) logical 'or' of abs of each element
; This is used for ssse3 because of the pabsw instruction.
; It is also used for mmx because of the lack of min/max instructions.
; 2) calculate min/max for the array, then or(abs(min),abs(max))
; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
;-----------------------------------------------------------------------------
; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
%macro OR_WORDS_HORIZ 2 ; src, tmp
%if cpuflag(sse2)
movhlps %2, %1
por %1, %2
pshuflw %2, %1, q0032
por %1, %2
pshuflw %2, %1, q0001
por %1, %2
%elif cpuflag(mmxext)
pshufw %2, %1, q0032
por %1, %2
pshufw %2, %1, q0001
por %1, %2
%else ; mmx
movq %2, %1
psrlq %2, 32
por %1, %2
movq %2, %1
psrlq %2, 16
por %1, %2
%endif
%endmacro
%macro AC3_MAX_MSB_ABS_INT16 1
cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
pxor m2, m2
pxor m3, m3
.loop:
%ifidn %1, min_max
mova m0, [srcq]
mova m1, [srcq+mmsize]
pminsw m2, m0
pminsw m2, m1
pmaxsw m3, m0
pmaxsw m3, m1
%else ; or_abs
%if notcpuflag(ssse3)
mova m0, [srcq]
mova m1, [srcq+mmsize]
ABS2 m0, m1, m3, m4
%else ; ssse3
; using memory args is faster for ssse3
pabsw m0, [srcq]
pabsw m1, [srcq+mmsize]
%endif
por m2, m0
por m2, m1
%endif
add srcq, mmsize*2
sub lend, mmsize
ja .loop
%ifidn %1, min_max
ABS2 m2, m3, m0, m1
por m2, m3
%endif
OR_WORDS_HORIZ m2, m0
movd eax, m2
and eax, 0xFFFF
RET
%endmacro
INIT_MMX mmx
AC3_MAX_MSB_ABS_INT16 or_abs
INIT_MMX mmxext
AC3_MAX_MSB_ABS_INT16 min_max
INIT_XMM sse2
AC3_MAX_MSB_ABS_INT16 min_max
INIT_XMM ssse3
AC3_MAX_MSB_ABS_INT16 or_abs
;-----------------------------------------------------------------------------
; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
;-----------------------------------------------------------------------------
%macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
movd m0, shiftd
.loop:
mova m1, [srcq ]
mova m2, [srcq+mmsize ]
mova m3, [srcq+mmsize*2]
mova m4, [srcq+mmsize*3]
%3 m1, m0
%3 m2, m0
%3 m3, m0
%3 m4, m0
mova [srcq ], m1
mova [srcq+mmsize ], m2
mova [srcq+mmsize*2], m3
mova [srcq+mmsize*3], m4
add srcq, mmsize*4
sub lend, mmsize*32/%2
ja .loop
.end:
REP_RET
%endmacro
;-----------------------------------------------------------------------------
; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
;-----------------------------------------------------------------------------
INIT_MMX mmx
AC3_SHIFT l, 16, psllw
INIT_XMM sse2
AC3_SHIFT l, 16, psllw
;-----------------------------------------------------------------------------
; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
;-----------------------------------------------------------------------------
INIT_MMX mmx
AC3_SHIFT r, 32, psrad
INIT_XMM sse2
AC3_SHIFT r, 32, psrad
;-----------------------------------------------------------------------------
; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
;-----------------------------------------------------------------------------
; The 3DNow! version is not bit-identical because pf2id uses truncation rather
; than round-to-nearest.
INIT_MMX 3dnow
cglobal float_to_fixed24, 3, 3, 0, dst, src, len
movq m0, [pf_1_24]
.loop:
movq m1, [srcq ]
movq m2, [srcq+8 ]
movq m3, [srcq+16]
movq m4, [srcq+24]
pfmul m1, m0
pfmul m2, m0
pfmul m3, m0
pfmul m4, m0
pf2id m1, m1
pf2id m2, m2
pf2id m3, m3
pf2id m4, m4
movq [dstq ], m1
movq [dstq+8 ], m2
movq [dstq+16], m3
movq [dstq+24], m4
add srcq, 32
add dstq, 32
sub lend, 8
ja .loop
femms
RET
INIT_XMM sse
cglobal float_to_fixed24, 3, 3, 3, dst, src, len
movaps m0, [pf_1_24]
.loop:
movaps m1, [srcq ]
movaps m2, [srcq+16]
mulps m1, m0
mulps m2, m0
cvtps2pi mm0, m1
movhlps m1, m1
cvtps2pi mm1, m1
cvtps2pi mm2, m2
movhlps m2, m2
cvtps2pi mm3, m2
movq [dstq ], mm0
movq [dstq+ 8], mm1
movq [dstq+16], mm2
movq [dstq+24], mm3
add srcq, 32
add dstq, 32
sub lend, 8
ja .loop
emms
RET
INIT_XMM sse2
cglobal float_to_fixed24, 3, 3, 9, dst, src, len
movaps m0, [pf_1_24]
.loop:
movaps m1, [srcq ]
movaps m2, [srcq+16 ]
movaps m3, [srcq+32 ]
movaps m4, [srcq+48 ]
%ifdef m8
movaps m5, [srcq+64 ]
movaps m6, [srcq+80 ]
movaps m7, [srcq+96 ]
movaps m8, [srcq+112]
%endif
mulps m1, m0
mulps m2, m0
mulps m3, m0
mulps m4, m0
%ifdef m8
mulps m5, m0
mulps m6, m0
mulps m7, m0
mulps m8, m0
%endif
cvtps2dq m1, m1
cvtps2dq m2, m2
cvtps2dq m3, m3
cvtps2dq m4, m4
%ifdef m8
cvtps2dq m5, m5
cvtps2dq m6, m6
cvtps2dq m7, m7
cvtps2dq m8, m8
%endif
movdqa [dstq ], m1
movdqa [dstq+16 ], m2
movdqa [dstq+32 ], m3
movdqa [dstq+48 ], m4
%ifdef m8
movdqa [dstq+64 ], m5
movdqa [dstq+80 ], m6
movdqa [dstq+96 ], m7
movdqa [dstq+112], m8
add srcq, 128
add dstq, 128
sub lenq, 32
%else
add srcq, 64
add dstq, 64
sub lenq, 16
%endif
ja .loop
REP_RET
;------------------------------------------------------------------------------
; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
;------------------------------------------------------------------------------
%macro PHADDD4 2 ; xmm src, xmm tmp
movhlps %2, %1
paddd %1, %2
pshufd %2, %1, 0x1
paddd %1, %2
%endmacro
INIT_XMM sse2
cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
movdqa m0, [mant_cntq ]
movdqa m1, [mant_cntq+ 1*16]
paddw m0, [mant_cntq+ 2*16]
paddw m1, [mant_cntq+ 3*16]
paddw m0, [mant_cntq+ 4*16]
paddw m1, [mant_cntq+ 5*16]
paddw m0, [mant_cntq+ 6*16]
paddw m1, [mant_cntq+ 7*16]
paddw m0, [mant_cntq+ 8*16]
paddw m1, [mant_cntq+ 9*16]
paddw m0, [mant_cntq+10*16]
paddw m1, [mant_cntq+11*16]
pmaddwd m0, [ac3_bap_bits ]
pmaddwd m1, [ac3_bap_bits+16]
paddd m0, m1
PHADDD4 m0, m1
movd sumd, m0
movdqa m3, [pw_bap_mul1]
movhpd m0, [mant_cntq +2]
movlpd m0, [mant_cntq+1*32+2]
movhpd m1, [mant_cntq+2*32+2]
movlpd m1, [mant_cntq+3*32+2]
movhpd m2, [mant_cntq+4*32+2]
movlpd m2, [mant_cntq+5*32+2]
pmulhuw m0, m3
pmulhuw m1, m3
pmulhuw m2, m3
paddusw m0, m1
paddusw m0, m2
pmaddwd m0, [pw_bap_mul2]
PHADDD4 m0, m1
movd eax, m0
add eax, sumd
RET
;------------------------------------------------------------------------------
; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
;------------------------------------------------------------------------------
%macro PABSD 1-2 ; src/dst, unused
%if cpuflag(ssse3)
pabsd %1, %1
%else ; src/dst, tmp
pxor %2, %2
pcmpgtd %2, %1
pxor %1, %2
psubd %1, %2
%endif
%endmacro
%macro AC3_EXTRACT_EXPONENTS 0
cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
add expq, lenq
lea coefq, [coefq+4*lenq]
neg lenq
mova m2, [pd_1]
mova m3, [pd_151]
.loop:
; move 4 32-bit coefs to xmm0
mova m0, [coefq+4*lenq]
; absolute value
PABSD m0, m1
; convert to float and extract exponents
pslld m0, 1
por m0, m2
cvtdq2ps m1, m0
psrld m1, 23
mova m0, m3
psubd m0, m1
; move the lowest byte in each of 4 dwords to the low dword
; NOTE: We cannot just extract the low bytes with pshufb because the dword
; result for 16777215 is -1 due to float inaccuracy. Using packuswb
; clips this to 0, which is the correct exponent.
packssdw m0, m0
packuswb m0, m0
movd [expq+lenq], m0
add lenq, 4
jl .loop
REP_RET
%endmacro
%if HAVE_SSE2_EXTERNAL
INIT_XMM sse2
AC3_EXTRACT_EXPONENTS
%endif
%if HAVE_SSSE3_EXTERNAL
INIT_XMM ssse3
AC3_EXTRACT_EXPONENTS
%endif
;-----------------------------------------------------------------------------
; void ff_apply_window_int16(int16_t *output, const int16_t *input,
; const int16_t *window, unsigned int len)
;-----------------------------------------------------------------------------
%macro REVERSE_WORDS 1-2
%if cpuflag(ssse3) && notcpuflag(atom)
pshufb %1, %2
%elif cpuflag(sse2)
pshuflw %1, %1, 0x1B
pshufhw %1, %1, 0x1B
pshufd %1, %1, 0x4E
%elif cpuflag(mmxext)
pshufw %1, %1, 0x1B
%endif
%endmacro
%macro MUL16FIXED 3
%if cpuflag(ssse3) ; dst, src, unused
; dst = ((dst * src) + (1<<14)) >> 15
pmulhrsw %1, %2
%elif cpuflag(mmxext) ; dst, src, temp
; dst = (dst * src) >> 15
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
; in from the pmullw result.
mova %3, %1
pmulhw %1, %2
pmullw %3, %2
psrlw %3, 15
psllw %1, 1
por %1, %3
%endif
%endmacro
%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
%if %1
cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
%else
cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
%endif
lea offset2q, [offsetq-mmsize]
%if cpuflag(ssse3) && notcpuflag(atom)
mova m5, [pb_revwords]
ALIGN 16
%elif %1
mova m5, [pd_16384]
%endif
.loop:
%if cpuflag(ssse3)
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The ssse3 version is bit-identical.
mova m0, [windowq+offset2q]
mova m1, [ inputq+offset2q]
pmulhrsw m1, m0
REVERSE_WORDS m0, m5
pmulhrsw m0, [ inputq+offsetq ]
mova [outputq+offset2q], m1
mova [outputq+offsetq ], m0
%elif %1
; This version expands 16-bit to 32-bit, multiplies by the window,
; adds 16384 for rounding, right shifts 15, then repacks back to words to
; save to the output. The window is reversed for the second half.
mova m3, [windowq+offset2q]
mova m4, [ inputq+offset2q]
pxor m0, m0
punpcklwd m0, m3
punpcklwd m1, m4
pmaddwd m0, m1
paddd m0, m5
psrad m0, 15
pxor m2, m2
punpckhwd m2, m3
punpckhwd m1, m4
pmaddwd m2, m1
paddd m2, m5
psrad m2, 15
packssdw m0, m2
mova [outputq+offset2q], m0
REVERSE_WORDS m3
mova m4, [ inputq+offsetq]
pxor m0, m0
punpcklwd m0, m3
punpcklwd m1, m4
pmaddwd m0, m1
paddd m0, m5
psrad m0, 15
pxor m2, m2
punpckhwd m2, m3
punpckhwd m1, m4
pmaddwd m2, m1
paddd m2, m5
psrad m2, 15
packssdw m0, m2
mova [outputq+offsetq], m0
%else
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The mmxext and sse2 versions do not use rounding, and
; therefore are not bit-identical to the C version.
mova m0, [windowq+offset2q]
mova m1, [ inputq+offset2q]
mova m2, [ inputq+offsetq ]
MUL16FIXED m1, m0, m3
REVERSE_WORDS m0
MUL16FIXED m2, m0, m3
mova [outputq+offset2q], m1
mova [outputq+offsetq ], m2
%endif
add offsetd, mmsize
sub offset2d, mmsize
jae .loop
REP_RET
%endmacro
INIT_MMX mmxext
APPLY_WINDOW_INT16 0
INIT_XMM sse2
APPLY_WINDOW_INT16 0
INIT_MMX mmxext
APPLY_WINDOW_INT16 1
INIT_XMM sse2
APPLY_WINDOW_INT16 1
INIT_XMM ssse3
APPLY_WINDOW_INT16 1
INIT_XMM ssse3, atom
APPLY_WINDOW_INT16 1

View File

@@ -0,0 +1,187 @@
;*****************************************************************************
;* x86-optimized AC-3 downmixing
;* Copyright (c) 2012 Justin Ruggles
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
;******************************************************************************
;* This is based on the channel mixing asm in libavresample, but it is
;* simplified for only float coefficients and only 3 to 6 channels.
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
;-----------------------------------------------------------------------------
; functions to downmix from 3 to 6 channels to mono or stereo
; void ff_ac3_downmix_*(float **samples, float **matrix, int len);
;-----------------------------------------------------------------------------
%macro AC3_DOWNMIX 2 ; %1 = in channels, %2 = out channels
; define some names to make the code clearer
%assign in_channels %1
%assign out_channels %2
%assign stereo out_channels - 1
; determine how many matrix elements must go on the stack vs. mmregs
%assign matrix_elements in_channels * out_channels
%if stereo
%assign needed_mmregs 4
%else
%assign needed_mmregs 3
%endif
%assign matrix_elements_mm num_mmregs - needed_mmregs
%if matrix_elements < matrix_elements_mm
%assign matrix_elements_mm matrix_elements
%endif
%assign total_mmregs needed_mmregs+matrix_elements_mm
%if matrix_elements_mm < matrix_elements
%assign matrix_elements_stack matrix_elements - matrix_elements_mm
%else
%assign matrix_elements_stack 0
%endif
cglobal ac3_downmix_%1_to_%2, 3,in_channels+1,total_mmregs,0-matrix_elements_stack*mmsize, src0, src1, len, src2, src3, src4, src5
; load matrix pointers
%define matrix0q r1q
%define matrix1q r3q
%if stereo
mov matrix1q, [matrix0q+gprsize]
%endif
mov matrix0q, [matrix0q]
; define matrix coeff names
%assign %%i 0
%assign %%j needed_mmregs
%rep in_channels
%if %%i >= matrix_elements_mm
CAT_XDEFINE mx_stack_0_, %%i, 1
CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
%else
CAT_XDEFINE mx_stack_0_, %%i, 0
CAT_XDEFINE mx_0_, %%i, m %+ %%j
%assign %%j %%j+1
%endif
%assign %%i %%i+1
%endrep
%if stereo
%assign %%i 0
%rep in_channels
%if in_channels + %%i >= matrix_elements_mm
CAT_XDEFINE mx_stack_1_, %%i, 1
CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
%else
CAT_XDEFINE mx_stack_1_, %%i, 0
CAT_XDEFINE mx_1_, %%i, m %+ %%j
%assign %%j %%j+1
%endif
%assign %%i %%i+1
%endrep
%endif
; load/splat matrix coeffs
%assign %%i 0
%rep in_channels
%if mx_stack_0_ %+ %%i
VBROADCASTSS m0, [matrix0q+4*%%i]
mova mx_0_ %+ %%i, m0
%else
VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
%endif
%if stereo
%if mx_stack_1_ %+ %%i
VBROADCASTSS m0, [matrix1q+4*%%i]
mova mx_1_ %+ %%i, m0
%else
VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
%endif
%endif
%assign %%i %%i+1
%endrep
lea lenq, [4*r2d]
; load channel pointers to registers
%assign %%i 1
%rep (in_channels - 1)
mov src %+ %%i %+ q, [src0q+%%i*gprsize]
add src %+ %%i %+ q, lenq
%assign %%i %%i+1
%endrep
mov src0q, [src0q]
add src0q, lenq
neg lenq
.loop:
%if stereo || mx_stack_0_0
mova m0, [src0q+lenq]
%endif
%if stereo
mulps m1, m0, mx_1_0
%endif
%if stereo || mx_stack_0_0
mulps m0, m0, mx_0_0
%else
mulps m0, mx_0_0, [src0q+lenq]
%endif
%assign %%i 1
%rep (in_channels - 1)
%define src_ptr src %+ %%i %+ q
; avoid extra load for mono if matrix is in a mm register
%if stereo || mx_stack_0_ %+ %%i
mova m2, [src_ptr+lenq]
%endif
%if stereo
FMULADD_PS m1, m2, mx_1_ %+ %%i, m1, m3
%endif
%if stereo || mx_stack_0_ %+ %%i
FMULADD_PS m0, m2, mx_0_ %+ %%i, m0, m2
%else
FMULADD_PS m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
%endif
%assign %%i %%i+1
%endrep
mova [src0q+lenq], m0
%if stereo
mova [src1q+lenq], m1
%endif
add lenq, mmsize
jl .loop
RET
%endmacro
%macro AC3_DOWNMIX_FUNCS 0
%assign %%i 3
%rep 4
INIT_XMM sse
AC3_DOWNMIX %%i, 1
AC3_DOWNMIX %%i, 2
INIT_YMM avx
AC3_DOWNMIX %%i, 1
AC3_DOWNMIX %%i, 2
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
AC3_DOWNMIX %%i, 1
AC3_DOWNMIX %%i, 2
%endif
%assign %%i %%i+1
%endrep
%endmacro
AC3_DOWNMIX_FUNCS

164
externals/ffmpeg/libavcodec/x86/ac3dsp_init.c vendored Executable file
View File

@@ -0,0 +1,164 @@
/*
* x86-optimized AC-3 DSP functions
* Copyright (c) 2011 Justin Ruggles
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/ac3.h"
#include "libavcodec/ac3dsp.h"
void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);
void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);
void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len);
void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
if (!bit_exact) {
c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
if (bit_exact) {
c->apply_window_int16 = ff_apply_window_int16_mmxext;
} else {
c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
}
}
if (EXTERNAL_SSE(cpu_flags)) {
c->float_to_fixed24 = ff_float_to_fixed24_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
c->float_to_fixed24 = ff_float_to_fixed24_sse2;
c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
c->extract_exponents = ff_ac3_extract_exponents_sse2;
if (bit_exact) {
c->apply_window_int16 = ff_apply_window_int16_sse2;
}
}
if (EXTERNAL_SSE2_FAST(cpu_flags)) {
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
if (!bit_exact) {
c->apply_window_int16 = ff_apply_window_int16_round_sse2;
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
if (cpu_flags & AV_CPU_FLAG_ATOM) {
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
} else {
c->extract_exponents = ff_ac3_extract_exponents_ssse3;
c->apply_window_int16 = ff_apply_window_int16_ssse3;
}
}
}
#define DOWNMIX_FUNC_OPT(ch, opt) \
void ff_ac3_downmix_ ## ch ## _to_1_ ## opt(float **samples, \
float **matrix, int len); \
void ff_ac3_downmix_ ## ch ## _to_2_ ## opt(float **samples, \
float **matrix, int len);
#define DOWNMIX_FUNCS(opt) \
DOWNMIX_FUNC_OPT(3, opt) \
DOWNMIX_FUNC_OPT(4, opt) \
DOWNMIX_FUNC_OPT(5, opt) \
DOWNMIX_FUNC_OPT(6, opt)
DOWNMIX_FUNCS(sse)
DOWNMIX_FUNCS(avx)
DOWNMIX_FUNCS(fma3)
void ff_ac3dsp_set_downmix_x86(AC3DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
#define SET_DOWNMIX(ch, suf, SUF) \
if (ch == c->in_channels) { \
if (EXTERNAL_ ## SUF (cpu_flags)) { \
if (c->out_channels == 1) \
c->downmix = ff_ac3_downmix_ ## ch ## _to_1_ ## suf; \
else \
c->downmix = ff_ac3_downmix_ ## ch ## _to_2_ ## suf; \
} \
}
#define SET_DOWNMIX_ALL(suf, SUF) \
SET_DOWNMIX(3, suf, SUF) \
SET_DOWNMIX(4, suf, SUF) \
SET_DOWNMIX(5, suf, SUF) \
SET_DOWNMIX(6, suf, SUF)
SET_DOWNMIX_ALL(sse, SSE)
if (!(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
SET_DOWNMIX_ALL(avx, AVX)
SET_DOWNMIX_ALL(fma3, FMA3)
}
}

133
externals/ffmpeg/libavcodec/x86/alacdsp.asm vendored Executable file
View File

@@ -0,0 +1,133 @@
;******************************************************************************
;* ALAC DSP SIMD optimizations
;*
;* Copyright (C) 2015 James Almer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
INIT_XMM sse4
%if ARCH_X86_64
cglobal alac_decorrelate_stereo, 2, 5, 8, buf0, len, shift, weight, buf1
%else
cglobal alac_decorrelate_stereo, 2, 3, 8, buf0, len, shift, weight
%define buf1q r2q
%endif
movd m6, shiftm
movd m7, weightm
SPLATD m7
shl lend, 2
mov buf1q, [buf0q + gprsize]
mov buf0q, [buf0q]
add buf1q, lenq
add buf0q, lenq
neg lenq
align 16
.loop:
mova m0, [buf0q + lenq]
mova m1, [buf0q + lenq + mmsize]
mova m2, [buf1q + lenq]
mova m3, [buf1q + lenq + mmsize]
pmulld m4, m2, m7
pmulld m5, m3, m7
psrad m4, m6
psrad m5, m6
psubd m0, m4
psubd m1, m5
paddd m2, m0
paddd m3, m1
mova [buf1q + lenq], m0
mova [buf1q + lenq + mmsize], m1
mova [buf0q + lenq], m2
mova [buf0q + lenq + mmsize], m3
add lenq, mmsize*2
jl .loop
RET
INIT_XMM sse2
cglobal alac_append_extra_bits_stereo, 2, 5, 5, buf0, exbuf0, buf1, exbuf1, len
movifnidn lend, lenm
movd m4, r2m ; exbits
shl lend, 2
mov buf1q, [buf0q + gprsize]
mov buf0q, [buf0q]
mov exbuf1q, [exbuf0q + gprsize]
mov exbuf0q, [exbuf0q]
add buf1q, lenq
add buf0q, lenq
add exbuf1q, lenq
add exbuf0q, lenq
neg lenq
align 16
.loop:
mova m0, [buf0q + lenq]
mova m1, [buf0q + lenq + mmsize]
pslld m0, m4
pslld m1, m4
mova m2, [buf1q + lenq]
mova m3, [buf1q + lenq + mmsize]
pslld m2, m4
pslld m3, m4
por m0, [exbuf0q + lenq]
por m1, [exbuf0q + lenq + mmsize]
por m2, [exbuf1q + lenq]
por m3, [exbuf1q + lenq + mmsize]
mova [buf0q + lenq ], m0
mova [buf0q + lenq + mmsize], m1
mova [buf1q + lenq ], m2
mova [buf1q + lenq + mmsize], m3
add lenq, mmsize*2
jl .loop
REP_RET
%if ARCH_X86_64
cglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len
%else
cglobal alac_append_extra_bits_mono, 2, 3, 3, buf, exbuf, len
%define exbitsm r2m
%endif
movifnidn lend, r4m
movd m2, exbitsm
shl lend, 2
mov bufq, [bufq]
mov exbufq, [exbufq]
add bufq, lenq
add exbufq, lenq
neg lenq
align 16
.loop:
mova m0, [bufq + lenq]
mova m1, [bufq + lenq + mmsize]
pslld m0, m2
pslld m1, m2
por m0, [exbufq + lenq]
por m1, [exbufq + lenq + mmsize]
mova [bufq + lenq], m0
mova [bufq + lenq + mmsize], m1
add lenq, mmsize*2
jl .loop
REP_RET

View File

@@ -0,0 +1,44 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/alacdsp.h"
#include "config.h"
void ff_alac_decorrelate_stereo_sse4(int32_t *buffer[2], int nb_samples,
int decorr_shift, int decorr_left_weight);
void ff_alac_append_extra_bits_stereo_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
int extra_bits, int channels, int nb_samples);
void ff_alac_append_extra_bits_mono_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
int extra_bits, int channels, int nb_samples);
av_cold void ff_alacdsp_init_x86(ALACDSPContext *c)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
c->append_extra_bits[0] = ff_alac_append_extra_bits_mono_sse2;
c->append_extra_bits[1] = ff_alac_append_extra_bits_stereo_sse2;
}
if (EXTERNAL_SSE4(cpu_flags)) {
c->decorrelate_stereo = ff_alac_decorrelate_stereo_sse4;
}
#endif /* HAVE_X86ASM */
}

174
externals/ffmpeg/libavcodec/x86/audiodsp.asm vendored Executable file
View File

@@ -0,0 +1,174 @@
;******************************************************************************
;* optimized audio functions
;* Copyright (c) 2008 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
%macro SCALARPRODUCT 0
; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
cglobal scalarproduct_int16, 3,3,3, v1, v2, order
add orderd, orderd
add v1q, orderq
add v2q, orderq
neg orderq
pxor m2, m2
.loop:
movu m0, [v1q + orderq]
movu m1, [v1q + orderq + mmsize]
pmaddwd m0, [v2q + orderq]
pmaddwd m1, [v2q + orderq + mmsize]
paddd m2, m0
paddd m2, m1
add orderq, mmsize*2
jl .loop
HADDD m2, m0
movd eax, m2
%if mmsize == 8
emms
%endif
RET
%endmacro
INIT_MMX mmxext
SCALARPRODUCT
INIT_XMM sse2
SCALARPRODUCT
;-----------------------------------------------------------------------------
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
; int32_t max, unsigned int len)
;-----------------------------------------------------------------------------
; %1 = number of xmm registers used
; %2 = number of inline load/process/store loops per asm loop
; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
; %4 = CLIPD function takes min/max as float instead of int (SSE2 version)
; %5 = suffix
%macro VECTOR_CLIP_INT32 4-5
cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
%if %4
cvtsi2ss m4, minm
cvtsi2ss m5, maxm
%else
movd m4, minm
movd m5, maxm
%endif
SPLATD m4
SPLATD m5
.loop:
%assign %%i 0
%rep %2
mova m0, [srcq + mmsize * (0 + %%i)]
mova m1, [srcq + mmsize * (1 + %%i)]
mova m2, [srcq + mmsize * (2 + %%i)]
mova m3, [srcq + mmsize * (3 + %%i)]
%if %3
mova m7, [srcq + mmsize * (4 + %%i)]
mova m8, [srcq + mmsize * (5 + %%i)]
mova m9, [srcq + mmsize * (6 + %%i)]
mova m10, [srcq + mmsize * (7 + %%i)]
%endif
CLIPD m0, m4, m5, m6
CLIPD m1, m4, m5, m6
CLIPD m2, m4, m5, m6
CLIPD m3, m4, m5, m6
%if %3
CLIPD m7, m4, m5, m6
CLIPD m8, m4, m5, m6
CLIPD m9, m4, m5, m6
CLIPD m10, m4, m5, m6
%endif
mova [dstq + mmsize * (0 + %%i)], m0
mova [dstq + mmsize * (1 + %%i)], m1
mova [dstq + mmsize * (2 + %%i)], m2
mova [dstq + mmsize * (3 + %%i)], m3
%if %3
mova [dstq + mmsize * (4 + %%i)], m7
mova [dstq + mmsize * (5 + %%i)], m8
mova [dstq + mmsize * (6 + %%i)], m9
mova [dstq + mmsize * (7 + %%i)], m10
%endif
%assign %%i (%%i + 4 * (1 + %3))
%endrep
add srcq, mmsize*4*(%2+%3)
add dstq, mmsize*4*(%2+%3)
sub lend, mmsize*(%2+%3)
jg .loop
REP_RET
%endmacro
INIT_MMX mmx
VECTOR_CLIP_INT32 0, 1, 0, 0
INIT_XMM sse2
VECTOR_CLIP_INT32 6, 1, 0, 0, _int
VECTOR_CLIP_INT32 6, 2, 0, 1
INIT_XMM sse4
%ifdef m8
VECTOR_CLIP_INT32 11, 1, 1, 0
%else
VECTOR_CLIP_INT32 6, 1, 0, 0
%endif
; void ff_vector_clipf_sse(float *dst, const float *src,
; int len, float min, float max)
INIT_XMM sse
cglobal vector_clipf, 3, 3, 6, dst, src, len, min, max
%if ARCH_X86_32
VBROADCASTSS m0, minm
VBROADCASTSS m1, maxm
%elif WIN64
SWAP 0, 3
VBROADCASTSS m0, m0
VBROADCASTSS m1, maxm
%else ; 64bit sysv
VBROADCASTSS m0, m0
VBROADCASTSS m1, m1
%endif
movsxdifnidn lenq, lend
.loop:
mova m2, [srcq + 4 * lenq - 4 * mmsize]
mova m3, [srcq + 4 * lenq - 3 * mmsize]
mova m4, [srcq + 4 * lenq - 2 * mmsize]
mova m5, [srcq + 4 * lenq - 1 * mmsize]
maxps m2, m0
maxps m3, m0
maxps m4, m0
maxps m5, m0
minps m2, m1
minps m3, m1
minps m4, m1
minps m5, m1
mova [dstq + 4 * lenq - 4 * mmsize], m2
mova [dstq + 4 * lenq - 3 * mmsize], m3
mova [dstq + 4 * lenq - 2 * mmsize], m4
mova [dstq + 4 * lenq - 1 * mmsize], m5
sub lenq, mmsize
jg .loop
RET

View File

@@ -0,0 +1,66 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/audiodsp.h"
int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
int order);
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
int order);
void ff_vector_clip_int32_mmx(int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_sse2(int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clipf_sse(float *dst, const float *src,
int len, float min, float max);
av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags))
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
if (EXTERNAL_MMXEXT(cpu_flags))
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
if (EXTERNAL_SSE(cpu_flags))
c->vector_clipf = ff_vector_clipf_sse;
if (EXTERNAL_SSE2(cpu_flags)) {
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
if (cpu_flags & AV_CPU_FLAG_ATOM)
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
else
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
if (EXTERNAL_SSE4(cpu_flags))
c->vector_clip_int32 = ff_vector_clip_int32_sse4;
}

88
externals/ffmpeg/libavcodec/x86/blockdsp.asm vendored Executable file
View File

@@ -0,0 +1,88 @@
;******************************************************************************
;* SIMD-optimized clear block functions
;* Copyright (c) 2002 Michael Niedermayer
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2009 Fiona Glaser
;*
;* AVX version by Jokyo Images
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
;----------------------------------------
; void ff_clear_block(int16_t *blocks);
;----------------------------------------
; %1 = number of xmm registers used
; %2 = number of inline store loops
%macro CLEAR_BLOCK 2
cglobal clear_block, 1, 1, %1, blocks
ZERO m0, m0, m0
%assign %%i 0
%rep %2
mova [blocksq+mmsize*(0+%%i)], m0
mova [blocksq+mmsize*(1+%%i)], m0
mova [blocksq+mmsize*(2+%%i)], m0
mova [blocksq+mmsize*(3+%%i)], m0
%assign %%i %%i+4
%endrep
RET
%endmacro
INIT_MMX mmx
%define ZERO pxor
CLEAR_BLOCK 0, 4
INIT_XMM sse
%define ZERO xorps
CLEAR_BLOCK 1, 2
INIT_YMM avx
CLEAR_BLOCK 1, 1
;-----------------------------------------
; void ff_clear_blocks(int16_t *blocks);
;-----------------------------------------
; %1 = number of xmm registers used
%macro CLEAR_BLOCKS 1
cglobal clear_blocks, 1, 2, %1, blocks, len
add blocksq, 768
mov lenq, -768
ZERO m0, m0, m0
.loop:
mova [blocksq+lenq+mmsize*0], m0
mova [blocksq+lenq+mmsize*1], m0
mova [blocksq+lenq+mmsize*2], m0
mova [blocksq+lenq+mmsize*3], m0
mova [blocksq+lenq+mmsize*4], m0
mova [blocksq+lenq+mmsize*5], m0
mova [blocksq+lenq+mmsize*6], m0
mova [blocksq+lenq+mmsize*7], m0
add lenq, mmsize*8
js .loop
RET
%endmacro
INIT_MMX mmx
%define ZERO pxor
CLEAR_BLOCKS 0
INIT_XMM sse
%define ZERO xorps
CLEAR_BLOCKS 1
INIT_YMM avx
CLEAR_BLOCKS 1

View File

@@ -0,0 +1,60 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/internal.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/blockdsp.h"
#include "libavcodec/version.h"
void ff_clear_block_mmx(int16_t *block);
void ff_clear_block_sse(int16_t *block);
void ff_clear_block_avx(int16_t *block);
void ff_clear_blocks_mmx(int16_t *blocks);
void ff_clear_blocks_sse(int16_t *blocks);
void ff_clear_blocks_avx(int16_t *blocks);
av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
AVCodecContext *avctx)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->clear_block = ff_clear_block_mmx;
c->clear_blocks = ff_clear_blocks_mmx;
}
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
return;
if (EXTERNAL_SSE(cpu_flags)) {
c->clear_block = ff_clear_block_sse;
c->clear_blocks = ff_clear_blocks_sse;
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
c->clear_block = ff_clear_block_avx;
c->clear_blocks = ff_clear_blocks_avx;
}
#endif /* HAVE_X86ASM */
}

159
externals/ffmpeg/libavcodec/x86/bswapdsp.asm vendored Executable file
View File

@@ -0,0 +1,159 @@
;******************************************************************************
;* optimized bswap buffer functions
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
cextern pb_80
SECTION .text
; %1 = aligned/unaligned
%macro BSWAP_LOOPS 1
mov r3d, r2d
sar r2d, 3
jz .left4_%1
%if cpuflag(avx2)
sar r2d, 1
jz .left8_%1
%endif
.loop8_%1:
mov%1 m0, [r1 + 0]
mov%1 m1, [r1 + mmsize]
%if cpuflag(ssse3)||cpuflag(avx2)
pshufb m0, m2
pshufb m1, m2
mov%1 [r0 + 0], m0
mov%1 [r0 + mmsize], m1
%else
pshuflw m0, m0, 10110001b
pshuflw m1, m1, 10110001b
pshufhw m0, m0, 10110001b
pshufhw m1, m1, 10110001b
mova m2, m0
mova m3, m1
psllw m0, 8
psllw m1, 8
psrlw m2, 8
psrlw m3, 8
por m2, m0
por m3, m1
mov%1 [r0 + 0], m2
mov%1 [r0 + 16], m3
%endif
add r0, mmsize*2
add r1, mmsize*2
dec r2d
jnz .loop8_%1
%if cpuflag(avx2)
.left8_%1:
mov r2d, r3d
test r3d, 8
jz .left4_%1
mov%1 m0, [r1]
pshufb m0, m2
mov%1 [r0 + 0], m0
add r1, mmsize
add r0, mmsize
%endif
.left4_%1:
mov r2d, r3d
test r3d, 4
jz .left
mov%1 xm0, [r1]
%if cpuflag(ssse3)
pshufb xm0, xm2
mov%1 [r0], xm0
%else
pshuflw m0, m0, 10110001b
pshufhw m0, m0, 10110001b
mova m2, m0
psllw m0, 8
psrlw m2, 8
por m2, m0
mov%1 [r0], m2
%endif
add r1, 16
add r0, 16
%endmacro
; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
%macro BSWAP32_BUF 0
%if cpuflag(ssse3)||cpuflag(avx2)
cglobal bswap32_buf, 3,4,3
mov r3, r1
VBROADCASTI128 m2, [pb_bswap32]
%else
cglobal bswap32_buf, 3,4,5
mov r3, r1
%endif
or r3, r0
test r3, mmsize - 1
jz .start_align
BSWAP_LOOPS u
jmp .left
.start_align:
BSWAP_LOOPS a
.left:
%if cpuflag(ssse3)
test r2d, 2
jz .left1
movq xm0, [r1]
pshufb xm0, xm2
movq [r0], xm0
add r1, 8
add r0, 8
.left1:
test r2d, 1
jz .end
mov r2d, [r1]
bswap r2d
mov [r0], r2d
%else
and r2d, 3
jz .end
.loop2:
mov r3d, [r1]
bswap r3d
mov [r0], r3d
add r1, 4
add r0, 4
dec r2d
jnz .loop2
%endif
.end:
RET
%endmacro
INIT_XMM sse2
BSWAP32_BUF
INIT_XMM ssse3
BSWAP32_BUF
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
BSWAP32_BUF
%endif

View File

@@ -0,0 +1,40 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/bswapdsp.h"
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_avx2(uint32_t *dst, const uint32_t *src, int w);
av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags))
c->bswap_buf = ff_bswap32_buf_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
c->bswap_buf = ff_bswap32_buf_ssse3;
if (EXTERNAL_AVX2_FAST(cpu_flags))
c->bswap_buf = ff_bswap32_buf_avx2;
}

301
externals/ffmpeg/libavcodec/x86/cabac.h vendored Executable file
View File

@@ -0,0 +1,301 @@
/*
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_CABAC_H
#define AVCODEC_X86_CABAC_H
#include "libavcodec/cabac.h"
#include "libavutil/attributes.h"
#include "libavutil/macros.h"
#include "libavutil/x86/asm.h"
#include "config.h"
#if (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
|| ( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)\
|| (defined(__INTEL_COMPILER) && defined(_MSC_VER))
# define BROKEN_COMPILER 1
#else
# define BROKEN_COMPILER 0
#endif
#if HAVE_INLINE_ASM
#ifndef UNCHECKED_BITSTREAM_READER
#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
#endif
#if UNCHECKED_BITSTREAM_READER
#define END_CHECK(end) ""
#else
#define END_CHECK(end) \
"cmp "end" , %%"FF_REG_c" \n\t"\
"jge 1f \n\t"
#endif
#ifdef BROKEN_RELOCATIONS
#define TABLES_ARG , "r"(tables)
#if HAVE_FAST_CMOV
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
"cmp "low" , "tmp" \n\t"\
"cmova %%ecx , "range" \n\t"\
"sbb %%rcx , %%rcx \n\t"\
"and %%ecx , "tmp" \n\t"\
"xor %%rcx , "retq" \n\t"\
"sub "tmp" , "low" \n\t"
#else /* HAVE_FAST_CMOV */
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
/* P4 Prescott has crappy cmov,sbb,64-bit shift so avoid them */ \
"sub "low" , "tmp" \n\t"\
"sar $31 , "tmp" \n\t"\
"sub %%ecx , "range" \n\t"\
"and "tmp" , "range" \n\t"\
"add %%ecx , "range" \n\t"\
"shl $17 , %%ecx \n\t"\
"and "tmp" , %%ecx \n\t"\
"sub %%ecx , "low" \n\t"\
"xor "tmp" , "ret" \n\t"\
"movslq "ret" , "retq" \n\t"
#endif /* HAVE_FAST_CMOV */
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
"lea ("ret", "range", 2), %%ecx \n\t"\
"movzbl "lps_off"("tables", %%rcx), "range" \n\t"\
"sub "range" , "tmp" \n\t"\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
"movzbl "norm_off"("tables", "rangeq"), %%ecx \n\t"\
"shl %%cl , "range" \n\t"\
"movzbl "mlps_off"+128("tables", "retq"), "tmp" \n\t"\
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
"jnz 2f \n\t"\
"mov "byte" , %%"FF_REG_c" \n\t"\
END_CHECK(end)\
"add"FF_OPSIZE" $2 , "byte" \n\t"\
"1: \n\t"\
"movzwl (%%"FF_REG_c") , "tmp" \n\t"\
"lea -1("low") , %%ecx \n\t"\
"xor "low" , %%ecx \n\t"\
"shr $15 , %%ecx \n\t"\
"bswap "tmp" \n\t"\
"shr $15 , "tmp" \n\t"\
"movzbl "norm_off"("tables", %%rcx), %%ecx \n\t"\
"sub $0xFFFF , "tmp" \n\t"\
"neg %%ecx \n\t"\
"add $7 , %%ecx \n\t"\
"shl %%cl , "tmp" \n\t"\
"add "tmp" , "low" \n\t"\
"2: \n\t"
#else /* BROKEN_RELOCATIONS */
#define TABLES_ARG NAMED_CONSTRAINTS_ARRAY_ADD(ff_h264_cabac_tables)
#define RIP_ARG
#if HAVE_FAST_CMOV
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"cmp "low" , "tmp" \n\t"\
"cmova %%ecx , "range" \n\t"\
"sbb %%ecx , %%ecx \n\t"\
"and %%ecx , "tmp" \n\t"\
"xor %%ecx , "ret" \n\t"\
"sub "tmp" , "low" \n\t"
#else /* HAVE_FAST_CMOV */
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"sub "low" , "tmp" \n\t"\
"sar $31 , "tmp" \n\t" /*lps_mask*/\
"sub %%ecx , "range" \n\t" /*RangeLPS - range*/\
"and "tmp" , "range" \n\t" /*(RangeLPS - range)&lps_mask*/\
"add %%ecx , "range" \n\t" /*new range*/\
"shl $17 , %%ecx \n\t"\
"and "tmp" , %%ecx \n\t"\
"sub %%ecx , "low" \n\t"\
"xor "tmp" , "ret" \n\t"
#endif /* HAVE_FAST_CMOV */
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
"movzbl "MANGLE(ff_h264_cabac_tables)"+"lps_off"("ret", "range", 2), "range" \n\t"\
"sub "range" , "tmp" \n\t"\
BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp) \
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"("range"), %%ecx \n\t"\
"shl %%cl , "range" \n\t"\
"movzbl "MANGLE(ff_h264_cabac_tables)"+"mlps_off"+128("ret"), "tmp" \n\t"\
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
" jnz 2f \n\t"\
"mov "byte" , %%"FF_REG_c" \n\t"\
END_CHECK(end)\
"add"FF_OPSIZE" $2 , "byte" \n\t"\
"1: \n\t"\
"movzwl (%%"FF_REG_c") , "tmp" \n\t"\
"lea -1("low") , %%ecx \n\t"\
"xor "low" , %%ecx \n\t"\
"shr $15 , %%ecx \n\t"\
"bswap "tmp" \n\t"\
"shr $15 , "tmp" \n\t"\
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"(%%ecx), %%ecx \n\t"\
"sub $0xFFFF , "tmp" \n\t"\
"neg %%ecx \n\t"\
"add $7 , %%ecx \n\t"\
"shl %%cl , "tmp" \n\t"\
"add "tmp" , "low" \n\t"\
"2: \n\t"
#endif /* BROKEN_RELOCATIONS */
#if HAVE_7REGS && !BROKEN_COMPILER
#define get_cabac_inline get_cabac_inline_x86
static av_always_inline int get_cabac_inline_x86(CABACContext *c,
uint8_t *const state)
{
int bit, tmp;
#ifdef BROKEN_RELOCATIONS
void *tables;
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
: NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
);
#endif
__asm__ volatile(
BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1",
"%2", "%q2", "%3", "%b3",
"%c6(%5)", "%c7(%5)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%8")
: "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
: "r"(state), "r"(c),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end))
TABLES_ARG
,"1"(c->low), "2"(c->range)
: "%"FF_REG_c, "memory"
);
return bit & 1;
}
#endif /* HAVE_7REGS && !BROKEN_COMPILER */
#if !BROKEN_COMPILER
#define get_cabac_bypass_sign get_cabac_bypass_sign_x86
static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
"movl %c6(%2), %k1 \n\t"
"movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
"cdq \n\t"
"and %%edx, %k1 \n\t"
"add %k1, %%eax \n\t"
"xor %%edx, %%ecx \n\t"
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
"jnz 1f \n\t"
"mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
"shrl $15, %%edx \n\t"
#if UNCHECKED_BITSTREAM_READER
"add $2, %1 \n\t"
"addl %%edx, %%eax \n\t"
"mov %1, %c4(%2) \n\t"
#else
"addl %%edx, %%eax \n\t"
"cmp %c5(%2), %1 \n\t"
"jge 1f \n\t"
"add"FF_OPSIZE" $2, %c4(%2) \n\t"
#endif
"1: \n\t"
"movl %%eax, %c3(%2) \n\t"
: "+c"(val), "=&r"(tmp)
: "r"(c),
"i"(offsetof(CABACContext, low)),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end)),
"i"(offsetof(CABACContext, range))
: "%eax", "%edx", "memory"
);
return val;
}
#define get_cabac_bypass get_cabac_bypass_x86
static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
{
x86_reg tmp;
int res;
__asm__ volatile(
"movl %c6(%2), %k1 \n\t"
"movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
"cdq \n\t"
"and %%edx, %k1 \n\t"
"add %k1, %%eax \n\t"
"inc %%edx \n\t"
"test %%ax, %%ax \n\t"
"jnz 1f \n\t"
"mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%ecx \n\t"
"bswap %%ecx \n\t"
"shrl $15, %%ecx \n\t"
"addl %%ecx, %%eax \n\t"
"cmp %c5(%2), %1 \n\t"
"jge 1f \n\t"
"add"FF_OPSIZE" $2, %c4(%2) \n\t"
"1: \n\t"
"movl %%eax, %c3(%2) \n\t"
: "=&d"(res), "=&r"(tmp)
: "r"(c),
"i"(offsetof(CABACContext, low)),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end)),
"i"(offsetof(CABACContext, range))
: "%eax", "%ecx", "memory"
);
return res;
}
#endif /* !BROKEN_COMPILER */
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_CABAC_H */

463
externals/ffmpeg/libavcodec/x86/cavsdsp.c vendored Executable file
View File

@@ -0,0 +1,463 @@
/*
* Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
* Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
*
* MMX-optimized DSP functions, based on H.264 optimizations by
* Michael Niedermayer and Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/common.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/cavsdsp.h"
#include "libavcodec/idctdsp.h"
#include "constants.h"
#include "fpel.h"
#include "idctdsp.h"
#include "config.h"
#if HAVE_MMX_EXTERNAL
void ff_cavs_idct8_mmx(int16_t *out, const int16_t *in);
static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
{
LOCAL_ALIGNED(16, int16_t, b2, [64]);
ff_cavs_idct8_mmx(b2, block);
ff_add_pixels_clamped_mmx(b2, dst, stride);
}
void ff_cavs_idct8_sse2(int16_t *out, const int16_t *in);
static void cavs_idct8_add_sse2(uint8_t *dst, int16_t *block, ptrdiff_t stride)
{
LOCAL_ALIGNED(16, int16_t, b2, [64]);
ff_cavs_idct8_sse2(b2, block);
ff_add_pixels_clamped_sse2(b2, dst, stride);
}
#endif /* HAVE_MMX_EXTERNAL */
#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
/*****************************************************************************
*
* motion compensation
*
****************************************************************************/
/* vertical filter [-1 -2 96 42 -7 0] */
#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"pmullw "MANGLE(MUL1)", %%mm6\n\t"\
"movq "#D", %%mm7 \n\t"\
"pmullw "MANGLE(MUL2)", %%mm7\n\t"\
"psllw $3, "#E" \n\t"\
"psubw "#E", %%mm6 \n\t"\
"psraw $3, "#E" \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw "#E", %%mm6 \n\t"\
"paddw "#B", "#B" \n\t"\
"pxor %%mm7, %%mm7 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psraw $1, "#B" \n\t"\
"psubw "#A", %%mm6 \n\t"\
"paddw "MANGLE(ADD)", %%mm6 \n\t"\
"psraw $7, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
"add %3, %1 \n\t"
/* vertical filter [ 0 -1 5 5 -1 0] */
#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"paddw "#D", %%mm6 \n\t"\
"pmullw "MANGLE(MUL1)", %%mm6\n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psubw "#E", %%mm6 \n\t"\
"paddw "MANGLE(ADD)", %%mm6 \n\t"\
"psraw $3, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
"add %3, %1 \n\t"
/* vertical filter [ 0 -7 42 96 -2 -1] */
#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"pmullw "MANGLE(MUL2)", %%mm6\n\t"\
"movq "#D", %%mm7 \n\t"\
"pmullw "MANGLE(MUL1)", %%mm7\n\t"\
"psllw $3, "#B" \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psraw $3, "#B" \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw "#B", %%mm6 \n\t"\
"paddw "#E", "#E" \n\t"\
"pxor %%mm7, %%mm7 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\
"psubw "#E", %%mm6 \n\t"\
"psraw $1, "#E" \n\t"\
"psubw "#F", %%mm6 \n\t"\
"paddw "MANGLE(ADD)", %%mm6 \n\t"\
"psraw $7, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
"add %3, %1 \n\t"
#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
int w= 2;\
src -= 2*srcStride;\
\
while(w--){\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
\
: "+a"(src), "+c"(dst)\
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
: "memory"\
);\
if(h==16){\
__asm__ volatile(\
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
\
: "+a"(src), "+c"(dst)\
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
: "memory"\
);\
}\
src += 4-(h+5)*srcStride;\
dst += 4-h*dstStride;\
}
#define QPEL_CAVS(OPNAME, OP, MMX)\
static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{\
int h=8;\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"movq 1(%0), %%mm2 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpckhbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"pmullw %%mm6, %%mm0 \n\t"\
"pmullw %%mm6, %%mm1 \n\t"\
"movq -1(%0), %%mm2 \n\t"\
"movq 2(%0), %%mm4 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"movq %%mm4, %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm7, %%mm5 \n\t"\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm3, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm5, %%mm1 \n\t"\
"movq "MANGLE(ff_pw_4)", %%mm5\n\t"\
"paddw %%mm5, %%mm0 \n\t"\
"paddw %%mm5, %%mm1 \n\t"\
"psraw $3, %%mm0 \n\t"\
"psraw $3, %%mm1 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm5, q) \
"add %3, %0 \n\t"\
"add %4, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+m"(h)\
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
NAMED_CONSTRAINTS_ADD(ff_pw_4,ff_pw_5)\
: "memory"\
);\
}\
\
static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
{ \
QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
}\
\
static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
{ \
QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_42) \
}\
\
static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
{ \
QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
}\
\
static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{ \
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{ \
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{ \
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{ \
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{ \
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{ \
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{ \
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
#define CAVS_MC(OPNAME, SIZE, MMX) \
static void OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
}\
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
#define AVG_3DNOW_OP(a,b,temp, size) \
"mov" #size " " #b ", " #temp " \n\t"\
"pavgusb " #temp ", " #a " \n\t"\
"mov" #size " " #a ", " #b " \n\t"
#define AVG_MMXEXT_OP(a, b, temp, size) \
"mov" #size " " #b ", " #temp " \n\t"\
"pavgb " #temp ", " #a " \n\t"\
"mov" #size " " #a ", " #b " \n\t"
#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
#if HAVE_MMX_EXTERNAL
static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels8_mmx(dst, src, stride, 8);
}
static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels8_mmx(dst, src, stride, 8);
}
static void avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels8_mmxext(dst, src, stride, 8);
}
static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels16_mmx(dst, src, stride, 16);
}
static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels16_mmx(dst, src, stride, 16);
}
static void avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels16_mmxext(dst, src, stride, 16);
}
static void put_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels16_sse2(dst, src, stride, 16);
}
static void avg_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels16_sse2(dst, src, stride, 16);
}
#endif
static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
AVCodecContext *avctx)
{
#if HAVE_MMX_EXTERNAL
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
c->cavs_idct8_add = cavs_idct8_add_mmx;
c->idct_perm = FF_IDCT_PERM_TRANSPOSE;
#endif /* HAVE_MMX_EXTERNAL */
}
#define DSPFUNC(PFX, IDX, NUM, EXT) \
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 4] = PFX ## _cavs_qpel ## NUM ## _mc01_ ## EXT; \
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 8] = PFX ## _cavs_qpel ## NUM ## _mc02_ ## EXT; \
c->PFX ## _cavs_qpel_pixels_tab[IDX][12] = PFX ## _cavs_qpel ## NUM ## _mc03_ ## EXT; \
#if HAVE_MMXEXT_INLINE
QPEL_CAVS(put_, PUT_OP, mmxext)
QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext)
CAVS_MC(put_, 8, mmxext)
CAVS_MC(put_, 16, mmxext)
CAVS_MC(avg_, 8, mmxext)
CAVS_MC(avg_, 16, mmxext)
#endif /* HAVE_MMXEXT_INLINE */
#if HAVE_AMD3DNOW_INLINE
QPEL_CAVS(put_, PUT_OP, 3dnow)
QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
CAVS_MC(put_, 8, 3dnow)
CAVS_MC(put_, 16,3dnow)
CAVS_MC(avg_, 8, 3dnow)
CAVS_MC(avg_, 16,3dnow)
static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
AVCodecContext *avctx)
{
DSPFUNC(put, 0, 16, 3dnow);
DSPFUNC(put, 1, 8, 3dnow);
DSPFUNC(avg, 0, 16, 3dnow);
DSPFUNC(avg, 1, 8, 3dnow);
}
#endif /* HAVE_AMD3DNOW_INLINE */
av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
{
av_unused int cpu_flags = av_get_cpu_flags();
if (X86_MMX(cpu_flags))
cavsdsp_init_mmx(c, avctx);
#if HAVE_AMD3DNOW_INLINE
if (INLINE_AMD3DNOW(cpu_flags))
cavsdsp_init_3dnow(c, avctx);
#endif /* HAVE_AMD3DNOW_INLINE */
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags)) {
DSPFUNC(put, 0, 16, mmxext);
DSPFUNC(put, 1, 8, mmxext);
DSPFUNC(avg, 0, 16, mmxext);
DSPFUNC(avg, 1, 8, mmxext);
}
#endif
#if HAVE_MMX_EXTERNAL
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmxext;
c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext;
}
#endif
#if HAVE_SSE2_EXTERNAL
if (EXTERNAL_SSE2(cpu_flags)) {
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
c->cavs_idct8_add = cavs_idct8_add_sse2;
c->idct_perm = FF_IDCT_PERM_TRANSPOSE;
}
#endif
}

211
externals/ffmpeg/libavcodec/x86/cavsidct.asm vendored Executable file
View File

@@ -0,0 +1,211 @@
; Chinese AVS video (AVS1-P2, JiZhun profile) decoder
; Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
;
; MMX-optimized DSP functions, based on H.264 optimizations by
; Michael Niedermayer and Loren Merritt
; Conversion from gcc syntax to x264asm syntax with modifications
; by Ronald S. Bultje <rsbultje@gmail.com>
;
; This file is part of FFmpeg.
;
; FFmpeg is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public
; License as published by the Free Software Foundation; either
; version 2.1 of the License, or (at your option) any later version.
;
; FFmpeg is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public License
; along with FFmpeg; if not, write to the Free Software Foundation,
; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
%include "libavutil/x86/x86util.asm"
cextern pw_4
cextern pw_64
SECTION .text
%macro CAVS_IDCT8_1D 2-3 1 ; source, round, init_load
%if %3 == 1
mova m4, [%1+7*16] ; m4 = src7
mova m5, [%1+1*16] ; m5 = src1
mova m2, [%1+5*16] ; m2 = src5
mova m7, [%1+3*16] ; m7 = src3
%else
SWAP 1, 7
SWAP 4, 6
%endif
mova m0, m4
mova m3, m5
mova m6, m2
mova m1, m7
paddw m4, m4 ; m4 = 2*src7
paddw m3, m3 ; m3 = 2*src1
paddw m6, m6 ; m6 = 2*src5
paddw m1, m1 ; m1 = 2*src3
paddw m0, m4 ; m0 = 3*src7
paddw m5, m3 ; m5 = 3*src1
paddw m2, m6 ; m2 = 3*src5
paddw m7, m1 ; m7 = 3*src3
psubw m5, m4 ; m5 = 3*src1 - 2*src7 = a0
paddw m7, m6 ; m7 = 3*src3 - 2*src5 = a1
psubw m1, m2 ; m1 = 2*src3 - 3*src5 = a2
paddw m3, m0 ; m3 = 2*src1 - 3*src7 = a3
mova m4, m5
mova m6, m7
mova m0, m3
mova m2, m1
SUMSUB_BA w, 7, 5 ; m7 = a0 + a1, m5 = a0 - a1
paddw m7, m3 ; m7 = a0 + a1 + a3
paddw m5, m1 ; m5 = a0 - a1 + a2
paddw m7, m7
paddw m5, m5
paddw m7, m6 ; m7 = b4
paddw m5, m4 ; m5 = b5
SUMSUB_BA w, 1, 3 ; m1 = a3 + a2, m3 = a3 - a2
psubw m4, m1 ; m4 = a0 - a2 - a3
mova m1, m4 ; m1 = a0 - a2 - a3
psubw m3, m6 ; m3 = a3 - a2 - a1
paddw m1, m1
paddw m3, m3
psubw m1, m2 ; m1 = b7
paddw m3, m0 ; m3 = b6
mova m2, [%1+2*16] ; m2 = src2
mova m6, [%1+6*16] ; m6 = src6
mova m4, m2
mova m0, m6
psllw m4, 2 ; m4 = 4*src2
psllw m6, 2 ; m6 = 4*src6
paddw m2, m4 ; m2 = 5*src2
paddw m0, m6 ; m0 = 5*src6
paddw m2, m2
paddw m0, m0
psubw m4, m0 ; m4 = 4*src2 - 10*src6 = a7
paddw m6, m2 ; m6 = 4*src6 + 10*src2 = a6
mova m2, [%1+0*16] ; m2 = src0
mova m0, [%1+4*16] ; m0 = src4
SUMSUB_BA w, 0, 2 ; m0 = src0 + src4, m2 = src0 - src4
psllw m0, 3
psllw m2, 3
paddw m0, %2 ; add rounding bias
paddw m2, %2 ; add rounding bias
SUMSUB_BA w, 6, 0 ; m6 = a4 + a6, m0 = a4 - a6
SUMSUB_BA w, 4, 2 ; m4 = a5 + a7, m2 = a5 - a7
SUMSUB_BA w, 7, 6 ; m7 = dst0, m6 = dst7
SUMSUB_BA w, 5, 4 ; m5 = dst1, m4 = dst6
SUMSUB_BA w, 3, 2 ; m3 = dst2, m2 = dst5
SUMSUB_BA w, 1, 0 ; m1 = dst3, m0 = dst4
%endmacro
INIT_MMX mmx
cglobal cavs_idct8, 2, 4, 8, 8 * 16, out, in, cnt, tmp
mov cntd, 2
mov tmpq, rsp
.loop_1:
CAVS_IDCT8_1D inq, [pw_4]
psraw m7, 3
psraw m6, 3
psraw m5, 3
psraw m4, 3
psraw m3, 3
psraw m2, 3
psraw m1, 3
psraw m0, 3
mova [tmpq], m7
TRANSPOSE4x4W 0, 2, 4, 6, 7
mova [tmpq+1*8], m0
mova [tmpq+3*8], m2
mova [tmpq+5*8], m4
mova [tmpq+7*8], m6
mova m7, [tmpq]
TRANSPOSE4x4W 7, 5, 3, 1, 0
mova [tmpq+0*8], m7
mova [tmpq+2*8], m5
mova [tmpq+4*8], m3
mova [tmpq+6*8], m1
add inq, mmsize
add tmpq, 64
dec cntd
jg .loop_1
mov cntd, 2
mov tmpq, rsp
.loop_2:
CAVS_IDCT8_1D tmpq, [pw_64]
psraw m7, 7
psraw m6, 7
psraw m5, 7
psraw m4, 7
psraw m3, 7
psraw m2, 7
psraw m1, 7
psraw m0, 7
mova [outq+0*16], m7
mova [outq+1*16], m5
mova [outq+2*16], m3
mova [outq+3*16], m1
mova [outq+4*16], m0
mova [outq+5*16], m2
mova [outq+6*16], m4
mova [outq+7*16], m6
add outq, mmsize
add tmpq, mmsize
dec cntd
jg .loop_2
RET
INIT_XMM sse2
cglobal cavs_idct8, 2, 2, 8 + ARCH_X86_64, 0 - 8 * 16, out, in
CAVS_IDCT8_1D inq, [pw_4]
psraw m7, 3
psraw m6, 3
psraw m5, 3
psraw m4, 3
psraw m3, 3
psraw m2, 3
psraw m1, 3
psraw m0, 3
%if ARCH_X86_64
TRANSPOSE8x8W 7, 5, 3, 1, 0, 2, 4, 6, 8
mova [rsp+4*16], m0
%else
mova [rsp+0*16], m4
TRANSPOSE8x8W 7, 5, 3, 1, 0, 2, 4, 6, [rsp+0*16], [rsp+4*16], 1
%endif
mova [rsp+0*16], m7
mova [rsp+2*16], m3
mova [rsp+6*16], m4
CAVS_IDCT8_1D rsp, [pw_64], 0
psraw m7, 7
psraw m6, 7
psraw m5, 7
psraw m4, 7
psraw m3, 7
psraw m2, 7
psraw m1, 7
psraw m0, 7
mova [outq+0*16], m7
mova [outq+1*16], m5
mova [outq+2*16], m3
mova [outq+3*16], m1
mova [outq+4*16], m0
mova [outq+5*16], m2
mova [outq+6*16], m4
mova [outq+7*16], m6
RET

View File

@@ -0,0 +1,43 @@
/*
* Opus encoder assembly optimizations
* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/opus_pvq.h"
extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N);
extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N);
extern float ff_pvq_search_exact_avx (float *X, int *y, int K, int N);
av_cold void ff_celt_pvq_init_x86(CeltPVQ *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags))
s->pvq_search = ff_pvq_search_approx_sse2;
if (EXTERNAL_SSE4(cpu_flags))
s->pvq_search = ff_pvq_search_approx_sse4;
if (EXTERNAL_AVX_FAST(cpu_flags))
s->pvq_search = ff_pvq_search_exact_avx;
}

View File

@@ -0,0 +1,385 @@
;******************************************************************************
;* SIMD optimized Opus encoder DSP function
;*
;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "config.asm"
%include "libavutil/x86/x86util.asm"
%ifdef __NASM_VER__
%use "smartalign"
ALIGNMODE p6
%endif
SECTION_RODATA 64
const_float_abs_mask: times 8 dd 0x7fffffff
const_align_abs_edge: times 8 dd 0
const_float_0_5: times 8 dd 0.5
const_float_1: times 8 dd 1.0
const_float_sign_mask: times 8 dd 0x80000000
const_int32_offsets:
%rep 8
dd $-const_int32_offsets
%endrep
SECTION .text
;
; Setup High Register to be used
; for holding memory constants
;
; %1 - the register to be used, assmues it is >= mm8
; %2 - name of the constant.
;
; Subsequent opcodes are going to use the constant in the form
; "addps m0, mm_const_name" and it would be turned into:
; "addps m0, [const_name]" on 32 bit arch or
; "addps m0, m8" on 64 bit arch
%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
%if num_mmregs > 8
%define mm_%3 %2
%{1} %2, [%3] ; movaps m8, [const_name]
%else
%define mm_%3 [%3]
%endif
%endmacro
;
; Set Position Independent Code
; Base address of a constant
; %1 - the register to be used, if PIC is set
; %2 - name of the constant.
;
; Subsequent opcode are going to use the base address in the form
; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
; "movaps m0, [r5 + r4]" if PIC is enabled
; "movaps m0, [constant_name + r4]" if texrel are used
%macro SET_PIC_BASE 3; reg, const_label
%ifdef PIC
%{1} %2, [%3] ; lea r5, [rip+const]
%define pic_base_%3 %2
%else
%define pic_base_%3 %3
%endif
%endmacro
%macro PULSES_SEARCH 1
; m6 Syy_norm
; m7 Sxy_norm
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
pxor m1, m1 ; max_idx
xorps m3, m3 ; p_max
xor r4d, r4d
align 16
%%distortion_search:
movd xm2, dword r4d ; movd zero extends
%ifidn %1,add
movaps m4, [tmpY + r4] ; y[i]
movaps m5, [tmpX + r4] ; X[i]
%if USE_APPROXIMATION == 1
xorps m0, m0
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
%endif
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
%if USE_APPROXIMATION == 1
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
%endif
%else
movaps m5, [tmpY + r4] ; m5 = y[i]
xorps m0, m0 ; m0 = 0;
cmpps m0, m0, m5, 1 ; m0 = (0<y)
subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i]
subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i]
andps m5, m0 ; (0<y)?m5:0
%endif
%if USE_APPROXIMATION == 1
rsqrtps m4, m4
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
%else
mulps m5, m5
divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy
%endif
VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later)
cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
maxps m3, m5 ; m3=max(p_max,p)
; maxps here is faster than blendvps, despite blend having lower latency.
pand m2, m0 ; This version seems faster than sse41 pblendvb
pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4
add r4d, mmsize
cmp r4d, Nd
jb %%distortion_search
por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop)
movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing
%if mmsize >= 32
; Merge parallel maximums round 8 (4 vs 4)
vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b]
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b]
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128]
%endif
; Merge parallel maximums round 4 (2 vs 2)
; m3=p[3210]
movhlps xm5, xm3 ; m5=p[xx32]
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
pshufd xm2, xm1, q3232
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0]
; Merge parallel maximums final round (1 vs 1)
shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1]
cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
pshufd xm2, xm1, q1111
PBLENDVB xm1, xm2, xm0
movd dword r4d, xm1 ; zero extends to the rest of r4q
VBROADCASTSS m3, [tmpX + r4]
%{1}ps m7, m3 ; Sxy += X[max_idx]
VBROADCASTSS m5, [tmpY + r4]
%{1}ps m6, m5 ; Syy += Y[max_idx]
; We have to update a single element in Y[i]
; However writing 4 bytes and then doing 16 byte load in the inner loop
; could cause a stall due to breaking write forwarding.
VPBROADCASTD m1, xm1
pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it
and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load
movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0]
andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0]
%{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0;
%endmacro
;
; We need one more register for
; PIC relative addressing. Use this
; to count it in cglobal
;
%ifdef PIC
%define num_pic_regs 1
%else
%define num_pic_regs 0
%endif
;
; Pyramid Vector Quantization Search implementation
;
; float * inX - Unaligned (SIMD) access, it will be overread,
; but extra data is masked away.
; int32 * outY - Should be aligned and padded buffer.
; It is used as temp buffer.
; uint32 K - Number of pulses to have after quantizations.
; uint32 N - Number of vector elements. Must be 0 < N < 256
;
%macro PVQ_FAST_SEARCH 1
cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
%define tmpX rsp
%define tmpY outYq
movaps m0, [const_float_abs_mask]
shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
mov r4d, Nd
neg r4d
and r4d, mmsize-1
SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const
movups m2, [pic_base_const_align_abs_edge + r4 - mmsize]
add Nd, r4d ; N = align(N, mmsize)
lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
movups m1, [inXq + r4]
andps m1, m2
movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] )
align 16
%%loop_abs_sum:
sub r4d, mmsize
jc %%end_loop_abs_sum
movups m2, [inXq + r4]
andps m2, m0
movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
addps m1, m2 ; Sx += abs(X[i])
jmp %%loop_abs_sum
align 16
%%end_loop_abs_sum:
HSUMPS m1, m2 ; m1 = Sx
xorps m0, m0
comiss xm0, xm1 ;
jz %%zero_input ; if (Sx==0) goto zero_input
cvtsi2ss xm0, dword Kd ; m0 = K
%if USE_APPROXIMATION == 1
rcpss xm1, xm1 ; m1 = approx(1/Sx)
mulss xm0, xm1 ; m0 = K*(1/Sx)
%else
divss xm0, xm1 ; b = K/Sx
; b = K/max_x
%endif
VBROADCASTSS m0, xm0
lea r4d, [Nd - mmsize]
pxor m5, m5 ; Sy ( Sum of abs( y[i]) )
xorps m6, m6 ; Syy ( Sum of y[i]*y[i] )
xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] )
align 16
%%loop_guess:
movaps m1, [tmpX + r4] ; m1 = X[i]
mulps m2, m0, m1 ; m2 = res*X[i]
cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
paddd m5, m2 ; Sy += yt
cvtdq2ps m2, m2 ; yt = (float)yt
mulps m1, m2 ; m1 = X[i]*yt
movaps [tmpY + r4], m2 ; y[i] = m2
addps m7, m1 ; Sxy += m1;
mulps m2, m2 ; m2 = yt*yt
addps m6, m2 ; Syy += m2
sub r4d, mmsize
jnc %%loop_guess
HSUMPS m6, m1 ; Syy_norm
HADDD m5, m4 ; pulses
movd dword r4d, xm5 ; zero extends to the rest of r4q
sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
jz %%finish ; K - pulses == 0
SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5
SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1
SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
; Use Syy/2 in distortion parameter calculations.
; Saves pre and post-caclulation to correct Y[] values.
; Same precision, since float mantisa is normalized.
; The SQRT approximation does differ.
HSUMPS m7, m0 ; Sxy_norm
mulps m6, mm_const_float_0_5
jc %%remove_pulses_loop ; K - pulses < 0
align 16 ; K - pulses > 0
%%add_pulses_loop:
PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
sub Kd, 1
jnz %%add_pulses_loop
addps m6, m6 ; Syy*=2
jmp %%finish
align 16
%%remove_pulses_loop:
PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
add Kd, 1
jnz %%remove_pulses_loop
addps m6, m6 ; Syy*=2
align 16
%%finish:
lea r4d, [Nd - mmsize]
movaps m2, [const_float_sign_mask]
align 16
%%restore_sign_loop:
movaps m0, [tmpY + r4] ; m0 = Y[i]
movups m1, [inXq + r4] ; m1 = X[i]
andps m1, m2 ; m1 = sign(X[i])
orps m0, m1 ; m0 = Y[i]*sign
cvtps2dq m3, m0 ; m3 = (int)m0
movaps [outYq + r4], m3
sub r4d, mmsize
jnc %%restore_sign_loop
%%return:
%if ARCH_X86_64 == 0 ; sbrdsp
movss r0m, xm6 ; return (float)Syy_norm
fld dword r0m
%else
movaps m0, m6 ; return (float)Syy_norm
%endif
RET
align 16
%%zero_input:
lea r4d, [Nd - mmsize]
xorps m0, m0
%%zero_loop:
movaps [outYq + r4], m0
sub r4d, mmsize
jnc %%zero_loop
movaps m6, [const_float_1]
jmp %%return
%endmacro
; if 1, use a float op that give half precision but execute for around 3 cycles.
; On Skylake & Ryzen the division is much faster (around 11c/3),
; that makes the full precision code about 2% slower.
; Opus also does use rsqrt approximation in their intrinsics code.
%define USE_APPROXIMATION 1
INIT_XMM sse2
PVQ_FAST_SEARCH _approx
INIT_XMM sse4
PVQ_FAST_SEARCH _approx
%define USE_APPROXIMATION 0
INIT_XMM avx
PVQ_FAST_SEARCH _exact

94
externals/ffmpeg/libavcodec/x86/constants.c vendored Executable file
View File

@@ -0,0 +1,94 @@
/*
* MMX/SSE/AVX constants used across x86 dsp optimizations.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h" // for xmm_reg
#include "constants.h"
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL,
0x0001000100010001ULL, 0x0001000100010001ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL,
0x0002000200020002ULL, 0x0002000200020002ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
DECLARE_ASM_ALIGNED(32, const ymm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL,
0x0004000400040004ULL, 0x0004000400040004ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_20) = { 0x0014001400140014ULL, 0x0014001400140014ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL,
0x0100010001000100ULL, 0x0100010001000100ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL,
0x0200020002000200ULL, 0x0200020002000200ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL,
0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL};
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL,
0x0400040004000400ULL, 0x0400040004000400ULL};
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
0x0800080008000800ULL, 0x0800080008000800ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
0x1000100010001000ULL, 0x1000100010001000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
0x2000200020002000ULL, 0x2000200020002000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL,
0x0000000000000000ULL, 0x0000000000000000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL,
0x0101010101010101ULL, 0x0101010101010101ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_2) = { 0x0202020202020202ULL, 0x0202020202020202ULL,
0x0202020202020202ULL, 0x0202020202020202ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL,
0x0303030303030303ULL, 0x0303030303030303ULL };
DECLARE_ALIGNED(32, const xmm_reg, ff_pb_15) = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL,
0x8080808080808080ULL, 0x8080808080808080ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL,
0x0000000100000001ULL, 0x0000000100000001ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x0000001000000010ULL,
0x0000001000000010ULL, 0x0000001000000010ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL,
0x0000002000000020ULL, 0x0000002000000020ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL,
0x0000200000002000ULL, 0x0000200000002000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };

72
externals/ffmpeg/libavcodec/x86/constants.h vendored Executable file
View File

@@ -0,0 +1,72 @@
/*
* MMX/SSE constants used across x86 dsp optimizations.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_CONSTANTS_H
#define AVCODEC_X86_CONSTANTS_H
#include <stdint.h>
#include "libavutil/x86/asm.h"
extern const ymm_reg ff_pw_1;
extern const ymm_reg ff_pw_2;
extern const xmm_reg ff_pw_3;
extern const ymm_reg ff_pw_4;
extern const xmm_reg ff_pw_5;
extern const xmm_reg ff_pw_8;
extern const xmm_reg ff_pw_9;
extern const uint64_t ff_pw_15;
extern const xmm_reg ff_pw_16;
extern const xmm_reg ff_pw_18;
extern const xmm_reg ff_pw_20;
extern const xmm_reg ff_pw_32;
extern const uint64_t ff_pw_42;
extern const uint64_t ff_pw_53;
extern const xmm_reg ff_pw_64;
extern const uint64_t ff_pw_96;
extern const uint64_t ff_pw_128;
extern const ymm_reg ff_pw_255;
extern const ymm_reg ff_pw_256;
extern const ymm_reg ff_pw_512;
extern const ymm_reg ff_pw_1023;
extern const ymm_reg ff_pw_1024;
extern const ymm_reg ff_pw_2048;
extern const ymm_reg ff_pw_4095;
extern const ymm_reg ff_pw_4096;
extern const ymm_reg ff_pw_8192;
extern const ymm_reg ff_pw_m1;
extern const ymm_reg ff_pb_0;
extern const ymm_reg ff_pb_1;
extern const ymm_reg ff_pb_2;
extern const ymm_reg ff_pb_3;
extern const ymm_reg ff_pb_80;
extern const ymm_reg ff_pb_FE;
extern const uint64_t ff_pb_FC;
extern const xmm_reg ff_ps_neg;
extern const ymm_reg ff_pd_1;
extern const ymm_reg ff_pd_16;
extern const ymm_reg ff_pd_32;
extern const ymm_reg ff_pd_8192;
extern const ymm_reg ff_pd_65535;
#endif /* AVCODEC_X86_CONSTANTS_H */

301
externals/ffmpeg/libavcodec/x86/dcadsp.asm vendored Executable file
View File

@@ -0,0 +1,301 @@
;******************************************************************************
;* SIMD-optimized functions for the DCA decoder
;* Copyright (C) 2016 James Almer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
%define sizeof_float 4
%define FMA3_OFFSET (8 * cpuflag(fma3))
%macro LFE_FIR0_FLOAT 0
cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
shr nblocksd, 1
sub lfeq, 7*sizeof_float
mov cnt1d, 32*sizeof_float
mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET
lea coeffq, [coeffq+cnt1q*8]
add samplesq, cnt1q
neg cnt1q
.loop:
%if cpuflag(avx)
cvtdq2ps m4, [lfeq+16]
cvtdq2ps m5, [lfeq ]
shufps m7, m4, m4, q0123
shufps m6, m5, m5, q0123
%elif cpuflag(sse2)
movu m4, [lfeq+16]
movu m5, [lfeq ]
cvtdq2ps m4, m4
cvtdq2ps m5, m5
pshufd m7, m4, q0123
pshufd m6, m5, q0123
%else
cvtpi2ps m4, [lfeq+16]
cvtpi2ps m0, [lfeq+24]
cvtpi2ps m5, [lfeq ]
cvtpi2ps m1, [lfeq+8 ]
shufps m4, m0, q1010
shufps m5, m1, q1010
shufps m7, m4, m4, q0123
shufps m6, m5, m5, q0123
%endif
.inner_loop:
%if ARCH_X86_64
movaps m8, [coeffq+cnt1q*8 ]
movaps m9, [coeffq+cnt1q*8+16]
movaps m10, [coeffq+cnt1q*8+32]
movaps m11, [coeffq+cnt1q*8+48]
%if cpuflag(fma3)
movaps m12, [coeffq+cnt1q*8+64]
movaps m13, [coeffq+cnt1q*8+80]
movaps m14, [coeffq+cnt1q*8+96]
movaps m15, [coeffq+cnt1q*8+112]
mulps m0, m7, m8
mulps m1, m7, m10
mulps m2, m7, m12
mulps m3, m7, m14
fmaddps m0, m6, m9, m0
fmaddps m1, m6, m11, m1
fmaddps m2, m6, m13, m2
fmaddps m3, m6, m15, m3
haddps m0, m1
haddps m2, m3
haddps m0, m2
movaps [samplesq+cnt1q], m0
%else
mulps m0, m7, m8
mulps m1, m6, m9
mulps m2, m7, m10
mulps m3, m6, m11
addps m0, m1
addps m2, m3
unpckhps m3, m0, m2
unpcklps m0, m2
addps m3, m0
movhlps m2, m3
addps m2, m3
movlps [samplesq+cnt1q], m2
%endif
%else ; ARCH_X86_32
%if cpuflag(fma3)
mulps m0, m7, [coeffq+cnt1q*8 ]
mulps m1, m7, [coeffq+cnt1q*8+32 ]
mulps m2, m7, [coeffq+cnt1q*8+64 ]
mulps m3, m7, [coeffq+cnt1q*8+96 ]
fmaddps m0, m6, [coeffq+cnt1q*8+16 ], m0
fmaddps m1, m6, [coeffq+cnt1q*8+48 ], m1
fmaddps m2, m6, [coeffq+cnt1q*8+80 ], m2
fmaddps m3, m6, [coeffq+cnt1q*8+112], m3
haddps m0, m1
haddps m2, m3
haddps m0, m2
movaps [samplesq+cnt1q], m0
%else
mulps m0, m7, [coeffq+cnt1q*8 ]
mulps m1, m6, [coeffq+cnt1q*8+16]
mulps m2, m7, [coeffq+cnt1q*8+32]
mulps m3, m6, [coeffq+cnt1q*8+48]
addps m0, m1
addps m2, m3
unpckhps m3, m0, m2
unpcklps m0, m2
addps m3, m0
movhlps m2, m3
addps m2, m3
movlps [samplesq+cnt1q], m2
%endif
%endif; ARCH
%if ARCH_X86_64
%if cpuflag(fma3)
mulps m8, m5
mulps m10, m5
mulps m12, m5
mulps m14, m5
fmaddps m8, m4, m9, m8
fmaddps m10, m4, m11, m10
fmaddps m12, m4, m13, m12
fmaddps m14, m4, m15, m14
haddps m10, m8
haddps m14, m12
haddps m14, m10
movaps [samplesq+cnt2q], m14
%else
mulps m8, m5
mulps m9, m4
mulps m10, m5
mulps m11, m4
addps m8, m9
addps m10, m11
unpckhps m11, m10, m8
unpcklps m10, m8
addps m11, m10
movhlps m8, m11
addps m8, m11
movlps [samplesq+cnt2q], m8
%endif
%else ; ARCH_X86_32
%if cpuflag(fma3)
mulps m0, m5, [coeffq+cnt1q*8 ]
mulps m1, m5, [coeffq+cnt1q*8+32 ]
mulps m2, m5, [coeffq+cnt1q*8+64 ]
mulps m3, m5, [coeffq+cnt1q*8+96 ]
fmaddps m0, m4, [coeffq+cnt1q*8+16 ], m0
fmaddps m1, m4, [coeffq+cnt1q*8+48 ], m1
fmaddps m2, m4, [coeffq+cnt1q*8+80 ], m2
fmaddps m3, m4, [coeffq+cnt1q*8+112], m3
haddps m1, m0
haddps m3, m2
haddps m3, m1
movaps [samplesq+cnt2q], m3
%else
mulps m0, m5, [coeffq+cnt1q*8 ]
mulps m1, m4, [coeffq+cnt1q*8+16]
mulps m2, m5, [coeffq+cnt1q*8+32]
mulps m3, m4, [coeffq+cnt1q*8+48]
addps m0, m1
addps m2, m3
unpckhps m3, m2, m0
unpcklps m2, m0
addps m3, m2
movhlps m0, m3
addps m0, m3
movlps [samplesq+cnt2q], m0
%endif
%endif; ARCH
sub cnt2d, 8 + FMA3_OFFSET
add cnt1q, 8 + FMA3_OFFSET
jl .inner_loop
add lfeq, 4
add samplesq, 64*sizeof_float
mov cnt1q, -32*sizeof_float
mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET
sub nblocksd, 1
jg .loop
RET
%endmacro
%if ARCH_X86_32
INIT_XMM sse
LFE_FIR0_FLOAT
%endif
INIT_XMM sse2
LFE_FIR0_FLOAT
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
LFE_FIR0_FLOAT
%endif
%if HAVE_FMA3_EXTERNAL
INIT_XMM fma3
LFE_FIR0_FLOAT
%endif
%macro LFE_FIR1_FLOAT 0
cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
shr nblocksd, 2
sub lfeq, 3*sizeof_float
mov cnt1d, 64*sizeof_float
mov cnt2d, 64*sizeof_float-16
lea coeffq, [coeffq+cnt1q*4]
add samplesq, cnt1q
neg cnt1q
.loop:
%if cpuflag(avx)
cvtdq2ps m4, [lfeq]
shufps m5, m4, m4, q0123
%elif cpuflag(sse2)
movu m4, [lfeq]
cvtdq2ps m4, m4
pshufd m5, m4, q0123
%endif
.inner_loop:
movaps m6, [coeffq+cnt1q*4 ]
movaps m7, [coeffq+cnt1q*4+16]
mulps m0, m5, m6
mulps m1, m5, m7
%if ARCH_X86_64
movaps m8, [coeffq+cnt1q*4+32]
movaps m9, [coeffq+cnt1q*4+48]
mulps m2, m5, m8
mulps m3, m5, m9
%else
mulps m2, m5, [coeffq+cnt1q*4+32]
mulps m3, m5, [coeffq+cnt1q*4+48]
%endif
haddps m0, m1
haddps m2, m3
haddps m0, m2
movaps [samplesq+cnt1q], m0
mulps m6, m4
mulps m7, m4
%if ARCH_X86_64
mulps m8, m4
mulps m9, m4
haddps m6, m7
haddps m8, m9
haddps m6, m8
%else
mulps m2, m4, [coeffq+cnt1q*4+32]
mulps m3, m4, [coeffq+cnt1q*4+48]
haddps m6, m7
haddps m2, m3
haddps m6, m2
%endif
movaps [samplesq+cnt2q], m6
sub cnt2d, 16
add cnt1q, 16
jl .inner_loop
add lfeq, sizeof_float
add samplesq, 128*sizeof_float
mov cnt1q, -64*sizeof_float
mov cnt2d, 64*sizeof_float-16
sub nblocksd, 1
jg .loop
RET
%endmacro
INIT_XMM sse3
LFE_FIR1_FLOAT
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
LFE_FIR1_FLOAT
%endif

52
externals/ffmpeg/libavcodec/x86/dcadsp_init.c vendored Executable file
View File

@@ -0,0 +1,52 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dcadsp.h"
#define LFE_FIR_FLOAT_FUNC(opt) \
void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
const float *filter_coeff, ptrdiff_t npcmblocks); \
void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
const float *filter_coeff, ptrdiff_t npcmblocks);
LFE_FIR_FLOAT_FUNC(sse)
LFE_FIR_FLOAT_FUNC(sse2)
LFE_FIR_FLOAT_FUNC(sse3)
LFE_FIR_FLOAT_FUNC(avx)
LFE_FIR_FLOAT_FUNC(fma3)
av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags))
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
if (EXTERNAL_SSE2(cpu_flags))
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
if (EXTERNAL_SSE3(cpu_flags))
s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3;
if (EXTERNAL_AVX(cpu_flags)) {
s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
s->lfe_fir_float[1] = ff_lfe_fir1_float_avx;
}
if (EXTERNAL_FMA3(cpu_flags))
s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
}

491
externals/ffmpeg/libavcodec/x86/dct32.asm vendored Executable file
View File

@@ -0,0 +1,491 @@
;******************************************************************************
;* 32 point SSE-optimized DCT transform
;* Copyright (c) 2010 Vitor Sessak
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
dd 0.553104, 0.582935, 0.622504, 0.674808
dd -10.190008, -3.407609, -2.057781, -1.484165
dd -1.169440, -0.972568, -0.839350, -0.744536
dd 0.502419, 0.522499, 0.566944, 0.646822
dd 0.788155, 1.060678, 1.722447, 5.101149
dd 0.509796, 0.601345, 0.899976, 2.562916
dd 0.509796, 0.601345, 0.899976, 2.562916
dd 1.000000, 1.000000, 1.306563, 0.541196
dd 1.000000, 1.000000, 1.306563, 0.541196
dd 1.000000, 0.707107, 1.000000, -0.707107
dd 1.000000, 0.707107, 1.000000, -0.707107
dd 0.707107, 0.707107, 0.707107, 0.707107
%macro BUTTERFLY 4
subps %4, %1, %2
addps %2, %2, %1
mulps %1, %4, %3
%endmacro
%macro BUTTERFLY0 5
%if cpuflag(sse2) && notcpuflag(avx)
pshufd %4, %1, %5
xorps %1, %2
addps %1, %4
mulps %1, %3
%else
shufps %4, %1, %1, %5
xorps %1, %1, %2
addps %4, %4, %1
mulps %1, %4, %3
%endif
%endmacro
%macro BUTTERFLY2 4
BUTTERFLY0 %1, %2, %3, %4, 0x1b
%endmacro
%macro BUTTERFLY3 4
BUTTERFLY0 %1, %2, %3, %4, 0xb1
%endmacro
%macro BUTTERFLY3V 5
movaps m%5, m%1
addps m%1, m%2
subps m%5, m%2
SWAP %2, %5
mulps m%2, [ps_cos_vec+192]
movaps m%5, m%3
addps m%3, m%4
subps m%4, m%5
mulps m%4, [ps_cos_vec+192]
%endmacro
%macro PASS6_AND_PERMUTE 0
mov tmpd, [outq+4]
movss m7, [outq+72]
addss m7, [outq+76]
movss m3, [outq+56]
addss m3, [outq+60]
addss m4, m3
movss m2, [outq+52]
addss m2, m3
movss m3, [outq+104]
addss m3, [outq+108]
addss m1, m3
addss m5, m4
movss [outq+ 16], m1
movss m1, [outq+100]
addss m1, m3
movss m3, [outq+40]
movss [outq+ 48], m1
addss m3, [outq+44]
movss m1, [outq+100]
addss m4, m3
addss m3, m2
addss m1, [outq+108]
movss [outq+ 40], m3
addss m2, [outq+36]
movss m3, [outq+8]
movss [outq+ 56], m2
addss m3, [outq+12]
movss [outq+ 32], m3
movss m3, [outq+80]
movss [outq+ 8], m5
movss [outq+ 80], m1
movss m2, [outq+52]
movss m5, [outq+120]
addss m5, [outq+124]
movss m1, [outq+64]
addss m2, [outq+60]
addss m0, m5
addss m5, [outq+116]
mov [outq+64], tmpd
addss m6, m0
addss m1, m6
mov tmpd, [outq+12]
mov [outq+ 96], tmpd
movss [outq+ 4], m1
movss m1, [outq+24]
movss [outq+ 24], m4
movss m4, [outq+88]
addss m4, [outq+92]
addss m3, m4
addss m4, [outq+84]
mov tmpd, [outq+108]
addss m1, [outq+28]
addss m0, m1
addss m1, m5
addss m6, m3
addss m3, m0
addss m0, m7
addss m5, [outq+20]
addss m7, m1
movss [outq+ 12], m6
mov [outq+112], tmpd
movss m6, [outq+28]
movss [outq+ 28], m0
movss m0, [outq+36]
movss [outq+ 36], m7
addss m1, m4
movss m7, [outq+116]
addss m0, m2
addss m7, [outq+124]
movss [outq+ 72], m0
movss m0, [outq+44]
addss m2, m0
movss [outq+ 44], m1
movss [outq+ 88], m2
addss m0, [outq+60]
mov tmpd, [outq+60]
mov [outq+120], tmpd
movss [outq+104], m0
addss m4, m5
addss m5, [outq+68]
movss [outq+52], m4
movss [outq+60], m5
movss m4, [outq+68]
movss m5, [outq+20]
movss [outq+ 20], m3
addss m5, m7
addss m7, m6
addss m4, m5
movss m2, [outq+84]
addss m2, [outq+92]
addss m5, m2
movss [outq+ 68], m4
addss m2, m7
movss m4, [outq+76]
movss [outq+ 84], m2
movss [outq+ 76], m5
addss m7, m4
addss m6, [outq+124]
addss m4, m6
addss m6, [outq+92]
movss [outq+100], m4
movss [outq+108], m6
movss m6, [outq+92]
movss [outq+92], m7
addss m6, [outq+124]
movss [outq+116], m6
%endmacro
INIT_YMM avx
SECTION .text
%if HAVE_AVX_EXTERNAL
; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
cglobal dct32_float, 2,3,8, out, in, tmp
; pass 1
vmovaps m4, [inq+0]
vinsertf128 m5, m5, [inq+96], 1
vinsertf128 m5, m5, [inq+112], 0
vshufps m5, m5, m5, 0x1b
BUTTERFLY m4, m5, [ps_cos_vec], m6
vmovaps m2, [inq+64]
vinsertf128 m6, m6, [inq+32], 1
vinsertf128 m6, m6, [inq+48], 0
vshufps m6, m6, m6, 0x1b
BUTTERFLY m2, m6, [ps_cos_vec+32], m0
; pass 2
BUTTERFLY m5, m6, [ps_cos_vec+64], m0
BUTTERFLY m4, m2, [ps_cos_vec+64], m7
; pass 3
vperm2f128 m3, m6, m4, 0x31
vperm2f128 m1, m6, m4, 0x20
vshufps m3, m3, m3, 0x1b
BUTTERFLY m1, m3, [ps_cos_vec+96], m6
vperm2f128 m4, m5, m2, 0x20
vperm2f128 m5, m5, m2, 0x31
vshufps m5, m5, m5, 0x1b
BUTTERFLY m4, m5, [ps_cos_vec+96], m6
; pass 4
vmovaps m6, [ps_p1p1m1m1+0]
vmovaps m2, [ps_cos_vec+128]
BUTTERFLY2 m5, m6, m2, m7
BUTTERFLY2 m4, m6, m2, m7
BUTTERFLY2 m1, m6, m2, m7
BUTTERFLY2 m3, m6, m2, m7
; pass 5
vshufps m6, m6, m6, 0xcc
vmovaps m2, [ps_cos_vec+160]
BUTTERFLY3 m5, m6, m2, m7
BUTTERFLY3 m4, m6, m2, m7
BUTTERFLY3 m1, m6, m2, m7
BUTTERFLY3 m3, m6, m2, m7
vperm2f128 m6, m3, m3, 0x31
vmovaps [outq], m3
vextractf128 [outq+64], m5, 1
vextractf128 [outq+32], m5, 0
vextractf128 [outq+80], m4, 1
vextractf128 [outq+48], m4, 0
vperm2f128 m0, m1, m1, 0x31
vmovaps [outq+96], m1
vzeroupper
; pass 6, no SIMD...
INIT_XMM
PASS6_AND_PERMUTE
RET
%endif
%if ARCH_X86_64
%define SPILL SWAP
%define UNSPILL SWAP
%macro PASS5 0
nop ; FIXME code alignment
SWAP 5, 8
SWAP 4, 12
SWAP 6, 14
SWAP 7, 13
SWAP 0, 15
PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
TRANSPOSE4x4PS 8, 9, 10, 11, 0
BUTTERFLY3V 8, 9, 10, 11, 0
addps m10, m11
TRANSPOSE4x4PS 12, 13, 14, 15, 0
BUTTERFLY3V 12, 13, 14, 15, 0
addps m14, m15
addps m12, m14
addps m14, m13
addps m13, m15
%endmacro
%macro PASS6 0
SWAP 9, 12
SWAP 11, 14
movss [outq+0x00], m8
pshuflw m0, m8, 0xe
movss [outq+0x10], m9
pshuflw m1, m9, 0xe
movss [outq+0x20], m10
pshuflw m2, m10, 0xe
movss [outq+0x30], m11
pshuflw m3, m11, 0xe
movss [outq+0x40], m12
pshuflw m4, m12, 0xe
movss [outq+0x50], m13
pshuflw m5, m13, 0xe
movss [outq+0x60], m14
pshuflw m6, m14, 0xe
movaps [outq+0x70], m15
pshuflw m7, m15, 0xe
addss m0, m1
addss m1, m2
movss [outq+0x08], m0
addss m2, m3
movss [outq+0x18], m1
addss m3, m4
movss [outq+0x28], m2
addss m4, m5
movss [outq+0x38], m3
addss m5, m6
movss [outq+0x48], m4
addss m6, m7
movss [outq+0x58], m5
movss [outq+0x68], m6
movss [outq+0x78], m7
PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
movhlps m0, m1
pshufd m1, m1, 3
SWAP 0, 2, 4, 6, 8, 10, 12, 14
SWAP 1, 3, 5, 7, 9, 11, 13, 15
%rep 7
movhlps m0, m1
pshufd m1, m1, 3
addss m15, m1
SWAP 0, 2, 4, 6, 8, 10, 12, 14
SWAP 1, 3, 5, 7, 9, 11, 13, 15
%endrep
%assign i 4
%rep 15
addss m0, m1
movss [outq+i], m0
SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
%assign i i+8
%endrep
%endmacro
%else ; ARCH_X86_32
%macro SPILL 2 ; xmm#, mempos
movaps [outq+(%2-8)*16], m%1
%endmacro
%macro UNSPILL 2
movaps m%1, [outq+(%2-8)*16]
%endmacro
%define PASS6 PASS6_AND_PERMUTE
%macro PASS5 0
movaps m2, [ps_cos_vec+160]
shufps m3, m3, 0xcc
BUTTERFLY3 m5, m3, m2, m1
SPILL 5, 8
UNSPILL 1, 9
BUTTERFLY3 m1, m3, m2, m5
SPILL 1, 14
BUTTERFLY3 m4, m3, m2, m5
SPILL 4, 12
BUTTERFLY3 m7, m3, m2, m5
SPILL 7, 13
UNSPILL 5, 10
BUTTERFLY3 m5, m3, m2, m7
SPILL 5, 10
UNSPILL 4, 11
BUTTERFLY3 m4, m3, m2, m7
SPILL 4, 11
BUTTERFLY3 m6, m3, m2, m7
SPILL 6, 9
BUTTERFLY3 m0, m3, m2, m7
SPILL 0, 15
%endmacro
%endif
; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
%macro DCT32_FUNC 0
cglobal dct32_float, 2, 3, 16, out, in, tmp
; pass 1
movaps m0, [inq+0]
LOAD_INV m1, [inq+112]
BUTTERFLY m0, m1, [ps_cos_vec], m3
movaps m7, [inq+64]
LOAD_INV m4, [inq+48]
BUTTERFLY m7, m4, [ps_cos_vec+32], m3
; pass 2
movaps m2, [ps_cos_vec+64]
BUTTERFLY m1, m4, m2, m3
SPILL 1, 11
SPILL 4, 8
; pass 1
movaps m1, [inq+16]
LOAD_INV m6, [inq+96]
BUTTERFLY m1, m6, [ps_cos_vec+16], m3
movaps m4, [inq+80]
LOAD_INV m5, [inq+32]
BUTTERFLY m4, m5, [ps_cos_vec+48], m3
; pass 2
BUTTERFLY m0, m7, m2, m3
movaps m2, [ps_cos_vec+80]
BUTTERFLY m6, m5, m2, m3
BUTTERFLY m1, m4, m2, m3
; pass 3
movaps m2, [ps_cos_vec+96]
shufps m1, m1, 0x1b
BUTTERFLY m0, m1, m2, m3
SPILL 0, 15
SPILL 1, 14
UNSPILL 0, 8
shufps m5, m5, 0x1b
BUTTERFLY m0, m5, m2, m3
UNSPILL 1, 11
shufps m6, m6, 0x1b
BUTTERFLY m1, m6, m2, m3
SPILL 1, 11
shufps m4, m4, 0x1b
BUTTERFLY m7, m4, m2, m3
; pass 4
movaps m3, [ps_p1p1m1m1+0]
movaps m2, [ps_cos_vec+128]
BUTTERFLY2 m5, m3, m2, m1
BUTTERFLY2 m0, m3, m2, m1
SPILL 0, 9
BUTTERFLY2 m6, m3, m2, m1
SPILL 6, 10
UNSPILL 0, 11
BUTTERFLY2 m0, m3, m2, m1
SPILL 0, 11
BUTTERFLY2 m4, m3, m2, m1
BUTTERFLY2 m7, m3, m2, m1
UNSPILL 6, 14
BUTTERFLY2 m6, m3, m2, m1
UNSPILL 0, 15
BUTTERFLY2 m0, m3, m2, m1
PASS5
PASS6
RET
%endmacro
%macro LOAD_INV 2
%if cpuflag(sse2)
pshufd %1, %2, 0x1b
%elif cpuflag(sse)
movaps %1, %2
shufps %1, %1, 0x1b
%endif
%endmacro
%if ARCH_X86_32
INIT_XMM sse
DCT32_FUNC
%endif
INIT_XMM sse2
DCT32_FUNC

41
externals/ffmpeg/libavcodec/x86/dct_init.c vendored Executable file
View File

@@ -0,0 +1,41 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dct.h"
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
av_cold void ff_dct_init_x86(DCTContext *s)
{
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_SSE(cpu_flags))
s->dct32 = ff_dct32_float_sse;
#endif
if (EXTERNAL_SSE2(cpu_flags))
s->dct32 = ff_dct32_float_sse2;
if (EXTERNAL_AVX_FAST(cpu_flags))
s->dct32 = ff_dct32_float_avx;
}

307
externals/ffmpeg/libavcodec/x86/dirac_dwt.asm vendored Executable file
View File

@@ -0,0 +1,307 @@
;******************************************************************************
;* x86 optimized discrete wavelet trasnform
;* Copyright (c) 2010 David Conrad
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pw_1991: times 4 dw 9,-1
cextern pw_1
cextern pw_2
cextern pw_8
cextern pw_16
SECTION .text
; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2
%macro COMPOSE_53iL0 4
paddw %2, %3
paddw %2, %4
psraw %2, 2
psubw %1, %2
%endm
; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
; if %4 is supplied, %1 is loaded unaligned from there
; m2: clobbered m3: pw_8 m4: pw_1991
%macro COMPOSE_DD97iH0 3-4
paddw m0, %3
paddw m1, %2
psubw m0, m3
mova m2, m1
punpcklwd m1, m0
punpckhwd m2, m0
pmaddwd m1, m4
pmaddwd m2, m4
%if %0 > 3
movu %1, %4
%endif
psrad m1, 4
psrad m2, 4
packssdw m1, m2
paddw m1, %1
%endm
%macro COMPOSE_VERTICAL 1
; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
; int width)
cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
mova m2, [pw_2]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m1, [b0q+2*widthq]
mova m0, [b1q+2*widthq]
COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
mova [b1q+2*widthq], m0
jg .loop
REP_RET
; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
; int width)
cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
mova m1, [pw_1]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m0, [b0q+2*widthq]
paddw m0, [b2q+2*widthq]
paddw m0, m1
psraw m0, 1
paddw m0, [b1q+2*widthq]
mova [b1q+2*widthq], m0
jg .loop
REP_RET
; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
; IDWTELEM *b3, IDWTELEM *b4, int width)
cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
mova m3, [pw_8]
mova m4, [pw_1991]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m0, [b0q+2*widthq]
mova m1, [b1q+2*widthq]
COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
mova [b2q+2*widthq], m1
jg .loop
REP_RET
; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
; IDWTELEM *b3, IDWTELEM *b4, int width)
cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
mova m3, [pw_16]
mova m4, [pw_1991]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m0, [b0q+2*widthq]
mova m1, [b1q+2*widthq]
mova m5, [b2q+2*widthq]
paddw m0, [b4q+2*widthq]
paddw m1, [b3q+2*widthq]
psubw m0, m3
mova m2, m1
punpcklwd m1, m0
punpckhwd m2, m0
pmaddwd m1, m4
pmaddwd m2, m4
psrad m1, 5
psrad m2, 5
packssdw m1, m2
psubw m5, m1
mova [b2q+2*widthq], m5
jg .loop
REP_RET
; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
mova m3, [pw_1]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m1, [b1q+2*widthq]
mova m0, [b0q+2*widthq]
mova m2, m1
paddw m1, m3
psraw m1, 1
psubw m0, m1
mova [b0q+2*widthq], m0
paddw m2, m0
mova [b1q+2*widthq], m2
jg .loop
REP_RET
%endmacro
; extend the left and right edges of the tmp array by %1 and %2 respectively
%macro EDGE_EXTENSION 3
mov %3, [tmpq]
%assign %%i 1
%rep %1
mov [tmpq-2*%%i], %3
%assign %%i %%i+1
%endrep
mov %3, [tmpq+2*w2q-2]
%assign %%i 0
%rep %2
mov [tmpq+2*w2q+2*%%i], %3
%assign %%i %%i+1
%endrep
%endmacro
%macro HAAR_HORIZONTAL 2
; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
mov w2d, wd
xor xq, xq
shr w2d, 1
lea b_w2q, [bq+wq]
mova m3, [pw_1]
.lowpass_loop:
movu m1, [b_w2q + 2*xq]
mova m0, [bq + 2*xq]
paddw m1, m3
psraw m1, 1
psubw m0, m1
mova [tmpq + 2*xq], m0
add xq, mmsize/2
cmp xq, w2q
jl .lowpass_loop
xor xq, xq
and w2q, ~(mmsize/2 - 1)
cmp w2q, mmsize/2
jl .end
.highpass_loop:
movu m1, [b_w2q + 2*xq]
mova m0, [tmpq + 2*xq]
paddw m1, m0
; shift and interleave
%if %2 == 1
paddw m0, m3
paddw m1, m3
psraw m0, 1
psraw m1, 1
%endif
mova m2, m0
punpcklwd m0, m1
punpckhwd m2, m1
mova [bq+4*xq], m0
mova [bq+4*xq+mmsize], m2
add xq, mmsize/2
cmp xq, w2q
jl .highpass_loop
.end:
REP_RET
%endmacro
INIT_XMM
; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
mov w2d, wd
xor xd, xd
shr w2d, 1
lea b_w2q, [bq+wq]
movu m4, [bq+wq]
mova m7, [pw_2]
pslldq m4, 14
.lowpass_loop:
movu m1, [b_w2q + 2*xq]
mova m0, [bq + 2*xq]
mova m2, m1
palignr m1, m4, 14
mova m4, m2
COMPOSE_53iL0 m0, m1, m2, m7
mova [tmpq + 2*xq], m0
add xd, mmsize/2
cmp xd, w2d
jl .lowpass_loop
EDGE_EXTENSION 1, 2, xw
; leave the last up to 7 (sse) or 3 (mmx) values for C
xor xd, xd
and w2d, ~(mmsize/2 - 1)
cmp w2d, mmsize/2
jl .end
mova m7, [tmpq-mmsize]
mova m0, [tmpq]
mova m5, [pw_1]
mova m3, [pw_8]
mova m4, [pw_1991]
.highpass_loop:
mova m6, m0
palignr m0, m7, 14
mova m7, [tmpq + 2*xq + 16]
mova m1, m7
mova m2, m7
palignr m1, m6, 2
palignr m2, m6, 4
COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
mova m0, m7
mova m7, m6
; shift and interleave
paddw m6, m5
paddw m1, m5
psraw m6, 1
psraw m1, 1
mova m2, m6
punpcklwd m6, m1
punpckhwd m2, m1
mova [bq+4*xq], m6
mova [bq+4*xq+mmsize], m2
add xd, mmsize/2
cmp xd, w2d
jl .highpass_loop
.end:
REP_RET
%if ARCH_X86_64 == 0
INIT_MMX
COMPOSE_VERTICAL mmx
HAAR_HORIZONTAL mmx, 0
HAAR_HORIZONTAL mmx, 1
%endif
;;INIT_XMM
INIT_XMM
COMPOSE_VERTICAL sse2
HAAR_HORIZONTAL sse2, 0
HAAR_HORIZONTAL sse2, 1

View File

@@ -0,0 +1,229 @@
/*
* x86 optimized discrete wavelet transform
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
* Copyright (c) 2010 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dirac_dwt.h"
#define COMPOSE_VERTICAL(ext, align) \
void ff_vertical_compose53iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
void ff_vertical_compose_dirac53iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
void ff_vertical_compose_dd137iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
void ff_vertical_compose_dd97iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
void ff_vertical_compose_haar##ext(int16_t *b0, int16_t *b1, int width); \
void ff_horizontal_compose_haar0i##ext(int16_t *b, int16_t *tmp, int w);\
void ff_horizontal_compose_haar1i##ext(int16_t *b, int16_t *tmp, int w);\
\
static void vertical_compose53iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
{ \
int i, width_align = width&~(align-1); \
int16_t *b0 = (int16_t *)_b0; \
int16_t *b1 = (int16_t *)_b1; \
int16_t *b2 = (int16_t *)_b2; \
\
for(i=width_align; i<width; i++) \
b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
\
ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
} \
\
static void vertical_compose_dirac53iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
{ \
int i, width_align = width&~(align-1); \
int16_t *b0 = (int16_t *)_b0; \
int16_t *b1 = (int16_t *)_b1; \
int16_t *b2 = (int16_t *)_b2; \
\
for(i=width_align; i<width; i++) \
b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
\
ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
} \
\
static void vertical_compose_dd137iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
uint8_t *_b3, uint8_t *_b4, int width) \
{ \
int i, width_align = width&~(align-1); \
int16_t *b0 = (int16_t *)_b0; \
int16_t *b1 = (int16_t *)_b1; \
int16_t *b2 = (int16_t *)_b2; \
int16_t *b3 = (int16_t *)_b3; \
int16_t *b4 = (int16_t *)_b4; \
\
for(i=width_align; i<width; i++) \
b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
\
ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
} \
\
static void vertical_compose_dd97iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
uint8_t *_b3, uint8_t *_b4, int width) \
{ \
int i, width_align = width&~(align-1); \
int16_t *b0 = (int16_t *)_b0; \
int16_t *b1 = (int16_t *)_b1; \
int16_t *b2 = (int16_t *)_b2; \
int16_t *b3 = (int16_t *)_b3; \
int16_t *b4 = (int16_t *)_b4; \
\
for(i=width_align; i<width; i++) \
b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
\
ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
} \
static void vertical_compose_haar##ext(uint8_t *_b0, uint8_t *_b1, int width) \
{ \
int i, width_align = width&~(align-1); \
int16_t *b0 = (int16_t *)_b0; \
int16_t *b1 = (int16_t *)_b1; \
\
for(i=width_align; i<width; i++) { \
b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
} \
\
ff_vertical_compose_haar##ext(b0, b1, width_align); \
} \
static void horizontal_compose_haar0i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
{\
int w2= w>>1;\
int x= w2 - (w2&(align-1));\
int16_t *b = (int16_t *)_b; \
int16_t *tmp = (int16_t *)_tmp; \
\
ff_horizontal_compose_haar0i##ext(b, tmp, w);\
\
for (; x < w2; x++) {\
b[2*x ] = tmp[x];\
b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
}\
}\
static void horizontal_compose_haar1i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
{\
int w2= w>>1;\
int x= w2 - (w2&(align-1));\
int16_t *b = (int16_t *)_b; \
int16_t *tmp = (int16_t *)_tmp; \
\
ff_horizontal_compose_haar1i##ext(b, tmp, w);\
\
for (; x < w2; x++) {\
b[2*x ] = (tmp[x] + 1)>>1;\
b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
}\
}\
\
#if HAVE_X86ASM
#if !ARCH_X86_64
COMPOSE_VERTICAL(_mmx, 4)
#endif
COMPOSE_VERTICAL(_sse2, 8)
void ff_horizontal_compose_dd97i_ssse3(int16_t *_b, int16_t *_tmp, int w);
static void horizontal_compose_dd97i_ssse3(uint8_t *_b, uint8_t *_tmp, int w)
{
int w2= w>>1;
int x= w2 - (w2&7);
int16_t *b = (int16_t *)_b;
int16_t *tmp = (int16_t *)_tmp;
ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
for (; x < w2; x++) {
b[2*x ] = (tmp[x] + 1)>>1;
b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
}
}
#endif
void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type)
{
#if HAVE_X86ASM
int mm_flags = av_get_cpu_flags();
#if !ARCH_X86_64
if (!(mm_flags & AV_CPU_FLAG_MMX))
return;
switch (type) {
case DWT_DIRAC_DD9_7:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
break;
case DWT_DIRAC_LEGALL5_3:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
break;
case DWT_DIRAC_DD13_7:
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
break;
case DWT_DIRAC_HAAR0:
d->vertical_compose = (void*)vertical_compose_haar_mmx;
d->horizontal_compose = horizontal_compose_haar0i_mmx;
break;
case DWT_DIRAC_HAAR1:
d->vertical_compose = (void*)vertical_compose_haar_mmx;
d->horizontal_compose = horizontal_compose_haar1i_mmx;
break;
}
#endif
if (!(mm_flags & AV_CPU_FLAG_SSE2))
return;
switch (type) {
case DWT_DIRAC_DD9_7:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
break;
case DWT_DIRAC_LEGALL5_3:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
break;
case DWT_DIRAC_DD13_7:
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
break;
case DWT_DIRAC_HAAR0:
d->vertical_compose = (void*)vertical_compose_haar_sse2;
d->horizontal_compose = horizontal_compose_haar0i_sse2;
break;
case DWT_DIRAC_HAAR1:
d->vertical_compose = (void*)vertical_compose_haar_sse2;
d->horizontal_compose = horizontal_compose_haar1i_sse2;
break;
}
if (!(mm_flags & AV_CPU_FLAG_SSSE3))
return;
switch (type) {
case DWT_DIRAC_DD9_7:
d->horizontal_compose = horizontal_compose_dd97i_ssse3;
break;
}
#endif // HAVE_X86ASM
}

348
externals/ffmpeg/libavcodec/x86/diracdsp.asm vendored Executable file
View File

@@ -0,0 +1,348 @@
;******************************************************************************
;* Copyright (c) 2010 David Conrad
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pw_7: times 8 dw 7
convert_to_unsigned_10bit: times 4 dd 0x200
clip_10bit: times 8 dw 0x3ff
cextern pw_3
cextern pw_16
cextern pw_32
cextern pb_80
SECTION .text
%macro UNPACK_ADD 6
mov%5 %1, %3
mov%6 m5, %4
mova m4, %1
mova %2, m5
punpcklbw %1, m7
punpcklbw m5, m7
punpckhbw m4, m7
punpckhbw %2, m7
paddw %1, m5
paddw %2, m4
%endmacro
%macro HPEL_FILTER 1
; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
mov src0q, srcq
lea stridex3q, [3*strideq]
sub src0q, stridex3q
pxor m7, m7
.loop:
; 7*(src[0] + src[1])
UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
pmullw m0, [pw_7]
pmullw m1, [pw_7]
; 3*( ... + src[-2] + src[3])
UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
paddw m0, m2
paddw m1, m3
pmullw m0, [pw_3]
pmullw m1, [pw_3]
; ... - 7*(src[-1] + src[2])
UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
pmullw m2, [pw_7]
pmullw m3, [pw_7]
psubw m0, m2
psubw m1, m3
; ... - (src[-3] + src[4])
UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
psubw m0, m2
psubw m1, m3
paddw m0, [pw_16]
paddw m1, [pw_16]
psraw m0, 5
psraw m1, 5
packuswb m0, m1
mova [dstq], m0
add dstq, mmsize
add srcq, mmsize
add src0q, mmsize
sub widthd, mmsize
jg .loop
RET
; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
dec widthd
pxor m7, m7
and widthd, ~(mmsize-1)
.loop:
; 7*(src[0] + src[1])
UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
pmullw m0, [pw_7]
pmullw m1, [pw_7]
; 3*( ... + src[-2] + src[3])
UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
paddw m0, m2
paddw m1, m3
pmullw m0, [pw_3]
pmullw m1, [pw_3]
; ... - 7*(src[-1] + src[2])
UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
pmullw m2, [pw_7]
pmullw m3, [pw_7]
psubw m0, m2
psubw m1, m3
; ... - (src[-3] + src[4])
UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
psubw m0, m2
psubw m1, m3
paddw m0, [pw_16]
paddw m1, [pw_16]
psraw m0, 5
psraw m1, 5
packuswb m0, m1
mova [dstq + widthq], m0
sub widthd, mmsize
jge .loop
RET
%endmacro
%macro PUT_RECT 1
; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
mova m0, [pb_80]
add wd, (mmsize-1)
and wd, ~(mmsize-1)
%if ARCH_X86_64
movsxd dst_strideq, dst_strided
movsxd src_strideq, src_strided
mov r7d, r5m
mov r8d, wd
%define wspill r8d
%define hd r7d
%else
mov r4m, wd
%define wspill r4m
%define hd r5mp
%endif
.loopy:
lea src2q, [srcq+src_strideq]
lea dst2q, [dstq+dst_strideq]
.loopx:
sub wd, mmsize
mova m1, [srcq +2*wq]
mova m2, [src2q+2*wq]
packsswb m1, [srcq +2*wq+mmsize]
packsswb m2, [src2q+2*wq+mmsize]
paddb m1, m0
paddb m2, m0
mova [dstq +wq], m1
mova [dst2q+wq], m2
jg .loopx
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
sub hd, 2
mov wd, wspill
jg .loopy
RET
%endm
%macro ADD_RECT 1
; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
mova m0, [pw_32]
add wd, (mmsize-1)
and wd, ~(mmsize-1)
%if ARCH_X86_64
movsxd strideq, strided
movsxd idwt_strideq, idwt_strided
mov r8d, wd
%define wspill r8d
%else
mov r5m, wd
%define wspill r5m
%endif
.loop:
sub wd, mmsize
movu m1, [srcq +2*wq] ; FIXME: ensure alignment
paddw m1, m0
psraw m1, 6
movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
paddw m2, m0
psraw m2, 6
paddw m1, [idwtq+2*wq]
paddw m2, [idwtq+2*wq+mmsize]
packuswb m1, m2
mova [dstq +wq], m1
jg .loop
lea srcq, [srcq + 2*strideq]
add dstq, strideq
lea idwtq, [idwtq+ 2*idwt_strideq]
sub hd, 1
mov wd, wspill
jg .loop
RET
%endm
%macro ADD_OBMC 2
; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
pxor m4, m4
.loop:
%assign i 0
%rep %1 / mmsize
mova m0, [srcq+i]
mova m1, m0
punpcklbw m0, m4
punpckhbw m1, m4
mova m2, [obmcq+i]
mova m3, m2
punpcklbw m2, m4
punpckhbw m3, m4
pmullw m0, m2
pmullw m1, m3
movu m2, [dstq+2*i]
movu m3, [dstq+2*i+mmsize]
paddw m0, m2
paddw m1, m3
movu [dstq+2*i], m0
movu [dstq+2*i+mmsize], m1
%assign i i+mmsize
%endrep
lea srcq, [srcq+strideq]
lea dstq, [dstq+2*strideq]
add obmcq, 32
sub yblend, 1
jg .loop
RET
%endm
INIT_MMX
%if ARCH_X86_64 == 0
PUT_RECT mmx
ADD_RECT mmx
HPEL_FILTER mmx
ADD_OBMC 32, mmx
ADD_OBMC 16, mmx
%endif
ADD_OBMC 8, mmx
INIT_XMM
PUT_RECT sse2
ADD_RECT sse2
HPEL_FILTER sse2
ADD_OBMC 32, sse2
ADD_OBMC 16, sse2
INIT_XMM sse4
; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
movd m2, qfd
movd m3, qsd
SPLATD m2
SPLATD m3
mov r4d, tot_hd
mov r3, dstq
.loop_v:
mov tot_hq, r4
mov dstq, r3
.loop_h:
movu m0, [srcq]
pabsd m1, m0
pmulld m1, m2
paddd m1, m3
psrld m1, 2
psignd m1, m0
movu [dstq], m1
add srcq, mmsize
add dstq, mmsize
sub tot_hq, 4
jg .loop_h
lea srcq, [srcq + 4*tot_hq]
add r3, strideq
dec tot_vd
jg .loop_v
RET
INIT_XMM sse4
; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
%if ARCH_X86_64
cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2
%else
cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2
%define hd r5mp
%endif
shl wd, 2
add srcq, wq
neg wq
mov t2q, dstq
mov t1q, wq
pxor m2, m2
mova m3, [clip_10bit]
mova m4, [convert_to_unsigned_10bit]
.loop_h:
mov dstq, t2q
mov wq, t1q
.loop_w:
movu m0, [srcq+wq+0*mmsize]
movu m1, [srcq+wq+1*mmsize]
paddd m0, m4
paddd m1, m4
packusdw m0, m0, m1
CLIPW m0, m2, m3 ; packusdw saturates so it's fine
movu [dstq], m0
add dstq, 1*mmsize
add wq, 2*mmsize
jl .loop_w
add srcq, src_strideq
add t2q, dst_strideq
sub hd, 1
jg .loop_h
RET

View File

@@ -0,0 +1,195 @@
/*
* Copyright (C) 2010 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86/cpu.h"
#include "libavcodec/diracdsp.h"
#include "fpel.h"
DECL_DIRAC_PIXOP(put, mmx);
DECL_DIRAC_PIXOP(avg, mmx);
DECL_DIRAC_PIXOP(avg, mmxext);
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
#if HAVE_X86ASM
#define HPEL_FILTER(MMSIZE, EXT) \
void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \
void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \
\
static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
const uint8_t *src, int stride, int width, int height) \
{ \
while( height-- ) \
{ \
ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \
ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \
\
dsth += stride; \
dstv += stride; \
dstc += stride; \
src += stride; \
} \
}
#define PIXFUNC(PFX, IDX, EXT) \
/*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \
c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
{\
if (h&3)\
ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
else\
OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
}\
void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
{\
if (h&3)\
ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
else\
OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
}\
void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
{\
if (h&3) {\
ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
} else {\
OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
}\
}
DIRAC_PIXOP(put, ff_put, mmx)
DIRAC_PIXOP(avg, ff_avg, mmx)
DIRAC_PIXOP(avg, ff_avg, mmxext)
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3)
ff_put_dirac_pixels16_c(dst, src, stride, h);
else
ff_put_pixels16_sse2(dst, src[0], stride, h);
}
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3)
ff_avg_dirac_pixels16_c(dst, src, stride, h);
else
ff_avg_pixels16_sse2(dst, src[0], stride, h);
}
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3) {
ff_put_dirac_pixels32_c(dst, src, stride, h);
} else {
ff_put_pixels16_sse2(dst , src[0] , stride, h);
ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
}
}
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3) {
ff_avg_dirac_pixels32_c(dst, src, stride, h);
} else {
ff_avg_pixels16_sse2(dst , src[0] , stride, h);
ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
}
}
#else // HAVE_X86ASM
#define HPEL_FILTER(MMSIZE, EXT) \
void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
const uint8_t *src, int stride, int width, int height);
#define PIXFUNC(PFX, IDX, EXT) do {} while (0)
#endif // HAVE_X86ASM
#if !ARCH_X86_64
HPEL_FILTER(8, mmx)
#endif
HPEL_FILTER(16, sse2)
void ff_diracdsp_init_x86(DiracDSPContext* c)
{
int mm_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(mm_flags)) {
c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
#if !ARCH_X86_64
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
c->dirac_hpel_filter = dirac_hpel_filter_mmx;
c->add_rect_clamped = ff_add_rect_clamped_mmx;
c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_mmx;
#endif
PIXFUNC(put, 0, mmx);
PIXFUNC(avg, 0, mmx);
}
if (EXTERNAL_MMXEXT(mm_flags)) {
PIXFUNC(avg, 0, mmxext);
}
if (EXTERNAL_SSE2(mm_flags)) {
c->dirac_hpel_filter = dirac_hpel_filter_sse2;
c->add_rect_clamped = ff_add_rect_clamped_sse2;
c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
}
if (EXTERNAL_SSE4(mm_flags)) {
c->dequant_subband[1] = ff_dequant_subband_32_sse4;
c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
}
}

49
externals/ffmpeg/libavcodec/x86/dnxhdenc.asm vendored Executable file
View File

@@ -0,0 +1,49 @@
;************************************************************************
;* VC3/DNxHD SIMD functions
;* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
;* Copyright (c) 2014 Tiancheng "Timothy" Gu <timothygu99@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
; void get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
; ptrdiff_t line_size)
INIT_XMM sse2
cglobal get_pixels_8x4_sym, 3,3,5, block, pixels, linesize
pxor m4, m4
movq m0, [pixelsq]
add pixelsq, linesizeq
movq m1, [pixelsq]
movq m2, [pixelsq+linesizeq]
movq m3, [pixelsq+linesizeq*2]
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpcklbw m3, m4
mova [blockq ], m0
mova [blockq+16 ], m1
mova [blockq+32 ], m2
mova [blockq+48 ], m3
mova [blockq+64 ], m3
mova [blockq+80 ], m2
mova [blockq+96 ], m1
mova [blockq+112], m0
RET

View File

@@ -0,0 +1,37 @@
/*
* VC3/DNxHD SIMD functions
* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
*
* VC-3 encoder funded by the British Broadcasting Corporation
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dnxhdenc.h"
void ff_get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
ptrdiff_t line_size);
av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
{
if (EXTERNAL_SSE2(av_get_cpu_flags())) {
if (ctx->cid_table->bit_depth == 8)
ctx->get_pixels_8x4_sym = ff_get_pixels_8x4_sym_sse2;
}
}

118
externals/ffmpeg/libavcodec/x86/exrdsp.asm vendored Executable file
View File

@@ -0,0 +1,118 @@
;******************************************************************************
;* X86 Optimized functions for Open Exr Decoder
;* Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
;*
;* reorder_pixels, predictor based on patch by John Loy
;* port to ASM by Jokyo Images support by CNC - French National Center for Cinema
;*
;* predictor AVX/AVX2 by Henrik Gramner
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
cextern pb_15
cextern pb_80
SECTION .text
;------------------------------------------------------------------------------
; void ff_reorder_pixels(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
;------------------------------------------------------------------------------
%macro REORDER_PIXELS 0
cglobal reorder_pixels, 3,4,3, dst, src1, size, src2
lea src2q, [src1q+sizeq] ; src2 = src + 2 * half_size
add dstq, sizeq ; dst offset by size
shr sizeq, 1 ; half_size
add src1q, sizeq ; offset src by half_size
neg sizeq ; size = offset for dst, src1, src2
.loop:
mova m0, [src1q+sizeq] ; load first part
movu m1, [src2q+sizeq] ; load second part
SBUTTERFLY bw, 0, 1, 2 ; interleaved
mova [dstq+2*sizeq ], xm0 ; copy to dst
mova [dstq+2*sizeq+16], xm1
%if cpuflag(avx2)
vperm2i128 m0, m0, m1, q0301
mova [dstq+2*sizeq+32], m0
%endif
add sizeq, mmsize
jl .loop
RET
%endmacro
INIT_XMM sse2
REORDER_PIXELS
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
REORDER_PIXELS
%endif
;------------------------------------------------------------------------------
; void ff_predictor(uint8_t *src, ptrdiff_t size);
;------------------------------------------------------------------------------
%macro PREDICTOR 0
cglobal predictor, 2,2,5, src, size
mova m0, [pb_80]
mova xm1, [pb_15]
mova xm2, xm0
add srcq, sizeq
neg sizeq
.loop:
pxor m3, m0, [srcq + sizeq]
pslldq m4, m3, 1
paddb m3, m4
pslldq m4, m3, 2
paddb m3, m4
pslldq m4, m3, 4
paddb m3, m4
pslldq m4, m3, 8
%if mmsize == 32
paddb m3, m4
paddb xm2, xm3
vextracti128 xm4, m3, 1
mova [srcq + sizeq], xm2
pshufb xm2, xm1
paddb xm2, xm4
mova [srcq + sizeq + 16], xm2
%else
paddb m2, m3
paddb m2, m4
mova [srcq + sizeq], m2
%endif
pshufb xm2, xm1
add sizeq, mmsize
jl .loop
RET
%endmacro
INIT_XMM ssse3
PREDICTOR
INIT_XMM avx
PREDICTOR
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
PREDICTOR
%endif

52
externals/ffmpeg/libavcodec/x86/exrdsp_init.c vendored Executable file
View File

@@ -0,0 +1,52 @@
/*
* OpenEXR (.exr) image decoder
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/exrdsp.h"
void ff_reorder_pixels_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
void ff_reorder_pixels_avx2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
void ff_predictor_ssse3(uint8_t *src, ptrdiff_t size);
void ff_predictor_avx(uint8_t *src, ptrdiff_t size);
void ff_predictor_avx2(uint8_t *src, ptrdiff_t size);
av_cold void ff_exrdsp_init_x86(ExrDSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->reorder_pixels = ff_reorder_pixels_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
dsp->predictor = ff_predictor_ssse3;
}
if (EXTERNAL_AVX(cpu_flags)) {
dsp->predictor = ff_predictor_avx;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
dsp->reorder_pixels = ff_reorder_pixels_avx2;
dsp->predictor = ff_predictor_avx2;
}
}

594
externals/ffmpeg/libavcodec/x86/fdct.c vendored Executable file
View File

@@ -0,0 +1,594 @@
/*
* SIMD-optimized forward DCT
* The gcc porting is Copyright (c) 2001 Fabrice Bellard.
* cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
* SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
*
* from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
*
* Intel Application Note AP-922 - fast, precise implementation of DCT
* http://developer.intel.com/vtune/cbts/appnotes.htm
*
* Also of inspiration:
* a page about fdct at http://www.geocities.com/ssavekar/dct.htm
* Skal's fdct at http://skal.planet-d.net/coding/dct.html
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/common.h"
#include "libavutil/x86/asm.h"
#include "fdct.h"
#if HAVE_MMX_INLINE
//////////////////////////////////////////////////////////////////////
//
// constants for the forward DCT
// -----------------------------
//
// Be sure to check that your compiler is aligning all constants to QWORD
// (8-byte) memory boundaries! Otherwise the unaligned memory access will
// severely stall MMX execution.
//
//////////////////////////////////////////////////////////////////////
#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
#define SHIFT_FRW_COL BITS_FRW_ACC
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
#define X8(x) x,x,x,x,x,x,x,x
//concatenated table, for forward DCT transformation
DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
X8(13036), // tg * (2<<16) + 0.5
X8(27146), // tg * (2<<16) + 0.5
X8(-21746) // tg * (2<<16) + 0.5
};
DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
X8(23170) //cos * (2<<15) + 0.5
};
DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
static const struct
{
DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
} fdct_r_row_sse2 =
{{
RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
}};
//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table
16384, 16384, 22725, 19266,
16384, 16384, 12873, 4520,
21407, 8867, 19266, -4520,
-8867, -21407, -22725, -12873,
16384, -16384, 12873, -22725,
-16384, 16384, 4520, 19266,
8867, -21407, 4520, -12873,
21407, -8867, 19266, -22725,
22725, 22725, 31521, 26722,
22725, 22725, 17855, 6270,
29692, 12299, 26722, -6270,
-12299, -29692, -31521, -17855,
22725, -22725, 17855, -31521,
-22725, 22725, 6270, 26722,
12299, -29692, 6270, -17855,
29692, -12299, 26722, -31521,
21407, 21407, 29692, 25172,
21407, 21407, 16819, 5906,
27969, 11585, 25172, -5906,
-11585, -27969, -29692, -16819,
21407, -21407, 16819, -29692,
-21407, 21407, 5906, 25172,
11585, -27969, 5906, -16819,
27969, -11585, 25172, -29692,
19266, 19266, 26722, 22654,
19266, 19266, 15137, 5315,
25172, 10426, 22654, -5315,
-10426, -25172, -26722, -15137,
19266, -19266, 15137, -26722,
-19266, 19266, 5315, 22654,
10426, -25172, 5315, -15137,
25172, -10426, 22654, -26722,
16384, 16384, 22725, 19266,
16384, 16384, 12873, 4520,
21407, 8867, 19266, -4520,
-8867, -21407, -22725, -12873,
16384, -16384, 12873, -22725,
-16384, 16384, 4520, 19266,
8867, -21407, 4520, -12873,
21407, -8867, 19266, -22725,
19266, 19266, 26722, 22654,
19266, 19266, 15137, 5315,
25172, 10426, 22654, -5315,
-10426, -25172, -26722, -15137,
19266, -19266, 15137, -26722,
-19266, 19266, 5315, 22654,
10426, -25172, 5315, -15137,
25172, -10426, 22654, -26722,
21407, 21407, 29692, 25172,
21407, 21407, 16819, 5906,
27969, 11585, 25172, -5906,
-11585, -27969, -29692, -16819,
21407, -21407, 16819, -29692,
-21407, 21407, 5906, 25172,
11585, -27969, 5906, -16819,
27969, -11585, 25172, -29692,
22725, 22725, 31521, 26722,
22725, 22725, 17855, 6270,
29692, 12299, 26722, -6270,
-12299, -29692, -31521, -17855,
22725, -22725, 17855, -31521,
-22725, 22725, 6270, 26722,
12299, -29692, 6270, -17855,
29692, -12299, 26722, -31521,
};
static const struct
{
DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
} tab_frw_01234567_sse2 =
{{
//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table
#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
C4, C4, C5, C7, C2, C6, C3, -C7, \
-C4, C4, C7, C3, C6, -C2, C7, -C5, \
C4, -C4, C5, -C1, C2, -C6, C3, -C1,
// c1..c7 * cos(pi/4) * 2^15
#define C1 22725
#define C2 21407
#define C3 19266
#define C4 16384
#define C5 12873
#define C6 8867
#define C7 4520
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 31521
#define C2 29692
#define C3 26722
#define C4 22725
#define C5 17855
#define C6 12299
#define C7 6270
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 29692
#define C2 27969
#define C3 25172
#define C4 21407
#define C5 16819
#define C6 11585
#define C7 5906
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 26722
#define C2 25172
#define C3 22654
#define C4 19266
#define C5 15137
#define C6 10426
#define C7 5315
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 22725
#define C2 21407
#define C3 19266
#define C4 16384
#define C5 12873
#define C6 8867
#define C7 4520
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 26722
#define C2 25172
#define C3 22654
#define C4 19266
#define C5 15137
#define C6 10426
#define C7 5315
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 29692
#define C2 27969
#define C3 25172
#define C4 21407
#define C5 16819
#define C6 11585
#define C7 5906
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 31521
#define C2 29692
#define C3 26722
#define C4 22725
#define C5 17855
#define C6 12299
#define C7 6270
TABLE_SSE2
}};
#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
#define FDCT_COL(cpu, mm, mov)\
static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
{\
__asm__ volatile (\
#mov" 16(%0), %%"#mm"0 \n\t" \
#mov" 96(%0), %%"#mm"1 \n\t" \
#mov" %%"#mm"0, %%"#mm"2 \n\t" \
#mov" 32(%0), %%"#mm"3 \n\t" \
"paddsw %%"#mm"1, %%"#mm"0 \n\t" \
#mov" 80(%0), %%"#mm"4 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
#mov" (%0), %%"#mm"5 \n\t" \
"paddsw %%"#mm"3, %%"#mm"4 \n\t" \
"paddsw 112(%0), %%"#mm"5 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
#mov" %%"#mm"0, %%"#mm"6 \n\t" \
"psubsw %%"#mm"1, %%"#mm"2 \n\t" \
#mov" 16(%1), %%"#mm"1 \n\t" \
"psubsw %%"#mm"4, %%"#mm"0 \n\t" \
#mov" 48(%0), %%"#mm"7 \n\t" \
"pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
"paddsw 64(%0), %%"#mm"7 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
#mov" %%"#mm"5, %%"#mm"4 \n\t" \
"psubsw %%"#mm"7, %%"#mm"5 \n\t" \
"paddsw %%"#mm"5, %%"#mm"1 \n\t" \
"paddsw %%"#mm"7, %%"#mm"4 \n\t" \
"por (%2), %%"#mm"1 \n\t" \
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
"pmulhw 16(%1), %%"#mm"5 \n\t" \
#mov" %%"#mm"4, %%"#mm"7 \n\t" \
"psubsw 80(%0), %%"#mm"3 \n\t" \
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
#mov" %%"#mm"1, 32(%3) \n\t" \
"paddsw %%"#mm"6, %%"#mm"7 \n\t" \
#mov" 48(%0), %%"#mm"1 \n\t" \
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
"psubsw 64(%0), %%"#mm"1 \n\t" \
#mov" %%"#mm"2, %%"#mm"6 \n\t" \
#mov" %%"#mm"4, 64(%3) \n\t" \
"paddsw %%"#mm"3, %%"#mm"2 \n\t" \
"pmulhw (%4), %%"#mm"2 \n\t" \
"psubsw %%"#mm"3, %%"#mm"6 \n\t" \
"pmulhw (%4), %%"#mm"6 \n\t" \
"psubsw %%"#mm"0, %%"#mm"5 \n\t" \
"por (%2), %%"#mm"5 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
"por (%2), %%"#mm"2 \n\t" \
#mov" %%"#mm"1, %%"#mm"4 \n\t" \
#mov" (%0), %%"#mm"3 \n\t" \
"paddsw %%"#mm"6, %%"#mm"1 \n\t" \
"psubsw 112(%0), %%"#mm"3 \n\t" \
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
#mov" (%1), %%"#mm"0 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
#mov" 32(%1), %%"#mm"6 \n\t" \
"pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
#mov" %%"#mm"7, (%3) \n\t" \
"pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
#mov" %%"#mm"5, 96(%3) \n\t" \
#mov" %%"#mm"3, %%"#mm"7 \n\t" \
#mov" 32(%1), %%"#mm"5 \n\t" \
"psubsw %%"#mm"2, %%"#mm"7 \n\t" \
"paddsw %%"#mm"2, %%"#mm"3 \n\t" \
"pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
"paddsw %%"#mm"3, %%"#mm"0 \n\t" \
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
"pmulhw (%1), %%"#mm"3 \n\t" \
"por (%2), %%"#mm"0 \n\t" \
"paddsw %%"#mm"7, %%"#mm"5 \n\t" \
"psubsw %%"#mm"6, %%"#mm"7 \n\t" \
#mov" %%"#mm"0, 16(%3) \n\t" \
"paddsw %%"#mm"4, %%"#mm"5 \n\t" \
#mov" %%"#mm"7, 48(%3) \n\t" \
"psubsw %%"#mm"1, %%"#mm"3 \n\t" \
#mov" %%"#mm"5, 80(%3) \n\t" \
#mov" %%"#mm"3, 112(%3) \n\t" \
: \
: "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
"r" (out + offset), "r" (ocos_4_16)); \
}
FDCT_COL(mmx, mm, movq)
FDCT_COL(sse2, xmm, movdqa)
static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
{
__asm__ volatile(
#define FDCT_ROW_SSE2_H1(i,t) \
"movq " #i "(%0), %%xmm2 \n\t" \
"movq " #i "+8(%0), %%xmm0 \n\t" \
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
"movdqa " #t "+48(%1), %%xmm7 \n\t" \
"movdqa " #t "(%1), %%xmm4 \n\t" \
"movdqa " #t "+16(%1), %%xmm5 \n\t"
#define FDCT_ROW_SSE2_H2(i,t) \
"movq " #i "(%0), %%xmm2 \n\t" \
"movq " #i "+8(%0), %%xmm0 \n\t" \
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
"movdqa " #t "+48(%1), %%xmm7 \n\t"
#define FDCT_ROW_SSE2(i) \
"movq %%xmm2, %%xmm1 \n\t" \
"pshuflw $27, %%xmm0, %%xmm0 \n\t" \
"paddsw %%xmm0, %%xmm1 \n\t" \
"psubsw %%xmm0, %%xmm2 \n\t" \
"punpckldq %%xmm2, %%xmm1 \n\t" \
"pshufd $78, %%xmm1, %%xmm2 \n\t" \
"pmaddwd %%xmm2, %%xmm3 \n\t" \
"pmaddwd %%xmm1, %%xmm7 \n\t" \
"pmaddwd %%xmm5, %%xmm2 \n\t" \
"pmaddwd %%xmm4, %%xmm1 \n\t" \
"paddd %%xmm7, %%xmm3 \n\t" \
"paddd %%xmm2, %%xmm1 \n\t" \
"paddd %%xmm6, %%xmm3 \n\t" \
"paddd %%xmm6, %%xmm1 \n\t" \
"psrad %3, %%xmm3 \n\t" \
"psrad %3, %%xmm1 \n\t" \
"packssdw %%xmm3, %%xmm1 \n\t" \
"movdqa %%xmm1, " #i "(%4) \n\t"
"movdqa (%2), %%xmm6 \n\t"
FDCT_ROW_SSE2_H1(0,0)
FDCT_ROW_SSE2(0)
FDCT_ROW_SSE2_H2(64,0)
FDCT_ROW_SSE2(64)
FDCT_ROW_SSE2_H1(16,64)
FDCT_ROW_SSE2(16)
FDCT_ROW_SSE2_H2(112,64)
FDCT_ROW_SSE2(112)
FDCT_ROW_SSE2_H1(32,128)
FDCT_ROW_SSE2(32)
FDCT_ROW_SSE2_H2(96,128)
FDCT_ROW_SSE2(96)
FDCT_ROW_SSE2_H1(48,192)
FDCT_ROW_SSE2(48)
FDCT_ROW_SSE2_H2(80,192)
FDCT_ROW_SSE2(80)
:
: "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
"r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
);
}
static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out,
const int16_t *table)
{
__asm__ volatile (
"pshufw $0x1B, 8(%0), %%mm5 \n\t"
"movq (%0), %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"paddsw %%mm5, %%mm0 \n\t"
"psubsw %%mm5, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"punpckldq %%mm1, %%mm0 \n\t"
"punpckhdq %%mm1, %%mm2 \n\t"
"movq (%1), %%mm1 \n\t"
"movq 8(%1), %%mm3 \n\t"
"movq 16(%1), %%mm4 \n\t"
"movq 24(%1), %%mm5 \n\t"
"movq 32(%1), %%mm6 \n\t"
"movq 40(%1), %%mm7 \n\t"
"pmaddwd %%mm0, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm0, %%mm4 \n\t"
"pmaddwd %%mm2, %%mm5 \n\t"
"pmaddwd %%mm0, %%mm6 \n\t"
"pmaddwd %%mm2, %%mm7 \n\t"
"pmaddwd 48(%1), %%mm0 \n\t"
"pmaddwd 56(%1), %%mm2 \n\t"
"paddd %%mm1, %%mm3 \n\t"
"paddd %%mm4, %%mm5 \n\t"
"paddd %%mm6, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"movq (%2), %%mm0 \n\t"
"paddd %%mm0, %%mm3 \n\t"
"paddd %%mm0, %%mm5 \n\t"
"paddd %%mm0, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
"packssdw %%mm5, %%mm3 \n\t"
"packssdw %%mm2, %%mm7 \n\t"
"movq %%mm3, (%3) \n\t"
"movq %%mm7, 8(%3) \n\t"
:
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
}
static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
{
//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
__asm__ volatile(
"movd 12(%0), %%mm1 \n\t"
"punpcklwd 8(%0), %%mm1 \n\t"
"movq %%mm1, %%mm2 \n\t"
"psrlq $0x20, %%mm1 \n\t"
"movq 0(%0), %%mm0 \n\t"
"punpcklwd %%mm2, %%mm1 \n\t"
"movq %%mm0, %%mm5 \n\t"
"paddsw %%mm1, %%mm0 \n\t"
"psubsw %%mm1, %%mm5 \n\t"
"movq %%mm0, %%mm2 \n\t"
"punpckldq %%mm5, %%mm0 \n\t"
"punpckhdq %%mm5, %%mm2 \n\t"
"movq 0(%1), %%mm1 \n\t"
"movq 8(%1), %%mm3 \n\t"
"movq 16(%1), %%mm4 \n\t"
"movq 24(%1), %%mm5 \n\t"
"movq 32(%1), %%mm6 \n\t"
"movq 40(%1), %%mm7 \n\t"
"pmaddwd %%mm0, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm0, %%mm4 \n\t"
"pmaddwd %%mm2, %%mm5 \n\t"
"pmaddwd %%mm0, %%mm6 \n\t"
"pmaddwd %%mm2, %%mm7 \n\t"
"pmaddwd 48(%1), %%mm0 \n\t"
"pmaddwd 56(%1), %%mm2 \n\t"
"paddd %%mm1, %%mm3 \n\t"
"paddd %%mm4, %%mm5 \n\t"
"paddd %%mm6, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"movq (%2), %%mm0 \n\t"
"paddd %%mm0, %%mm3 \n\t"
"paddd %%mm0, %%mm5 \n\t"
"paddd %%mm0, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
"packssdw %%mm5, %%mm3 \n\t"
"packssdw %%mm2, %%mm7 \n\t"
"movq %%mm3, 0(%3) \n\t"
"movq %%mm7, 8(%3) \n\t"
:
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
}
void ff_fdct_mmx(int16_t *block)
{
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
int16_t * block1= (int16_t*)align_tmp;
const int16_t *table= tab_frw_01234567;
int i;
fdct_col_mmx(block, block1, 0);
fdct_col_mmx(block, block1, 4);
for(i=8;i>0;i--) {
fdct_row_mmx(block1, block, table);
block1 += 8;
table += 32;
block += 8;
}
}
#endif /* HAVE_MMX_INLINE */
#if HAVE_MMXEXT_INLINE
void ff_fdct_mmxext(int16_t *block)
{
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
int16_t *block1= (int16_t*)align_tmp;
const int16_t *table= tab_frw_01234567;
int i;
fdct_col_mmx(block, block1, 0);
fdct_col_mmx(block, block1, 4);
for(i=8;i>0;i--) {
fdct_row_mmxext(block1, block, table);
block1 += 8;
table += 32;
block += 8;
}
}
#endif /* HAVE_MMXEXT_INLINE */
#if HAVE_SSE2_INLINE
void ff_fdct_sse2(int16_t *block)
{
DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
int16_t * const block1= (int16_t*)align_tmp;
fdct_col_sse2(block, block1, 0);
fdct_row_sse2(block1, block);
}
#endif /* HAVE_SSE2_INLINE */

28
externals/ffmpeg/libavcodec/x86/fdct.h vendored Executable file
View File

@@ -0,0 +1,28 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_FDCT_H
#define AVCODEC_X86_FDCT_H
#include <stdint.h>
void ff_fdct_mmx(int16_t *block);
void ff_fdct_mmxext(int16_t *block);
void ff_fdct_sse2(int16_t *block);
#endif /* AVCODEC_X86_FDCT_H */

View File

@@ -0,0 +1,44 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/fdctdsp.h"
#include "fdct.h"
av_cold void ff_fdctdsp_init_x86(FDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
int cpu_flags = av_get_cpu_flags();
const int dct_algo = avctx->dct_algo;
if (!high_bit_depth) {
if ((dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) {
if (INLINE_MMX(cpu_flags))
c->fdct = ff_fdct_mmx;
if (INLINE_MMXEXT(cpu_flags))
c->fdct = ff_fdct_mmxext;
if (INLINE_SSE2(cpu_flags))
c->fdct = ff_fdct_sse2;
}
}
}

1085
externals/ffmpeg/libavcodec/x86/fft.asm vendored Executable file

File diff suppressed because it is too large Load Diff

38
externals/ffmpeg/libavcodec/x86/fft.h vendored Executable file
View File

@@ -0,0 +1,38 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_FFT_H
#define AVCODEC_X86_FFT_H
#include "libavcodec/fft.h"
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
#endif /* AVCODEC_X86_FFT_H */

61
externals/ffmpeg/libavcodec/x86/fft_init.c vendored Executable file
View File

@@ -0,0 +1,61 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "fft.h"
av_cold void ff_fft_init_x86(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (s->nbits > 16)
return;
#if ARCH_X86_32
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
s->imdct_calc = ff_imdct_calc_3dnow;
s->imdct_half = ff_imdct_half_3dnow;
s->fft_calc = ff_fft_calc_3dnow;
}
if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
s->imdct_calc = ff_imdct_calc_3dnowext;
s->imdct_half = ff_imdct_half_3dnowext;
s->fft_calc = ff_fft_calc_3dnowext;
}
#endif /* ARCH_X86_32 */
if (EXTERNAL_SSE(cpu_flags)) {
s->imdct_calc = ff_imdct_calc_sse;
s->imdct_half = ff_imdct_half_sse;
s->fft_permute = ff_fft_permute_sse;
s->fft_calc = ff_fft_calc_sse;
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
}
if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
s->imdct_half = ff_imdct_half_avx;
s->fft_calc = ff_fft_calc_avx;
s->fft_permutation = FF_FFT_PERM_AVX;
}
}

View File

@@ -0,0 +1,101 @@
;******************************************************************************
;* FLAC DSP functions
;*
;* Copyright (c) 2014 James Darnley <james.darnley@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License along
;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
INIT_XMM sse4
%if ARCH_X86_64
cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
DECLARE_REG_TMP 5, 6
%define length r2d
movsxd orderq, orderd
%else
cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs
DECLARE_REG_TMP 2, 5
%define length r2mp
%endif
; Here we assume that the maximum order value is 32. This means that we only
; need to copy a maximum of 32 samples. Therefore we let the preprocessor
; unroll this loop and copy all 32.
%assign iter 0
%rep 32/(mmsize/4)
movu m0, [smpq+iter]
movu [resq+iter], m0
%assign iter iter+mmsize
%endrep
lea resq, [resq+orderq*4]
lea smpq, [smpq+orderq*4]
lea coefsq, [coefsq+orderq*4]
sub length, orderd
movd m3, r5m
neg orderq
%define posj t0q
%define negj t1q
.looplen:
pxor m0, m0
pxor m4, m4
pxor m6, m6
mov posj, orderq
xor negj, negj
.looporder:
movd m2, [coefsq+posj*4] ; c = coefs[j]
SPLATD m2
movu m1, [smpq+negj*4-4] ; s = smp[i-j-1]
movu m5, [smpq+negj*4-4+mmsize]
movu m7, [smpq+negj*4-4+mmsize*2]
pmulld m1, m2
pmulld m5, m2
pmulld m7, m2
paddd m0, m1 ; p += c * s
paddd m4, m5
paddd m6, m7
dec negj
inc posj
jnz .looporder
psrad m0, m3 ; p >>= shift
psrad m4, m3
psrad m6, m3
movu m1, [smpq]
movu m5, [smpq+mmsize]
movu m7, [smpq+mmsize*2]
psubd m1, m0 ; smp[i] - p
psubd m5, m4
psubd m7, m6
movu [resq], m1 ; res[i] = smp[i] - (p >> shift)
movu [resq+mmsize], m5
movu [resq+mmsize*2], m7
add resq, 3*mmsize
add smpq, 3*mmsize
sub length, (3*mmsize)/4
jg .looplen
RET

313
externals/ffmpeg/libavcodec/x86/flacdsp.asm vendored Executable file
View File

@@ -0,0 +1,313 @@
;******************************************************************************
;* FLAC DSP SIMD optimizations
;*
;* Copyright (C) 2014 Loren Merritt
;* Copyright (C) 2014 James Almer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
%macro PMACSDQL 5
%if cpuflag(xop)
pmacsdql %1, %2, %3, %1
%else
pmuldq %2, %3
paddq %1, %2
%endif
%endmacro
%macro LPC_32 1
INIT_XMM %1
cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
sub lend, pred_orderd
jle .ret
lea decodedq, [decodedq+pred_orderq*4-8]
lea coeffsq, [coeffsq+pred_orderq*4]
neg pred_orderq
movd m4, qlevelm
ALIGN 16
.loop_sample:
movd m0, [decodedq+pred_orderq*4+8]
add decodedq, 8
movd m1, [coeffsq+pred_orderq*4]
pxor m2, m2
pxor m3, m3
lea jq, [pred_orderq+1]
test jq, jq
jz .end_order
.loop_order:
PMACSDQL m2, m0, m1, m2, m0
movd m0, [decodedq+jq*4]
PMACSDQL m3, m1, m0, m3, m1
movd m1, [coeffsq+jq*4]
inc jq
jl .loop_order
.end_order:
PMACSDQL m2, m0, m1, m2, m0
psrlq m2, m4
movd m0, [decodedq]
paddd m0, m2
movd [decodedq], m0
sub lend, 2
jl .ret
PMACSDQL m3, m1, m0, m3, m1
psrlq m3, m4
movd m1, [decodedq+4]
paddd m1, m3
movd [decodedq+4], m1
jg .loop_sample
.ret:
REP_RET
%endmacro
%if HAVE_XOP_EXTERNAL
LPC_32 xop
%endif
LPC_32 sse4
;----------------------------------------------------------------------------------
;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
; int len, int shift);
;----------------------------------------------------------------------------------
%macro FLAC_DECORRELATE_16 3-4
cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
%if ARCH_X86_32
mov lend, lenm
%endif
movd m3, r4m
shl lend, 2
mov in1q, [in0q + gprsize]
mov in0q, [in0q]
mov outq, [outq]
add in1q, lenq
add in0q, lenq
add outq, lenq
neg lenq
align 16
.loop:
mova m0, [in0q + lenq]
mova m1, [in1q + lenq]
%ifidn %1, ms
psrad m2, m1, 1
psubd m0, m2
%endif
%ifnidn %1, indep2
p%4d m2, m0, m1
%endif
packssdw m%2, m%2
packssdw m%3, m%3
punpcklwd m%2, m%3
psllw m%2, m3
mova [outq + lenq], m%2
add lenq, 16
jl .loop
REP_RET
%endmacro
INIT_XMM sse2
FLAC_DECORRELATE_16 ls, 0, 2, sub
FLAC_DECORRELATE_16 rs, 2, 1, add
FLAC_DECORRELATE_16 ms, 2, 0, add
;----------------------------------------------------------------------------------
;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
; int len, int shift);
;----------------------------------------------------------------------------------
%macro FLAC_DECORRELATE_32 5
cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
%if ARCH_X86_32
mov lend, lenm
%endif
movd m3, r4m
mov in1q, [in0q + gprsize]
mov in0q, [in0q]
mov outq, [outq]
sub in1q, in0q
align 16
.loop:
mova m0, [in0q]
mova m1, [in0q + in1q]
%ifidn %1, ms
psrad m2, m1, 1
psubd m0, m2
%endif
p%5d m2, m0, m1
pslld m%2, m3
pslld m%3, m3
SBUTTERFLY dq, %2, %3, %4
mova [outq ], m%2
mova [outq + mmsize], m%3
add in0q, mmsize
add outq, mmsize*2
sub lend, mmsize/4
jg .loop
REP_RET
%endmacro
INIT_XMM sse2
FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
FLAC_DECORRELATE_32 rs, 2, 1, 0, add
FLAC_DECORRELATE_32 ms, 2, 0, 1, add
;-----------------------------------------------------------------------------------------
;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
; int len, int shift);
;-----------------------------------------------------------------------------------------
;%1 = bps
;%2 = channels
;%3 = last xmm reg used
;%4 = word/dword (shift instruction)
%macro FLAC_DECORRELATE_INDEP 4
%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
%if ARCH_X86_32
%if %2 == 6
DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
%define lend dword r3m
%else
mov lend, lenm
%endif
%endif
movd m%3, r4m
%assign %%i 1
%rep %2-1
mov in %+ %%i %+ q, [in0q+%%i*gprsize]
%assign %%i %%i+1
%endrep
mov in0q, [in0q]
mov outq, [outq]
%assign %%i 1
%rep %2-1
sub in %+ %%i %+ q, in0q
%assign %%i %%i+1
%endrep
align 16
.loop:
mova m0, [in0q]
%assign %%i 1
%rep REPCOUNT-1
mova m %+ %%i, [in0q + in %+ %%i %+ q]
%assign %%i %%i+1
%endrep
%if %1 == 32
%if %2 == 8
TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
%elif %2 == 6
SBUTTERFLY dq, 0, 1, 6
SBUTTERFLY dq, 2, 3, 6
SBUTTERFLY dq, 4, 5, 6
punpcklqdq m6, m0, m2
punpckhqdq m2, m4
shufps m4, m0, 0xe4
punpcklqdq m0, m1, m3
punpckhqdq m3, m5
shufps m5, m1, 0xe4
SWAP 0,6,1,4,5,3
%elif %2 == 4
TRANSPOSE4x4D 0, 1, 2, 3, 4
%else ; %2 == 2
SBUTTERFLY dq, 0, 1, 2
%endif
%else ; %1 == 16
%if %2 == 8
packssdw m0, [in0q + in4q]
packssdw m1, [in0q + in5q]
packssdw m2, [in0q + in6q]
packssdw m3, [in0q + in7q]
TRANSPOSE2x4x4W 0, 1, 2, 3, 4
%elif %2 == 6
packssdw m0, [in0q + in3q]
packssdw m1, [in0q + in4q]
packssdw m2, [in0q + in5q]
pshufd m3, m0, q1032
punpcklwd m0, m1
punpckhwd m1, m2
punpcklwd m2, m3
shufps m3, m0, m2, q2020
shufps m0, m1, q2031
shufps m2, m1, q3131
shufps m1, m2, m3, q3120
shufps m3, m0, q0220
shufps m0, m2, q3113
SWAP 2, 0, 3
%else ; %2 == 4
packssdw m0, [in0q + in2q]
packssdw m1, [in0q + in3q]
SBUTTERFLY wd, 0, 1, 2
SBUTTERFLY dq, 0, 1, 2
%endif
%endif
%assign %%i 0
%rep REPCOUNT
psll%4 m %+ %%i, m%3
%assign %%i %%i+1
%endrep
%assign %%i 0
%rep REPCOUNT
mova [outq + %%i*mmsize], m %+ %%i
%assign %%i %%i+1
%endrep
add in0q, mmsize
add outq, mmsize*REPCOUNT
sub lend, mmsize/4
jg .loop
REP_RET
%endmacro
INIT_XMM sse2
FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
FLAC_DECORRELATE_INDEP 32, 2, 3, d
FLAC_DECORRELATE_INDEP 16, 4, 3, w
FLAC_DECORRELATE_INDEP 32, 4, 5, d
FLAC_DECORRELATE_INDEP 16, 6, 4, w
FLAC_DECORRELATE_INDEP 32, 6, 7, d
%if ARCH_X86_64
FLAC_DECORRELATE_INDEP 16, 8, 5, w
FLAC_DECORRELATE_INDEP 32, 8, 9, d
%endif
INIT_XMM avx
FLAC_DECORRELATE_INDEP 32, 4, 5, d
FLAC_DECORRELATE_INDEP 32, 6, 7, d
%if ARCH_X86_64
FLAC_DECORRELATE_INDEP 16, 8, 5, w
FLAC_DECORRELATE_INDEP 32, 8, 9, d
%endif

115
externals/ffmpeg/libavcodec/x86/flacdsp_init.c vendored Executable file
View File

@@ -0,0 +1,115 @@
/*
* Copyright (c) 2014 James Almer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/flacdsp.h"
#include "libavutil/x86/cpu.h"
#include "config.h"
void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
int qlevel, int len);
void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
int qlevel, int len);
void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
#define DECORRELATE_FUNCS(fmt, opt) \
void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
int len, int shift); \
void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
int len, int shift); \
void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
int len, int shift); \
void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
int len, int shift); \
void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
int len, int shift); \
void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
int len, int shift); \
void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
int len, int shift)
DECORRELATE_FUNCS(16, sse2);
DECORRELATE_FUNCS(16, avx);
DECORRELATE_FUNCS(32, sse2);
DECORRELATE_FUNCS(32, avx);
av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
int bps)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
#if CONFIG_FLAC_DECODER
if (EXTERNAL_SSE2(cpu_flags)) {
if (fmt == AV_SAMPLE_FMT_S16) {
if (channels == 2)
c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2;
else if (channels == 4)
c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2;
else if (channels == 6)
c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2;
else if (ARCH_X86_64 && channels == 8)
c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2;
c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
} else if (fmt == AV_SAMPLE_FMT_S32) {
if (channels == 2)
c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2;
else if (channels == 4)
c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2;
else if (channels == 6)
c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2;
else if (ARCH_X86_64 && channels == 8)
c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2;
c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
}
}
if (EXTERNAL_SSE4(cpu_flags)) {
c->lpc32 = ff_flac_lpc_32_sse4;
}
if (EXTERNAL_AVX(cpu_flags)) {
if (fmt == AV_SAMPLE_FMT_S16) {
if (ARCH_X86_64 && channels == 8)
c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx;
} else if (fmt == AV_SAMPLE_FMT_S32) {
if (channels == 4)
c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx;
else if (channels == 6)
c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx;
else if (ARCH_X86_64 && channels == 8)
c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx;
}
}
if (EXTERNAL_XOP(cpu_flags)) {
c->lpc32 = ff_flac_lpc_32_xop;
}
#endif
#if CONFIG_FLAC_ENCODER
if (EXTERNAL_SSE4(cpu_flags)) {
if (CONFIG_GPL)
c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
}
#endif
#endif /* HAVE_X86ASM */
}

124
externals/ffmpeg/libavcodec/x86/fmtconvert.asm vendored Executable file
View File

@@ -0,0 +1,124 @@
;******************************************************************************
;* x86 optimized Format Conversion Utils
;* Copyright (c) 2008 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
;------------------------------------------------------------------------------
; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul,
; int len);
;------------------------------------------------------------------------------
%macro INT32_TO_FLOAT_FMUL_SCALAR 1
%if UNIX64
cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
%else
cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
%endif
%if WIN64
SWAP 0, 2
%elif ARCH_X86_32
movss m0, mulm
%endif
SPLATD m0
shl lend, 2
add srcq, lenq
add dstq, lenq
neg lenq
.loop:
%if cpuflag(sse2)
cvtdq2ps m1, [srcq+lenq ]
cvtdq2ps m2, [srcq+lenq+16]
%else
cvtpi2ps m1, [srcq+lenq ]
cvtpi2ps m3, [srcq+lenq+ 8]
cvtpi2ps m2, [srcq+lenq+16]
cvtpi2ps m4, [srcq+lenq+24]
movlhps m1, m3
movlhps m2, m4
%endif
mulps m1, m0
mulps m2, m0
mova [dstq+lenq ], m1
mova [dstq+lenq+16], m2
add lenq, 32
jl .loop
%if notcpuflag(sse2)
;; cvtpi2ps switches to MMX even if the source is a memory location
;; possible an error in documentation since every tested CPU disagrees with
;; that. Use emms anyway since the vast majority of machines will use the
;; SSE2 variant
emms
%endif
RET
%endmacro
INIT_XMM sse
INT32_TO_FLOAT_FMUL_SCALAR 5
INIT_XMM sse2
INT32_TO_FLOAT_FMUL_SCALAR 3
;------------------------------------------------------------------------------
; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
; const float *mul, int len);
;------------------------------------------------------------------------------
%macro INT32_TO_FLOAT_FMUL_ARRAY8 0
cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
shl lend, 2
add srcq, lenq
add dstq, lenq
neg lenq
.loop:
movss m0, [mulq]
SPLATD m0
%if cpuflag(sse2)
cvtdq2ps m1, [srcq+lenq ]
cvtdq2ps m2, [srcq+lenq+16]
%else
cvtpi2ps m1, [srcq+lenq ]
cvtpi2ps m3, [srcq+lenq+ 8]
cvtpi2ps m2, [srcq+lenq+16]
cvtpi2ps m4, [srcq+lenq+24]
movlhps m1, m3
movlhps m2, m4
%endif
mulps m1, m0
mulps m2, m0
mova [dstq+lenq ], m1
mova [dstq+lenq+16], m2
add mulq, 4
add lenq, 32
jl .loop
%if notcpuflag(sse2)
;; cvtpi2ps switches to MMX even if the source is a memory location
;; possible an error in documentation since every tested CPU disagrees with
;; that. Use emms anyway since the vast majority of machines will use the
;; SSE2 variant
emms
%endif
RET
%endmacro
INIT_XMM sse
INT32_TO_FLOAT_FMUL_ARRAY8
INIT_XMM sse2
INT32_TO_FLOAT_FMUL_ARRAY8

View File

@@ -0,0 +1,55 @@
/*
* Format Conversion Utils
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/fmtconvert.h"
#if HAVE_X86ASM
void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
void ff_int32_to_float_fmul_array8_sse (FmtConvertContext *c, float *dst, const int32_t *src,
const float *mul, int len);
void ff_int32_to_float_fmul_array8_sse2(FmtConvertContext *c, float *dst, const int32_t *src,
const float *mul, int len);
#endif /* HAVE_X86ASM */
av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse2;
}
#endif /* HAVE_X86ASM */
}

106
externals/ffmpeg/libavcodec/x86/fpel.asm vendored Executable file
View File

@@ -0,0 +1,106 @@
;******************************************************************************
;* SIMD-optimized fullpel functions
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
%macro PAVGB_MMX 4
LOAD %3, %1
por %3, %2
pxor %2, %1
pand %2, %4
psrlq %2, 1
psubb %3, %2
SWAP %2, %3
%endmacro
; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels,
; ptrdiff_t line_size, int h)
%macro OP_PIXELS 2
%if %2 == mmsize/2
%define LOAD movh
%define SAVE movh
%define LEN mmsize
%else
%define LOAD movu
%define SAVE mova
%define LEN %2
%endif
cglobal %1_pixels%2, 4,5,4
lea r4, [r2*3]
%ifidn %1, avg
%if notcpuflag(mmxext)
pcmpeqd m6, m6
paddb m6, m6
%endif
%endif
.loop:
%assign %%i 0
%rep LEN/mmsize
LOAD m0, [r1 + %%i]
LOAD m1, [r1+r2 + %%i]
LOAD m2, [r1+r2*2 + %%i]
LOAD m3, [r1+r4 + %%i]
%ifidn %1, avg
%if notcpuflag(mmxext)
PAVGB_MMX [r0 + %%i], m0, m4, m6
PAVGB_MMX [r0+r2 + %%i], m1, m5, m6
PAVGB_MMX [r0+r2*2 + %%i], m2, m4, m6
PAVGB_MMX [r0+r4 + %%i], m3, m5, m6
%else
pavgb m0, [r0 + %%i]
pavgb m1, [r0+r2 + %%i]
pavgb m2, [r0+r2*2 + %%i]
pavgb m3, [r0+r4 + %%i]
%endif
%endif
SAVE [r0 + %%i], m0
SAVE [r0+r2 + %%i], m1
SAVE [r0+r2*2 + %%i], m2
SAVE [r0+r4 + %%i], m3
%assign %%i %%i+mmsize
%endrep
sub r3d, 4
lea r1, [r1+r2*4]
lea r0, [r0+r2*4]
jne .loop
RET
%endmacro
INIT_MMX mmx
OP_PIXELS put, 4
OP_PIXELS avg, 4
OP_PIXELS put, 8
OP_PIXELS avg, 8
OP_PIXELS put, 16
OP_PIXELS avg, 16
INIT_MMX mmxext
OP_PIXELS avg, 4
OP_PIXELS avg, 8
OP_PIXELS avg, 16
INIT_XMM sse2
OP_PIXELS put, 16
OP_PIXELS avg, 16

49
externals/ffmpeg/libavcodec/x86/fpel.h vendored Executable file
View File

@@ -0,0 +1,49 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_FPEL_H
#define AVCODEC_X86_FPEL_H
#include <stddef.h>
#include <stdint.h>
void ff_avg_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
#endif /* AVCODEC_X86_FPEL_H */

54
externals/ffmpeg/libavcodec/x86/g722dsp.asm vendored Executable file
View File

@@ -0,0 +1,54 @@
;******************************************************************************
;* SIMD optimized DSP functions for G722 coding
;*
;* Copyright (c) 2014 James Almer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pw_qmf_coeffs: dw 3, -210, -11, -805, -11, 951, 53, 3876
pw_qmf_coeffs2: dw 12, 3876, -156, 951, 32, -805, 362, -210
pw_qmf_coeffs3: dw 362, 0 , 32, 0, -156, 0, 12, 0
pw_qmf_coeffs4: dw 53, 0, -11, 0, -11, 0, 3, 0
SECTION .text
INIT_XMM sse2
cglobal g722_apply_qmf, 2, 2, 5, prev, out
movu m0, [prevq+mmsize*0]
movu m1, [prevq+mmsize*1]
movu m2, [prevq+mmsize*2]
punpcklwd m3, m0, m1
punpckhwd m0, m1
punpcklwd m4, m2, m2
punpckhwd m2, m2
pmaddwd m3, [pw_qmf_coeffs ]
pmaddwd m0, [pw_qmf_coeffs2]
pmaddwd m4, [pw_qmf_coeffs3]
pmaddwd m2, [pw_qmf_coeffs4]
paddd m0, m3
paddd m2, m4
paddd m0, m2
pshufd m2, m0, q0032
paddd m0, m2
pshufd m0, m0, q0001
movq [outq], m0
RET

View File

@@ -0,0 +1,35 @@
/*
* Copyright (c) 2014 James Almer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/g722dsp.h"
void ff_g722_apply_qmf_sse2(const int16_t *prev_samples, int xout[2]);
av_cold void ff_g722dsp_init_x86(G722DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags))
dsp->apply_qmf = ff_g722_apply_qmf_sse2;
}

View File

@@ -0,0 +1,189 @@
;******************************************************************************
;* MMX-optimized H.263 loop filter
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pb_FC
cextern h263_loop_filter_strength
SECTION .text
%macro H263_LOOP_FILTER 5
pxor m7, m7
mova m0, [%1]
mova m1, [%1]
mova m2, [%4]
mova m3, [%4]
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
psubw m0, m2
psubw m1, m3
mova m2, [%2]
mova m3, [%2]
mova m4, [%3]
mova m5, [%3]
punpcklbw m2, m7
punpckhbw m3, m7
punpcklbw m4, m7
punpckhbw m5, m7
psubw m4, m2
psubw m5, m3
psllw m4, 2
psllw m5, 2
paddw m4, m0
paddw m5, m1
pxor m6, m6
pcmpgtw m6, m4
pcmpgtw m7, m5
pxor m4, m6
pxor m5, m7
psubw m4, m6
psubw m5, m7
psrlw m4, 3
psrlw m5, 3
packuswb m4, m5
packsswb m6, m7
pxor m7, m7
movd m2, %5
punpcklbw m2, m2
punpcklbw m2, m2
punpcklbw m2, m2
psubusb m2, m4
mova m3, m2
psubusb m3, m4
psubb m2, m3
mova m3, [%2]
mova m4, [%3]
pxor m3, m6
pxor m4, m6
paddusb m3, m2
psubusb m4, m2
pxor m3, m6
pxor m4, m6
paddusb m2, m2
packsswb m0, m1
pcmpgtb m7, m0
pxor m0, m7
psubb m0, m7
mova m1, m0
psubusb m0, m2
psubb m1, m0
pand m1, [pb_FC]
psrlw m1, 2
pxor m1, m7
psubb m1, m7
mova m5, [%1]
mova m6, [%4]
psubb m5, m1
paddb m6, m1
%endmacro
INIT_MMX mmx
; void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
cglobal h263_v_loop_filter, 3,5
movsxdifnidn r1, r1d
movsxdifnidn r2, r2d
lea r4, [h263_loop_filter_strength]
movzx r3d, BYTE [r4+r2]
movsx r2, r3b
shl r2, 1
mov r3, r0
sub r3, r1
mov r4, r3
sub r4, r1
H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
mova [r3], m3
mova [r0], m4
mova [r4], m5
mova [r0+r1], m6
RET
%macro TRANSPOSE4X4 2
movd m0, [%1]
movd m1, [%1+r1]
movd m2, [%1+r1*2]
movd m3, [%1+r3]
punpcklbw m0, m1
punpcklbw m2, m3
mova m1, m0
punpcklwd m0, m2
punpckhwd m1, m2
movd [%2+ 0], m0
punpckhdq m0, m0
movd [%2+ 8], m0
movd [%2+16], m1
punpckhdq m1, m1
movd [%2+24], m1
%endmacro
; void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
INIT_MMX mmx
cglobal h263_h_loop_filter, 3,5,0,32
movsxdifnidn r1, r1d
movsxdifnidn r2, r2d
lea r4, [h263_loop_filter_strength]
movzx r3d, BYTE [r4+r2]
movsx r2, r3b
shl r2, 1
sub r0, 2
lea r3, [r1*3]
TRANSPOSE4X4 r0, rsp
lea r4, [r0+r1*4]
TRANSPOSE4X4 r4, rsp+4
H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
mova m1, m5
mova m0, m4
punpcklbw m5, m3
punpcklbw m4, m6
punpckhbw m1, m3
punpckhbw m0, m6
mova m3, m5
mova m6, m1
punpcklwd m5, m4
punpcklwd m1, m0
punpckhwd m3, m4
punpckhwd m6, m0
movd [r0], m5
punpckhdq m5, m5
movd [r0+r1*1], m5
movd [r0+r1*2], m3
punpckhdq m3, m3
movd [r0+r3], m3
movd [r4], m1
punpckhdq m1, m1
movd [r4+r1*1], m1
movd [r4+r1*2], m6
punpckhdq m6, m6
movd [r4+r3], m6
RET

View File

@@ -0,0 +1,39 @@
/*
* Copyright (c) 2013 Diego Biurrun <diego@biurrun.de>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/h263dsp.h"
void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
av_cold void ff_h263dsp_init_x86(H263DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
}
}

208
externals/ffmpeg/libavcodec/x86/h264_cabac.c vendored Executable file
View File

@@ -0,0 +1,208 @@
/*
* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* H.264 / AVC / MPEG-4 part10 codec.
* non-SIMD x86-specific optimizations for H.264
* @author Michael Niedermayer <michaelni@gmx.at>
*/
#include <stddef.h>
#include "libavcodec/cabac.h"
#include "cabac.h"
#if HAVE_INLINE_ASM
#if ARCH_X86_64
#define REG64 "r"
#else
#define REG64 "m"
#endif
//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
//as that would make optimization work hard)
#if HAVE_7REGS && !BROKEN_COMPILER
#define decode_significance decode_significance_x86
static int decode_significance_x86(CABACContext *c, int max_coeff,
uint8_t *significant_coeff_ctx_base,
int *index, x86_reg last_off){
void *end= significant_coeff_ctx_base + max_coeff - 1;
int minusstart= -(intptr_t)significant_coeff_ctx_base;
int minusindex= 4-(intptr_t)index;
int bit;
x86_reg coeff_count;
#ifdef BROKEN_RELOCATIONS
void *tables;
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
: NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
);
#endif
__asm__ volatile(
"3: \n\t"
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c11(%6)", "%c12(%6)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%13")
"test $1, %4 \n\t"
" jz 4f \n\t"
"add %10, %1 \n\t"
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c11(%6)", "%c12(%6)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%13")
"sub %10, %1 \n\t"
"mov %2, %0 \n\t"
"movl %7, %%ecx \n\t"
"add %1, %%"FF_REG_c" \n\t"
"movl %%ecx, (%0) \n\t"
"test $1, %4 \n\t"
" jnz 5f \n\t"
"add"FF_OPSIZE" $4, %2 \n\t"
"4: \n\t"
"add $1, %1 \n\t"
"cmp %8, %1 \n\t"
" jb 3b \n\t"
"mov %2, %0 \n\t"
"movl %7, %%ecx \n\t"
"add %1, %%"FF_REG_c" \n\t"
"movl %%ecx, (%0) \n\t"
"5: \n\t"
"add %9, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
"+&r"(c->low), "=&r"(bit), "+&r"(c->range)
: "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end))
TABLES_ARG
: "%"FF_REG_c, "memory"
);
return coeff_count;
}
#define decode_significance_8x8 decode_significance_8x8_x86
static int decode_significance_8x8_x86(CABACContext *c,
uint8_t *significant_coeff_ctx_base,
int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){
int minusindex= 4-(intptr_t)index;
int bit;
x86_reg coeff_count;
x86_reg last=0;
x86_reg state;
#ifdef BROKEN_RELOCATIONS
void *tables;
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
: NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
);
#endif
__asm__ volatile(
"mov %1, %6 \n\t"
"3: \n\t"
"mov %10, %0 \n\t"
"movzb (%0, %6), %6 \n\t"
"add %9, %6 \n\t"
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c12(%7)", "%c13(%7)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%15")
"mov %1, %6 \n\t"
"test $1, %4 \n\t"
" jz 4f \n\t"
#ifdef BROKEN_RELOCATIONS
"movzb %c14(%15, %q6), %6\n\t"
#else
"movzb "MANGLE(ff_h264_cabac_tables)"+%c14(%6), %6\n\t"
#endif
"add %11, %6 \n\t"
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c12(%7)", "%c13(%7)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%15")
"mov %2, %0 \n\t"
"mov %1, %6 \n\t"
"mov %k6, (%0) \n\t"
"test $1, %4 \n\t"
" jnz 5f \n\t"
"add"FF_OPSIZE" $4, %2 \n\t"
"4: \n\t"
"add $1, %6 \n\t"
"mov %6, %1 \n\t"
"cmp $63, %6 \n\t"
" jb 3b \n\t"
"mov %2, %0 \n\t"
"mov %k6, (%0) \n\t"
"5: \n\t"
"addl %8, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+"REG64(last), "+"REG64(index), "+&r"(c->low),
"=&r"(bit), "+&r"(c->range), "=&r"(state)
: "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
REG64(sig_off), REG64(last_coeff_ctx_base),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end)),
"i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
: "%"FF_REG_c, "memory"
);
return coeff_count;
}
#endif /* HAVE_7REGS && BROKEN_COMPILER */
#endif /* HAVE_INLINE_ASM */

View File

@@ -0,0 +1,663 @@
;******************************************************************************
;* MMX/SSSE3-optimized functions for H.264 chroma MC
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
;* 2005-2008 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
rnd_rv40_2d_tbl: times 4 dw 0
times 4 dw 16
times 4 dw 32
times 4 dw 16
times 4 dw 32
times 4 dw 28
times 4 dw 32
times 4 dw 28
times 4 dw 0
times 4 dw 32
times 4 dw 16
times 4 dw 32
times 4 dw 32
times 4 dw 28
times 4 dw 32
times 4 dw 28
rnd_rv40_1d_tbl: times 4 dw 0
times 4 dw 2
times 4 dw 4
times 4 dw 2
times 4 dw 4
times 4 dw 3
times 4 dw 4
times 4 dw 3
times 4 dw 0
times 4 dw 4
times 4 dw 2
times 4 dw 4
times 4 dw 4
times 4 dw 3
times 4 dw 4
times 4 dw 3
cextern pw_3
cextern pw_4
cextern pw_8
pw_28: times 8 dw 28
cextern pw_32
cextern pw_64
SECTION .text
%macro mv0_pixels_mc8 0
lea r4, [r2*2 ]
.next4rows:
movq mm0, [r1 ]
movq mm1, [r1+r2]
add r1, r4
CHROMAMC_AVG mm0, [r0 ]
CHROMAMC_AVG mm1, [r0+r2]
movq [r0 ], mm0
movq [r0+r2], mm1
add r0, r4
movq mm0, [r1 ]
movq mm1, [r1+r2]
add r1, r4
CHROMAMC_AVG mm0, [r0 ]
CHROMAMC_AVG mm1, [r0+r2]
movq [r0 ], mm0
movq [r0+r2], mm1
add r0, r4
sub r3d, 4
jne .next4rows
%endmacro
%macro chroma_mc8_mmx_func 2-3
%ifidn %2, rv40
%ifdef PIC
%define rnd_1d_rv40 r8
%define rnd_2d_rv40 r8
%define extra_regs 2
%else ; no-PIC
%define rnd_1d_rv40 rnd_rv40_1d_tbl
%define rnd_2d_rv40 rnd_rv40_2d_tbl
%define extra_regs 1
%endif ; PIC
%else
%define extra_regs 0
%endif ; rv40
; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */,
; uint8_t *src /* align 1 */,
; ptrdiff_t stride, int h, int mx, int my)
cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
mov r6d, r5d
or r6d, r4d
jne .at_least_one_non_zero
; mx == 0 AND my == 0 - no filter needed
mv0_pixels_mc8
REP_RET
.at_least_one_non_zero:
%ifidn %2, rv40
%if ARCH_X86_64
mov r7, r5
and r7, 6 ; &~1 for mx/my=[0,7]
lea r7, [r7*4+r4]
sar r7d, 1
%define rnd_bias r7
%define dest_reg r0
%else ; x86-32
mov r0, r5
and r0, 6 ; &~1 for mx/my=[0,7]
lea r0, [r0*4+r4]
sar r0d, 1
%define rnd_bias r0
%define dest_reg r5
%endif
%else ; vc1, h264
%define rnd_bias 0
%define dest_reg r0
%endif
test r5d, r5d
mov r6, 1
je .my_is_zero
test r4d, r4d
mov r6, r2 ; dxy = x ? 1 : stride
jne .both_non_zero
.my_is_zero:
; mx == 0 XOR my == 0 - 1 dimensional filter only
or r4d, r5d ; x + y
%ifidn %2, rv40
%ifdef PIC
lea r8, [rnd_rv40_1d_tbl]
%endif
%if ARCH_X86_64 == 0
mov r5, r0m
%endif
%endif
movd m5, r4d
movq m4, [pw_8]
movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
punpcklwd m5, m5
punpckldq m5, m5 ; mm5 = B = x
pxor m7, m7
psubw m4, m5 ; mm4 = A = 8-x
.next1drow:
movq m0, [r1 ] ; mm0 = src[0..7]
movq m2, [r1+r6] ; mm1 = src[1..8]
movq m1, m0
movq m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
pmullw m0, m4 ; [mm0,mm1] = A * src[0..7]
pmullw m1, m4
pmullw m2, m5 ; [mm2,mm3] = B * src[1..8]
pmullw m3, m5
paddw m0, m6
paddw m1, m6
paddw m0, m2
paddw m1, m3
psrlw m0, 3
psrlw m1, 3
packuswb m0, m1
CHROMAMC_AVG m0, [dest_reg]
movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
add dest_reg, r2
add r1, r2
dec r3d
jne .next1drow
REP_RET
.both_non_zero: ; general case, bilinear
movd m4, r4d ; x
movd m6, r5d ; y
%ifidn %2, rv40
%ifdef PIC
lea r8, [rnd_rv40_2d_tbl]
%endif
%if ARCH_X86_64 == 0
mov r5, r0m
%endif
%endif
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
sub rsp, 16 ; AA and DD
punpcklwd m4, m4
punpcklwd m6, m6
punpckldq m4, m4 ; mm4 = x words
punpckldq m6, m6 ; mm6 = y words
movq m5, m4
pmullw m4, m6 ; mm4 = x * y
psllw m5, 3
psllw m6, 3
movq m7, m5
paddw m7, m6
movq [rsp+8], m4 ; DD = x * y
psubw m5, m4 ; mm5 = B = 8x - xy
psubw m6, m4 ; mm6 = C = 8y - xy
paddw m4, [pw_64]
psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64
pxor m7, m7
movq [rsp ], m4
movq m0, [r1 ] ; mm0 = src[0..7]
movq m1, [r1+1] ; mm1 = src[1..8]
.next2drow:
add r1, r2
movq m2, m0
movq m3, m1
punpckhbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
pmullw m0, [rsp]
pmullw m2, [rsp]
pmullw m1, m5
pmullw m3, m5
paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4]
paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8]
movq m0, [r1]
movq m1, m0
punpcklbw m0, m7
punpckhbw m1, m7
pmullw m0, m6
pmullw m1, m6
paddw m2, m0
paddw m3, m1 ; [mm2,mm3] += C * src[0..7]
movq m1, [r1+1]
movq m0, m1
movq m4, m1
punpcklbw m0, m7
punpckhbw m4, m7
pmullw m0, [rsp+8]
pmullw m4, [rsp+8]
paddw m2, m0
paddw m3, m4 ; [mm2,mm3] += D * src[1..8]
movq m0, [r1]
paddw m2, [rnd_2d_%2+rnd_bias*8]
paddw m3, [rnd_2d_%2+rnd_bias*8]
psrlw m2, 6
psrlw m3, 6
packuswb m2, m3
CHROMAMC_AVG m2, [dest_reg]
movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
add dest_reg, r2
dec r3d
jne .next2drow
mov rsp, r6 ; restore stack pointer
RET
%endmacro
%macro chroma_mc4_mmx_func 2
%define extra_regs 0
%ifidn %2, rv40
%ifdef PIC
%define extra_regs 1
%endif ; PIC
%endif ; rv40
cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
pxor m7, m7
movd m2, r4d ; x
movd m3, r5d ; y
movq m4, [pw_8]
movq m5, [pw_8]
punpcklwd m2, m2
punpcklwd m3, m3
punpcklwd m2, m2
punpcklwd m3, m3
psubw m4, m2
psubw m5, m3
%ifidn %2, rv40
%ifdef PIC
lea r6, [rnd_rv40_2d_tbl]
%define rnd_2d_rv40 r6
%else
%define rnd_2d_rv40 rnd_rv40_2d_tbl
%endif
and r5, 6 ; &~1 for mx/my=[0,7]
lea r5, [r5*4+r4]
sar r5d, 1
%define rnd_bias r5
%else ; vc1, h264
%define rnd_bias 0
%endif
movd m0, [r1 ]
movd m6, [r1+1]
add r1, r2
punpcklbw m0, m7
punpcklbw m6, m7
pmullw m0, m4
pmullw m6, m2
paddw m6, m0
.next2rows:
movd m0, [r1 ]
movd m1, [r1+1]
add r1, r2
punpcklbw m0, m7
punpcklbw m1, m7
pmullw m0, m4
pmullw m1, m2
paddw m1, m0
movq m0, m1
pmullw m6, m5
pmullw m1, m3
paddw m6, [rnd_2d_%2+rnd_bias*8]
paddw m1, m6
psrlw m1, 6
packuswb m1, m1
CHROMAMC_AVG4 m1, m6, [r0]
movd [r0], m1
add r0, r2
movd m6, [r1 ]
movd m1, [r1+1]
add r1, r2
punpcklbw m6, m7
punpcklbw m1, m7
pmullw m6, m4
pmullw m1, m2
paddw m1, m6
movq m6, m1
pmullw m0, m5
pmullw m1, m3
paddw m0, [rnd_2d_%2+rnd_bias*8]
paddw m1, m0
psrlw m1, 6
packuswb m1, m1
CHROMAMC_AVG4 m1, m0, [r0]
movd [r0], m1
add r0, r2
sub r3d, 2
jnz .next2rows
REP_RET
%endmacro
%macro chroma_mc2_mmx_func 2
cglobal %1_%2_chroma_mc2, 6, 7, 0
mov r6d, r4d
shl r4d, 16
sub r4d, r6d
add r4d, 8
imul r5d, r4d ; x*y<<16 | y*(8-x)
shl r4d, 3
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
movd m5, r4d
movd m6, r5d
punpckldq m5, m5 ; mm5 = {A,B,A,B}
punpckldq m6, m6 ; mm6 = {C,D,C,D}
pxor m7, m7
movd m2, [r1]
punpcklbw m2, m7
pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
.nextrow:
add r1, r2
movq m1, m2
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
movd m0, [r1]
punpcklbw m0, m7
pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
movq m2, m0
pmaddwd m0, m6
paddw m1, [rnd_2d_%2]
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
psrlw m1, 6
packssdw m1, m7
packuswb m1, m7
CHROMAMC_AVG4 m1, m3, [r0]
movd r5d, m1
mov [r0], r5w
add r0, r2
sub r3d, 1
jnz .nextrow
REP_RET
%endmacro
%define rnd_1d_h264 pw_4
%define rnd_2d_h264 pw_32
%define rnd_1d_vc1 pw_3
%define rnd_2d_vc1 pw_28
%macro NOTHING 2-3
%endmacro
%macro DIRECT_AVG 2
PAVGB %1, %2
%endmacro
%macro COPY_AVG 3
movd %2, %3
PAVGB %1, %2
%endmacro
INIT_MMX mmx
%define CHROMAMC_AVG NOTHING
%define CHROMAMC_AVG4 NOTHING
chroma_mc8_mmx_func put, h264, _rnd
chroma_mc8_mmx_func put, vc1, _nornd
chroma_mc8_mmx_func put, rv40
chroma_mc4_mmx_func put, h264
chroma_mc4_mmx_func put, rv40
INIT_MMX mmxext
chroma_mc2_mmx_func put, h264
%define CHROMAMC_AVG DIRECT_AVG
%define CHROMAMC_AVG4 COPY_AVG
chroma_mc8_mmx_func avg, h264, _rnd
chroma_mc8_mmx_func avg, vc1, _nornd
chroma_mc8_mmx_func avg, rv40
chroma_mc4_mmx_func avg, h264
chroma_mc4_mmx_func avg, rv40
chroma_mc2_mmx_func avg, h264
INIT_MMX 3dnow
chroma_mc8_mmx_func avg, h264, _rnd
chroma_mc8_mmx_func avg, vc1, _nornd
chroma_mc8_mmx_func avg, rv40
chroma_mc4_mmx_func avg, h264
chroma_mc4_mmx_func avg, rv40
%macro chroma_mc8_ssse3_func 2-3
cglobal %1_%2_chroma_mc8%3, 6, 7, 8
mov r6d, r5d
or r6d, r4d
jne .at_least_one_non_zero
; mx == 0 AND my == 0 - no filter needed
mv0_pixels_mc8
REP_RET
.at_least_one_non_zero:
test r5d, r5d
je .my_is_zero
test r4d, r4d
je .mx_is_zero
; general case, bilinear
mov r6d, r4d
shl r4d, 8
sub r4, r6
mov r6, 8
add r4, 8 ; x*288+8 = x<<8 | (8-x)
sub r6d, r5d
imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
movd m7, r6d
movd m6, r4d
movdqa m5, [rnd_2d_%2]
movq m0, [r1 ]
movq m1, [r1+1]
pshuflw m7, m7, 0
pshuflw m6, m6, 0
punpcklbw m0, m1
movlhps m7, m7
movlhps m6, m6
.next2rows:
movq m1, [r1+r2*1 ]
movq m2, [r1+r2*1+1]
movq m3, [r1+r2*2 ]
movq m4, [r1+r2*2+1]
lea r1, [r1+r2*2]
punpcklbw m1, m2
movdqa m2, m1
punpcklbw m3, m4
movdqa m4, m3
pmaddubsw m0, m7
pmaddubsw m1, m6
pmaddubsw m2, m7
pmaddubsw m3, m6
paddw m0, m5
paddw m2, m5
paddw m1, m0
paddw m3, m2
psrlw m1, 6
movdqa m0, m4
psrlw m3, 6
%ifidn %1, avg
movq m2, [r0 ]
movhps m2, [r0+r2]
%endif
packuswb m1, m3
CHROMAMC_AVG m1, m2
movq [r0 ], m1
movhps [r0+r2], m1
sub r3d, 2
lea r0, [r0+r2*2]
jg .next2rows
REP_RET
.my_is_zero:
mov r5d, r4d
shl r4d, 8
add r4, 8
sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
movd m7, r4d
movdqa m6, [rnd_1d_%2]
pshuflw m7, m7, 0
movlhps m7, m7
.next2xrows:
movq m0, [r1 ]
movq m1, [r1 +1]
movq m2, [r1+r2 ]
movq m3, [r1+r2+1]
punpcklbw m0, m1
punpcklbw m2, m3
pmaddubsw m0, m7
pmaddubsw m2, m7
%ifidn %1, avg
movq m4, [r0 ]
movhps m4, [r0+r2]
%endif
paddw m0, m6
paddw m2, m6
psrlw m0, 3
psrlw m2, 3
packuswb m0, m2
CHROMAMC_AVG m0, m4
movq [r0 ], m0
movhps [r0+r2], m0
sub r3d, 2
lea r0, [r0+r2*2]
lea r1, [r1+r2*2]
jg .next2xrows
REP_RET
.mx_is_zero:
mov r4d, r5d
shl r5d, 8
add r5, 8
sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
movd m7, r5d
movdqa m6, [rnd_1d_%2]
pshuflw m7, m7, 0
movlhps m7, m7
.next2yrows:
movq m0, [r1 ]
movq m1, [r1+r2 ]
movdqa m2, m1
movq m3, [r1+r2*2]
lea r1, [r1+r2*2]
punpcklbw m0, m1
punpcklbw m2, m3
pmaddubsw m0, m7
pmaddubsw m2, m7
%ifidn %1, avg
movq m4, [r0 ]
movhps m4, [r0+r2]
%endif
paddw m0, m6
paddw m2, m6
psrlw m0, 3
psrlw m2, 3
packuswb m0, m2
CHROMAMC_AVG m0, m4
movq [r0 ], m0
movhps [r0+r2], m0
sub r3d, 2
lea r0, [r0+r2*2]
jg .next2yrows
REP_RET
%endmacro
%macro chroma_mc4_ssse3_func 2
cglobal %1_%2_chroma_mc4, 6, 7, 0
mov r6, r4
shl r4d, 8
sub r4d, r6d
mov r6, 8
add r4d, 8 ; x*288+8
sub r6d, r5d
imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
movd m7, r6d
movd m6, r4d
movq m5, [pw_32]
movd m0, [r1 ]
pshufw m7, m7, 0
punpcklbw m0, [r1+1]
pshufw m6, m6, 0
.next2rows:
movd m1, [r1+r2*1 ]
movd m3, [r1+r2*2 ]
punpcklbw m1, [r1+r2*1+1]
punpcklbw m3, [r1+r2*2+1]
lea r1, [r1+r2*2]
movq m2, m1
movq m4, m3
pmaddubsw m0, m7
pmaddubsw m1, m6
pmaddubsw m2, m7
pmaddubsw m3, m6
paddw m0, m5
paddw m2, m5
paddw m1, m0
paddw m3, m2
psrlw m1, 6
movq m0, m4
psrlw m3, 6
packuswb m1, m1
packuswb m3, m3
CHROMAMC_AVG m1, [r0 ]
CHROMAMC_AVG m3, [r0+r2]
movd [r0 ], m1
movd [r0+r2], m3
sub r3d, 2
lea r0, [r0+r2*2]
jg .next2rows
REP_RET
%endmacro
%define CHROMAMC_AVG NOTHING
INIT_XMM ssse3
chroma_mc8_ssse3_func put, h264, _rnd
chroma_mc8_ssse3_func put, vc1, _nornd
INIT_MMX ssse3
chroma_mc4_ssse3_func put, h264
%define CHROMAMC_AVG DIRECT_AVG
INIT_XMM ssse3
chroma_mc8_ssse3_func avg, h264, _rnd
chroma_mc8_ssse3_func avg, vc1, _nornd
INIT_MMX ssse3
chroma_mc4_ssse3_func avg, h264

View File

@@ -0,0 +1,269 @@
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pw_4
cextern pw_8
cextern pw_32
cextern pw_64
SECTION .text
%macro MV0_PIXELS_MC8 0
lea r4, [r2*3 ]
lea r5, [r2*4 ]
.next4rows:
movu m0, [r1 ]
movu m1, [r1+r2 ]
CHROMAMC_AVG m0, [r0 ]
CHROMAMC_AVG m1, [r0+r2 ]
mova [r0 ], m0
mova [r0+r2 ], m1
movu m0, [r1+r2*2]
movu m1, [r1+r4 ]
CHROMAMC_AVG m0, [r0+r2*2]
CHROMAMC_AVG m1, [r0+r4 ]
mova [r0+r2*2], m0
mova [r0+r4 ], m1
add r1, r5
add r0, r5
sub r3d, 4
jne .next4rows
%endmacro
;-----------------------------------------------------------------------------
; void ff_put/avg_h264_chroma_mc8(pixel *dst, pixel *src, ptrdiff_t stride,
; int h, int mx, int my)
;-----------------------------------------------------------------------------
%macro CHROMA_MC8 1
cglobal %1_h264_chroma_mc8_10, 6,7,8
mov r6d, r5d
or r6d, r4d
jne .at_least_one_non_zero
; mx == 0 AND my == 0 - no filter needed
MV0_PIXELS_MC8
REP_RET
.at_least_one_non_zero:
mov r6d, 2
test r5d, r5d
je .x_interpolation
mov r6, r2 ; dxy = x ? 1 : stride
test r4d, r4d
jne .xy_interpolation
.x_interpolation:
; mx == 0 XOR my == 0 - 1 dimensional filter only
or r4d, r5d ; x + y
movd m5, r4d
mova m4, [pw_8]
mova m6, [pw_4] ; mm6 = rnd >> 3
SPLATW m5, m5 ; mm5 = B = x
psubw m4, m5 ; mm4 = A = 8-x
.next1drow:
movu m0, [r1 ] ; mm0 = src[0..7]
movu m2, [r1+r6] ; mm2 = src[1..8]
pmullw m0, m4 ; mm0 = A * src[0..7]
pmullw m2, m5 ; mm2 = B * src[1..8]
paddw m0, m6
paddw m0, m2
psrlw m0, 3
CHROMAMC_AVG m0, [r0]
mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
add r0, r2
add r1, r2
dec r3d
jne .next1drow
REP_RET
.xy_interpolation: ; general case, bilinear
movd m4, r4m ; x
movd m6, r5m ; y
SPLATW m4, m4 ; mm4 = x words
SPLATW m6, m6 ; mm6 = y words
psllw m5, m4, 3 ; mm5 = 8x
pmullw m4, m6 ; mm4 = x * y
psllw m6, 3 ; mm6 = 8y
paddw m1, m5, m6 ; mm7 = 8x+8y
mova m7, m4 ; DD = x * y
psubw m5, m4 ; mm5 = B = 8x - xy
psubw m6, m4 ; mm6 = C = 8y - xy
paddw m4, [pw_64]
psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64
movu m0, [r1 ] ; mm0 = src[0..7]
movu m1, [r1+2] ; mm1 = src[1..8]
.next2drow:
add r1, r2
pmullw m2, m0, m4
pmullw m1, m5
paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8]
movu m0, [r1]
movu m1, [r1+2]
pmullw m3, m0, m6
paddw m2, m3 ; mm2 += C * src[0..7+strde]
pmullw m3, m1, m7
paddw m2, m3 ; mm2 += D * src[1..8+strde]
paddw m2, [pw_32]
psrlw m2, 6
CHROMAMC_AVG m2, [r0]
mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6
add r0, r2
dec r3d
jne .next2drow
REP_RET
%endmacro
;-----------------------------------------------------------------------------
; void ff_put/avg_h264_chroma_mc4(pixel *dst, pixel *src, ptrdiff_t stride,
; int h, int mx, int my)
;-----------------------------------------------------------------------------
;TODO: xmm mc4
%macro MC4_OP 2
movq %1, [r1 ]
movq m1, [r1+2]
add r1, r2
pmullw %1, m4
pmullw m1, m2
paddw m1, %1
mova %1, m1
pmullw %2, m5
pmullw m1, m3
paddw %2, [pw_32]
paddw m1, %2
psrlw m1, 6
CHROMAMC_AVG m1, %2, [r0]
movq [r0], m1
add r0, r2
%endmacro
%macro CHROMA_MC4 1
cglobal %1_h264_chroma_mc4_10, 6,6,7
movd m2, r4m ; x
movd m3, r5m ; y
mova m4, [pw_8]
mova m5, m4
SPLATW m2, m2
SPLATW m3, m3
psubw m4, m2
psubw m5, m3
movq m0, [r1 ]
movq m6, [r1+2]
add r1, r2
pmullw m0, m4
pmullw m6, m2
paddw m6, m0
.next2rows:
MC4_OP m0, m6
MC4_OP m6, m0
sub r3d, 2
jnz .next2rows
REP_RET
%endmacro
;-----------------------------------------------------------------------------
; void ff_put/avg_h264_chroma_mc2(pixel *dst, pixel *src, ptrdiff_t stride,
; int h, int mx, int my)
;-----------------------------------------------------------------------------
%macro CHROMA_MC2 1
cglobal %1_h264_chroma_mc2_10, 6,7
mov r6d, r4d
shl r4d, 16
sub r4d, r6d
add r4d, 8
imul r5d, r4d ; x*y<<16 | y*(8-x)
shl r4d, 3
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
movd m5, r4d
movd m6, r5d
punpckldq m5, m5 ; mm5 = {A,B,A,B}
punpckldq m6, m6 ; mm6 = {C,D,C,D}
pxor m7, m7
pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2]
.nextrow:
add r1, r2
movq m1, m2
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2]
movq m2, m0
pmaddwd m0, m6
paddw m1, [pw_32]
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
psrlw m1, 6
packssdw m1, m7
CHROMAMC_AVG m1, m3, [r0]
movd [r0], m1
add r0, r2
dec r3d
jnz .nextrow
REP_RET
%endmacro
%macro NOTHING 2-3
%endmacro
%macro AVG 2-3
%if %0==3
movq %2, %3
%endif
pavgw %1, %2
%endmacro
%define CHROMAMC_AVG NOTHING
INIT_XMM sse2
CHROMA_MC8 put
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
CHROMA_MC8 put
%endif
INIT_MMX mmxext
CHROMA_MC4 put
CHROMA_MC2 put
%define CHROMAMC_AVG AVG
INIT_XMM sse2
CHROMA_MC8 avg
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
CHROMA_MC8 avg
%endif
INIT_MMX mmxext
CHROMA_MC4 avg
CHROMA_MC2 avg

1420
externals/ffmpeg/libavcodec/x86/h264_deblock.asm vendored Executable file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1199
externals/ffmpeg/libavcodec/x86/h264_idct.asm vendored Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,657 @@
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
cextern pw_1023
%define pw_pixel_max pw_1023
cextern pd_32
;-----------------------------------------------------------------------------
; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
%macro STORE_DIFFx2 6
psrad %1, 6
psrad %2, 6
packssdw %1, %2
movq %3, [%5]
movhps %3, [%5+%6]
paddsw %1, %3
CLIPW %1, %4, [pw_pixel_max]
movq [%5], %1
movhps [%5+%6], %1
%endmacro
%macro STORE_DIFF16 5
psrad %1, 6
psrad %2, 6
packssdw %1, %2
paddsw %1, [%5]
CLIPW %1, %3, %4
mova [%5], %1
%endmacro
;dst, in, stride
%macro IDCT4_ADD_10 3
mova m0, [%2+ 0]
mova m1, [%2+16]
mova m2, [%2+32]
mova m3, [%2+48]
IDCT4_1D d,0,1,2,3,4,5
TRANSPOSE4x4D 0,1,2,3,4
paddd m0, [pd_32]
IDCT4_1D d,0,1,2,3,4,5
pxor m5, m5
mova [%2+ 0], m5
mova [%2+16], m5
mova [%2+32], m5
mova [%2+48], m5
STORE_DIFFx2 m0, m1, m4, m5, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m2, m3, m4, m5, %1, %3
%endmacro
%macro IDCT_ADD_10 0
cglobal h264_idct_add_10, 3,3
movsxdifnidn r2, r2d
IDCT4_ADD_10 r0, r1, r2
RET
%endmacro
INIT_XMM sse2
IDCT_ADD_10
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD_10
%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
;;;;;;; NO FATE SAMPLES TRIGGER THIS
%macro ADD4x4IDCT 0
add4x4_idct %+ SUFFIX:
add r5, r0
mova m0, [r2+ 0]
mova m1, [r2+16]
mova m2, [r2+32]
mova m3, [r2+48]
IDCT4_1D d,0,1,2,3,4,5
TRANSPOSE4x4D 0,1,2,3,4
paddd m0, [pd_32]
IDCT4_1D d,0,1,2,3,4,5
pxor m5, m5
mova [r2+ 0], m5
mova [r2+16], m5
mova [r2+32], m5
mova [r2+48], m5
STORE_DIFFx2 m0, m1, m4, m5, r5, r3
lea r5, [r5+r3*2]
STORE_DIFFx2 m2, m3, m4, m5, r5, r3
ret
%endmacro
INIT_XMM sse2
ALIGN 16
ADD4x4IDCT
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
ALIGN 16
ADD4x4IDCT
%endif
%macro ADD16_OP 2
cmp byte [r4+%2], 0
jz .skipblock%1
mov r5d, [r1+%1*4]
call add4x4_idct %+ SUFFIX
.skipblock%1:
%if %1<15
add r2, 64
%endif
%endmacro
%macro IDCT_ADD16_10 0
cglobal h264_idct_add16_10, 5,6
movsxdifnidn r3, r3d
ADD16_OP 0, 4+1*8
ADD16_OP 1, 5+1*8
ADD16_OP 2, 4+2*8
ADD16_OP 3, 5+2*8
ADD16_OP 4, 6+1*8
ADD16_OP 5, 7+1*8
ADD16_OP 6, 6+2*8
ADD16_OP 7, 7+2*8
ADD16_OP 8, 4+3*8
ADD16_OP 9, 5+3*8
ADD16_OP 10, 4+4*8
ADD16_OP 11, 5+4*8
ADD16_OP 12, 6+3*8
ADD16_OP 13, 7+3*8
ADD16_OP 14, 6+4*8
ADD16_OP 15, 7+4*8
REP_RET
%endmacro
INIT_XMM sse2
IDCT_ADD16_10
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD16_10
%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
%macro IDCT_DC_ADD_OP_10 3
pxor m5, m5
%if avx_enabled
paddw m1, m0, [%1+0 ]
paddw m2, m0, [%1+%2 ]
paddw m3, m0, [%1+%2*2]
paddw m4, m0, [%1+%3 ]
%else
mova m1, [%1+0 ]
mova m2, [%1+%2 ]
mova m3, [%1+%2*2]
mova m4, [%1+%3 ]
paddw m1, m0
paddw m2, m0
paddw m3, m0
paddw m4, m0
%endif
CLIPW m1, m5, m6
CLIPW m2, m5, m6
CLIPW m3, m5, m6
CLIPW m4, m5, m6
mova [%1+0 ], m1
mova [%1+%2 ], m2
mova [%1+%2*2], m3
mova [%1+%3 ], m4
%endmacro
INIT_MMX mmxext
cglobal h264_idct_dc_add_10,3,3
movsxdifnidn r2, r2d
movd m0, [r1]
mov dword [r1], 0
paddd m0, [pd_32]
psrad m0, 6
lea r1, [r2*3]
pshufw m0, m0, 0
mova m6, [pw_pixel_max]
IDCT_DC_ADD_OP_10 r0, r2, r1
RET
;-----------------------------------------------------------------------------
; void ff_h264_idct8_dc_add_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
%macro IDCT8_DC_ADD 0
cglobal h264_idct8_dc_add_10,3,4,7
movsxdifnidn r2, r2d
movd m0, [r1]
mov dword[r1], 0
paddd m0, [pd_32]
psrad m0, 6
lea r1, [r2*3]
SPLATW m0, m0, 0
mova m6, [pw_pixel_max]
IDCT_DC_ADD_OP_10 r0, r2, r1
lea r0, [r0+r2*4]
IDCT_DC_ADD_OP_10 r0, r2, r1
RET
%endmacro
INIT_XMM sse2
IDCT8_DC_ADD
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_DC_ADD
%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
%macro AC 1
.ac%1:
mov r5d, [r1+(%1+0)*4]
call add4x4_idct %+ SUFFIX
mov r5d, [r1+(%1+1)*4]
add r2, 64
call add4x4_idct %+ SUFFIX
add r2, 64
jmp .skipadd%1
%endmacro
%assign last_block 16
%macro ADD16_OP_INTRA 2
cmp word [r4+%2], 0
jnz .ac%1
mov r5d, [r2+ 0]
or r5d, [r2+64]
jz .skipblock%1
mov r5d, [r1+(%1+0)*4]
call idct_dc_add %+ SUFFIX
.skipblock%1:
%if %1<last_block-2
add r2, 128
%endif
.skipadd%1:
%endmacro
%macro IDCT_ADD16INTRA_10 0
idct_dc_add %+ SUFFIX:
add r5, r0
movq m0, [r2+ 0]
movhps m0, [r2+64]
mov dword [r2+ 0], 0
mov dword [r2+64], 0
paddd m0, [pd_32]
psrad m0, 6
pshufhw m0, m0, 0
pshuflw m0, m0, 0
lea r6, [r3*3]
mova m6, [pw_pixel_max]
IDCT_DC_ADD_OP_10 r5, r3, r6
ret
cglobal h264_idct_add16intra_10,5,7,8
movsxdifnidn r3, r3d
ADD16_OP_INTRA 0, 4+1*8
ADD16_OP_INTRA 2, 4+2*8
ADD16_OP_INTRA 4, 6+1*8
ADD16_OP_INTRA 6, 6+2*8
ADD16_OP_INTRA 8, 4+3*8
ADD16_OP_INTRA 10, 4+4*8
ADD16_OP_INTRA 12, 6+3*8
ADD16_OP_INTRA 14, 6+4*8
REP_RET
AC 8
AC 10
AC 12
AC 14
AC 0
AC 2
AC 4
AC 6
%endmacro
INIT_XMM sse2
IDCT_ADD16INTRA_10
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD16INTRA_10
%endif
%assign last_block 36
;-----------------------------------------------------------------------------
; void ff_h264_idct_add8_10(pixel **dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
%macro IDCT_ADD8 0
cglobal h264_idct_add8_10,5,8,7
movsxdifnidn r3, r3d
%if ARCH_X86_64
mov r7, r0
%endif
add r2, 1024
mov r0, [r0]
ADD16_OP_INTRA 16, 4+ 6*8
ADD16_OP_INTRA 18, 4+ 7*8
add r2, 1024-128*2
%if ARCH_X86_64
mov r0, [r7+gprsize]
%else
mov r0, r0m
mov r0, [r0+gprsize]
%endif
ADD16_OP_INTRA 32, 4+11*8
ADD16_OP_INTRA 34, 4+12*8
REP_RET
AC 16
AC 18
AC 32
AC 34
%endmacro ; IDCT_ADD8
INIT_XMM sse2
IDCT_ADD8
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD8
%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct_add8_422_10(pixel **dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
%assign last_block 44
%macro IDCT_ADD8_422 0
cglobal h264_idct_add8_422_10, 5, 8, 7
movsxdifnidn r3, r3d
%if ARCH_X86_64
mov r7, r0
%endif
add r2, 1024
mov r0, [r0]
ADD16_OP_INTRA 16, 4+ 6*8
ADD16_OP_INTRA 18, 4+ 7*8
ADD16_OP_INTRA 24, 4+ 8*8 ; i+4
ADD16_OP_INTRA 26, 4+ 9*8 ; i+4
add r2, 1024-128*4
%if ARCH_X86_64
mov r0, [r7+gprsize]
%else
mov r0, r0m
mov r0, [r0+gprsize]
%endif
ADD16_OP_INTRA 32, 4+11*8
ADD16_OP_INTRA 34, 4+12*8
ADD16_OP_INTRA 40, 4+13*8 ; i+4
ADD16_OP_INTRA 42, 4+14*8 ; i+4
REP_RET
AC 16
AC 18
AC 24 ; i+4
AC 26 ; i+4
AC 32
AC 34
AC 40 ; i+4
AC 42 ; i+4
%endmacro
INIT_XMM sse2
IDCT_ADD8_422
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD8_422
%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
%macro IDCT8_1D 2
SWAP 0, 1
psrad m4, m5, 1
psrad m1, m0, 1
paddd m4, m5
paddd m1, m0
paddd m4, m7
paddd m1, m5
psubd m4, m0
paddd m1, m3
psubd m0, m3
psubd m5, m3
paddd m0, m7
psubd m5, m7
psrad m3, 1
psrad m7, 1
psubd m0, m3
psubd m5, m7
SWAP 1, 7
psrad m1, m7, 2
psrad m3, m4, 2
paddd m3, m0
psrad m0, 2
paddd m1, m5
psrad m5, 2
psubd m0, m4
psubd m7, m5
SWAP 5, 6
psrad m4, m2, 1
psrad m6, m5, 1
psubd m4, m5
paddd m6, m2
mova m2, %1
mova m5, %2
SUMSUB_BA d, 5, 2
SUMSUB_BA d, 6, 5
SUMSUB_BA d, 4, 2
SUMSUB_BA d, 7, 6
SUMSUB_BA d, 0, 4
SUMSUB_BA d, 3, 2
SUMSUB_BA d, 1, 5
SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
%endmacro
%macro IDCT8_1D_FULL 1
mova m7, [%1+112*2]
mova m6, [%1+ 96*2]
mova m5, [%1+ 80*2]
mova m3, [%1+ 48*2]
mova m2, [%1+ 32*2]
mova m1, [%1+ 16*2]
IDCT8_1D [%1], [%1+ 64*2]
%endmacro
; %1=int16_t *block, %2=int16_t *dstblock
%macro IDCT8_ADD_SSE_START 2
IDCT8_1D_FULL %1
%if ARCH_X86_64
TRANSPOSE4x4D 0,1,2,3,8
mova [%2 ], m0
TRANSPOSE4x4D 4,5,6,7,8
mova [%2+8*2], m4
%else
mova [%1], m7
TRANSPOSE4x4D 0,1,2,3,7
mova m7, [%1]
mova [%2 ], m0
mova [%2+16*2], m1
mova [%2+32*2], m2
mova [%2+48*2], m3
TRANSPOSE4x4D 4,5,6,7,3
mova [%2+ 8*2], m4
mova [%2+24*2], m5
mova [%2+40*2], m6
mova [%2+56*2], m7
%endif
%endmacro
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_SSE_END 3
IDCT8_1D_FULL %2
mova [%2 ], m6
mova [%2+16*2], m7
pxor m7, m7
STORE_DIFFx2 m0, m1, m6, m7, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m2, m3, m6, m7, %1, %3
mova m0, [%2 ]
mova m1, [%2+16*2]
lea %1, [%1+%3*2]
STORE_DIFFx2 m4, m5, m6, m7, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m0, m1, m6, m7, %1, %3
%endmacro
%macro IDCT8_ADD 0
cglobal h264_idct8_add_10, 3,4,16
movsxdifnidn r2, r2d
%if UNIX64 == 0
%assign pad 16-gprsize-(stack_offset&15)
sub rsp, pad
call h264_idct8_add1_10 %+ SUFFIX
add rsp, pad
RET
%endif
ALIGN 16
; TODO: does not need to use stack
h264_idct8_add1_10 %+ SUFFIX:
%assign pad 256+16-gprsize
sub rsp, pad
add dword [r1], 32
%if ARCH_X86_64
IDCT8_ADD_SSE_START r1, rsp
SWAP 1, 9
SWAP 2, 10
SWAP 3, 11
SWAP 5, 13
SWAP 6, 14
SWAP 7, 15
IDCT8_ADD_SSE_START r1+16, rsp+128
PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
IDCT8_1D [rsp], [rsp+128]
SWAP 0, 8
SWAP 1, 9
SWAP 2, 10
SWAP 3, 11
SWAP 4, 12
SWAP 5, 13
SWAP 6, 14
SWAP 7, 15
IDCT8_1D [rsp+16], [rsp+144]
psrad m8, 6
psrad m0, 6
packssdw m8, m0
paddsw m8, [r0]
pxor m0, m0
mova [r1+ 0], m0
mova [r1+ 16], m0
mova [r1+ 32], m0
mova [r1+ 48], m0
mova [r1+ 64], m0
mova [r1+ 80], m0
mova [r1+ 96], m0
mova [r1+112], m0
mova [r1+128], m0
mova [r1+144], m0
mova [r1+160], m0
mova [r1+176], m0
mova [r1+192], m0
mova [r1+208], m0
mova [r1+224], m0
mova [r1+240], m0
CLIPW m8, m0, [pw_pixel_max]
mova [r0], m8
mova m8, [pw_pixel_max]
STORE_DIFF16 m9, m1, m0, m8, r0+r2
lea r0, [r0+r2*2]
STORE_DIFF16 m10, m2, m0, m8, r0
STORE_DIFF16 m11, m3, m0, m8, r0+r2
lea r0, [r0+r2*2]
STORE_DIFF16 m12, m4, m0, m8, r0
STORE_DIFF16 m13, m5, m0, m8, r0+r2
lea r0, [r0+r2*2]
STORE_DIFF16 m14, m6, m0, m8, r0
STORE_DIFF16 m15, m7, m0, m8, r0+r2
%else
IDCT8_ADD_SSE_START r1, rsp
IDCT8_ADD_SSE_START r1+16, rsp+128
lea r3, [r0+8]
IDCT8_ADD_SSE_END r0, rsp, r2
IDCT8_ADD_SSE_END r3, rsp+16, r2
mova [r1+ 0], m7
mova [r1+ 16], m7
mova [r1+ 32], m7
mova [r1+ 48], m7
mova [r1+ 64], m7
mova [r1+ 80], m7
mova [r1+ 96], m7
mova [r1+112], m7
mova [r1+128], m7
mova [r1+144], m7
mova [r1+160], m7
mova [r1+176], m7
mova [r1+192], m7
mova [r1+208], m7
mova [r1+224], m7
mova [r1+240], m7
%endif ; ARCH_X86_64
add rsp, pad
ret
%endmacro
INIT_XMM sse2
IDCT8_ADD
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_ADD
%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
;;;;;;; NO FATE SAMPLES TRIGGER THIS
%macro IDCT8_ADD4_OP 2
cmp byte [r4+%2], 0
jz .skipblock%1
mov r0d, [r6+%1*4]
add r0, r5
call h264_idct8_add1_10 %+ SUFFIX
.skipblock%1:
%if %1<12
add r1, 256
%endif
%endmacro
%macro IDCT8_ADD4 0
cglobal h264_idct8_add4_10, 0,7,16
movsxdifnidn r3, r3d
%assign pad 16-gprsize-(stack_offset&15)
SUB rsp, pad
mov r5, r0mp
mov r6, r1mp
mov r1, r2mp
mov r2d, r3m
movifnidn r4, r4mp
IDCT8_ADD4_OP 0, 4+1*8
IDCT8_ADD4_OP 4, 6+1*8
IDCT8_ADD4_OP 8, 4+3*8
IDCT8_ADD4_OP 12, 6+3*8
ADD rsp, pad
RET
%endmacro ; IDCT8_ADD4
INIT_XMM sse2
IDCT8_ADD4
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_ADD4
%endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,410 @@
/*
* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/h264pred.h"
#define PRED4x4(TYPE, DEPTH, OPT) \
void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
const uint8_t *topright, \
ptrdiff_t stride);
PRED4x4(dc, 10, mmxext)
PRED4x4(down_left, 10, sse2)
PRED4x4(down_left, 10, avx)
PRED4x4(down_right, 10, sse2)
PRED4x4(down_right, 10, ssse3)
PRED4x4(down_right, 10, avx)
PRED4x4(vertical_left, 10, sse2)
PRED4x4(vertical_left, 10, avx)
PRED4x4(vertical_right, 10, sse2)
PRED4x4(vertical_right, 10, ssse3)
PRED4x4(vertical_right, 10, avx)
PRED4x4(horizontal_up, 10, mmxext)
PRED4x4(horizontal_down, 10, sse2)
PRED4x4(horizontal_down, 10, ssse3)
PRED4x4(horizontal_down, 10, avx)
#define PRED8x8(TYPE, DEPTH, OPT) \
void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
ptrdiff_t stride);
PRED8x8(dc, 10, mmxext)
PRED8x8(dc, 10, sse2)
PRED8x8(top_dc, 10, sse2)
PRED8x8(plane, 10, sse2)
PRED8x8(vertical, 10, sse2)
PRED8x8(horizontal, 10, sse2)
#define PRED8x8L(TYPE, DEPTH, OPT)\
void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
int has_topleft, \
int has_topright, \
ptrdiff_t stride);
PRED8x8L(dc, 10, sse2)
PRED8x8L(dc, 10, avx)
PRED8x8L(128_dc, 10, mmxext)
PRED8x8L(128_dc, 10, sse2)
PRED8x8L(top_dc, 10, sse2)
PRED8x8L(top_dc, 10, avx)
PRED8x8L(vertical, 10, sse2)
PRED8x8L(vertical, 10, avx)
PRED8x8L(horizontal, 10, sse2)
PRED8x8L(horizontal, 10, ssse3)
PRED8x8L(horizontal, 10, avx)
PRED8x8L(down_left, 10, sse2)
PRED8x8L(down_left, 10, ssse3)
PRED8x8L(down_left, 10, avx)
PRED8x8L(down_right, 10, sse2)
PRED8x8L(down_right, 10, ssse3)
PRED8x8L(down_right, 10, avx)
PRED8x8L(vertical_right, 10, sse2)
PRED8x8L(vertical_right, 10, ssse3)
PRED8x8L(vertical_right, 10, avx)
PRED8x8L(horizontal_up, 10, sse2)
PRED8x8L(horizontal_up, 10, ssse3)
PRED8x8L(horizontal_up, 10, avx)
#define PRED16x16(TYPE, DEPTH, OPT)\
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
ptrdiff_t stride);
PRED16x16(dc, 10, mmxext)
PRED16x16(dc, 10, sse2)
PRED16x16(top_dc, 10, mmxext)
PRED16x16(top_dc, 10, sse2)
PRED16x16(128_dc, 10, mmxext)
PRED16x16(128_dc, 10, sse2)
PRED16x16(left_dc, 10, mmxext)
PRED16x16(left_dc, 10, sse2)
PRED16x16(vertical, 10, mmxext)
PRED16x16(vertical, 10, sse2)
PRED16x16(horizontal, 10, mmxext)
PRED16x16(horizontal, 10, sse2)
/* 8-bit versions */
PRED16x16(vertical, 8, mmx)
PRED16x16(vertical, 8, sse)
PRED16x16(horizontal, 8, mmx)
PRED16x16(horizontal, 8, mmxext)
PRED16x16(horizontal, 8, ssse3)
PRED16x16(dc, 8, mmxext)
PRED16x16(dc, 8, sse2)
PRED16x16(dc, 8, ssse3)
PRED16x16(plane_h264, 8, mmx)
PRED16x16(plane_h264, 8, mmxext)
PRED16x16(plane_h264, 8, sse2)
PRED16x16(plane_h264, 8, ssse3)
PRED16x16(plane_rv40, 8, mmx)
PRED16x16(plane_rv40, 8, mmxext)
PRED16x16(plane_rv40, 8, sse2)
PRED16x16(plane_rv40, 8, ssse3)
PRED16x16(plane_svq3, 8, mmx)
PRED16x16(plane_svq3, 8, mmxext)
PRED16x16(plane_svq3, 8, sse2)
PRED16x16(plane_svq3, 8, ssse3)
PRED16x16(tm_vp8, 8, mmx)
PRED16x16(tm_vp8, 8, mmxext)
PRED16x16(tm_vp8, 8, sse2)
PRED16x16(tm_vp8, 8, avx2)
PRED8x8(top_dc, 8, mmxext)
PRED8x8(dc_rv40, 8, mmxext)
PRED8x8(dc, 8, mmxext)
PRED8x8(vertical, 8, mmx)
PRED8x8(horizontal, 8, mmx)
PRED8x8(horizontal, 8, mmxext)
PRED8x8(horizontal, 8, ssse3)
PRED8x8(plane, 8, mmx)
PRED8x8(plane, 8, mmxext)
PRED8x8(plane, 8, sse2)
PRED8x8(plane, 8, ssse3)
PRED8x8(tm_vp8, 8, mmx)
PRED8x8(tm_vp8, 8, mmxext)
PRED8x8(tm_vp8, 8, sse2)
PRED8x8(tm_vp8, 8, ssse3)
PRED8x8L(top_dc, 8, mmxext)
PRED8x8L(top_dc, 8, ssse3)
PRED8x8L(dc, 8, mmxext)
PRED8x8L(dc, 8, ssse3)
PRED8x8L(horizontal, 8, mmxext)
PRED8x8L(horizontal, 8, ssse3)
PRED8x8L(vertical, 8, mmxext)
PRED8x8L(vertical, 8, ssse3)
PRED8x8L(down_left, 8, mmxext)
PRED8x8L(down_left, 8, sse2)
PRED8x8L(down_left, 8, ssse3)
PRED8x8L(down_right, 8, mmxext)
PRED8x8L(down_right, 8, sse2)
PRED8x8L(down_right, 8, ssse3)
PRED8x8L(vertical_right, 8, mmxext)
PRED8x8L(vertical_right, 8, sse2)
PRED8x8L(vertical_right, 8, ssse3)
PRED8x8L(vertical_left, 8, sse2)
PRED8x8L(vertical_left, 8, ssse3)
PRED8x8L(horizontal_up, 8, mmxext)
PRED8x8L(horizontal_up, 8, ssse3)
PRED8x8L(horizontal_down, 8, mmxext)
PRED8x8L(horizontal_down, 8, sse2)
PRED8x8L(horizontal_down, 8, ssse3)
PRED4x4(dc, 8, mmxext)
PRED4x4(down_left, 8, mmxext)
PRED4x4(down_right, 8, mmxext)
PRED4x4(vertical_left, 8, mmxext)
PRED4x4(vertical_right, 8, mmxext)
PRED4x4(horizontal_up, 8, mmxext)
PRED4x4(horizontal_down, 8, mmxext)
PRED4x4(tm_vp8, 8, mmx)
PRED4x4(tm_vp8, 8, mmxext)
PRED4x4(tm_vp8, 8, ssse3)
PRED4x4(vertical_vp8, 8, mmxext)
av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
const int bit_depth,
const int chroma_format_idc)
{
int cpu_flags = av_get_cpu_flags();
if (bit_depth == 8) {
if (EXTERNAL_MMX(cpu_flags)) {
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmx;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmx;
if (chroma_format_idc <= 1) {
h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx;
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmx;
}
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmx;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmx;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmx;
} else {
if (chroma_format_idc <= 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx;
if (codec_id == AV_CODEC_ID_SVQ3) {
if (cpu_flags & AV_CPU_FLAG_CMOV)
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx;
}
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext;
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext;
if (chroma_format_idc <= 1)
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext;
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext;
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext;
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext;
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext;
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_mmxext;
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_mmxext;
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext;
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_mmxext;
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_mmxext;
h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] = ff_pred4x4_down_right_8_mmxext;
h->pred4x4 [VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_8_mmxext;
h->pred4x4 [HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_8_mmxext;
h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_8_mmxext;
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8 ||
codec_id == AV_CODEC_ID_H264) {
h->pred4x4 [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext;
}
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
h->pred4x4 [VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_8_mmxext;
}
if (codec_id != AV_CODEC_ID_RV40) {
h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_8_mmxext;
}
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
if (chroma_format_idc <= 1) {
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext;
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext;
}
}
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmxext;
h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmxext;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext;
h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext;
} else {
if (chroma_format_idc <= 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext;
if (codec_id == AV_CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmxext;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmxext;
} else {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmxext;
}
}
}
if (EXTERNAL_SSE(cpu_flags)) {
h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2;
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2;
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2;
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2;
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2;
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2;
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2;
} else {
if (chroma_format_idc <= 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2;
if (codec_id == AV_CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2;
}
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3;
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3;
if (chroma_format_idc <= 1)
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3;
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_ssse3;
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3;
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3;
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3;
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3;
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3;
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_ssse3;
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_ssse3;
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_ssse3;
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_ssse3;
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_ssse3;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_ssse3;
} else {
if (chroma_format_idc <= 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3;
if (codec_id == AV_CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3;
}
}
}
if(EXTERNAL_AVX2(cpu_flags)){
if (codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2;
}
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext;
if (chroma_format_idc <= 1)
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext;
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext;
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext;
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext;
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext;
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext;
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2;
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2;
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2;
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2;
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2;
if (chroma_format_idc <= 1) {
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2;
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2;
h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2;
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2;
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2;
}
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2;
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2;
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2;
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2;
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2;
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2;
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2;
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2;
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2;
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2;
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2;
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2;
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2;
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3;
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3;
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3;
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3;
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3;
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3;
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3;
}
if (EXTERNAL_AVX(cpu_flags)) {
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx;
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx;
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx;
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx;
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx;
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx;
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx;
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx;
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx;
}
}
}

634
externals/ffmpeg/libavcodec/x86/h264_qpel.c vendored Executable file
View File

@@ -0,0 +1,634 @@
/*
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
* Copyright (c) 2011 Daniel Kang
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/h264dec.h"
#include "libavcodec/h264qpel.h"
#include "libavcodec/pixels.h"
#include "fpel.h"
#if HAVE_X86ASM
void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext
#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
#define ff_put_pixels8_mmxext ff_put_pixels8_mmx
#define ff_put_pixels4_mmxext ff_put_pixels4_mmx
#define DEF_QPEL(OPNAME)\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\
void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);
DEF_QPEL(avg)
DEF_QPEL(put)
static av_always_inline void ff_put_h264_qpel8or16_hv1_lowpass_mmxext(int16_t *tmp, const uint8_t *src, int tmpStride, int srcStride, int size)
{
int w = (size + 8) >> 2;
src -= 2 * srcStride + 2;
while (w--) {
ff_put_h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);
tmp += 4;
src += 4;
}
}
#define QPEL_H264(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
int w=3;\
src -= 2*srcStride+2;\
while(w--){\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
tmp += 4;\
src += 4;\
}\
tmp -= 3*4;\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\
src -= 2*srcStride;\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
src += 4;\
dst += 4;\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
int w = size>>4;\
do{\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
tmp += 8;\
dst += 8;\
}while(w--);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
src += 8*dstStride;\
dst += 8*dstStride;\
src2 += 8*src2Stride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
}\
\
static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h)\
{\
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
}\
#if ARCH_X86_64
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
#else // ARCH_X86_64
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
src += 8*dstStride;\
dst += 8*dstStride;\
src2 += 8*src2Stride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
}
#endif // ARCH_X86_64
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
const uint8_t *src,
int tmpStride,
int srcStride,
int size)
{
int w = (size+8)>>3;
src -= 2*srcStride+2;
while(w--){
ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
tmp += 8;
src += 8;
}
}
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
}\
#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext
#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext
#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels16_sse2(dst, src, stride, 16);
}
static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels16_sse2(dst, src, stride, 16);
}
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
}\
#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
}\
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
}\
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
}\
#define H264_MC_4816(MMX)\
H264_MC(put_, 4, MMX, 8)\
H264_MC(put_, 8, MMX, 8)\
H264_MC(put_, 16,MMX, 8)\
H264_MC(avg_, 4, MMX, 8)\
H264_MC(avg_, 8, MMX, 8)\
H264_MC(avg_, 16,MMX, 8)\
#define H264_MC_816(QPEL, XMM)\
QPEL(put_, 8, XMM, 16)\
QPEL(put_, 16,XMM, 16)\
QPEL(avg_, 8, XMM, 16)\
QPEL(avg_, 16,XMM, 16)\
QPEL_H264(put_, PUT_OP, mmxext)
QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
QPEL_H264_V_XMM(put_, PUT_OP, sse2)
QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
H264_MC_4816(mmxext)
H264_MC_816(H264_MC_V, sse2)
H264_MC_816(H264_MC_HV, sse2)
H264_MC_816(H264_MC_H, ssse3)
H264_MC_816(H264_MC_HV, ssse3)
//10bit
#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
#define LUMA_MC_816(DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
LUMA_MC_ALL(10, mc00, mmxext)
LUMA_MC_ALL(10, mc10, mmxext)
LUMA_MC_ALL(10, mc20, mmxext)
LUMA_MC_ALL(10, mc30, mmxext)
LUMA_MC_ALL(10, mc01, mmxext)
LUMA_MC_ALL(10, mc11, mmxext)
LUMA_MC_ALL(10, mc21, mmxext)
LUMA_MC_ALL(10, mc31, mmxext)
LUMA_MC_ALL(10, mc02, mmxext)
LUMA_MC_ALL(10, mc12, mmxext)
LUMA_MC_ALL(10, mc22, mmxext)
LUMA_MC_ALL(10, mc32, mmxext)
LUMA_MC_ALL(10, mc03, mmxext)
LUMA_MC_ALL(10, mc13, mmxext)
LUMA_MC_ALL(10, mc23, mmxext)
LUMA_MC_ALL(10, mc33, mmxext)
LUMA_MC_816(10, mc00, sse2)
LUMA_MC_816(10, mc10, sse2)
LUMA_MC_816(10, mc10, sse2_cache64)
LUMA_MC_816(10, mc10, ssse3_cache64)
LUMA_MC_816(10, mc20, sse2)
LUMA_MC_816(10, mc20, sse2_cache64)
LUMA_MC_816(10, mc20, ssse3_cache64)
LUMA_MC_816(10, mc30, sse2)
LUMA_MC_816(10, mc30, sse2_cache64)
LUMA_MC_816(10, mc30, ssse3_cache64)
LUMA_MC_816(10, mc01, sse2)
LUMA_MC_816(10, mc11, sse2)
LUMA_MC_816(10, mc21, sse2)
LUMA_MC_816(10, mc31, sse2)
LUMA_MC_816(10, mc02, sse2)
LUMA_MC_816(10, mc12, sse2)
LUMA_MC_816(10, mc22, sse2)
LUMA_MC_816(10, mc32, sse2)
LUMA_MC_816(10, mc03, sse2)
LUMA_MC_816(10, mc13, sse2)
LUMA_MC_816(10, mc23, sse2)
LUMA_MC_816(10, mc33, sse2)
#define QPEL16_OPMC(OP, MC, MMX)\
void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride){\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
src += 8*stride;\
dst += 8*stride;\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
}
#define QPEL16_OP(MC, MMX)\
QPEL16_OPMC(put, MC, MMX)\
QPEL16_OPMC(avg, MC, MMX)
#define QPEL16(MMX)\
QPEL16_OP(mc00, MMX)\
QPEL16_OP(mc01, MMX)\
QPEL16_OP(mc02, MMX)\
QPEL16_OP(mc03, MMX)\
QPEL16_OP(mc10, MMX)\
QPEL16_OP(mc11, MMX)\
QPEL16_OP(mc12, MMX)\
QPEL16_OP(mc13, MMX)\
QPEL16_OP(mc20, MMX)\
QPEL16_OP(mc21, MMX)\
QPEL16_OP(mc22, MMX)\
QPEL16_OP(mc23, MMX)\
QPEL16_OP(mc30, MMX)\
QPEL16_OP(mc31, MMX)\
QPEL16_OP(mc32, MMX)\
QPEL16_OP(mc33, MMX)
#if ARCH_X86_32 // ARCH_X86_64 implies SSE2+
QPEL16(mmxext)
#endif
#endif /* HAVE_X86ASM */
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
do { \
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
} while (0)
#define H264_QPEL_FUNCS(x, y, CPU) \
do { \
c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
} while (0)
#define H264_QPEL_FUNCS_10(x, y, CPU) \
do { \
c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
} while (0)
av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
{
#if HAVE_X86ASM
int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMXEXT(cpu_flags)) {
if (!high_bit_depth) {
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
} else if (bit_depth == 10) {
#if ARCH_X86_32
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
#endif
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
}
}
if (EXTERNAL_SSE2(cpu_flags)) {
if (!high_bit_depth) {
H264_QPEL_FUNCS(0, 1, sse2);
H264_QPEL_FUNCS(0, 2, sse2);
H264_QPEL_FUNCS(0, 3, sse2);
H264_QPEL_FUNCS(1, 1, sse2);
H264_QPEL_FUNCS(1, 2, sse2);
H264_QPEL_FUNCS(1, 3, sse2);
H264_QPEL_FUNCS(2, 1, sse2);
H264_QPEL_FUNCS(2, 2, sse2);
H264_QPEL_FUNCS(2, 3, sse2);
H264_QPEL_FUNCS(3, 1, sse2);
H264_QPEL_FUNCS(3, 2, sse2);
H264_QPEL_FUNCS(3, 3, sse2);
}
if (bit_depth == 10) {
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
}
}
if (EXTERNAL_SSE2_FAST(cpu_flags)) {
if (!high_bit_depth) {
H264_QPEL_FUNCS(0, 0, sse2);
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
if (!high_bit_depth) {
H264_QPEL_FUNCS(1, 0, ssse3);
H264_QPEL_FUNCS(1, 1, ssse3);
H264_QPEL_FUNCS(1, 2, ssse3);
H264_QPEL_FUNCS(1, 3, ssse3);
H264_QPEL_FUNCS(2, 0, ssse3);
H264_QPEL_FUNCS(2, 1, ssse3);
H264_QPEL_FUNCS(2, 2, ssse3);
H264_QPEL_FUNCS(2, 3, ssse3);
H264_QPEL_FUNCS(3, 0, ssse3);
H264_QPEL_FUNCS(3, 1, ssse3);
H264_QPEL_FUNCS(3, 2, ssse3);
H264_QPEL_FUNCS(3, 3, ssse3);
}
if (bit_depth == 10) {
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
}
}
if (EXTERNAL_AVX(cpu_flags)) {
/* AVX implies 64 byte cache lines without the need to avoid unaligned
* memory accesses that cross the boundary between two cache lines.
* TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid
* having to treat SSE2 functions with such properties as AVX. */
if (bit_depth == 10) {
H264_QPEL_FUNCS_10(1, 0, sse2);
H264_QPEL_FUNCS_10(2, 0, sse2);
H264_QPEL_FUNCS_10(3, 0, sse2);
}
}
#endif
}

View File

@@ -0,0 +1,884 @@
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
;*****************************************************************************
;* Copyright (C) 2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
cextern pd_65535
cextern pw_1023
%define pw_pixel_max pw_1023
cextern pw_16
cextern pw_1
cextern pb_0
pad10: times 8 dw 10*1023
pad20: times 8 dw 20*1023
pad30: times 8 dw 30*1023
depad: times 4 dd 32*20*1023 + 512
depad2: times 8 dw 20*1023 + 16*1022 + 16
unpad: times 8 dw 16*1022/32 ; needs to be mod 16
tap1: times 4 dw 1, -5
tap2: times 4 dw 20, 20
tap3: times 4 dw -5, 1
SECTION .text
%macro AVG_MOV 2
pavgw %2, %1
mova %1, %2
%endmacro
%macro ADDW 3
%if mmsize == 8
paddw %1, %2
%else
movu %3, %2
paddw %1, %3
%endif
%endmacro
%macro FILT_H 4
paddw %1, %4
psubw %1, %2 ; a-b
psraw %1, 2 ; (a-b)/4
psubw %1, %2 ; (a-b)/4-b
paddw %1, %3 ; (a-b)/4-b+c
psraw %1, 2 ; ((a-b)/4-b+c)/4
paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
%endmacro
%macro PRELOAD_V 0
lea r3, [r2*3]
sub r1, r3
movu m0, [r1+r2]
movu m1, [r1+r2*2]
add r1, r3
movu m2, [r1]
movu m3, [r1+r2]
movu m4, [r1+r2*2]
add r1, r3
%endmacro
%macro FILT_V 8
movu %6, [r1]
paddw %1, %6
mova %7, %2
paddw %7, %5
mova %8, %3
paddw %8, %4
FILT_H %1, %7, %8, [pw_16]
psraw %1, 1
CLIPW %1, [pb_0], [pw_pixel_max]
%endmacro
%macro MC 1
%define OP_MOV mova
INIT_MMX mmxext
%1 put, 4
INIT_XMM sse2
%1 put, 8
%define OP_MOV AVG_MOV
INIT_MMX mmxext
%1 avg, 4
INIT_XMM sse2
%1 avg, 8
%endmacro
%macro MCAxA_OP 7
%if ARCH_X86_32
cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
mov r0, r0m
mov r1, r1m
add r0, %3*2
add r1, %3*2
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
mov r0, r0m
mov r1, r1m
lea r0, [r0+r2*%3]
lea r1, [r1+r2*%3]
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
mov r0, r0m
mov r1, r1m
lea r0, [r0+r2*%3+%3*2]
lea r1, [r1+r2*%3+%3*2]
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
RET
%else ; ARCH_X86_64
cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
mov r%6, r0
%assign p1 %6+1
mov r %+ p1, r1
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
lea r0, [r%6+%3*2]
lea r1, [r %+ p1+%3*2]
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
lea r0, [r%6+r2*%3]
lea r1, [r %+ p1+r2*%3]
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
lea r0, [r%6+r2*%3+%3*2]
lea r1, [r %+ p1+r2*%3+%3*2]
%if UNIX64 == 0 ; fall through to function
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
RET
%endif
%endif
%endmacro
;cpu, put/avg, mc, 4/8, ...
%macro cglobal_mc 6
%assign i %3*2
%if ARCH_X86_32 || cpuflag(sse2)
MCAxA_OP %1, %2, %3, i, %4,%5,%6
%endif
cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
%if UNIX64 == 0 ; no prologue or epilogue for UNIX64
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
RET
%endif
stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
%endmacro
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro COPY4 0
movu m0, [r1 ]
OP_MOV [r0 ], m0
movu m0, [r1+r2 ]
OP_MOV [r0+r2 ], m0
movu m0, [r1+r2*2]
OP_MOV [r0+r2*2], m0
movu m0, [r1+r3 ]
OP_MOV [r0+r3 ], m0
%endmacro
%macro MC00 1
INIT_MMX mmxext
cglobal_mc %1, mc00, 4, 3,4,0
lea r3, [r2*3]
COPY4
ret
INIT_XMM sse2
cglobal %1_h264_qpel8_mc00_10, 3,4
lea r3, [r2*3]
COPY4
lea r0, [r0+r2*4]
lea r1, [r1+r2*4]
COPY4
RET
cglobal %1_h264_qpel16_mc00_10, 3,4
mov r3d, 8
.loop:
movu m0, [r1 ]
movu m1, [r1 +16]
OP_MOV [r0 ], m0
OP_MOV [r0 +16], m1
movu m0, [r1+r2 ]
movu m1, [r1+r2+16]
OP_MOV [r0+r2 ], m0
OP_MOV [r0+r2+16], m1
lea r0, [r0+r2*2]
lea r1, [r1+r2*2]
dec r3d
jg .loop
REP_RET
%endmacro
%define OP_MOV mova
MC00 put
%define OP_MOV AVG_MOV
MC00 avg
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC_CACHE 1
%define OP_MOV mova
INIT_MMX mmxext
%1 put, 4
INIT_XMM sse2, cache64
%1 put, 8
INIT_XMM ssse3, cache64
%1 put, 8
INIT_XMM sse2
%1 put, 8
%define OP_MOV AVG_MOV
INIT_MMX mmxext
%1 avg, 4
INIT_XMM sse2, cache64
%1 avg, 8
INIT_XMM ssse3, cache64
%1 avg, 8
INIT_XMM sse2
%1 avg, 8
%endmacro
%macro MC20 2
cglobal_mc %1, mc20, %2, 3,4,9
mov r3d, %2
mova m1, [pw_pixel_max]
%if num_mmregs > 8
mova m8, [pw_16]
%define p16 m8
%else
%define p16 [pw_16]
%endif
.nextrow:
%if %0 == 4
movu m2, [r1-4]
movu m3, [r1-2]
movu m4, [r1+0]
ADDW m2, [r1+6], m5
ADDW m3, [r1+4], m5
ADDW m4, [r1+2], m5
%else ; movu is slow on these processors
%if mmsize==16
movu m2, [r1-4]
movu m0, [r1+6]
mova m6, m0
psrldq m0, 6
paddw m6, m2
PALIGNR m3, m0, m2, 2, m5
PALIGNR m7, m0, m2, 8, m5
paddw m3, m7
PALIGNR m4, m0, m2, 4, m5
PALIGNR m7, m0, m2, 6, m5
paddw m4, m7
SWAP 2, 6
%else
movu m2, [r1-4]
movu m6, [r1+4]
PALIGNR m3, m6, m2, 2, m5
paddw m3, m6
PALIGNR m4, m6, m2, 4, m5
PALIGNR m7, m6, m2, 6, m5
paddw m4, m7
paddw m2, [r1+6]
%endif
%endif
FILT_H m2, m3, m4, p16
psraw m2, 1
pxor m0, m0
CLIPW m2, m0, m1
OP_MOV [r0], m2
add r0, r2
add r1, r2
dec r3d
jg .nextrow
rep ret
%endmacro
MC_CACHE MC20
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC30 2
cglobal_mc %1, mc30, %2, 3,5,9
lea r4, [r1+2]
jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
%endmacro
MC_CACHE MC30
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC10 2
cglobal_mc %1, mc10, %2, 3,5,9
mov r4, r1
.body:
mov r3d, %2
mova m1, [pw_pixel_max]
%if num_mmregs > 8
mova m8, [pw_16]
%define p16 m8
%else
%define p16 [pw_16]
%endif
.nextrow:
%if %0 == 4
movu m2, [r1-4]
movu m3, [r1-2]
movu m4, [r1+0]
ADDW m2, [r1+6], m5
ADDW m3, [r1+4], m5
ADDW m4, [r1+2], m5
%else ; movu is slow on these processors
%if mmsize==16
movu m2, [r1-4]
movu m0, [r1+6]
mova m6, m0
psrldq m0, 6
paddw m6, m2
PALIGNR m3, m0, m2, 2, m5
PALIGNR m7, m0, m2, 8, m5
paddw m3, m7
PALIGNR m4, m0, m2, 4, m5
PALIGNR m7, m0, m2, 6, m5
paddw m4, m7
SWAP 2, 6
%else
movu m2, [r1-4]
movu m6, [r1+4]
PALIGNR m3, m6, m2, 2, m5
paddw m3, m6
PALIGNR m4, m6, m2, 4, m5
PALIGNR m7, m6, m2, 6, m5
paddw m4, m7
paddw m2, [r1+6]
%endif
%endif
FILT_H m2, m3, m4, p16
psraw m2, 1
pxor m0, m0
CLIPW m2, m0, m1
movu m3, [r4]
pavgw m2, m3
OP_MOV [r0], m2
add r0, r2
add r1, r2
add r4, r2
dec r3d
jg .nextrow
rep ret
%endmacro
MC_CACHE MC10
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro V_FILT 10
v_filt%9_%10_10:
add r4, r2
.no_addr4:
FILT_V m0, m1, m2, m3, m4, m5, m6, m7
add r1, r2
add r0, r2
ret
%endmacro
INIT_MMX mmxext
RESET_MM_PERMUTATION
%assign i 0
%rep 4
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
INIT_XMM sse2
RESET_MM_PERMUTATION
%assign i 0
%rep 6
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
%macro MC02 2
cglobal_mc %1, mc02, %2, 3,4,8
PRELOAD_V
sub r0, r2
%assign j 0
%rep %2
%assign i (j % 6)
call v_filt%2_ %+ i %+ _10.no_addr4
OP_MOV [r0], m0
SWAP 0,1,2,3,4,5
%assign j j+1
%endrep
ret
%endmacro
MC MC02
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC01 2
cglobal_mc %1, mc01, %2, 3,5,8
mov r4, r1
.body:
PRELOAD_V
sub r4, r2
sub r0, r2
%assign j 0
%rep %2
%assign i (j % 6)
call v_filt%2_ %+ i %+ _10
movu m7, [r4]
pavgw m0, m7
OP_MOV [r0], m0
SWAP 0,1,2,3,4,5
%assign j j+1
%endrep
ret
%endmacro
MC MC01
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC03 2
cglobal_mc %1, mc03, %2, 3,5,8
lea r4, [r1+r2]
jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
%endmacro
MC MC03
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro H_FILT_AVG 2-3
h_filt%1_%2_10:
;FILT_H with fewer registers and averaged with the FILT_V result
;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
;unfortunately I need three registers, so m5 will have to be re-read from memory
movu m5, [r4-4]
ADDW m5, [r4+6], m7
movu m6, [r4-2]
ADDW m6, [r4+4], m7
paddw m5, [pw_16]
psubw m5, m6 ; a-b
psraw m5, 2 ; (a-b)/4
psubw m5, m6 ; (a-b)/4-b
movu m6, [r4+0]
ADDW m6, [r4+2], m7
paddw m5, m6 ; (a-b)/4-b+c
psraw m5, 2 ; ((a-b)/4-b+c)/4
paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
psraw m5, 1
CLIPW m5, [pb_0], [pw_pixel_max]
;avg FILT_V, FILT_H
pavgw m0, m5
%if %0!=4
movu m5, [r1+r5]
%endif
ret
%endmacro
INIT_MMX mmxext
RESET_MM_PERMUTATION
%assign i 0
%rep 3
H_FILT_AVG 4, i
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
H_FILT_AVG 4, i, 0
INIT_XMM sse2
RESET_MM_PERMUTATION
%assign i 0
%rep 6
%if i==1
H_FILT_AVG 8, i, 0
%else
H_FILT_AVG 8, i
%endif
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
%macro MC11 2
; this REALLY needs x86_64
cglobal_mc %1, mc11, %2, 3,6,8
mov r4, r1
.body:
PRELOAD_V
sub r0, r2
sub r4, r2
mov r5, r2
neg r5
%assign j 0
%rep %2
%assign i (j % 6)
call v_filt%2_ %+ i %+ _10
call h_filt%2_ %+ i %+ _10
%if %2==8 && i==1
movu m5, [r1+r5]
%endif
OP_MOV [r0], m0
SWAP 0,1,2,3,4,5
%assign j j+1
%endrep
ret
%endmacro
MC MC11
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC31 2
cglobal_mc %1, mc31, %2, 3,6,8
mov r4, r1
add r1, 2
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
%endmacro
MC MC31
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC13 2
cglobal_mc %1, mc13, %2, 3,7,12
lea r4, [r1+r2]
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
%endmacro
MC MC13
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC33 2
cglobal_mc %1, mc33, %2, 3,6,8
lea r4, [r1+r2]
add r1, 2
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
%endmacro
MC MC33
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro FILT_H2 3
psubw %1, %2 ; a-b
psubw %2, %3 ; b-c
psllw %2, 2
psubw %1, %2 ; a-5*b+4*c
psllw %3, 4
paddw %1, %3 ; a-5*b+20*c
%endmacro
%macro FILT_VNRD 8
movu %6, [r1]
paddw %1, %6
mova %7, %2
paddw %7, %5
mova %8, %3
paddw %8, %4
FILT_H2 %1, %7, %8
%endmacro
%macro HV 1
%if mmsize==16
%define PAD 12
%define COUNT 2
%else
%define PAD 4
%define COUNT 3
%endif
put_hv%1_10:
neg r2 ; This actually saves instructions
lea r1, [r1+r2*2-mmsize+PAD]
lea r4, [rsp+PAD+gprsize]
mov r3d, COUNT
.v_loop:
movu m0, [r1]
sub r1, r2
movu m1, [r1]
sub r1, r2
movu m2, [r1]
sub r1, r2
movu m3, [r1]
sub r1, r2
movu m4, [r1]
sub r1, r2
%assign i 0
%rep %1-1
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
psubw m0, [pad20]
movu [r4+i*mmsize*3], m0
sub r1, r2
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
psubw m0, [pad20]
movu [r4+i*mmsize*3], m0
add r4, mmsize
lea r1, [r1+r2*8+mmsize]
%if %1==8
lea r1, [r1+r2*4]
%endif
dec r3d
jg .v_loop
neg r2
ret
%endmacro
INIT_MMX mmxext
HV 4
INIT_XMM sse2
HV 8
%macro H_LOOP 1
%if num_mmregs > 8
%define s1 m8
%define s2 m9
%define s3 m10
%define d1 m11
%else
%define s1 [tap1]
%define s2 [tap2]
%define s3 [tap3]
%define d1 [depad]
%endif
h%1_loop_op:
movu m1, [r1+mmsize-4]
movu m2, [r1+mmsize-2]
mova m3, [r1+mmsize+0]
movu m4, [r1+mmsize+2]
movu m5, [r1+mmsize+4]
movu m6, [r1+mmsize+6]
%if num_mmregs > 8
pmaddwd m1, s1
pmaddwd m2, s1
pmaddwd m3, s2
pmaddwd m4, s2
pmaddwd m5, s3
pmaddwd m6, s3
paddd m1, d1
paddd m2, d1
%else
mova m0, s1
pmaddwd m1, m0
pmaddwd m2, m0
mova m0, s2
pmaddwd m3, m0
pmaddwd m4, m0
mova m0, s3
pmaddwd m5, m0
pmaddwd m6, m0
mova m0, d1
paddd m1, m0
paddd m2, m0
%endif
paddd m3, m5
paddd m4, m6
paddd m1, m3
paddd m2, m4
psrad m1, 10
psrad m2, 10
pslld m2, 16
pand m1, [pd_65535]
por m1, m2
%if num_mmregs <= 8
pxor m0, m0
%endif
CLIPW m1, m0, m7
add r1, mmsize*3
ret
%endmacro
INIT_MMX mmxext
H_LOOP 4
INIT_XMM sse2
H_LOOP 8
%macro MC22 2
cglobal_mc %1, mc22, %2, 3,7,12
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD
call put_hv%2_10
mov r3d, %2
mova m7, [pw_pixel_max]
%if num_mmregs > 8
pxor m0, m0
mova m8, [tap1]
mova m9, [tap2]
mova m10, [tap3]
mova m11, [depad]
%endif
mov r1, rsp
.h_loop:
call h%2_loop_op
OP_MOV [r0], m1
add r0, r2
dec r3d
jg .h_loop
mov rsp, r6 ; restore stack pointer
ret
%endmacro
MC MC22
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC12 2
cglobal_mc %1, mc12, %2, 3,7,12
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD
call put_hv%2_10
xor r4d, r4d
.body:
mov r3d, %2
pxor m0, m0
mova m7, [pw_pixel_max]
%if num_mmregs > 8
mova m8, [tap1]
mova m9, [tap2]
mova m10, [tap3]
mova m11, [depad]
%endif
mov r1, rsp
.h_loop:
call h%2_loop_op
movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
paddw m3, [depad2]
psrlw m3, 5
psubw m3, [unpad]
CLIPW m3, m0, m7
pavgw m1, m3
OP_MOV [r0], m1
add r0, r2
dec r3d
jg .h_loop
mov rsp, r6 ; restore stack pointer
ret
%endmacro
MC MC12
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC32 2
cglobal_mc %1, mc32, %2, 3,7,12
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD
call put_hv%2_10
mov r4d, 2 ; sizeof(pixel)
jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
%endmacro
MC MC32
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro H_NRD 1
put_h%1_10:
add rsp, gprsize
mov r3d, %1
xor r4d, r4d
mova m6, [pad20]
.nextrow:
movu m2, [r5-4]
movu m3, [r5-2]
movu m4, [r5+0]
ADDW m2, [r5+6], m5
ADDW m3, [r5+4], m5
ADDW m4, [r5+2], m5
FILT_H2 m2, m3, m4
psubw m2, m6
mova [rsp+r4], m2
add r4d, mmsize*3
add r5, r2
dec r3d
jg .nextrow
sub rsp, gprsize
ret
%endmacro
INIT_MMX mmxext
H_NRD 4
INIT_XMM sse2
H_NRD 8
%macro MC21 2
cglobal_mc %1, mc21, %2, 3,7,12
mov r5, r1
.body:
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD
call put_h%2_10
sub rsp, PAD
call put_hv%2_10
mov r4d, PAD-mmsize ; H buffer
jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
%endmacro
MC MC21
;-----------------------------------------------------------------------------
; void ff_h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC23 2
cglobal_mc %1, mc23, %2, 3,7,12
lea r5, [r1+r2]
jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
%endmacro
MC MC23

View File

@@ -0,0 +1,862 @@
;*****************************************************************************
;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
;*****************************************************************************
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2012 Daniel Kang
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
cextern pw_16
cextern pw_5
cextern pb_0
SECTION .text
%macro op_avgh 3
movh %3, %2
pavgb %1, %3
movh %2, %1
%endmacro
%macro op_avg 2-3
pavgb %1, %2
mova %2, %1
%endmacro
%macro op_puth 2-3
movh %2, %1
%endmacro
%macro op_put 2-3
mova %2, %1
%endmacro
%macro QPEL4_H_LOWPASS_OP 1
cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
pxor m7, m7
mova m4, [pw_5]
mova m5, [pw_16]
mov r4d, 4
.loop:
movh m1, [r1-1]
movh m2, [r1+0]
movh m3, [r1+1]
movh m0, [r1+2]
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m0, m7
paddw m1, m0
paddw m2, m3
movh m0, [r1-2]
movh m3, [r1+3]
punpcklbw m0, m7
punpcklbw m3, m7
paddw m0, m3
psllw m2, 2
psubw m2, m1
pmullw m2, m4
paddw m0, m5
paddw m0, m2
psraw m0, 5
packuswb m0, m0
op_%1h m0, [r0], m6
add r0, r2
add r1, r3
dec r4d
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL4_H_LOWPASS_OP put
QPEL4_H_LOWPASS_OP avg
%macro QPEL8_H_LOWPASS_OP 1
cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
mov r4d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
mova m0, [r1]
mova m2, [r1+1]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
paddw m0, m2
paddw m1, m3
psllw m0, 2
psllw m1, 2
mova m2, [r1-1]
mova m4, [r1+2]
mova m3, m2
mova m5, m4
punpcklbw m2, m7
punpckhbw m3, m7
punpcklbw m4, m7
punpckhbw m5, m7
paddw m2, m4
paddw m5, m3
psubw m0, m2
psubw m1, m5
pmullw m0, m6
pmullw m1, m6
movd m2, [r1-2]
movd m5, [r1+7]
punpcklbw m2, m7
punpcklbw m5, m7
paddw m2, m3
paddw m4, m5
mova m5, [pw_16]
paddw m2, m5
paddw m4, m5
paddw m0, m2
paddw m1, m4
psraw m0, 5
psraw m1, 5
packuswb m0, m1
op_%1 m0, [r0], m4
add r0, r2
add r1, r3
dec r4d
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8_H_LOWPASS_OP put
QPEL8_H_LOWPASS_OP avg
%macro QPEL8_H_LOWPASS_OP_XMM 1
cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
mov r4d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
movu m1, [r1-2]
mova m0, m1
punpckhbw m1, m7
punpcklbw m0, m7
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m4, m0, 2
palignr m3, m0, 4
palignr m2, m0, 6
palignr m1, m0, 8
palignr m5, m0, 10
paddw m0, m5
paddw m2, m3
paddw m1, m4
psllw m2, 2
psubw m2, m1
paddw m0, [pw_16]
pmullw m2, m6
paddw m2, m0
psraw m2, 5
packuswb m2, m2
op_%1h m2, [r0], m4
add r1, r3
add r0, r2
dec r4d
jne .loop
REP_RET
%endmacro
INIT_XMM ssse3
QPEL8_H_LOWPASS_OP_XMM put
QPEL8_H_LOWPASS_OP_XMM avg
%macro QPEL4_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
pxor m7, m7
mova m4, [pw_5]
mova m5, [pw_16]
mov r5d, 4
.loop:
movh m1, [r1-1]
movh m2, [r1+0]
movh m3, [r1+1]
movh m0, [r1+2]
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m0, m7
paddw m1, m0
paddw m2, m3
movh m0, [r1-2]
movh m3, [r1+3]
punpcklbw m0, m7
punpcklbw m3, m7
paddw m0, m3
psllw m2, 2
psubw m2, m1
pmullw m2, m4
paddw m0, m5
paddw m0, m2
movh m3, [r2]
psraw m0, 5
packuswb m0, m0
pavgb m0, m3
op_%1h m0, [r0], m6
add r0, r3
add r1, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL4_H_LOWPASS_L2_OP put
QPEL4_H_LOWPASS_L2_OP avg
%macro QPEL8_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
mova m0, [r1]
mova m2, [r1+1]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
paddw m0, m2
paddw m1, m3
psllw m0, 2
psllw m1, 2
mova m2, [r1-1]
mova m4, [r1+2]
mova m3, m2
mova m5, m4
punpcklbw m2, m7
punpckhbw m3, m7
punpcklbw m4, m7
punpckhbw m5, m7
paddw m2, m4
paddw m5, m3
psubw m0, m2
psubw m1, m5
pmullw m0, m6
pmullw m1, m6
movd m2, [r1-2]
movd m5, [r1+7]
punpcklbw m2, m7
punpcklbw m5, m7
paddw m2, m3
paddw m4, m5
mova m5, [pw_16]
paddw m2, m5
paddw m4, m5
paddw m0, m2
paddw m1, m4
psraw m0, 5
psraw m1, 5
mova m4, [r2]
packuswb m0, m1
pavgb m0, m4
op_%1 m0, [r0], m4
add r0, r3
add r1, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8_H_LOWPASS_L2_OP put
QPEL8_H_LOWPASS_L2_OP avg
%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
lddqu m1, [r1-2]
mova m0, m1
punpckhbw m1, m7
punpcklbw m0, m7
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m4, m0, 2
palignr m3, m0, 4
palignr m2, m0, 6
palignr m1, m0, 8
palignr m5, m0, 10
paddw m0, m5
paddw m2, m3
paddw m1, m4
psllw m2, 2
movh m3, [r2]
psubw m2, m1
paddw m0, [pw_16]
pmullw m2, m6
paddw m2, m0
psraw m2, 5
packuswb m2, m2
pavgb m2, m3
op_%1h m2, [r0], m4
add r1, r3
add r0, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
INIT_XMM ssse3
QPEL8_H_LOWPASS_L2_OP_XMM put
QPEL8_H_LOWPASS_L2_OP_XMM avg
; All functions that call this are required to have function arguments of
; dst, src, dstStride, srcStride
%macro FILT_V 1
mova m6, m2
movh m5, [r1]
paddw m6, m3
psllw m6, 2
psubw m6, m1
psubw m6, m4
punpcklbw m5, m7
pmullw m6, [pw_5]
paddw m0, [pw_16]
add r1, r3
paddw m0, m5
paddw m6, m0
psraw m6, 5
packuswb m6, m6
op_%1h m6, [r0], m0 ; 1
add r0, r2
SWAP 0, 1, 2, 3, 4, 5
%endmacro
%macro QPEL4_V_LOWPASS_OP 1
cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
sub r1, r3
sub r1, r3
pxor m7, m7
movh m0, [r1]
movh m1, [r1+r3]
lea r1, [r1+2*r3]
movh m2, [r1]
movh m3, [r1+r3]
lea r1, [r1+2*r3]
movh m4, [r1]
add r1, r3
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
RET
%endmacro
INIT_MMX mmxext
QPEL4_V_LOWPASS_OP put
QPEL4_V_LOWPASS_OP avg
%macro QPEL8OR16_V_LOWPASS_OP 1
%if cpuflag(sse2)
cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
sub r1, r3
sub r1, r3
%else
cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
%endif
pxor m7, m7
movh m0, [r1]
movh m1, [r1+r3]
lea r1, [r1+2*r3]
movh m2, [r1]
movh m3, [r1+r3]
lea r1, [r1+2*r3]
movh m4, [r1]
add r1, r3
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
cmp r4d, 16
jne .end
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
.end:
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8OR16_V_LOWPASS_OP put
QPEL8OR16_V_LOWPASS_OP avg
INIT_XMM sse2
QPEL8OR16_V_LOWPASS_OP put
QPEL8OR16_V_LOWPASS_OP avg
; All functions that use this are required to have args:
; src, tmp, srcSize
%macro FILT_HV 1 ; offset
mova m6, m2
movh m5, [r0]
paddw m6, m3
psllw m6, 2
paddw m0, [pw_16]
psubw m6, m1
psubw m6, m4
punpcklbw m5, m7
pmullw m6, [pw_5]
paddw m0, m5
add r0, r2
paddw m6, m0
mova [r1+%1], m6
SWAP 0, 1, 2, 3, 4, 5
%endmacro
%macro QPEL4_HV1_LOWPASS_OP 1
cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
movsxdifnidn r2, r2d
pxor m7, m7
movh m0, [r0]
movh m1, [r0+r2]
lea r0, [r0+2*r2]
movh m2, [r0]
movh m3, [r0+r2]
lea r0, [r0+2*r2]
movh m4, [r0]
add r0, r2
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_HV 0*24
FILT_HV 1*24
FILT_HV 2*24
FILT_HV 3*24
RET
cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
movsxdifnidn r2, r2d
mov r3d, 4
.loop:
mova m0, [r0]
paddw m0, [r0+10]
mova m1, [r0+2]
paddw m1, [r0+8]
mova m2, [r0+4]
paddw m2, [r0+6]
psubw m0, m1
psraw m0, 2
psubw m0, m1
paddsw m0, m2
psraw m0, 2
paddw m0, m2
psraw m0, 6
packuswb m0, m0
op_%1h m0, [r1], m7
add r0, 24
add r1, r2
dec r3d
jnz .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL4_HV1_LOWPASS_OP put
QPEL4_HV1_LOWPASS_OP avg
%macro QPEL8OR16_HV1_LOWPASS_OP 1
cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
movsxdifnidn r2, r2d
pxor m7, m7
movh m0, [r0]
movh m1, [r0+r2]
lea r0, [r0+2*r2]
movh m2, [r0]
movh m3, [r0+r2]
lea r0, [r0+2*r2]
movh m4, [r0]
add r0, r2
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_HV 0*48
FILT_HV 1*48
FILT_HV 2*48
FILT_HV 3*48
FILT_HV 4*48
FILT_HV 5*48
FILT_HV 6*48
FILT_HV 7*48
cmp r3d, 16
jne .end
FILT_HV 8*48
FILT_HV 9*48
FILT_HV 10*48
FILT_HV 11*48
FILT_HV 12*48
FILT_HV 13*48
FILT_HV 14*48
FILT_HV 15*48
.end:
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8OR16_HV1_LOWPASS_OP put
QPEL8OR16_HV1_LOWPASS_OP avg
INIT_XMM sse2
QPEL8OR16_HV1_LOWPASS_OP put
%macro QPEL8OR16_HV2_LOWPASS_OP 1
; unused is to match ssse3 and mmxext args
cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
movsxdifnidn r2, r2d
.loop:
mova m0, [r1]
mova m3, [r1+8]
mova m1, [r1+2]
mova m4, [r1+10]
paddw m0, m4
paddw m1, m3
paddw m3, [r1+18]
paddw m4, [r1+16]
mova m2, [r1+4]
mova m5, [r1+12]
paddw m2, [r1+6]
paddw m5, [r1+14]
psubw m0, m1
psubw m3, m4
psraw m0, 2
psraw m3, 2
psubw m0, m1
psubw m3, m4
paddsw m0, m2
paddsw m3, m5
psraw m0, 2
psraw m3, 2
paddw m0, m2
paddw m3, m5
psraw m0, 6
psraw m3, 6
packuswb m0, m3
op_%1 m0, [r0], m7
add r1, 48
add r0, r2
dec r4d
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8OR16_HV2_LOWPASS_OP put
QPEL8OR16_HV2_LOWPASS_OP avg
%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
cmp r4d, 16
je .op16
.loop8:
mova m1, [r1+16]
mova m0, [r1]
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m5, m0, 10
palignr m4, m0, 8
palignr m3, m0, 6
palignr m2, m0, 4
palignr m1, m0, 2
paddw m0, m5
paddw m1, m4
paddw m2, m3
psubw m0, m1
psraw m0, 2
psubw m0, m1
paddw m0, m2
psraw m0, 2
paddw m0, m2
psraw m0, 6
packuswb m0, m0
op_%1h m0, [r0], m7
add r1, 48
add r0, r2
dec r4d
jne .loop8
jmp .done
.op16:
mova m4, [r1+32]
mova m5, [r1+16]
mova m7, [r1]
mova m3, m4
mova m2, m4
mova m1, m4
mova m0, m4
palignr m0, m5, 10
palignr m1, m5, 8
palignr m2, m5, 6
palignr m3, m5, 4
palignr m4, m5, 2
paddw m0, m5
paddw m1, m4
paddw m2, m3
mova m6, m5
mova m4, m5
mova m3, m5
palignr m4, m7, 8
palignr m6, m7, 2
palignr m3, m7, 10
paddw m4, m6
mova m6, m5
palignr m5, m7, 6
palignr m6, m7, 4
paddw m3, m7
paddw m5, m6
psubw m0, m1
psubw m3, m4
psraw m0, 2
psraw m3, 2
psubw m0, m1
psubw m3, m4
paddw m0, m2
paddw m3, m5
psraw m0, 2
psraw m3, 2
paddw m0, m2
paddw m3, m5
psraw m0, 6
psraw m3, 6
packuswb m3, m0
op_%1 m3, [r0], m7
add r1, 48
add r0, r2
dec r4d
jne .op16
.done:
REP_RET
%endmacro
INIT_XMM ssse3
QPEL8OR16_HV2_LOWPASS_OP_XMM put
QPEL8OR16_HV2_LOWPASS_OP_XMM avg
%macro PIXELS4_L2_SHIFT5 1
cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mova m0, [r1]
mova m1, [r1+24]
psraw m0, 5
psraw m1, 5
packuswb m0, m0
packuswb m1, m1
pavgb m0, [r2]
pavgb m1, [r2+r4]
op_%1h m0, [r0], m4
op_%1h m1, [r0+r3], m5
lea r2, [r2+r4*2]
lea r0, [r0+r3*2]
mova m0, [r1+48]
mova m1, [r1+72]
psraw m0, 5
psraw m1, 5
packuswb m0, m0
packuswb m1, m1
pavgb m0, [r2]
pavgb m1, [r2+r4]
op_%1h m0, [r0], m4
op_%1h m1, [r0+r3], m5
RET
%endmacro
INIT_MMX mmxext
PIXELS4_L2_SHIFT5 put
PIXELS4_L2_SHIFT5 avg
%macro PIXELS8_L2_SHIFT5 1
cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
.loop:
mova m0, [r1]
mova m1, [r1+8]
mova m2, [r1+48]
mova m3, [r1+48+8]
psraw m0, 5
psraw m1, 5
psraw m2, 5
psraw m3, 5
packuswb m0, m1
packuswb m2, m3
pavgb m0, [r2]
pavgb m2, [r2+r4]
op_%1 m0, [r0], m4
op_%1 m2, [r0+r3], m5
lea r2, [r2+2*r4]
add r1, 48*2
lea r0, [r0+2*r3]
sub r5d, 2
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PIXELS8_L2_SHIFT5 put
PIXELS8_L2_SHIFT5 avg
%if ARCH_X86_64
%macro QPEL16_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 16
pxor m15, m15
mova m14, [pw_5]
mova m13, [pw_16]
.loop:
lddqu m1, [r1+6]
lddqu m7, [r1-2]
mova m0, m1
punpckhbw m1, m15
punpcklbw m0, m15
punpcklbw m7, m15
mova m2, m1
mova m6, m0
mova m3, m1
mova m8, m0
mova m4, m1
mova m9, m0
mova m12, m0
mova m11, m1
palignr m11, m0, 10
palignr m12, m7, 10
palignr m4, m0, 2
palignr m9, m7, 2
palignr m3, m0, 4
palignr m8, m7, 4
palignr m2, m0, 6
palignr m6, m7, 6
paddw m11, m0
palignr m1, m0, 8
palignr m0, m7, 8
paddw m7, m12
paddw m2, m3
paddw m6, m8
paddw m1, m4
paddw m0, m9
psllw m2, 2
psllw m6, 2
psubw m2, m1
psubw m6, m0
paddw m11, m13
paddw m7, m13
pmullw m2, m14
pmullw m6, m14
lddqu m3, [r2]
paddw m2, m11
paddw m6, m7
psraw m2, 5
psraw m6, 5
packuswb m6, m2
pavgb m6, m3
op_%1 m6, [r0], m11
add r1, r3
add r0, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
INIT_XMM ssse3
QPEL16_H_LOWPASS_L2_OP put
QPEL16_H_LOWPASS_L2_OP avg
%endif

View File

@@ -0,0 +1,320 @@
;*****************************************************************************
;* SSE2-optimized weighted prediction code
;*****************************************************************************
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
;-----------------------------------------------------------------------------
; biweight pred:
;
; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
; int height, int log2_denom, int weightd,
; int weights, int offset);
; and
; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height,
; int log2_denom, int weight, int offset);
;-----------------------------------------------------------------------------
%macro WEIGHT_SETUP 0
add r5, r5
inc r5
movd m3, r4d
movd m5, r5d
movd m6, r3d
pslld m5, m6
psrld m5, 1
%if mmsize == 16
pshuflw m3, m3, 0
pshuflw m5, m5, 0
punpcklqdq m3, m3
punpcklqdq m5, m5
%else
pshufw m3, m3, 0
pshufw m5, m5, 0
%endif
pxor m7, m7
%endmacro
%macro WEIGHT_OP 2
movh m0, [r0+%1]
movh m1, [r0+%2]
punpcklbw m0, m7
punpcklbw m1, m7
pmullw m0, m3
pmullw m1, m3
paddsw m0, m5
paddsw m1, m5
psraw m0, m6
psraw m1, m6
packuswb m0, m1
%endmacro
INIT_MMX mmxext
cglobal h264_weight_16, 6, 6, 0
WEIGHT_SETUP
.nextrow:
WEIGHT_OP 0, 4
mova [r0 ], m0
WEIGHT_OP 8, 12
mova [r0+8], m0
add r0, r1
dec r2d
jnz .nextrow
REP_RET
%macro WEIGHT_FUNC_MM 2
cglobal h264_weight_%1, 6, 6, %2
WEIGHT_SETUP
.nextrow:
WEIGHT_OP 0, mmsize/2
mova [r0], m0
add r0, r1
dec r2d
jnz .nextrow
REP_RET
%endmacro
INIT_MMX mmxext
WEIGHT_FUNC_MM 8, 0
INIT_XMM sse2
WEIGHT_FUNC_MM 16, 8
%macro WEIGHT_FUNC_HALF_MM 2
cglobal h264_weight_%1, 6, 6, %2
WEIGHT_SETUP
sar r2d, 1
lea r3, [r1*2]
.nextrow:
WEIGHT_OP 0, r1
movh [r0], m0
%if mmsize == 16
movhps [r0+r1], m0
%else
psrlq m0, 32
movh [r0+r1], m0
%endif
add r0, r3
dec r2d
jnz .nextrow
REP_RET
%endmacro
INIT_MMX mmxext
WEIGHT_FUNC_HALF_MM 4, 0
INIT_XMM sse2
WEIGHT_FUNC_HALF_MM 8, 8
%macro BIWEIGHT_SETUP 0
%if ARCH_X86_64
%define off_regd r7d
%else
%define off_regd r3d
%endif
mov off_regd, r7m
add off_regd, 1
or off_regd, 1
add r4d, 1
cmp r6d, 128
je .nonnormal
cmp r5d, 128
jne .normal
.nonnormal:
sar r5d, 1
sar r6d, 1
sar off_regd, 1
sub r4d, 1
.normal:
%if cpuflag(ssse3)
movd m4, r5d
movd m0, r6d
%else
movd m3, r5d
movd m4, r6d
%endif
movd m5, off_regd
movd m6, r4d
pslld m5, m6
psrld m5, 1
%if cpuflag(ssse3)
punpcklbw m4, m0
pshuflw m4, m4, 0
pshuflw m5, m5, 0
punpcklqdq m4, m4
punpcklqdq m5, m5
%else
%if mmsize == 16
pshuflw m3, m3, 0
pshuflw m4, m4, 0
pshuflw m5, m5, 0
punpcklqdq m3, m3
punpcklqdq m4, m4
punpcklqdq m5, m5
%else
pshufw m3, m3, 0
pshufw m4, m4, 0
pshufw m5, m5, 0
%endif
pxor m7, m7
%endif
%endmacro
%macro BIWEIGHT_STEPA 3
movh m%1, [r0+%3]
movh m%2, [r1+%3]
punpcklbw m%1, m7
punpcklbw m%2, m7
pmullw m%1, m3
pmullw m%2, m4
paddsw m%1, m%2
%endmacro
%macro BIWEIGHT_STEPB 0
paddsw m0, m5
paddsw m1, m5
psraw m0, m6
psraw m1, m6
packuswb m0, m1
%endmacro
INIT_MMX mmxext
cglobal h264_biweight_16, 7, 8, 0
BIWEIGHT_SETUP
movifnidn r3d, r3m
.nextrow:
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, 4
BIWEIGHT_STEPB
mova [r0], m0
BIWEIGHT_STEPA 0, 1, 8
BIWEIGHT_STEPA 1, 2, 12
BIWEIGHT_STEPB
mova [r0+8], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
%macro BIWEIGHT_FUNC_MM 2
cglobal h264_biweight_%1, 7, 8, %2
BIWEIGHT_SETUP
movifnidn r3d, r3m
.nextrow:
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, mmsize/2
BIWEIGHT_STEPB
mova [r0], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
%endmacro
INIT_MMX mmxext
BIWEIGHT_FUNC_MM 8, 0
INIT_XMM sse2
BIWEIGHT_FUNC_MM 16, 8
%macro BIWEIGHT_FUNC_HALF_MM 2
cglobal h264_biweight_%1, 7, 8, %2
BIWEIGHT_SETUP
movifnidn r3d, r3m
sar r3, 1
lea r4, [r2*2]
.nextrow:
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, r2
BIWEIGHT_STEPB
movh [r0], m0
%if mmsize == 16
movhps [r0+r2], m0
%else
psrlq m0, 32
movh [r0+r2], m0
%endif
add r0, r4
add r1, r4
dec r3d
jnz .nextrow
REP_RET
%endmacro
INIT_MMX mmxext
BIWEIGHT_FUNC_HALF_MM 4, 0
INIT_XMM sse2
BIWEIGHT_FUNC_HALF_MM 8, 8
%macro BIWEIGHT_SSSE3_OP 0
pmaddubsw m0, m4
pmaddubsw m2, m4
paddsw m0, m5
paddsw m2, m5
psraw m0, m6
psraw m2, m6
packuswb m0, m2
%endmacro
INIT_XMM ssse3
cglobal h264_biweight_16, 7, 8, 8
BIWEIGHT_SETUP
movifnidn r3d, r3m
.nextrow:
movh m0, [r0]
movh m2, [r0+8]
movh m3, [r1+8]
punpcklbw m0, [r1]
punpcklbw m2, m3
BIWEIGHT_SSSE3_OP
mova [r0], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
INIT_XMM ssse3
cglobal h264_biweight_8, 7, 8, 8
BIWEIGHT_SETUP
movifnidn r3d, r3m
sar r3, 1
lea r4, [r2*2]
.nextrow:
movh m0, [r0]
movh m1, [r1]
movh m2, [r0+r2]
movh m3, [r1+r2]
punpcklbw m0, m1
punpcklbw m2, m3
BIWEIGHT_SSSE3_OP
movh [r0], m0
movhps [r0+r2], m0
add r0, r4
add r1, r4
dec r3d
jnz .nextrow
REP_RET

View File

@@ -0,0 +1,284 @@
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
sq_1: dq 1
dq 0
cextern pw_1
cextern pw_1023
%define pw_pixel_max pw_1023
SECTION .text
;-----------------------------------------------------------------------------
; void ff_h264_weight_16_10(uint8_t *dst, int stride, int height,
; int log2_denom, int weight, int offset);
;-----------------------------------------------------------------------------
%macro WEIGHT_PROLOGUE 0
.prologue:
PROLOGUE 0,6,8
movifnidn r0, r0mp
movifnidn r1d, r1m
movifnidn r2d, r2m
movifnidn r4d, r4m
movifnidn r5d, r5m
%endmacro
%macro WEIGHT_SETUP 0
mova m0, [pw_1]
movd m2, r3m
pslld m0, m2 ; 1<<log2_denom
SPLATW m0, m0
shl r5, 19 ; *8, move to upper half of dword
lea r5, [r5+r4*2+0x10000]
movd m3, r5d ; weight<<1 | 1+(offset<<(3))
pshufd m3, m3, 0
mova m4, [pw_pixel_max]
paddw m2, [sq_1] ; log2_denom+1
%if notcpuflag(sse4)
pxor m7, m7
%endif
%endmacro
%macro WEIGHT_OP 1-2
%if %0==1
mova m5, [r0+%1]
punpckhwd m6, m5, m0
punpcklwd m5, m0
%else
movq m5, [r0+%1]
movq m6, [r0+%2]
punpcklwd m5, m0
punpcklwd m6, m0
%endif
pmaddwd m5, m3
pmaddwd m6, m3
psrad m5, m2
psrad m6, m2
%if cpuflag(sse4)
packusdw m5, m6
pminsw m5, m4
%else
packssdw m5, m6
CLIPW m5, m7, m4
%endif
%endmacro
%macro WEIGHT_FUNC_DBL 0
cglobal h264_weight_16_10
WEIGHT_PROLOGUE
WEIGHT_SETUP
.nextrow:
WEIGHT_OP 0
mova [r0 ], m5
WEIGHT_OP 16
mova [r0+16], m5
add r0, r1
dec r2d
jnz .nextrow
REP_RET
%endmacro
INIT_XMM sse2
WEIGHT_FUNC_DBL
INIT_XMM sse4
WEIGHT_FUNC_DBL
%macro WEIGHT_FUNC_MM 0
cglobal h264_weight_8_10
WEIGHT_PROLOGUE
WEIGHT_SETUP
.nextrow:
WEIGHT_OP 0
mova [r0], m5
add r0, r1
dec r2d
jnz .nextrow
REP_RET
%endmacro
INIT_XMM sse2
WEIGHT_FUNC_MM
INIT_XMM sse4
WEIGHT_FUNC_MM
%macro WEIGHT_FUNC_HALF_MM 0
cglobal h264_weight_4_10
WEIGHT_PROLOGUE
sar r2d, 1
WEIGHT_SETUP
lea r3, [r1*2]
.nextrow:
WEIGHT_OP 0, r1
movh [r0], m5
movhps [r0+r1], m5
add r0, r3
dec r2d
jnz .nextrow
REP_RET
%endmacro
INIT_XMM sse2
WEIGHT_FUNC_HALF_MM
INIT_XMM sse4
WEIGHT_FUNC_HALF_MM
;-----------------------------------------------------------------------------
; void ff_h264_biweight_16_10(uint8_t *dst, uint8_t *src, int stride,
; int height, int log2_denom, int weightd,
; int weights, int offset);
;-----------------------------------------------------------------------------
%if ARCH_X86_32
DECLARE_REG_TMP 3
%else
DECLARE_REG_TMP 7
%endif
%macro BIWEIGHT_PROLOGUE 0
.prologue:
PROLOGUE 0,8,8
movifnidn r0, r0mp
movifnidn r1, r1mp
movifnidn r2d, r2m
movifnidn r5d, r5m
movifnidn r6d, r6m
movifnidn t0d, r7m
%endmacro
%macro BIWEIGHT_SETUP 0
lea t0, [t0*4+1] ; (offset<<2)+1
or t0, 1
shl r6, 16
or r5, r6
movd m4, r5d ; weightd | weights
movd m5, t0d ; (offset+1)|1
movd m6, r4m ; log2_denom
pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
paddd m6, [sq_1]
pshufd m4, m4, 0
pshufd m5, m5, 0
mova m3, [pw_pixel_max]
movifnidn r3d, r3m
%if notcpuflag(sse4)
pxor m7, m7
%endif
%endmacro
%macro BIWEIGHT 1-2
%if %0==1
mova m0, [r0+%1]
mova m1, [r1+%1]
punpckhwd m2, m0, m1
punpcklwd m0, m1
%else
movq m0, [r0+%1]
movq m1, [r1+%1]
punpcklwd m0, m1
movq m2, [r0+%2]
movq m1, [r1+%2]
punpcklwd m2, m1
%endif
pmaddwd m0, m4
pmaddwd m2, m4
paddd m0, m5
paddd m2, m5
psrad m0, m6
psrad m2, m6
%if cpuflag(sse4)
packusdw m0, m2
pminsw m0, m3
%else
packssdw m0, m2
CLIPW m0, m7, m3
%endif
%endmacro
%macro BIWEIGHT_FUNC_DBL 0
cglobal h264_biweight_16_10
BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP
.nextrow:
BIWEIGHT 0
mova [r0 ], m0
BIWEIGHT 16
mova [r0+16], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
%endmacro
INIT_XMM sse2
BIWEIGHT_FUNC_DBL
INIT_XMM sse4
BIWEIGHT_FUNC_DBL
%macro BIWEIGHT_FUNC 0
cglobal h264_biweight_8_10
BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP
.nextrow:
BIWEIGHT 0
mova [r0], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
%endmacro
INIT_XMM sse2
BIWEIGHT_FUNC
INIT_XMM sse4
BIWEIGHT_FUNC
%macro BIWEIGHT_FUNC_HALF 0
cglobal h264_biweight_4_10
BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP
sar r3d, 1
lea r4, [r2*2]
.nextrow:
BIWEIGHT 0, r2
movh [r0 ], m0
movhps [r0+r2], m0
add r0, r4
add r1, r4
dec r3d
jnz .nextrow
REP_RET
%endmacro
INIT_XMM sse2
BIWEIGHT_FUNC_HALF
INIT_XMM sse4
BIWEIGHT_FUNC_HALF

View File

@@ -0,0 +1,117 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/h264chroma.h"
void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride, int h, int x, int y);
CHROMA_MC(put, 2, 10, mmxext)
CHROMA_MC(avg, 2, 10, mmxext)
CHROMA_MC(put, 4, 10, mmxext)
CHROMA_MC(avg, 4, 10, mmxext)
CHROMA_MC(put, 8, 10, sse2)
CHROMA_MC(avg, 8, 10, sse2)
CHROMA_MC(put, 8, 10, avx)
CHROMA_MC(avg, 8, 10, avx)
av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth)
{
int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags) && !high_bit_depth) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags) && !high_bit_depth) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
}
if (EXTERNAL_MMXEXT(cpu_flags) && !high_bit_depth) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
}
if (EXTERNAL_MMXEXT(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags) && !high_bit_depth) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
}
if (EXTERNAL_AVX(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
// AVX implies !cache64.
// TODO: Port cache(32|64) detection from x264.
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
}
}

448
externals/ffmpeg/libavcodec/x86/h264dsp_init.c vendored Executable file
View File

@@ -0,0 +1,448 @@
/*
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/h264dsp.h"
/***********************************/
/* IDCT */
#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \
void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
int16_t *block, \
int stride);
IDCT_ADD_FUNC(, 8, mmx)
IDCT_ADD_FUNC(, 8, sse2)
IDCT_ADD_FUNC(, 8, avx)
IDCT_ADD_FUNC(, 10, sse2)
IDCT_ADD_FUNC(_dc, 8, mmxext)
IDCT_ADD_FUNC(_dc, 8, sse2)
IDCT_ADD_FUNC(_dc, 8, avx)
IDCT_ADD_FUNC(_dc, 10, mmxext)
IDCT_ADD_FUNC(8_dc, 8, mmxext)
IDCT_ADD_FUNC(8_dc, 10, sse2)
IDCT_ADD_FUNC(8, 8, mmx)
IDCT_ADD_FUNC(8, 8, sse2)
IDCT_ADD_FUNC(8, 10, sse2)
IDCT_ADD_FUNC(, 10, avx)
IDCT_ADD_FUNC(8_dc, 10, avx)
IDCT_ADD_FUNC(8, 10, avx)
#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, const int *block_offset, \
int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
IDCT_ADD_REP_FUNC(8, 4, 10, avx)
IDCT_ADD_REP_FUNC(, 16, 8, mmx)
IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
IDCT_ADD_REP_FUNC(, 16, 8, sse2)
IDCT_ADD_REP_FUNC(, 16, 10, sse2)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
IDCT_ADD_REP_FUNC(, 16, 10, avx)
IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
(uint8_t **dst, const int *block_offset, \
int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
IDCT_ADD_REP_FUNC2(, 8, 8, mmxext)
IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
IDCT_ADD_REP_FUNC2(, 8, 10, avx)
IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx)
IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2)
IDCT_ADD_REP_FUNC2(, 8_422, 10, avx)
void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
/***********************************/
/* deblocking */
void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40],
int8_t ref[2][40],
int16_t mv[2][40][2],
int bidir, int edges, int step,
int mask_mv0, int mask_mv1, int field);
#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
ptrdiff_t stride, \
int alpha, \
int beta, \
int8_t *tc0);
#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
ptrdiff_t stride, \
int alpha, \
int beta);
#define LF_FUNCS(type, depth) \
LF_FUNC(h, chroma, depth, mmxext) \
LF_IFUNC(h, chroma_intra, depth, mmxext) \
LF_FUNC(h, chroma422, depth, mmxext) \
LF_IFUNC(h, chroma422_intra, depth, mmxext) \
LF_FUNC(v, chroma, depth, mmxext) \
LF_IFUNC(v, chroma_intra, depth, mmxext) \
LF_FUNC(h, luma, depth, mmxext) \
LF_IFUNC(h, luma_intra, depth, mmxext) \
LF_FUNC(h, luma, depth, sse2) \
LF_IFUNC(h, luma_intra, depth, sse2) \
LF_FUNC(v, luma, depth, sse2) \
LF_IFUNC(v, luma_intra, depth, sse2) \
LF_FUNC(h, chroma, depth, sse2) \
LF_IFUNC(h, chroma_intra, depth, sse2) \
LF_FUNC(h, chroma422, depth, sse2) \
LF_IFUNC(h, chroma422_intra, depth, sse2) \
LF_FUNC(v, chroma, depth, sse2) \
LF_IFUNC(v, chroma_intra, depth, sse2) \
LF_FUNC(h, luma, depth, avx) \
LF_IFUNC(h, luma_intra, depth, avx) \
LF_FUNC(v, luma, depth, avx) \
LF_IFUNC(v, luma_intra, depth, avx) \
LF_FUNC(h, chroma, depth, avx) \
LF_IFUNC(h, chroma_intra, depth, avx) \
LF_FUNC(h, chroma422, depth, avx) \
LF_IFUNC(h, chroma422_intra, depth, avx) \
LF_FUNC(v, chroma, depth, avx) \
LF_IFUNC(v, chroma_intra, depth, avx)
LF_FUNC(h, luma_mbaff, 8, sse2)
LF_FUNC(h, luma_mbaff, 8, avx)
LF_FUNCS(uint8_t, 8)
LF_FUNCS(uint16_t, 10)
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
LF_FUNC(v8, luma, 8, mmxext)
static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0)
{
if ((tc0[0] & tc0[1]) >= 0)
ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0);
if ((tc0[2] & tc0[3]) >= 0)
ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2);
}
LF_IFUNC(v8, luma_intra, 8, mmxext)
static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride,
int alpha, int beta)
{
ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta);
ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta);
}
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
LF_FUNC(v, luma, 10, mmxext)
LF_IFUNC(v, luma_intra, 10, mmxext)
/***********************************/
/* weighted prediction */
#define H264_WEIGHT(W, OPT) \
void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, ptrdiff_t stride, \
int height, int log2_denom, \
int weight, int offset);
#define H264_BIWEIGHT(W, OPT) \
void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride, int height, \
int log2_denom, int weightd, \
int weights, int offset);
#define H264_BIWEIGHT_MMX(W) \
H264_WEIGHT(W, mmxext) \
H264_BIWEIGHT(W, mmxext)
#define H264_BIWEIGHT_MMX_SSE(W) \
H264_BIWEIGHT_MMX(W) \
H264_WEIGHT(W, sse2) \
H264_BIWEIGHT(W, sse2) \
H264_BIWEIGHT(W, ssse3)
H264_BIWEIGHT_MMX_SSE(16)
H264_BIWEIGHT_MMX_SSE(8)
H264_BIWEIGHT_MMX(4)
#define H264_WEIGHT_10(W, DEPTH, OPT) \
void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
ptrdiff_t stride, \
int height, \
int log2_denom, \
int weight, \
int offset);
#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
uint8_t *src, \
ptrdiff_t stride, \
int height, \
int log2_denom, \
int weightd, \
int weights, \
int offset);
#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
H264_WEIGHT_10(W, DEPTH, sse2) \
H264_WEIGHT_10(W, DEPTH, sse4) \
H264_BIWEIGHT_10(W, DEPTH, sse2) \
H264_BIWEIGHT_10(W, DEPTH, sse4)
H264_BIWEIGHT_10_SSE(16, 10)
H264_BIWEIGHT_10_SSE(8, 10)
H264_BIWEIGHT_10_SSE(4, 10)
av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1)
c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext;
if (bit_depth == 8) {
if (EXTERNAL_MMX(cpu_flags)) {
c->h264_idct_dc_add =
c->h264_idct_add = ff_h264_idct_add_8_mmx;
c->h264_idct8_dc_add =
c->h264_idct8_add = ff_h264_idct8_add_8_mmx;
c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
if (chroma_format_idc <= 1) {
c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
} else {
c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx;
}
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
if (cpu_flags & AV_CPU_FLAG_CMOV)
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext;
if (chroma_format_idc <= 1)
c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext;
}
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext;
c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->h264_idct8_add = ff_h264_idct8_add_8_sse2;
c->h264_idct_add16 = ff_h264_idct_add16_8_sse2;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2;
if (chroma_format_idc <= 1)
c->h264_idct_add8 = ff_h264_idct_add8_8_sse2;
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2;
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
#if ARCH_X86_64
c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_sse2;
#endif
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_sse2;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_sse2;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_sse2;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_sse2;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_sse2;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2;
}
c->h264_idct_add = ff_h264_idct_add_8_sse2;
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
}
if (EXTERNAL_AVX(cpu_flags)) {
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
#if ARCH_X86_64
c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx;
#endif
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_avx;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_avx;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_avx;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_avx;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
}
c->h264_idct_add = ff_h264_idct_add_8_avx;
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
#if ARCH_X86_32
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext;
}
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
#endif /* ARCH_X86_32 */
c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->h264_idct_add = ff_h264_idct_add_10_sse2;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
if (chroma_format_idc <= 1) {
c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
} else {
c->h264_idct_add8 = ff_h264_idct_add8_422_10_sse2;
}
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
#if HAVE_ALIGNED_STACK
c->h264_idct8_add = ff_h264_idct8_add_10_sse2;
c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
#endif /* HAVE_ALIGNED_STACK */
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
}
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
#endif /* HAVE_ALIGNED_STACK */
}
if (EXTERNAL_SSE4(cpu_flags)) {
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
}
if (EXTERNAL_AVX(cpu_flags)) {
c->h264_idct_dc_add =
c->h264_idct_add = ff_h264_idct_add_10_avx;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
if (chroma_format_idc <= 1) {
c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
} else {
c->h264_idct_add8 = ff_h264_idct_add8_422_10_avx;
}
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
#if HAVE_ALIGNED_STACK
c->h264_idct8_add = ff_h264_idct8_add_10_avx;
c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
#endif /* HAVE_ALIGNED_STACK */
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
}
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
#endif /* HAVE_ALIGNED_STACK */
}
}
#endif
}

View File

@@ -0,0 +1,369 @@
; *****************************************************************************
; * Provide SIMD optimizations for add_residual functions for HEVC decoding
; * Copyright (c) 2014 Pierre-Edouard LEPERE
; *
; * This file is part of FFmpeg.
; *
; * FFmpeg is free software; you can redistribute it and/or
; * modify it under the terms of the GNU Lesser General Public
; * License as published by the Free Software Foundation; either
; * version 2.1 of the License, or (at your option) any later version.
; *
; * FFmpeg is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; * Lesser General Public License for more details.
; *
; * You should have received a copy of the GNU Lesser General Public
; * License along with FFmpeg; if not, write to the Free Software
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
; ******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
cextern pw_1023
%define max_pixels_10 pw_1023
; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
%macro ADD_RES_MMX_4_8 0
mova m0, [r1]
mova m2, [r1+8]
movd m1, [r0]
movd m3, [r0+r2]
punpcklbw m1, m4
punpcklbw m3, m4
paddsw m0, m1
paddsw m2, m3
packuswb m0, m4
packuswb m2, m4
movd [r0], m0
movd [r0+r2], m2
%endmacro
INIT_MMX mmxext
; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_4_8, 3, 3, 6
pxor m4, m4
ADD_RES_MMX_4_8
add r1, 16
lea r0, [r0+r2*2]
ADD_RES_MMX_4_8
RET
%macro ADD_RES_SSE_8_8 0
movq m0, [r0]
movq m1, [r0+r2]
punpcklbw m0, m4
punpcklbw m1, m4
mova m2, [r1]
mova m3, [r1+16]
paddsw m0, m2
paddsw m1, m3
packuswb m0, m1
movq m2, [r0+r2*2]
movq m3, [r0+r3]
punpcklbw m2, m4
punpcklbw m3, m4
mova m6, [r1+32]
mova m7, [r1+48]
paddsw m2, m6
paddsw m3, m7
packuswb m2, m3
movq [r0], m0
movhps [r0+r2], m0
movq [r0+r2*2], m2
movhps [r0+r3], m2
%endmacro
%macro ADD_RES_SSE_16_32_8 3
mova m1, [%2]
mova m2, m1
punpcklbw m1, m0
punpckhbw m2, m0
mova xm5, [r1+%1]
mova xm6, [r1+%1+16]
%if cpuflag(avx2)
vinserti128 m5, m5, [r1+%1+32], 1
vinserti128 m6, m6, [r1+%1+48], 1
%endif
paddsw m1, m5
paddsw m2, m6
mova m3, [%3]
mova m4, m3
punpcklbw m3, m0
punpckhbw m4, m0
mova xm5, [r1+%1+mmsize*2]
mova xm6, [r1+%1+mmsize*2+16]
%if cpuflag(avx2)
vinserti128 m5, m5, [r1+%1+96], 1
vinserti128 m6, m6, [r1+%1+112], 1
%endif
paddsw m3, m5
paddsw m4, m6
packuswb m1, m2
packuswb m3, m4
mova [%2], m1
mova [%3], m3
%endmacro
%macro TRANSFORM_ADD_8 0
; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_8_8, 3, 4, 8
pxor m4, m4
lea r3, [r2*3]
ADD_RES_SSE_8_8
add r1, 64
lea r0, [r0+r2*4]
ADD_RES_SSE_8_8
RET
; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_16_8, 3, 5, 7
pxor m0, m0
lea r3, [r2*3]
mov r4d, 4
.loop:
ADD_RES_SSE_16_32_8 0, r0, r0+r2
ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
add r1, 128
lea r0, [r0+r2*4]
dec r4d
jg .loop
RET
; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_32_8, 3, 5, 7
pxor m0, m0
mov r4d, 16
.loop:
ADD_RES_SSE_16_32_8 0, r0, r0+16
ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
add r1, 128
lea r0, [r0+r2*2]
dec r4d
jg .loop
RET
%endmacro
INIT_XMM sse2
TRANSFORM_ADD_8
INIT_XMM avx
TRANSFORM_ADD_8
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_32_8, 3, 5, 7
pxor m0, m0
lea r3, [r2*3]
mov r4d, 8
.loop:
ADD_RES_SSE_16_32_8 0, r0, r0+r2
ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
add r1, 256
lea r0, [r0+r2*4]
dec r4d
jg .loop
RET
%endif ;HAVE_AVX2_EXTERNAL
%macro ADD_RES_SSE_8_10 4
mova m0, [%4]
mova m1, [%4+16]
mova m2, [%4+32]
mova m3, [%4+48]
paddw m0, [%1+0]
paddw m1, [%1+%2]
paddw m2, [%1+%2*2]
paddw m3, [%1+%3]
CLIPW m0, m4, m5
CLIPW m1, m4, m5
CLIPW m2, m4, m5
CLIPW m3, m4, m5
mova [%1+0], m0
mova [%1+%2], m1
mova [%1+%2*2], m2
mova [%1+%3], m3
%endmacro
%macro ADD_RES_MMX_4_10 3
mova m0, [%1+0]
mova m1, [%1+%2]
paddw m0, [%3]
paddw m1, [%3+8]
CLIPW m0, m2, m3
CLIPW m1, m2, m3
mova [%1+0], m0
mova [%1+%2], m1
%endmacro
%macro ADD_RES_SSE_16_10 3
mova m0, [%3]
mova m1, [%3+16]
mova m2, [%3+32]
mova m3, [%3+48]
paddw m0, [%1]
paddw m1, [%1+16]
paddw m2, [%1+%2]
paddw m3, [%1+%2+16]
CLIPW m0, m4, m5
CLIPW m1, m4, m5
CLIPW m2, m4, m5
CLIPW m3, m4, m5
mova [%1], m0
mova [%1+16], m1
mova [%1+%2], m2
mova [%1+%2+16], m3
%endmacro
%macro ADD_RES_SSE_32_10 2
mova m0, [%2]
mova m1, [%2+16]
mova m2, [%2+32]
mova m3, [%2+48]
paddw m0, [%1]
paddw m1, [%1+16]
paddw m2, [%1+32]
paddw m3, [%1+48]
CLIPW m0, m4, m5
CLIPW m1, m4, m5
CLIPW m2, m4, m5
CLIPW m3, m4, m5
mova [%1], m0
mova [%1+16], m1
mova [%1+32], m2
mova [%1+48], m3
%endmacro
%macro ADD_RES_AVX2_16_10 4
mova m0, [%4]
mova m1, [%4+32]
mova m2, [%4+64]
mova m3, [%4+96]
paddw m0, [%1+0]
paddw m1, [%1+%2]
paddw m2, [%1+%2*2]
paddw m3, [%1+%3]
CLIPW m0, m4, m5
CLIPW m1, m4, m5
CLIPW m2, m4, m5
CLIPW m3, m4, m5
mova [%1+0], m0
mova [%1+%2], m1
mova [%1+%2*2], m2
mova [%1+%3], m3
%endmacro
%macro ADD_RES_AVX2_32_10 3
mova m0, [%3]
mova m1, [%3+32]
mova m2, [%3+64]
mova m3, [%3+96]
paddw m0, [%1]
paddw m1, [%1+32]
paddw m2, [%1+%2]
paddw m3, [%1+%2+32]
CLIPW m0, m4, m5
CLIPW m1, m4, m5
CLIPW m2, m4, m5
CLIPW m3, m4, m5
mova [%1], m0
mova [%1+32], m1
mova [%1+%2], m2
mova [%1+%2+32], m3
%endmacro
; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
INIT_MMX mmxext
cglobal hevc_add_residual_4_10, 3, 3, 6
pxor m2, m2
mova m3, [max_pixels_10]
ADD_RES_MMX_4_10 r0, r2, r1
add r1, 16
lea r0, [r0+2*r2]
ADD_RES_MMX_4_10 r0, r2, r1
RET
INIT_XMM sse2
cglobal hevc_add_residual_8_10, 3, 4, 6
pxor m4, m4
mova m5, [max_pixels_10]
lea r3, [r2*3]
ADD_RES_SSE_8_10 r0, r2, r3, r1
lea r0, [r0+r2*4]
add r1, 64
ADD_RES_SSE_8_10 r0, r2, r3, r1
RET
cglobal hevc_add_residual_16_10, 3, 5, 6
pxor m4, m4
mova m5, [max_pixels_10]
mov r4d, 8
.loop:
ADD_RES_SSE_16_10 r0, r2, r1
lea r0, [r0+r2*2]
add r1, 64
dec r4d
jg .loop
RET
cglobal hevc_add_residual_32_10, 3, 5, 6
pxor m4, m4
mova m5, [max_pixels_10]
mov r4d, 32
.loop:
ADD_RES_SSE_32_10 r0, r1
lea r0, [r0+r2]
add r1, 64
dec r4d
jg .loop
RET
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
cglobal hevc_add_residual_16_10, 3, 5, 6
pxor m4, m4
mova m5, [max_pixels_10]
lea r3, [r2*3]
mov r4d, 4
.loop:
ADD_RES_AVX2_16_10 r0, r2, r3, r1
lea r0, [r0+r2*4]
add r1, 128
dec r4d
jg .loop
RET
cglobal hevc_add_residual_32_10, 3, 5, 6
pxor m4, m4
mova m5, [max_pixels_10]
mov r4d, 16
.loop:
ADD_RES_AVX2_32_10 r0, r2, r1
lea r0, [r0+r2*2]
add r1, 128
dec r4d
jg .loop
RET
%endif ;HAVE_AVX2_EXTERNAL

View File

@@ -0,0 +1,871 @@
;*****************************************************************************
;* SSE2-optimized HEVC deblocking code
;*****************************************************************************
;* Copyright (C) 2013 VTT
;*
;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pw_1023
%define pw_pixel_max_10 pw_1023
pw_pixel_max_12: times 8 dw ((1 << 12)-1)
pw_m2: times 8 dw -2
pd_1 : times 4 dd 1
cextern pw_4
cextern pw_8
cextern pw_m1
SECTION .text
INIT_XMM sse2
; in: 8 rows of 4 bytes in %4..%11
; out: 4 rows of 8 words in m0..m3
%macro TRANSPOSE4x8B_LOAD 8
movd m0, %1
movd m2, %2
movd m1, %3
movd m3, %4
punpcklbw m0, m2
punpcklbw m1, m3
punpcklwd m0, m1
movd m4, %5
movd m6, %6
movd m5, %7
movd m3, %8
punpcklbw m4, m6
punpcklbw m5, m3
punpcklwd m4, m5
punpckhdq m2, m0, m4
punpckldq m0, m4
pxor m5, m5
punpckhbw m1, m0, m5
punpcklbw m0, m5
punpckhbw m3, m2, m5
punpcklbw m2, m5
%endmacro
; in: 4 rows of 8 words in m0..m3
; out: 8 rows of 4 bytes in %1..%8
%macro TRANSPOSE8x4B_STORE 8
packuswb m0, m2
packuswb m1, m3
SBUTTERFLY bw, 0, 1, 2
SBUTTERFLY wd, 0, 1, 2
movd %1, m0
pshufd m0, m0, 0x39
movd %2, m0
pshufd m0, m0, 0x39
movd %3, m0
pshufd m0, m0, 0x39
movd %4, m0
movd %5, m1
pshufd m1, m1, 0x39
movd %6, m1
pshufd m1, m1, 0x39
movd %7, m1
pshufd m1, m1, 0x39
movd %8, m1
%endmacro
; in: 8 rows of 4 words in %4..%11
; out: 4 rows of 8 words in m0..m3
%macro TRANSPOSE4x8W_LOAD 8
movq m0, %1
movq m2, %2
movq m1, %3
movq m3, %4
punpcklwd m0, m2
punpcklwd m1, m3
punpckhdq m2, m0, m1
punpckldq m0, m1
movq m4, %5
movq m6, %6
movq m5, %7
movq m3, %8
punpcklwd m4, m6
punpcklwd m5, m3
punpckhdq m6, m4, m5
punpckldq m4, m5
punpckhqdq m1, m0, m4
punpcklqdq m0, m4
punpckhqdq m3, m2, m6
punpcklqdq m2, m6
%endmacro
; in: 4 rows of 8 words in m0..m3
; out: 8 rows of 4 words in %1..%8
%macro TRANSPOSE8x4W_STORE 9
TRANSPOSE4x4W 0, 1, 2, 3, 4
pxor m5, m5; zeros reg
CLIPW m0, m5, %9
CLIPW m1, m5, %9
CLIPW m2, m5, %9
CLIPW m3, m5, %9
movq %1, m0
movhps %2, m0
movq %3, m1
movhps %4, m1
movq %5, m2
movhps %6, m2
movq %7, m3
movhps %8, m3
%endmacro
; in: 8 rows of 8 bytes in %1..%8
; out: 8 rows of 8 words in m0..m7
%macro TRANSPOSE8x8B_LOAD 8
movq m7, %1
movq m2, %2
movq m1, %3
movq m3, %4
punpcklbw m7, m2
punpcklbw m1, m3
punpcklwd m3, m7, m1
punpckhwd m7, m1
movq m4, %5
movq m6, %6
movq m5, %7
movq m15, %8
punpcklbw m4, m6
punpcklbw m5, m15
punpcklwd m9, m4, m5
punpckhwd m4, m5
punpckldq m1, m3, m9; 0, 1
punpckhdq m3, m9; 2, 3
punpckldq m5, m7, m4; 4, 5
punpckhdq m7, m4; 6, 7
pxor m13, m13
punpcklbw m0, m1, m13; 0 in 16 bit
punpckhbw m1, m13; 1 in 16 bit
punpcklbw m2, m3, m13; 2
punpckhbw m3, m13; 3
punpcklbw m4, m5, m13; 4
punpckhbw m5, m13; 5
punpcklbw m6, m7, m13; 6
punpckhbw m7, m13; 7
%endmacro
; in: 8 rows of 8 words in m0..m8
; out: 8 rows of 8 bytes in %1..%8
%macro TRANSPOSE8x8B_STORE 8
packuswb m0, m4
packuswb m1, m5
packuswb m2, m6
packuswb m3, m7
TRANSPOSE2x4x4B 0, 1, 2, 3, 4
movq %1, m0
movhps %2, m0
movq %3, m1
movhps %4, m1
movq %5, m2
movhps %6, m2
movq %7, m3
movhps %8, m3
%endmacro
; in: 8 rows of 8 words in %1..%8
; out: 8 rows of 8 words in m0..m7
%macro TRANSPOSE8x8W_LOAD 8
movdqu m0, %1
movdqu m1, %2
movdqu m2, %3
movdqu m3, %4
movdqu m4, %5
movdqu m5, %6
movdqu m6, %7
movdqu m7, %8
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
%endmacro
; in: 8 rows of 8 words in m0..m8
; out: 8 rows of 8 words in %1..%8
%macro TRANSPOSE8x8W_STORE 9
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
pxor m8, m8
CLIPW m0, m8, %9
CLIPW m1, m8, %9
CLIPW m2, m8, %9
CLIPW m3, m8, %9
CLIPW m4, m8, %9
CLIPW m5, m8, %9
CLIPW m6, m8, %9
CLIPW m7, m8, %9
movdqu %1, m0
movdqu %2, m1
movdqu %3, m2
movdqu %4, m3
movdqu %5, m4
movdqu %6, m5
movdqu %7, m6
movdqu %8, m7
%endmacro
; in: %2 clobbered
; out: %1
; mask in m11
; clobbers m10
%macro MASKED_COPY 2
pand %2, m11 ; and mask
pandn m10, m11, %1; and -mask
por %2, m10
mova %1, %2
%endmacro
; in: %2 clobbered
; out: %1
; mask in %3, will be clobbered
%macro MASKED_COPY2 3
pand %2, %3 ; and mask
pandn %3, %1; and -mask
por %2, %3
mova %1, %2
%endmacro
ALIGN 16
; input in m0 ... m3 and tcs in r2. Output in m1 and m2
%macro CHROMA_DEBLOCK_BODY 1
psubw m4, m2, m1; q0 - p0
psubw m5, m0, m3; p1 - q1
psllw m4, 2; << 2
paddw m5, m4;
;tc calculations
movq m6, [tcq]; tc0
punpcklwd m6, m6
pshufd m6, m6, 0xA0; tc0, tc1
%if cpuflag(ssse3)
psignw m4, m6, [pw_m1]; -tc0, -tc1
%else
pmullw m4, m6, [pw_m1]; -tc0, -tc1
%endif
;end tc calculations
paddw m5, [pw_4]; +4
psraw m5, 3; >> 3
%if %1 > 8
psllw m4, %1-8; << (BIT_DEPTH - 8)
psllw m6, %1-8; << (BIT_DEPTH - 8)
%endif
pmaxsw m5, m4
pminsw m5, m6
paddw m1, m5; p0 + delta0
psubw m2, m5; q0 - delta0
%endmacro
; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6
%macro LUMA_DEBLOCK_BODY 2
psllw m9, m2, 1; *2
psubw m10, m1, m9
paddw m10, m3
ABS1 m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3
psllw m9, m5, 1; *2
psubw m11, m6, m9
paddw m11, m4
ABS1 m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3
;beta calculations
%if %1 > 8
shl betaq, %1 - 8
%endif
movd m13, betad
SPLATW m13, m13, 0
;end beta calculations
paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3
pshufhw m14, m9, 0x0f ;0b00001111; 0d3 0d3 0d0 0d0 in high
pshuflw m14, m14, 0x0f ;0b00001111; 1d3 1d3 1d0 1d0 in low
pshufhw m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
pshuflw m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
paddw m14, m9; 0d0+0d3, 1d0+1d3
;compare
pcmpgtw m15, m13, m14
movmskps r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
test r13, r13
je .bypassluma
;weak / strong decision compare to beta_2
psraw m15, m13, 2; beta >> 2
psllw m8, m9, 1;
pcmpgtw m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
movmskps r6, m15;
;end weak / strong decision
; weak filter nd_p/q calculation
pshufd m8, m10, 0x31
psrld m8, 16
paddw m8, m10
movd r7d, m8
pshufd m8, m8, 0x4E
movd r8d, m8
pshufd m8, m11, 0x31
psrld m8, 16
paddw m8, m11
movd r9d, m8
pshufd m8, m8, 0x4E
movd r10d, m8
; end calc for weak filter
; filtering mask
mov r11, r13
shr r11, 3
movd m15, r11d
and r13, 1
movd m11, r13d
shufps m11, m15, 0
shl r11, 1
or r13, r11
pcmpeqd m11, [pd_1]; filtering mask
;decide between strong and weak filtering
;tc25 calculations
mov r11d, [tcq];
%if %1 > 8
shl r11, %1 - 8
%endif
movd m8, r11d; tc0
mov r3d, [tcq+4];
%if %1 > 8
shl r3, %1 - 8
%endif
add r11d, r3d; tc0 + tc1
jz .bypassluma
movd m9, r3d; tc1
punpcklwd m8, m8
punpcklwd m9, m9
shufps m8, m9, 0; tc0, tc1
mova m9, m8
psllw m8, 2; tc << 2
pavgw m8, m9; tc25 = ((tc * 5 + 1) >> 1)
;end tc25 calculations
;----beta_3 comparison-----
psubw m12, m0, m3; p3 - p0
ABS1 m12, m14; abs(p3 - p0)
psubw m15, m7, m4; q3 - q0
ABS1 m15, m14; abs(q3 - q0)
paddw m12, m15; abs(p3 - p0) + abs(q3 - q0)
pshufhw m12, m12, 0xf0 ;0b11110000;
pshuflw m12, m12, 0xf0 ;0b11110000;
psraw m13, 3; beta >> 3
pcmpgtw m13, m12;
movmskps r11, m13;
and r6, r11; strong mask , beta_2 and beta_3 comparisons
;----beta_3 comparison end-----
;----tc25 comparison---
psubw m12, m3, m4; p0 - q0
ABS1 m12, m14; abs(p0 - q0)
pshufhw m12, m12, 0xf0 ;0b11110000;
pshuflw m12, m12, 0xf0 ;0b11110000;
pcmpgtw m8, m12; tc25 comparisons
movmskps r11, m8;
and r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
;----tc25 comparison end---
mov r11, r6;
shr r11, 1;
and r6, r11; strong mask, bits 2 and 0
pmullw m14, m9, [pw_m2]; -tc * 2
paddw m9, m9
and r6, 5; 0b101
mov r11, r6; strong mask
shr r6, 2;
movd m12, r6d; store to xmm for mask generation
shl r6, 1
and r11, 1
movd m10, r11d; store to xmm for mask generation
or r6, r11; final strong mask, bits 1 and 0
jz .weakfilter
shufps m10, m12, 0
pcmpeqd m10, [pd_1]; strong mask
mova m13, [pw_4]; 4 in every cell
pand m11, m10; combine filtering mask and strong mask
paddw m12, m2, m3; p1 + p0
paddw m12, m4; p1 + p0 + q0
mova m10, m12; copy
paddw m12, m12; 2*p1 + 2*p0 + 2*q0
paddw m12, m1; p2 + 2*p1 + 2*p0 + 2*q0
paddw m12, m5; p2 + 2*p1 + 2*p0 + 2*q0 + q1
paddw m12, m13; p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4
psraw m12, 3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3)
psubw m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0
pmaxsw m12, m14
pminsw m12, m9; av_clip( , -2 * tc, 2 * tc)
paddw m12, m3; p0'
paddw m15, m1, m10; p2 + p1 + p0 + q0
psrlw m13, 1; 2 in every cell
paddw m15, m13; p2 + p1 + p0 + q0 + 2
psraw m15, 2; (p2 + p1 + p0 + q0 + 2) >> 2
psubw m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1
pmaxsw m15, m14
pminsw m15, m9; av_clip( , -2 * tc, 2 * tc)
paddw m15, m2; p1'
paddw m8, m1, m0; p3 + p2
paddw m8, m8; 2*p3 + 2*p2
paddw m8, m1; 2*p3 + 3*p2
paddw m8, m10; 2*p3 + 3*p2 + p1 + p0 + q0
paddw m13, m13
paddw m8, m13; 2*p3 + 3*p2 + p1 + p0 + q0 + 4
psraw m8, 3; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
psubw m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2
pmaxsw m8, m14
pminsw m8, m9; av_clip( , -2 * tc, 2 * tc)
paddw m8, m1; p2'
MASKED_COPY m1, m8
paddw m8, m3, m4; p0 + q0
paddw m8, m5; p0 + q0 + q1
paddw m8, m8; 2*p0 + 2*q0 + 2*q1
paddw m8, m2; p1 + 2*p0 + 2*q0 + 2*q1
paddw m8, m6; p1 + 2*p0 + 2*q0 + 2*q1 + q2
paddw m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4
psraw m8, 3; (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3
psubw m8, m4;
pmaxsw m8, m14
pminsw m8, m9; av_clip( , -2 * tc, 2 * tc)
paddw m8, m4; q0'
MASKED_COPY m2, m15
paddw m15, m3, m4; p0 + q0
paddw m15, m5; p0 + q0 + q1
mova m10, m15;
paddw m15, m6; p0 + q0 + q1 + q2
psrlw m13, 1; 2 in every cell
paddw m15, m13; p0 + q0 + q1 + q2 + 2
psraw m15, 2; (p0 + q0 + q1 + q2 + 2) >> 2
psubw m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1
pmaxsw m15, m14
pminsw m15, m9; av_clip( , -2 * tc, 2 * tc)
paddw m15, m5; q1'
paddw m13, m7; q3 + 2
paddw m13, m6; q3 + q2 + 2
paddw m13, m13; 2*q3 + 2*q2 + 4
paddw m13, m6; 2*q3 + 3*q2 + 4
paddw m13, m10; 2*q3 + 3*q2 + q1 + q0 + p0 + 4
psraw m13, 3; (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3
psubw m13, m6; ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2
pmaxsw m13, m14
pminsw m13, m9; av_clip( , -2 * tc, 2 * tc)
paddw m13, m6; q2'
MASKED_COPY m6, m13
MASKED_COPY m5, m15
MASKED_COPY m4, m8
MASKED_COPY m3, m12
.weakfilter:
not r6; strong mask -> weak mask
and r6, r13; final weak filtering mask, bits 0 and 1
jz .store
; weak filtering mask
mov r11, r6
shr r11, 1
movd m12, r11d
and r6, 1
movd m11, r6d
shufps m11, m12, 0
pcmpeqd m11, [pd_1]; filtering mask
mov r13, betaq
shr r13, 1;
add betaq, r13
shr betaq, 3; ((beta + (beta >> 1)) >> 3))
mova m13, [pw_8]
psubw m12, m4, m3 ; q0 - p0
psllw m10, m12, 3; 8 * (q0 - p0)
paddw m12, m10 ; 9 * (q0 - p0)
psubw m10, m5, m2 ; q1 - p1
psllw m8, m10, 1; 2 * ( q1 - p1 )
paddw m10, m8; 3 * ( q1 - p1 )
psubw m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
paddw m12, m13; + 8
psraw m12, 4; >> 4 , delta0
PABSW m13, m12; abs(delta0)
psllw m10, m9, 2; 8 * tc
paddw m10, m9; 10 * tc
pcmpgtw m10, m13
pand m11, m10
psraw m9, 1; tc * 2 -> tc
psraw m14, 1; -tc * 2 -> -tc
pmaxsw m12, m14
pminsw m12, m9; av_clip(delta0, -tc, tc)
psraw m9, 1; tc -> tc / 2
%if cpuflag(ssse3)
psignw m14, m9, [pw_m1]; -tc / 2
%else
pmullw m14, m9, [pw_m1]; -tc / 2
%endif
pavgw m15, m1, m3; (p2 + p0 + 1) >> 1
psubw m15, m2; ((p2 + p0 + 1) >> 1) - p1
paddw m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0
psraw m15, 1; (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
pmaxsw m15, m14
pminsw m15, m9; av_clip(deltap1, -tc/2, tc/2)
paddw m15, m2; p1'
;beta calculations
movd m10, betad
SPLATW m10, m10, 0
movd m13, r7d; 1dp0 + 1dp3
movd m8, r8d; 0dp0 + 0dp3
punpcklwd m8, m8
punpcklwd m13, m13
shufps m13, m8, 0;
pcmpgtw m8, m10, m13
pand m8, m11
;end beta calculations
MASKED_COPY2 m2, m15, m8; write p1'
pavgw m8, m6, m4; (q2 + q0 + 1) >> 1
psubw m8, m5; ((q2 + q0 + 1) >> 1) - q1
psubw m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0)
psraw m8, 1; ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
pmaxsw m8, m14
pminsw m8, m9; av_clip(deltaq1, -tc/2, tc/2)
paddw m8, m5; q1'
movd m13, r9d;
movd m15, r10d;
punpcklwd m15, m15
punpcklwd m13, m13
shufps m13, m15, 0; dq0 + dq3
pcmpgtw m10, m13; compare to ((beta+(beta>>1))>>3)
pand m10, m11
MASKED_COPY2 m5, m8, m10; write q1'
paddw m15, m3, m12 ; p0 + delta0
MASKED_COPY m3, m15
psubw m8, m4, m12 ; q0 - delta0
MASKED_COPY m4, m8
%endmacro
;-----------------------------------------------------------------------------
; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
; uint8_t *_no_p, uint8_t *_no_q);
;-----------------------------------------------------------------------------
%macro LOOP_FILTER_CHROMA 0
cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
sub pixq, 2
lea r3strideq, [3*strideq]
mov pix0q, pixq
add pixq, r3strideq
TRANSPOSE4x8B_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
CHROMA_DEBLOCK_BODY 8
TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
RET
cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
sub pixq, 4
lea r3strideq, [3*strideq]
mov pix0q, pixq
add pixq, r3strideq
TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
CHROMA_DEBLOCK_BODY 10
TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
RET
cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
sub pixq, 4
lea r3strideq, [3*strideq]
mov pix0q, pixq
add pixq, r3strideq
TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
CHROMA_DEBLOCK_BODY 12
TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
RET
;-----------------------------------------------------------------------------
; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
; uint8_t *_no_p, uint8_t *_no_q);
;-----------------------------------------------------------------------------
cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
mov pix0q, pixq
sub pix0q, strideq
sub pix0q, strideq
movq m0, [pix0q]; p1
movq m1, [pix0q+strideq]; p0
movq m2, [pixq]; q0
movq m3, [pixq+strideq]; q1
pxor m5, m5; zeros reg
punpcklbw m0, m5
punpcklbw m1, m5
punpcklbw m2, m5
punpcklbw m3, m5
CHROMA_DEBLOCK_BODY 8
packuswb m1, m2
movh[pix0q+strideq], m1
movhps [pixq], m1
RET
cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
mov pix0q, pixq
sub pix0q, strideq
sub pix0q, strideq
movu m0, [pix0q]; p1
movu m1, [pix0q+strideq]; p0
movu m2, [pixq]; q0
movu m3, [pixq+strideq]; q1
CHROMA_DEBLOCK_BODY 10
pxor m5, m5; zeros reg
CLIPW m1, m5, [pw_pixel_max_10]
CLIPW m2, m5, [pw_pixel_max_10]
movu [pix0q+strideq], m1
movu [pixq], m2
RET
cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
mov pix0q, pixq
sub pix0q, strideq
sub pix0q, strideq
movu m0, [pix0q]; p1
movu m1, [pix0q+strideq]; p0
movu m2, [pixq]; q0
movu m3, [pixq+strideq]; q1
CHROMA_DEBLOCK_BODY 12
pxor m5, m5; zeros reg
CLIPW m1, m5, [pw_pixel_max_12]
CLIPW m2, m5, [pw_pixel_max_12]
movu [pix0q+strideq], m1
movu [pixq], m2
RET
%endmacro
INIT_XMM sse2
LOOP_FILTER_CHROMA
INIT_XMM avx
LOOP_FILTER_CHROMA
%if ARCH_X86_64
%macro LOOP_FILTER_LUMA 0
;-----------------------------------------------------------------------------
; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
;-----------------------------------------------------------------------------
cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
sub pixq, 4
lea pix0q, [3 * r1]
mov src3strideq, pixq
add pixq, pix0q
TRANSPOSE8x8B_LOAD PASS8ROWS(src3strideq, pixq, r1, pix0q)
LUMA_DEBLOCK_BODY 8, v
.store:
TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
.bypassluma:
RET
cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
sub pixq, 8
lea pix0q, [3 * strideq]
mov src3strideq, pixq
add pixq, pix0q
TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q)
LUMA_DEBLOCK_BODY 10, v
.store:
TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
.bypassluma:
RET
cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
sub pixq, 8
lea pix0q, [3 * strideq]
mov src3strideq, pixq
add pixq, pix0q
TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q)
LUMA_DEBLOCK_BODY 12, v
.store:
TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
.bypassluma:
RET
;-----------------------------------------------------------------------------
; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
;-----------------------------------------------------------------------------
cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
lea src3strideq, [3 * strideq]
mov pix0q, pixq
sub pix0q, src3strideq
sub pix0q, strideq
movq m0, [pix0q]; p3
movq m1, [pix0q + strideq]; p2
movq m2, [pix0q + 2 * strideq]; p1
movq m3, [pix0q + src3strideq]; p0
movq m4, [pixq]; q0
movq m5, [pixq + strideq]; q1
movq m6, [pixq + 2 * strideq]; q2
movq m7, [pixq + src3strideq]; q3
pxor m8, m8
punpcklbw m0, m8
punpcklbw m1, m8
punpcklbw m2, m8
punpcklbw m3, m8
punpcklbw m4, m8
punpcklbw m5, m8
punpcklbw m6, m8
punpcklbw m7, m8
LUMA_DEBLOCK_BODY 8, h
.store:
packuswb m1, m2
packuswb m3, m4
packuswb m5, m6
movh [pix0q + strideq], m1
movhps [pix0q + 2 * strideq], m1
movh [pix0q + src3strideq], m3
movhps [pixq ], m3
movh [pixq + strideq], m5
movhps [pixq + 2 * strideq], m5
.bypassluma:
RET
cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
lea src3strideq, [3 * strideq]
mov pix0q, pixq
sub pix0q, src3strideq
sub pix0q, strideq
movdqu m0, [pix0q]; p3
movdqu m1, [pix0q + strideq]; p2
movdqu m2, [pix0q + 2 * strideq]; p1
movdqu m3, [pix0q + src3strideq]; p0
movdqu m4, [pixq]; q0
movdqu m5, [pixq + strideq]; q1
movdqu m6, [pixq + 2 * strideq]; q2
movdqu m7, [pixq + src3strideq]; q3
LUMA_DEBLOCK_BODY 10, h
.store:
pxor m8, m8; zeros reg
CLIPW m1, m8, [pw_pixel_max_10]
CLIPW m2, m8, [pw_pixel_max_10]
CLIPW m3, m8, [pw_pixel_max_10]
CLIPW m4, m8, [pw_pixel_max_10]
CLIPW m5, m8, [pw_pixel_max_10]
CLIPW m6, m8, [pw_pixel_max_10]
movdqu [pix0q + strideq], m1; p2
movdqu [pix0q + 2 * strideq], m2; p1
movdqu [pix0q + src3strideq], m3; p0
movdqu [pixq ], m4; q0
movdqu [pixq + strideq], m5; q1
movdqu [pixq + 2 * strideq], m6; q2
.bypassluma:
RET
cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
lea src3strideq, [3 * strideq]
mov pix0q, pixq
sub pix0q, src3strideq
sub pix0q, strideq
movdqu m0, [pix0q]; p3
movdqu m1, [pix0q + strideq]; p2
movdqu m2, [pix0q + 2 * strideq]; p1
movdqu m3, [pix0q + src3strideq]; p0
movdqu m4, [pixq]; q0
movdqu m5, [pixq + strideq]; q1
movdqu m6, [pixq + 2 * strideq]; q2
movdqu m7, [pixq + src3strideq]; q3
LUMA_DEBLOCK_BODY 12, h
.store:
pxor m8, m8; zeros reg
CLIPW m1, m8, [pw_pixel_max_12]
CLIPW m2, m8, [pw_pixel_max_12]
CLIPW m3, m8, [pw_pixel_max_12]
CLIPW m4, m8, [pw_pixel_max_12]
CLIPW m5, m8, [pw_pixel_max_12]
CLIPW m6, m8, [pw_pixel_max_12]
movdqu [pix0q + strideq], m1; p2
movdqu [pix0q + 2 * strideq], m2; p1
movdqu [pix0q + src3strideq], m3; p0
movdqu [pixq ], m4; q0
movdqu [pixq + strideq], m5; q1
movdqu [pixq + 2 * strideq], m6; q2
.bypassluma:
RET
%endmacro
INIT_XMM sse2
LOOP_FILTER_LUMA
INIT_XMM ssse3
LOOP_FILTER_LUMA
INIT_XMM avx
LOOP_FILTER_LUMA
%endif

853
externals/ffmpeg/libavcodec/x86/hevc_idct.asm vendored Executable file
View File

@@ -0,0 +1,853 @@
;*******************************************************************************
;* SIMD-optimized IDCT functions for HEVC decoding
;* Copyright (c) 2014 Pierre-Edouard LEPERE
;* Copyright (c) 2014 James Almer
;* Copyright (c) 2016 Alexandra Hájková
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pd_64: times 4 dd 64
pd_2048: times 4 dd 2048
pd_512: times 4 dd 512
; 4x4 transform coeffs
cextern pw_64
pw_64_m64: times 4 dw 64, -64
pw_83_36: times 4 dw 83, 36
pw_36_m83: times 4 dw 36, -83
; 8x8 transform coeffs
pw_89_75: times 4 dw 89, 75
pw_50_18: times 4 dw 50, 18
pw_75_m18: times 4 dw 75, -18
pw_m89_m50: times 4 dw -89, -50
pw_50_m89: times 4 dw 50, -89
pw_18_75: times 4 dw 18, 75
pw_18_m50: times 4 dw 18, -50
pw_75_m89: times 4 dw 75, -89
; 16x16 transformation coeffs
trans_coeffs16: times 4 dw 90, 87
times 4 dw 80, 70
times 4 dw 57, 43
times 4 dw 25, 9
times 4 dw 87, 57
times 4 dw 9, -43
times 4 dw -80, -90
times 4 dw -70, -25
times 4 dw 80, 9
times 4 dw -70, -87
times 4 dw -25, 57
times 4 dw 90, 43
times 4 dw 70, -43
times 4 dw -87, 9
times 4 dw 90, 25
times 4 dw -80, -57
times 4 dw 57, -80
times 4 dw -25, 90
times 4 dw -9, -87
times 4 dw 43, 70
times 4 dw 43, -90
times 4 dw 57, 25
times 4 dw -87, 70
times 4 dw 9, -80
times 4 dw 25, -70
times 4 dw 90, -80
times 4 dw 43, 9
times 4 dw -57, 87
times 4 dw 9, -25
times 4 dw 43, -57
times 4 dw 70, -80
times 4 dw 87, -90
; 32x32 transform coeffs
trans_coeff32: times 8 dw 90
times 4 dw 88, 85
times 4 dw 82, 78
times 4 dw 73, 67
times 4 dw 61, 54
times 4 dw 46, 38
times 4 dw 31, 22
times 4 dw 13, 4
times 4 dw 90, 82
times 4 dw 67, 46
times 4 dw 22, -4
times 4 dw -31, -54
times 4 dw -73, -85
times 4 dw -90, -88
times 4 dw -78, -61
times 4 dw -38, -13
times 4 dw 88, 67
times 4 dw 31, -13
times 4 dw -54, -82
times 4 dw -90, -78
times 4 dw -46, -4
times 4 dw 38, 73
times 4 dw 90, 85
times 4 dw 61, 22
times 4 dw 85, 46
times 4 dw -13, -67
times 4 dw -90, -73
times 4 dw -22, 38
times 4 dw 82, 88
times 4 dw 54, -4
times 4 dw -61, -90
times 4 dw -78, -31
times 4 dw 82, 22
times 4 dw -54, -90
times 4 dw -61, 13
times 4 dw 78, 85
times 4 dw 31, -46
times 4 dw -90, -67
times 4 dw 4, 73
times 4 dw 88, 38
times 4 dw 78, -4
times 4 dw -82, -73
times 4 dw 13, 85
times 4 dw 67, -22
times 4 dw -88, -61
times 4 dw 31, 90
times 4 dw 54, -38
times 4 dw -90, -46
times 4 dw 73, -31
times 4 dw -90, -22
times 4 dw 78, 67
times 4 dw -38, -90
times 4 dw -13, 82
times 4 dw 61, -46
times 4 dw -88, -4
times 4 dw 85, 54
times 4 dw 67, -54
times 4 dw -78, 38
times 4 dw 85, -22
times 4 dw -90, 4
times 4 dw 90, 13
times 4 dw -88, -31
times 4 dw 82, 46
times 4 dw -73, -61
times 4 dw 61, -73
times 4 dw -46, 82
times 4 dw 31, -88
times 4 dw -13, 90
times 4 dw -4, -90
times 4 dw 22, 85
times 4 dw -38, -78
times 4 dw 54, 67
times 4 dw 54, -85
times 4 dw -4, 88
times 4 dw -46, -61
times 4 dw 82, 13
times 4 dw -90, 38
times 4 dw 67, -78
times 4 dw -22, 90
times 4 dw -31, -73
times 4 dw 46, -90
times 4 dw 38, 54
times 4 dw -90, 31
times 4 dw 61, -88
times 4 dw 22, 67
times 4 dw -85, 13
times 4 dw 73, -82
times 4 dw 4, 78
times 4 dw 38, -88
times 4 dw 73, -4
times 4 dw -67, 90
times 4 dw -46, -31
times 4 dw 85, -78
times 4 dw 13, 61
times 4 dw -90, 54
times 4 dw 22, -82
times 4 dw 31, -78
times 4 dw 90, -61
times 4 dw 4, 54
times 4 dw -88, 82
times 4 dw -38, -22
times 4 dw 73, -90
times 4 dw 67, -13
times 4 dw -46, 85
times 4 dw 22, -61
times 4 dw 85, -90
times 4 dw 73, -38
times 4 dw -4, 46
times 4 dw -78, 90
times 4 dw -82, 54
times 4 dw -13, -31
times 4 dw 67, -88
times 4 dw 13, -38
times 4 dw 61, -78
times 4 dw 88, -90
times 4 dw 85, -73
times 4 dw 54, -31
times 4 dw 4, 22
times 4 dw -46, 67
times 4 dw -82, 90
times 4 dw 4, -13
times 4 dw 22, -31
times 4 dw 38, -46
times 4 dw 54, -61
times 4 dw 67, -73
times 4 dw 78, -82
times 4 dw 85, -88
times 4 dw 90, -90
SECTION .text
; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
; %1 = HxW
; %2 = number of loops
; %3 = bitdepth
%macro IDCT_DC 3
cglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp
movsx tmpd, word [coeffq]
add tmpd, (1 << (14 - %3)) + 1
sar tmpd, (15 - %3)
movd xm0, tmpd
SPLATW m0, xm0
DEFINE_ARGS coeff, cnt
mov cntd, %2
.loop:
mova [coeffq+mmsize*0], m0
mova [coeffq+mmsize*1], m0
mova [coeffq+mmsize*2], m0
mova [coeffq+mmsize*3], m0
add coeffq, mmsize*8
mova [coeffq+mmsize*-4], m0
mova [coeffq+mmsize*-3], m0
mova [coeffq+mmsize*-2], m0
mova [coeffq+mmsize*-1], m0
dec cntd
jg .loop
RET
%endmacro
; %1 = HxW
; %2 = bitdepth
%macro IDCT_DC_NL 2 ; No loop
cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp
movsx tmpd, word [coeffq]
add tmpd, (1 << (14 - %2)) + 1
sar tmpd, (15 - %2)
movd m0, tmpd
SPLATW m0, xm0
mova [coeffq+mmsize*0], m0
mova [coeffq+mmsize*1], m0
mova [coeffq+mmsize*2], m0
mova [coeffq+mmsize*3], m0
%if mmsize == 16
mova [coeffq+mmsize*4], m0
mova [coeffq+mmsize*5], m0
mova [coeffq+mmsize*6], m0
mova [coeffq+mmsize*7], m0
%endif
RET
%endmacro
; IDCT 4x4, expects input in m0, m1
; %1 - shift
; %2 - 1/0 - SCALE and Transpose or not
; %3 - 1/0 add constant or not
%macro TR_4x4 3
; interleaves src0 with src2 to m0
; and src1 with scr3 to m2
; src0: 00 01 02 03 m0: 00 20 01 21 02 22 03 23
; src1: 10 11 12 13 -->
; src2: 20 21 22 23 m1: 10 30 11 31 12 32 13 33
; src3: 30 31 32 33
SBUTTERFLY wd, 0, 1, 2
pmaddwd m2, m0, [pw_64] ; e0
pmaddwd m3, m1, [pw_83_36] ; o0
pmaddwd m0, [pw_64_m64] ; e1
pmaddwd m1, [pw_36_m83] ; o1
%if %3 == 1
%assign %%add 1 << (%1 - 1)
mova m4, [pd_ %+ %%add]
paddd m2, m4
paddd m0, m4
%endif
SUMSUB_BADC d, 3, 2, 1, 0, 4
%if %2 == 1
psrad m3, %1 ; e0 + o0
psrad m1, %1 ; e1 + o1
psrad m2, %1 ; e0 - o0
psrad m0, %1 ; e1 - o1
;clip16
packssdw m3, m1
packssdw m0, m2
; Transpose
SBUTTERFLY wd, 3, 0, 1
SBUTTERFLY wd, 3, 0, 1
SWAP 3, 1, 0
%else
SWAP 3, 2, 0
%endif
%endmacro
%macro DEFINE_BIAS 1
%assign shift (20 - %1)
%assign c_add (1 << (shift - 1))
%define arr_add pd_ %+ c_add
%endmacro
; %1 - bit_depth
; %2 - register add constant
; is loaded to
; shift = 20 - bit_depth
%macro LOAD_BIAS 2
DEFINE_BIAS %1
mova %2, [arr_add]
%endmacro
; %1, %2 - registers to load packed 16 bit values to
; %3, %4, %5, %6 - vertical offsets
; %7 - horizontal offset
%macro LOAD_BLOCK 7
movq %1, [r0 + %3 + %7]
movhps %1, [r0 + %5 + %7]
movq %2, [r0 + %4 + %7]
movhps %2, [r0 + %6 + %7]
%endmacro
; void ff_hevc_idct_4x4__{8,10}_<opt>(int16_t *coeffs, int col_limit)
; %1 = bitdepth
%macro IDCT_4x4 1
cglobal hevc_idct_4x4_%1, 1, 1, 5, coeffs
mova m0, [coeffsq]
mova m1, [coeffsq + 16]
TR_4x4 7, 1, 1
TR_4x4 20 - %1, 1, 1
mova [coeffsq], m0
mova [coeffsq + 16], m1
RET
%endmacro
; scale, pack (clip16) and store the residuals 0 e8[0] + o8[0] --> + %1
; 4 at one time (4 columns) 1 e8[1] + o8[1]
; from %5: e8/16 + o8/16, with %1 offset ...
; and %3: e8/16 - o8/16, with %2 offset 6 e8[1] - o8[1]
; %4 - shift 7 e8[0] - o8[0] --> + %2
%macro STORE_8 7
psrad %5, %4
psrad %3, %4
packssdw %5, %3
movq [coeffsq + %1], %5
movhps [coeffsq + %2], %5
%endmacro
; %1 - horizontal offset
; %2 - shift
; %3, %4 - transform coeffs
; %5 - vertical offset for e8 + o8
; %6 - vertical offset for e8 - o8
; %7 - register with e8 inside
; %8 - block_size
; %9 - register to store e8 +o8
; %10 - register to store e8 - o8
%macro E8_O8 10
pmaddwd m6, m4, %3
pmaddwd m7, m5, %4
paddd m6, m7
paddd m7, m6, %7 ; o8 + e8
psubd %7, m6 ; e8 - o8
%if %8 == 8
STORE_8 %5 + %1, %6 + %1, %7, %2, m7, 0, 0
%else
SWAP m7, %9
SWAP %7, %10
%endif
%endmacro
; 8x4 residuals are processed and stored
; %1 - horizontal offset
; %2 - shift
; %3 - offset of the even row
; %4 - step: 1 for 8x8, 2 for 16x16, 4 for 32x32
; %5 - offset of the odd row
; %6 - block size
; %7 - 1/0 add a constant in TR_4x4 or not
; I want to add a constant for 8x8 transform but not for 16x16 and 32x32
%macro TR_8x4 7
; load 4 columns of even rows
LOAD_BLOCK m0, m1, 0, 2 * %4 * %3, %4 * %3, 3 * %4 * %3, %1
TR_4x4 %2, 0, %7 ; e8: m0, m1, m2, m3, for 4 columns only
; load 4 columns of odd rows
LOAD_BLOCK m4, m5, %4 * %5, 3 * %4 * %5, 5 * %4 * %5, 7 * %4 * %5, %1
; 00 01 02 03
; 10 11 12 13 m4: 10 30 11 31 12 32 13 33
; ... -- >
; m5: 50 70 51 71 52 72 53 73
; 70 71 72 73
SBUTTERFLY wd, 4, 5, 6
E8_O8 %1, %2, [pw_89_75], [pw_50_18], 0, %5 * 7, m0, %6, m8, m15
E8_O8 %1, %2, [pw_75_m18], [pw_m89_m50], %5, %5 * 6, m1, %6, m9, m14
E8_O8 %1, %2, [pw_50_m89], [pw_18_75], %5 * 2, %5 * 5, m2, %6, m10, m13
E8_O8 %1, %2, [pw_18_m50], [pw_75_m89], %5 * 3, %5 * 4, m3, %6, m11, m12
%endmacro
%macro STORE_PACKED 7
movq [r0 + %3 + %7], %1
movhps [r0 + %4 + %7], %1
movq [r0 + %5 + %7], %2
movhps [r0 + %6 + %7], %2
%endmacro
; transpose 4x4 block packed
; in %1 and %2 registers
; %3 - temporary register
%macro TRANSPOSE_4x4 3
SBUTTERFLY wd, %1, %2, %3
SBUTTERFLY dq, %1, %2, %3
%endmacro
; %1 - horizontal offset of the block i
; %2 - vertical offset of the block i
; %3 - width in bytes
; %4 - vertical offset for the block j
; %5 - horizontal offset for the block j
%macro SWAP_BLOCKS 5
; M_j
LOAD_BLOCK m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
TRANSPOSE_4x4 4, 5, 6
; M_i
LOAD_BLOCK m6, m7, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
; transpose and store M_i
SWAP m6, m4
SWAP m7, m5
TRANSPOSE_4x4 4, 5, 6
STORE_PACKED m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
%endmacro
; %1 - horizontal offset
; %2 - vertical offset of the block
; %3 - width in bytes
%macro TRANSPOSE_BLOCK 3
LOAD_BLOCK m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
TRANSPOSE_4x4 4, 5, 6
STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
%endmacro
%macro TRANSPOSE_8x8 0
cglobal hevc_idct_transpose_8x8, 0, 0, 0
; M1 M2 ^T = M1^t M3^t
; M3 M4 M2^t M4^t
; M1 4x4 block
TRANSPOSE_BLOCK 0, 0, 16
; M2 and M3
SWAP_BLOCKS 0, 64, 16, 0, 8
; M4
TRANSPOSE_BLOCK 8, 64, 16
ret
%endmacro
; void ff_hevc_idct_8x8_{8,10}_<opt>(int16_t *coeffs, int col_limit)
; %1 = bitdepth
%macro IDCT_8x8 1
cglobal hevc_idct_8x8_%1, 1, 1, 8, coeffs
TR_8x4 0, 7, 32, 1, 16, 8, 1
TR_8x4 8, 7, 32, 1, 16, 8, 1
call hevc_idct_transpose_8x8_ %+ cpuname
DEFINE_BIAS %1
TR_8x4 0, shift, 32, 1, 16, 8, 1
TR_8x4 8, shift, 32, 1, 16, 8, 1
TAIL_CALL hevc_idct_transpose_8x8_ %+ cpuname, 1
%endmacro
; store intermedite e32 coeffs on stack
; as 16x4 matrix
; from m10: e8 + o8, with %6 offset
; and %3: e8 - o8, with %7 offset
; %4 - shift, unused here
%macro STORE_16 7
mova [rsp + %6], %5
mova [rsp + %7], %3
%endmacro
; %1, %2 - transform constants
; %3, %4 - regs with interleaved coeffs
; %5 - 1/0 SWAP or add
; %6, %7 - registers for intermidiate sums
; %8 - accumulator register
%macro ADD_ROWS 8
pmaddwd %6, %3, %1
pmaddwd %7, %4, %2
paddd %6, %7
%if %5 == 1
SWAP %6, %8
%else
paddd %8, %6
%endif
%endmacro
; %1 - transform coeffs
; %2, %3 offsets for storing e+o/e-o back to coeffsq
; %4 - shift
; %5 - add
; %6 - block_size
; %7 - register with e16
; %8, %9 - stack offsets for storing e+o/e-o
%macro E16_O16 9
ADD_ROWS [%1], [%1 + 16], m0, m1, 1, m5, m6, m7
ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m5, m6, m7
%if %6 == 8
paddd %7, %5
%endif
paddd m4, m7, %7 ; o16 + e16
psubd %7, m7 ; e16 - o16
STORE_%6 %2, %3, %7, %4, m4, %8, %9
%endmacro
%macro TR_16x4 10
; produce 8x4 matrix of e16 coeffs
; for 4 first rows and store it on stack (128 bytes)
TR_8x4 %1, 7, %4, %5, %6, %8, 0
; load 8 even rows
LOAD_BLOCK m0, m1, %9 * %6, %9 * 3 * %6, %9 * 5 * %6, %9 * 7 * %6, %1
LOAD_BLOCK m2, m3, %9 * 9 * %6, %9 * 11 * %6, %9 * 13 * %6, %9 * 15 * %6, %1
SBUTTERFLY wd, 0, 1, 4
SBUTTERFLY wd, 2, 3, 4
E16_O16 trans_coeffs16, 0 + %1, 15 * %6 + %1, %2, %3, %7, m8, 0, 15 * 16
mova m8, %3
E16_O16 trans_coeffs16 + 64, %6 + %1, 14 * %6 + %1, %2, m8, %7, m9, 16, 14 * 16
E16_O16 trans_coeffs16 + 2 * 64, 2 * %6 + %1, 13 * %6 + %1, %2, m8, %7, m10, 2 * 16, 13 * 16
E16_O16 trans_coeffs16 + 3 * 64, 3 * %6 + %1, 12 * %6 + %1, %2, m8, %7, m11, 3 * 16, 12 * 16
E16_O16 trans_coeffs16 + 4 * 64, 4 * %6 + %1, 11 * %6 + %1, %2, m8, %7, m12, 4 * 16, 11 * 16
E16_O16 trans_coeffs16 + 5 * 64, 5 * %6 + %1, 10 * %6 + %1, %2, m8, %7, m13, 5 * 16, 10 * 16
E16_O16 trans_coeffs16 + 6 * 64, 6 * %6 + %1, 9 * %6 + %1, %2, m8, %7, m14, 6 * 16, 9 * 16
E16_O16 trans_coeffs16 + 7 * 64, 7 * %6 + %1, 8 * %6 + %1, %2, m8, %7, m15, 7 * 16, 8 * 16
%endmacro
%macro TRANSPOSE_16x16 0
cglobal hevc_idct_transpose_16x16, 0, 0, 0
; M1 M2 M3 M4 ^T m1 m5 m9 m13 M_i^T = m_i
; M5 M6 M7 M8 --> m2 m6 m10 m14
; M9 M10 M11 M12 m3 m7 m11 m15
; M13 M14 M15 M16 m4 m8 m12 m16
; M1 4x4 block
TRANSPOSE_BLOCK 0, 0, 32
; M5, M2
SWAP_BLOCKS 0, 128, 32, 0, 8
; M9, M3
SWAP_BLOCKS 0, 256, 32, 0, 16
; M13, M4
SWAP_BLOCKS 0, 384, 32, 0, 24
;M6
TRANSPOSE_BLOCK 8, 128, 32
; M10, M7
SWAP_BLOCKS 8, 256, 32, 128, 16
; M14, M8
SWAP_BLOCKS 8, 384, 32, 128, 24
;M11
TRANSPOSE_BLOCK 16, 256, 32
; M15, M12
SWAP_BLOCKS 16, 384, 32, 256, 24
;M16
TRANSPOSE_BLOCK 24, 384, 32
ret
%endmacro
; void ff_hevc_idct_16x16_{8,10}_<opt>(int16_t *coeffs, int col_limit)
; %1 = bitdepth
%macro IDCT_16x16 1
cglobal hevc_idct_16x16_%1, 1, 2, 16, coeffs
mov r1d, 3
.loop16:
TR_16x4 8 * r1, 7, [pd_64], 64, 2, 32, 8, 16, 1, 0
dec r1d
jge .loop16
call hevc_idct_transpose_16x16_ %+ cpuname
DEFINE_BIAS %1
mov r1d, 3
.loop16_2:
TR_16x4 8 * r1, shift, [arr_add], 64, 2, 32, 8, 16, 1, 1
dec r1d
jge .loop16_2
TAIL_CALL hevc_idct_transpose_16x16_ %+ cpuname, 1
%endmacro
; scale, pack (clip16) and store the residuals 0 e32[0] + o32[0] --> %1
; 4 at one time (4 columns) 1 e32[1] + o32[1]
; %1 - address to store e32 + o32
; %2 - address to store e32 - e32
; %5 - reg with e32 + o32 ...
; %3 - reg with e32 - o32 30 e32[1] - o32[1]
; %4 - shift 31 e32[0] - o32[0] --> %2
%macro STORE_32 5
psrad %5, %4
psrad %3, %4
packssdw %5, %3
movq [%1], %5
movhps [%2], %5
%endmacro
; %1 - transform coeffs
; %2 - stack offset for e32
; %2, %3 offsets for storing e+o/e-o back to coeffsq
; %4 - shift
; %5 - stack offset of e32
%macro E32_O32 5
ADD_ROWS [%1], [%1 + 16], m0, m1, 1, m8, m9, m10
ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m8, m9, m10
ADD_ROWS [%1 + 4 * 16], [%1 + 5 * 16], m4, m5, 0, m8, m9, m10
ADD_ROWS [%1 + 6 * 16], [%1 + 7 * 16], m6, m7, 0, m8, m9, m10
paddd m11, m14, [rsp + %5]
paddd m12, m10, m11 ; o32 + e32
psubd m11, m10 ; e32 - o32
STORE_32 %2, %3, m11, %4, m12
%endmacro
; %1 - horizontal offset
; %2 - bitdepth
%macro TR_32x4 3
TR_16x4 %1, 7, [pd_64], 128, 4, 64, 16, 16, 2, 0
LOAD_BLOCK m0, m1, 64, 3 * 64, 5 * 64, 7 * 64, %1
LOAD_BLOCK m2, m3, 9 * 64, 11 * 64, 13 * 64, 15 * 64, %1
LOAD_BLOCK m4, m5, 17 * 64, 19 * 64, 21 * 64, 23 * 64, %1
LOAD_BLOCK m6, m7, 25 * 64, 27 * 64, 29 * 64, 31 * 64, %1
SBUTTERFLY wd, 0, 1, 8
SBUTTERFLY wd, 2, 3, 8
SBUTTERFLY wd, 4, 5, 8
SBUTTERFLY wd, 6, 7, 8
%if %3 == 1
%assign shift 7
mova m14, [pd_64]
%else
LOAD_BIAS %2, m14
%endif
lea r2, [trans_coeff32 + 15 * 128]
lea r3, [coeffsq + %1]
lea r4, [r3 + 16 * 64]
mov r5d, 15 * 16
%%loop:
E32_O32 r2, r3 + r5 * 4, r4, shift, r5
sub r2, 128
add r4, 64
sub r5d, 16
jge %%loop
%endmacro
%macro TRANSPOSE_32x32 0
cglobal hevc_idct_transpose_32x32, 0, 0, 0
; M0 M1 ... M7
; M8 M15
;
; ...
;
; M56 M63
TRANSPOSE_BLOCK 0, 0, 64 ; M1
mov r1d, 7
mov r2d, 7 * 256
.loop_transpose:
SWAP_BLOCKS 0, r2, 64, 0, r1 * 8
sub r2d, 256
dec r1d
jg .loop_transpose
TRANSPOSE_BLOCK 8, 256, 64 ; M9
mov r1d, 6
mov r2d, 512
mov r3d, 16
.loop_transpose2:
SWAP_BLOCKS 8, r2, 64, 256, r3
add r3d, 8
add r2d, 256
dec r1d
jg .loop_transpose2
TRANSPOSE_BLOCK 2 * 8, 2 * 256, 64 ; M9
mov r1d, 5
mov r2d, 768
mov r3d, 24
.loop_transpose3:
SWAP_BLOCKS 2 * 8, r2, 64, 2 * 256, r3
add r3d, 8
add r2d, 256
dec r1d
jg .loop_transpose3
TRANSPOSE_BLOCK 3 * 8, 3 * 256, 64 ; M27
mov r1d, 4
mov r2d, 1024
mov r3d, 32
.loop_transpose4:
SWAP_BLOCKS 3 * 8, r2, 64, 3 * 256, r3
add r3d, 8
add r2d, 256
dec r1d
jg .loop_transpose4
TRANSPOSE_BLOCK 4 * 8, 4 * 256, 64 ; M36
mov r1d, 3
mov r2d, 1280
mov r3d, 40
.loop_transpose5:
SWAP_BLOCKS 4 * 8, r2, 64, 4 * 256, r3
add r3d, 8
add r2d, 256
dec r1d
jg .loop_transpose5
TRANSPOSE_BLOCK 5 * 8, 5 * 256, 64 ; M45
SWAP_BLOCKS 5 * 8, 6 * 256, 64, 5 * 256, 6 * 8
SWAP_BLOCKS 5 * 8, 7 * 256, 64, 5 * 256, 7 * 8
TRANSPOSE_BLOCK 6 * 8, 6 * 256, 64 ; M54
SWAP_BLOCKS 6 * 8, 7 * 256, 64, 6 * 256, 7 * 8
TRANSPOSE_BLOCK 7 * 8, 7 * 256, 64 ; M63
ret
%endmacro
; void ff_hevc_idct_32x32_{8,10}_<opt>(int16_t *coeffs, int col_limit)
; %1 = bitdepth
%macro IDCT_32x32 1
cglobal hevc_idct_32x32_%1, 1, 6, 16, 256, coeffs
mov r1d, 7
.loop32:
TR_32x4 8 * r1, %1, 1
dec r1d
jge .loop32
call hevc_idct_transpose_32x32_ %+ cpuname
mov r1d, 7
.loop32_2:
TR_32x4 8 * r1, %1, 0
dec r1d
jge .loop32_2
TAIL_CALL hevc_idct_transpose_32x32_ %+ cpuname, 1
%endmacro
%macro INIT_IDCT_DC 1
INIT_MMX mmxext
IDCT_DC_NL 4, %1
IDCT_DC 8, 2, %1
INIT_XMM sse2
IDCT_DC_NL 8, %1
IDCT_DC 16, 4, %1
IDCT_DC 32, 16, %1
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
IDCT_DC 16, 2, %1
IDCT_DC 32, 8, %1
%endif ;HAVE_AVX2_EXTERNAL
%endmacro
%macro INIT_IDCT 2
INIT_XMM %2
%if %1 == 8
TRANSPOSE_8x8
%if ARCH_X86_64
TRANSPOSE_16x16
TRANSPOSE_32x32
%endif
%endif
%if ARCH_X86_64
IDCT_32x32 %1
IDCT_16x16 %1
%endif
IDCT_8x8 %1
IDCT_4x4 %1
%endmacro
INIT_IDCT_DC 8
INIT_IDCT_DC 10
INIT_IDCT_DC 12
INIT_IDCT 8, sse2
INIT_IDCT 8, avx
INIT_IDCT 10, sse2
INIT_IDCT 10, avx
;INIT_IDCT 12, sse2
;INIT_IDCT 12, avx

1672
externals/ffmpeg/libavcodec/x86/hevc_mc.asm vendored Executable file

File diff suppressed because it is too large Load Diff

340
externals/ffmpeg/libavcodec/x86/hevc_sao.asm vendored Executable file
View File

@@ -0,0 +1,340 @@
;******************************************************************************
;* SIMD optimized SAO functions for HEVC 8bit decoding
;*
;* Copyright (c) 2013 Pierre-Edouard LEPERE
;* Copyright (c) 2014 James Almer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
cextern pb_1
cextern pb_2
SECTION .text
;******************************************************************************
;SAO Band Filter
;******************************************************************************
%macro HEVC_SAO_BAND_FILTER_INIT 0
and leftq, 31
movd xm0, leftd
add leftq, 1
and leftq, 31
movd xm1, leftd
add leftq, 1
and leftq, 31
movd xm2, leftd
add leftq, 1
and leftq, 31
movd xm3, leftd
SPLATW m0, xm0
SPLATW m1, xm1
SPLATW m2, xm2
SPLATW m3, xm3
%if mmsize > 16
SPLATW m4, [offsetq + 2]
SPLATW m5, [offsetq + 4]
SPLATW m6, [offsetq + 6]
SPLATW m7, [offsetq + 8]
%else
movq m7, [offsetq + 2]
SPLATW m4, m7, 0
SPLATW m5, m7, 1
SPLATW m6, m7, 2
SPLATW m7, m7, 3
%endif
%if ARCH_X86_64
pxor m14, m14
%else ; ARCH_X86_32
mova [rsp+mmsize*0], m0
mova [rsp+mmsize*1], m1
mova [rsp+mmsize*2], m2
mova [rsp+mmsize*3], m3
mova [rsp+mmsize*4], m4
mova [rsp+mmsize*5], m5
mova [rsp+mmsize*6], m6
pxor m0, m0
%assign MMSIZE mmsize
%define m14 m0
%define m13 m1
%define m9 m2
%define m8 m3
%endif ; ARCH
DEFINE_ARGS dst, src, dststride, srcstride, offset, height
mov heightd, r7m
%endmacro
%macro HEVC_SAO_BAND_FILTER_COMPUTE 2
psraw %1, %2, 3
%if ARCH_X86_64
pcmpeqw m10, %1, m0
pcmpeqw m11, %1, m1
pcmpeqw m12, %1, m2
pcmpeqw %1, m3
pand m10, m4
pand m11, m5
pand m12, m6
pand %1, m7
por m10, m11
por m12, %1
por m10, m12
paddw %2, m10
%else ; ARCH_X86_32
pcmpeqw m4, %1, [rsp+MMSIZE*0]
pcmpeqw m5, %1, [rsp+MMSIZE*1]
pcmpeqw m6, %1, [rsp+MMSIZE*2]
pcmpeqw %1, [rsp+MMSIZE*3]
pand m4, [rsp+MMSIZE*4]
pand m5, [rsp+MMSIZE*5]
pand m6, [rsp+MMSIZE*6]
pand %1, m7
por m4, m5
por m6, %1
por m4, m6
paddw %2, m4
%endif ; ARCH
%endmacro
;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
; int16_t *sao_offset_val, int sao_left_class, int width, int height);
%macro HEVC_SAO_BAND_FILTER 2
cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
HEVC_SAO_BAND_FILTER_INIT
align 16
.loop:
%if %1 == 8
movq m8, [srcq]
punpcklbw m8, m14
HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
packuswb m8, m14
movq [dstq], m8
%endif ; %1 == 8
%assign i 0
%rep %2
mova m13, [srcq + i]
punpcklbw m8, m13, m14
HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
punpckhbw m13, m14
HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
packuswb m8, m13
mova [dstq + i], m8
%assign i i+mmsize
%endrep
%if %1 == 48
INIT_XMM cpuname
mova m13, [srcq + i]
punpcklbw m8, m13, m14
HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
punpckhbw m13, m14
HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
packuswb m8, m13
mova [dstq + i], m8
%if cpuflag(avx2)
INIT_YMM cpuname
%endif
%endif ; %1 == 48
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
REP_RET
%endmacro
%macro HEVC_SAO_BAND_FILTER_FUNCS 0
HEVC_SAO_BAND_FILTER 8, 0
HEVC_SAO_BAND_FILTER 16, 1
HEVC_SAO_BAND_FILTER 32, 2
HEVC_SAO_BAND_FILTER 48, 2
HEVC_SAO_BAND_FILTER 64, 4
%endmacro
INIT_XMM sse2
HEVC_SAO_BAND_FILTER_FUNCS
INIT_XMM avx
HEVC_SAO_BAND_FILTER_FUNCS
%if HAVE_AVX2_EXTERNAL
INIT_XMM avx2
HEVC_SAO_BAND_FILTER 8, 0
HEVC_SAO_BAND_FILTER 16, 1
INIT_YMM avx2
HEVC_SAO_BAND_FILTER 32, 1
HEVC_SAO_BAND_FILTER 48, 1
HEVC_SAO_BAND_FILTER 64, 2
%endif
;******************************************************************************
;SAO Edge Filter
;******************************************************************************
%define MAX_PB_SIZE 64
%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
%macro HEVC_SAO_EDGE_FILTER_INIT 0
%if WIN64
movsxd eoq, dword eom
%elif ARCH_X86_64
movsxd eoq, eod
%else
mov eoq, r4m
%endif
lea tmp2q, [pb_eo]
movsx a_strideq, byte [tmp2q+eoq*4+1]
movsx b_strideq, byte [tmp2q+eoq*4+3]
imul a_strideq, EDGE_SRCSTRIDE
imul b_strideq, EDGE_SRCSTRIDE
movsx tmpq, byte [tmp2q+eoq*4]
add a_strideq, tmpq
movsx tmpq, byte [tmp2q+eoq*4+2]
add b_strideq, tmpq
%endmacro
%macro HEVC_SAO_EDGE_FILTER_COMPUTE 1
pminub m4, m1, m2
pminub m5, m1, m3
pcmpeqb m2, m4
pcmpeqb m3, m5
pcmpeqb m4, m1
pcmpeqb m5, m1
psubb m4, m2
psubb m5, m3
paddb m4, m6
paddb m4, m5
pshufb m2, m0, m4
%if %1 > 8
punpckhbw m5, m7, m1
punpckhbw m4, m2, m7
punpcklbw m3, m7, m1
punpcklbw m2, m7
pmaddubsw m5, m4
pmaddubsw m3, m2
packuswb m3, m5
%else
punpcklbw m3, m7, m1
punpcklbw m2, m7
pmaddubsw m3, m2
packuswb m3, m3
%endif
%endmacro
;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
; int eo, int width, int height);
%macro HEVC_SAO_EDGE_FILTER 2-3
%if ARCH_X86_64
cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
%define tmp2q heightq
HEVC_SAO_EDGE_FILTER_INIT
mov heightd, r6m
%else ; ARCH_X86_32
cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height
%define eoq srcq
%define tmpq heightq
%define tmp2q dststrideq
%define offsetq heightq
HEVC_SAO_EDGE_FILTER_INIT
mov srcq, srcm
mov offsetq, r3m
mov dststrideq, dststridem
%endif ; ARCH
%if mmsize > 16
vbroadcasti128 m0, [offsetq]
%else
movu m0, [offsetq]
%endif
mova m1, [pb_edge_shuffle]
packsswb m0, m0
mova m7, [pb_1]
pshufb m0, m1
mova m6, [pb_2]
%if ARCH_X86_32
mov heightd, r6m
%endif
align 16
.loop:
%if %1 == 8
movq m1, [srcq]
movq m2, [srcq + a_strideq]
movq m3, [srcq + b_strideq]
HEVC_SAO_EDGE_FILTER_COMPUTE %1
movq [dstq], m3
%endif
%assign i 0
%rep %2
mova m1, [srcq + i]
movu m2, [srcq + a_strideq + i]
movu m3, [srcq + b_strideq + i]
HEVC_SAO_EDGE_FILTER_COMPUTE %1
mov%3 [dstq + i], m3
%assign i i+mmsize
%endrep
%if %1 == 48
INIT_XMM cpuname
mova m1, [srcq + i]
movu m2, [srcq + a_strideq + i]
movu m3, [srcq + b_strideq + i]
HEVC_SAO_EDGE_FILTER_COMPUTE %1
mova [dstq + i], m3
%if cpuflag(avx2)
INIT_YMM cpuname
%endif
%endif
add dstq, dststrideq
add srcq, EDGE_SRCSTRIDE
dec heightd
jg .loop
RET
%endmacro
INIT_XMM ssse3
HEVC_SAO_EDGE_FILTER 8, 0
HEVC_SAO_EDGE_FILTER 16, 1, a
HEVC_SAO_EDGE_FILTER 32, 2, a
HEVC_SAO_EDGE_FILTER 48, 2, a
HEVC_SAO_EDGE_FILTER 64, 4, a
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
HEVC_SAO_EDGE_FILTER 32, 1, a
HEVC_SAO_EDGE_FILTER 48, 1, u
HEVC_SAO_EDGE_FILTER 64, 2, a
%endif

View File

@@ -0,0 +1,370 @@
;******************************************************************************
;* SIMD optimized SAO functions for HEVC 10/12bit decoding
;*
;* Copyright (c) 2013 Pierre-Edouard LEPERE
;* Copyright (c) 2014 James Almer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
pw_m2: times 16 dw -2
pw_mask10: times 16 dw 0x03FF
pw_mask12: times 16 dw 0x0FFF
pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
cextern pw_m1
cextern pw_1
cextern pw_2
SECTION .text
;******************************************************************************
;SAO Band Filter
;******************************************************************************
%macro HEVC_SAO_BAND_FILTER_INIT 1
and leftq, 31
movd xm0, leftd
add leftq, 1
and leftq, 31
movd xm1, leftd
add leftq, 1
and leftq, 31
movd xm2, leftd
add leftq, 1
and leftq, 31
movd xm3, leftd
SPLATW m0, xm0
SPLATW m1, xm1
SPLATW m2, xm2
SPLATW m3, xm3
%if mmsize > 16
SPLATW m4, [offsetq + 2]
SPLATW m5, [offsetq + 4]
SPLATW m6, [offsetq + 6]
SPLATW m7, [offsetq + 8]
%else
movq m7, [offsetq + 2]
SPLATW m4, m7, 0
SPLATW m5, m7, 1
SPLATW m6, m7, 2
SPLATW m7, m7, 3
%endif
%if ARCH_X86_64
mova m13, [pw_mask %+ %1]
pxor m14, m14
%else ; ARCH_X86_32
mova [rsp+mmsize*0], m0
mova [rsp+mmsize*1], m1
mova [rsp+mmsize*2], m2
mova [rsp+mmsize*3], m3
mova [rsp+mmsize*4], m4
mova [rsp+mmsize*5], m5
mova [rsp+mmsize*6], m6
mova m1, [pw_mask %+ %1]
pxor m0, m0
%define m14 m0
%define m13 m1
%define m9 m2
%define m8 m3
%endif ; ARCH
DEFINE_ARGS dst, src, dststride, srcstride, offset, height
mov heightd, r7m
%endmacro
;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
; int16_t *sao_offset_val, int sao_left_class, int width, int height);
%macro HEVC_SAO_BAND_FILTER 3
cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
HEVC_SAO_BAND_FILTER_INIT %1
align 16
.loop:
%assign i 0
%assign j 0
%rep %3
%assign k 8+(j&1)
%assign l 9-(j&1)
mova m %+ k, [srcq + i]
psraw m %+ l, m %+ k, %1-5
%if ARCH_X86_64
pcmpeqw m10, m %+ l, m0
pcmpeqw m11, m %+ l, m1
pcmpeqw m12, m %+ l, m2
pcmpeqw m %+ l, m3
pand m10, m4
pand m11, m5
pand m12, m6
pand m %+ l, m7
por m10, m11
por m12, m %+ l
por m10, m12
paddw m %+ k, m10
%else ; ARCH_X86_32
pcmpeqw m4, m %+ l, [rsp+mmsize*0]
pcmpeqw m5, m %+ l, [rsp+mmsize*1]
pcmpeqw m6, m %+ l, [rsp+mmsize*2]
pcmpeqw m %+ l, [rsp+mmsize*3]
pand m4, [rsp+mmsize*4]
pand m5, [rsp+mmsize*5]
pand m6, [rsp+mmsize*6]
pand m %+ l, m7
por m4, m5
por m6, m %+ l
por m4, m6
paddw m %+ k, m4
%endif ; ARCH
CLIPW m %+ k, m14, m13
mova [dstq + i], m %+ k
%assign i i+mmsize
%assign j j+1
%endrep
add dstq, dststrideq
add srcq, srcstrideq
dec heightd
jg .loop
REP_RET
%endmacro
%macro HEVC_SAO_BAND_FILTER_FUNCS 0
HEVC_SAO_BAND_FILTER 10, 8, 1
HEVC_SAO_BAND_FILTER 10, 16, 2
HEVC_SAO_BAND_FILTER 10, 32, 4
HEVC_SAO_BAND_FILTER 10, 48, 6
HEVC_SAO_BAND_FILTER 10, 64, 8
HEVC_SAO_BAND_FILTER 12, 8, 1
HEVC_SAO_BAND_FILTER 12, 16, 2
HEVC_SAO_BAND_FILTER 12, 32, 4
HEVC_SAO_BAND_FILTER 12, 48, 6
HEVC_SAO_BAND_FILTER 12, 64, 8
%endmacro
INIT_XMM sse2
HEVC_SAO_BAND_FILTER_FUNCS
INIT_XMM avx
HEVC_SAO_BAND_FILTER_FUNCS
%if HAVE_AVX2_EXTERNAL
INIT_XMM avx2
HEVC_SAO_BAND_FILTER 10, 8, 1
INIT_YMM avx2
HEVC_SAO_BAND_FILTER 10, 16, 1
HEVC_SAO_BAND_FILTER 10, 32, 2
HEVC_SAO_BAND_FILTER 10, 48, 3
HEVC_SAO_BAND_FILTER 10, 64, 4
INIT_XMM avx2
HEVC_SAO_BAND_FILTER 12, 8, 1
INIT_YMM avx2
HEVC_SAO_BAND_FILTER 12, 16, 1
HEVC_SAO_BAND_FILTER 12, 32, 2
HEVC_SAO_BAND_FILTER 12, 48, 3
HEVC_SAO_BAND_FILTER 12, 64, 4
%endif
;******************************************************************************
;SAO Edge Filter
;******************************************************************************
%define MAX_PB_SIZE 64
%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
%macro PMINUW 4
%if cpuflag(sse4)
pminuw %1, %2, %3
%else
psubusw %4, %2, %3
psubw %1, %2, %4
%endif
%endmacro
%macro HEVC_SAO_EDGE_FILTER_INIT 0
%if WIN64
movsxd eoq, dword eom
%elif ARCH_X86_64
movsxd eoq, eod
%else
mov eoq, r4m
%endif
lea tmp2q, [pb_eo]
movsx a_strideq, byte [tmp2q+eoq*4+1]
movsx b_strideq, byte [tmp2q+eoq*4+3]
imul a_strideq, EDGE_SRCSTRIDE >> 1
imul b_strideq, EDGE_SRCSTRIDE >> 1
movsx tmpq, byte [tmp2q+eoq*4]
add a_strideq, tmpq
movsx tmpq, byte [tmp2q+eoq*4+2]
add b_strideq, tmpq
%endmacro
;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
; int eo, int width, int height);
%macro HEVC_SAO_EDGE_FILTER 3
%if ARCH_X86_64
cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
%define tmp2q heightq
HEVC_SAO_EDGE_FILTER_INIT
mov heightd, r6m
add a_strideq, a_strideq
add b_strideq, b_strideq
%else ; ARCH_X86_32
cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
%define eoq srcq
%define tmpq heightq
%define tmp2q dststrideq
%define offsetq heightq
%define m8 m1
%define m9 m2
%define m10 m3
%define m11 m4
%define m12 m5
HEVC_SAO_EDGE_FILTER_INIT
mov srcq, srcm
mov offsetq, r3m
mov dststrideq, dststridem
add a_strideq, a_strideq
add b_strideq, b_strideq
%endif ; ARCH
%if mmsize > 16
SPLATW m8, [offsetq+2]
SPLATW m9, [offsetq+4]
SPLATW m10, [offsetq+0]
SPLATW m11, [offsetq+6]
SPLATW m12, [offsetq+8]
%else
movq m10, [offsetq+0]
movd m12, [offsetq+6]
SPLATW m8, xm10, 1
SPLATW m9, xm10, 2
SPLATW m10, xm10, 0
SPLATW m11, xm12, 0
SPLATW m12, xm12, 1
%endif
pxor m0, m0
%if ARCH_X86_64
mova m13, [pw_m1]
mova m14, [pw_1]
mova m15, [pw_2]
%else
mov heightd, r6m
mova [rsp+mmsize*0], m8
mova [rsp+mmsize*1], m9
mova [rsp+mmsize*2], m10
mova [rsp+mmsize*3], m11
mova [rsp+mmsize*4], m12
%endif
align 16
.loop:
%assign i 0
%rep %3
mova m1, [srcq + i]
movu m2, [srcq+a_strideq + i]
movu m3, [srcq+b_strideq + i]
PMINUW m4, m1, m2, m6
PMINUW m5, m1, m3, m7
pcmpeqw m2, m4
pcmpeqw m3, m5
pcmpeqw m4, m1
pcmpeqw m5, m1
psubw m4, m2
psubw m5, m3
paddw m4, m5
pcmpeqw m2, m4, [pw_m2]
%if ARCH_X86_64
pcmpeqw m3, m4, m13
pcmpeqw m5, m4, m0
pcmpeqw m6, m4, m14
pcmpeqw m7, m4, m15
pand m2, m8
pand m3, m9
pand m5, m10
pand m6, m11
pand m7, m12
%else
pcmpeqw m3, m4, [pw_m1]
pcmpeqw m5, m4, m0
pcmpeqw m6, m4, [pw_1]
pcmpeqw m7, m4, [pw_2]
pand m2, [rsp+mmsize*0]
pand m3, [rsp+mmsize*1]
pand m5, [rsp+mmsize*2]
pand m6, [rsp+mmsize*3]
pand m7, [rsp+mmsize*4]
%endif
paddw m2, m3
paddw m5, m6
paddw m2, m7
paddw m2, m1
paddw m2, m5
CLIPW m2, m0, [pw_mask %+ %1]
mova [dstq + i], m2
%assign i i+mmsize
%endrep
add dstq, dststrideq
add srcq, EDGE_SRCSTRIDE
dec heightd
jg .loop
RET
%endmacro
INIT_XMM sse2
HEVC_SAO_EDGE_FILTER 10, 8, 1
HEVC_SAO_EDGE_FILTER 10, 16, 2
HEVC_SAO_EDGE_FILTER 10, 32, 4
HEVC_SAO_EDGE_FILTER 10, 48, 6
HEVC_SAO_EDGE_FILTER 10, 64, 8
HEVC_SAO_EDGE_FILTER 12, 8, 1
HEVC_SAO_EDGE_FILTER 12, 16, 2
HEVC_SAO_EDGE_FILTER 12, 32, 4
HEVC_SAO_EDGE_FILTER 12, 48, 6
HEVC_SAO_EDGE_FILTER 12, 64, 8
%if HAVE_AVX2_EXTERNAL
INIT_XMM avx2
HEVC_SAO_EDGE_FILTER 10, 8, 1
INIT_YMM avx2
HEVC_SAO_EDGE_FILTER 10, 16, 1
HEVC_SAO_EDGE_FILTER 10, 32, 2
HEVC_SAO_EDGE_FILTER 10, 48, 3
HEVC_SAO_EDGE_FILTER 10, 64, 4
INIT_XMM avx2
HEVC_SAO_EDGE_FILTER 12, 8, 1
INIT_YMM avx2
HEVC_SAO_EDGE_FILTER 12, 16, 1
HEVC_SAO_EDGE_FILTER 12, 32, 2
HEVC_SAO_EDGE_FILTER 12, 48, 3
HEVC_SAO_EDGE_FILTER 12, 64, 4
%endif

259
externals/ffmpeg/libavcodec/x86/hevcdsp.h vendored Executable file
View File

@@ -0,0 +1,259 @@
/*
* HEVC video decoder
*
* Copyright (C) 2012 - 2013 Guillaume Martres
* Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
*
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_HEVCDSP_H
#define AVCODEC_X86_HEVCDSP_H
#include <stddef.h>
#include <stdint.h>
#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
#define PEL_PROTOTYPE(name, D, opt) \
void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
///////////////////////////////////////////////////////////////////////////////
// MC functions
///////////////////////////////////////////////////////////////////////////////
#define EPEL_PROTOTYPES(fname, bitd, opt) \
PEL_PROTOTYPE(fname##4, bitd, opt); \
PEL_PROTOTYPE(fname##6, bitd, opt); \
PEL_PROTOTYPE(fname##8, bitd, opt); \
PEL_PROTOTYPE(fname##12, bitd, opt); \
PEL_PROTOTYPE(fname##16, bitd, opt); \
PEL_PROTOTYPE(fname##24, bitd, opt); \
PEL_PROTOTYPE(fname##32, bitd, opt); \
PEL_PROTOTYPE(fname##48, bitd, opt); \
PEL_PROTOTYPE(fname##64, bitd, opt)
#define QPEL_PROTOTYPES(fname, bitd, opt) \
PEL_PROTOTYPE(fname##4, bitd, opt); \
PEL_PROTOTYPE(fname##8, bitd, opt); \
PEL_PROTOTYPE(fname##12, bitd, opt); \
PEL_PROTOTYPE(fname##16, bitd, opt); \
PEL_PROTOTYPE(fname##24, bitd, opt); \
PEL_PROTOTYPE(fname##32, bitd, opt); \
PEL_PROTOTYPE(fname##48, bitd, opt); \
PEL_PROTOTYPE(fname##64, bitd, opt)
#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom, int _wx, int _ox); \
void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom, int _wx0, int _wx1, int _ox0, int _ox1)
#define WEIGHTING_PROTOTYPES(bitd, opt) \
WEIGHTING_PROTOTYPE(2, bitd, opt); \
WEIGHTING_PROTOTYPE(4, bitd, opt); \
WEIGHTING_PROTOTYPE(6, bitd, opt); \
WEIGHTING_PROTOTYPE(8, bitd, opt); \
WEIGHTING_PROTOTYPE(12, bitd, opt); \
WEIGHTING_PROTOTYPE(16, bitd, opt); \
WEIGHTING_PROTOTYPE(24, bitd, opt); \
WEIGHTING_PROTOTYPE(32, bitd, opt); \
WEIGHTING_PROTOTYPE(48, bitd, opt); \
WEIGHTING_PROTOTYPE(64, bitd, opt)
///////////////////////////////////////////////////////////////////////////////
// QPEL_PIXELS EPEL_PIXELS
///////////////////////////////////////////////////////////////////////////////
EPEL_PROTOTYPES(pel_pixels , 8, sse4);
EPEL_PROTOTYPES(pel_pixels , 10, sse4);
EPEL_PROTOTYPES(pel_pixels , 12, sse4);
void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
///////////////////////////////////////////////////////////////////////////////
// EPEL
///////////////////////////////////////////////////////////////////////////////
EPEL_PROTOTYPES(epel_h , 8, sse4);
EPEL_PROTOTYPES(epel_h , 10, sse4);
EPEL_PROTOTYPES(epel_h , 12, sse4);
EPEL_PROTOTYPES(epel_v , 8, sse4);
EPEL_PROTOTYPES(epel_v , 10, sse4);
EPEL_PROTOTYPES(epel_v , 12, sse4);
EPEL_PROTOTYPES(epel_hv , 8, sse4);
EPEL_PROTOTYPES(epel_hv , 10, sse4);
EPEL_PROTOTYPES(epel_hv , 12, sse4);
PEL_PROTOTYPE(epel_h16, 8, avx2);
PEL_PROTOTYPE(epel_h24, 8, avx2);
PEL_PROTOTYPE(epel_h32, 8, avx2);
PEL_PROTOTYPE(epel_h48, 8, avx2);
PEL_PROTOTYPE(epel_h64, 8, avx2);
PEL_PROTOTYPE(epel_h16,10, avx2);
PEL_PROTOTYPE(epel_h24,10, avx2);
PEL_PROTOTYPE(epel_h32,10, avx2);
PEL_PROTOTYPE(epel_h48,10, avx2);
PEL_PROTOTYPE(epel_h64,10, avx2);
PEL_PROTOTYPE(epel_v16, 8, avx2);
PEL_PROTOTYPE(epel_v24, 8, avx2);
PEL_PROTOTYPE(epel_v32, 8, avx2);
PEL_PROTOTYPE(epel_v48, 8, avx2);
PEL_PROTOTYPE(epel_v64, 8, avx2);
PEL_PROTOTYPE(epel_v16,10, avx2);
PEL_PROTOTYPE(epel_v24,10, avx2);
PEL_PROTOTYPE(epel_v32,10, avx2);
PEL_PROTOTYPE(epel_v48,10, avx2);
PEL_PROTOTYPE(epel_v64,10, avx2);
PEL_PROTOTYPE(epel_hv16, 8, avx2);
PEL_PROTOTYPE(epel_hv24, 8, avx2);
PEL_PROTOTYPE(epel_hv32, 8, avx2);
PEL_PROTOTYPE(epel_hv48, 8, avx2);
PEL_PROTOTYPE(epel_hv64, 8, avx2);
PEL_PROTOTYPE(epel_hv16,10, avx2);
PEL_PROTOTYPE(epel_hv24,10, avx2);
PEL_PROTOTYPE(epel_hv32,10, avx2);
PEL_PROTOTYPE(epel_hv48,10, avx2);
PEL_PROTOTYPE(epel_hv64,10, avx2);
///////////////////////////////////////////////////////////////////////////////
// QPEL
///////////////////////////////////////////////////////////////////////////////
QPEL_PROTOTYPES(qpel_h , 8, sse4);
QPEL_PROTOTYPES(qpel_h , 10, sse4);
QPEL_PROTOTYPES(qpel_h , 12, sse4);
QPEL_PROTOTYPES(qpel_v, 8, sse4);
QPEL_PROTOTYPES(qpel_v, 10, sse4);
QPEL_PROTOTYPES(qpel_v, 12, sse4);
QPEL_PROTOTYPES(qpel_hv, 8, sse4);
QPEL_PROTOTYPES(qpel_hv, 10, sse4);
QPEL_PROTOTYPES(qpel_hv, 12, sse4);
PEL_PROTOTYPE(qpel_h16, 8, avx2);
PEL_PROTOTYPE(qpel_h24, 8, avx2);
PEL_PROTOTYPE(qpel_h32, 8, avx2);
PEL_PROTOTYPE(qpel_h48, 8, avx2);
PEL_PROTOTYPE(qpel_h64, 8, avx2);
PEL_PROTOTYPE(qpel_h16,10, avx2);
PEL_PROTOTYPE(qpel_h24,10, avx2);
PEL_PROTOTYPE(qpel_h32,10, avx2);
PEL_PROTOTYPE(qpel_h48,10, avx2);
PEL_PROTOTYPE(qpel_h64,10, avx2);
PEL_PROTOTYPE(qpel_v16, 8, avx2);
PEL_PROTOTYPE(qpel_v24, 8, avx2);
PEL_PROTOTYPE(qpel_v32, 8, avx2);
PEL_PROTOTYPE(qpel_v48, 8, avx2);
PEL_PROTOTYPE(qpel_v64, 8, avx2);
PEL_PROTOTYPE(qpel_v16,10, avx2);
PEL_PROTOTYPE(qpel_v24,10, avx2);
PEL_PROTOTYPE(qpel_v32,10, avx2);
PEL_PROTOTYPE(qpel_v48,10, avx2);
PEL_PROTOTYPE(qpel_v64,10, avx2);
PEL_PROTOTYPE(qpel_hv16, 8, avx2);
PEL_PROTOTYPE(qpel_hv24, 8, avx2);
PEL_PROTOTYPE(qpel_hv32, 8, avx2);
PEL_PROTOTYPE(qpel_hv48, 8, avx2);
PEL_PROTOTYPE(qpel_hv64, 8, avx2);
PEL_PROTOTYPE(qpel_hv16,10, avx2);
PEL_PROTOTYPE(qpel_hv24,10, avx2);
PEL_PROTOTYPE(qpel_hv32,10, avx2);
PEL_PROTOTYPE(qpel_hv48,10, avx2);
PEL_PROTOTYPE(qpel_hv64,10, avx2);
WEIGHTING_PROTOTYPES(8, sse4);
WEIGHTING_PROTOTYPES(10, sse4);
WEIGHTING_PROTOTYPES(12, sse4);
///////////////////////////////////////////////////////////////////////////////
// TRANSFORM_ADD
///////////////////////////////////////////////////////////////////////////////
void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
#endif // AVCODEC_X86_HEVCDSP_H

1151
externals/ffmpeg/libavcodec/x86/hevcdsp_init.c vendored Executable file

File diff suppressed because it is too large Load Diff

591
externals/ffmpeg/libavcodec/x86/hpeldsp.asm vendored Executable file
View File

@@ -0,0 +1,591 @@
;******************************************************************************
;*
;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
;* Copyright (c) 2013 Daniel Kang
;*
;* SIMD-optimized halfpel functions
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pb_1
cextern pw_2
pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
cextern pw_8192
SECTION .text
; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_PIXELS8_X2 0
%if cpuflag(sse2)
cglobal put_pixels16_x2, 4,5,4
%else
cglobal put_pixels8_x2, 4,5
%endif
lea r4, [r2*2]
.loop:
movu m0, [r1+1]
movu m1, [r1+r2+1]
%if cpuflag(sse2)
movu m2, [r1]
movu m3, [r1+r2]
pavgb m0, m2
pavgb m1, m3
%else
PAVGB m0, [r1]
PAVGB m1, [r1+r2]
%endif
mova [r0], m0
mova [r0+r2], m1
add r1, r4
add r0, r4
movu m0, [r1+1]
movu m1, [r1+r2+1]
%if cpuflag(sse2)
movu m2, [r1]
movu m3, [r1+r2]
pavgb m0, m2
pavgb m1, m3
%else
PAVGB m0, [r1]
PAVGB m1, [r1+r2]
%endif
add r1, r4
mova [r0], m0
mova [r0+r2], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_PIXELS8_X2
INIT_MMX 3dnow
PUT_PIXELS8_X2
; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_PIXELS_16 0
cglobal put_pixels16_x2, 4,5
lea r4, [r2*2]
.loop:
mova m0, [r1]
mova m1, [r1+r2]
mova m2, [r1+8]
mova m3, [r1+r2+8]
PAVGB m0, [r1+1]
PAVGB m1, [r1+r2+1]
PAVGB m2, [r1+9]
PAVGB m3, [r1+r2+9]
mova [r0], m0
mova [r0+r2], m1
mova [r0+8], m2
mova [r0+r2+8], m3
add r1, r4
add r0, r4
mova m0, [r1]
mova m1, [r1+r2]
mova m2, [r1+8]
mova m3, [r1+r2+8]
PAVGB m0, [r1+1]
PAVGB m1, [r1+r2+1]
PAVGB m2, [r1+9]
PAVGB m3, [r1+r2+9]
add r1, r4
mova [r0], m0
mova [r0+r2], m1
mova [r0+8], m2
mova [r0+r2+8], m3
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_PIXELS_16
INIT_MMX 3dnow
PUT_PIXELS_16
; The 8_X2 macro can easily be used here
INIT_XMM sse2
PUT_PIXELS8_X2
; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_NO_RND_PIXELS8_X2 0
cglobal put_no_rnd_pixels8_x2, 4,5
mova m6, [pb_1]
lea r4, [r2*2]
.loop:
mova m0, [r1]
mova m2, [r1+r2]
mova m1, [r1+1]
mova m3, [r1+r2+1]
add r1, r4
psubusb m0, m6
psubusb m2, m6
PAVGB m0, m1
PAVGB m2, m3
mova [r0], m0
mova [r0+r2], m2
mova m0, [r1]
mova m1, [r1+1]
mova m2, [r1+r2]
mova m3, [r1+r2+1]
add r0, r4
add r1, r4
psubusb m0, m6
psubusb m2, m6
PAVGB m0, m1
PAVGB m2, m3
mova [r0], m0
mova [r0+r2], m2
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_X2
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_X2
; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_PIXELS8_Y2 0
%if cpuflag(sse2)
cglobal put_pixels16_y2, 4,5,3
%else
cglobal put_pixels8_y2, 4,5
%endif
lea r4, [r2*2]
movu m0, [r1]
sub r0, r2
.loop:
movu m1, [r1+r2]
movu m2, [r1+r4]
add r1, r4
PAVGB m0, m1
PAVGB m1, m2
mova [r0+r2], m0
mova [r0+r4], m1
movu m1, [r1+r2]
movu m0, [r1+r4]
add r0, r4
add r1, r4
PAVGB m2, m1
PAVGB m1, m0
mova [r0+r2], m2
mova [r0+r4], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_PIXELS8_Y2
INIT_MMX 3dnow
PUT_PIXELS8_Y2
; actually, put_pixels16_y2_sse2
INIT_XMM sse2
PUT_PIXELS8_Y2
; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_NO_RND_PIXELS8_Y2 0
cglobal put_no_rnd_pixels8_y2, 4,5
mova m6, [pb_1]
lea r4, [r2+r2]
mova m0, [r1]
sub r0, r2
.loop:
mova m1, [r1+r2]
mova m2, [r1+r4]
add r1, r4
psubusb m1, m6
PAVGB m0, m1
PAVGB m1, m2
mova [r0+r2], m0
mova [r0+r4], m1
mova m1, [r1+r2]
mova m0, [r1+r4]
add r0, r4
add r1, r4
psubusb m1, m6
PAVGB m2, m1
PAVGB m1, m0
mova [r0+r2], m2
mova [r0+r4], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_Y2
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_Y2
; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS8 0
cglobal avg_pixels8, 4,5
lea r4, [r2*2]
.loop:
mova m0, [r0]
mova m1, [r0+r2]
PAVGB m0, [r1]
PAVGB m1, [r1+r2]
mova [r0], m0
mova [r0+r2], m1
add r1, r4
add r0, r4
mova m0, [r0]
mova m1, [r0+r2]
PAVGB m0, [r1]
PAVGB m1, [r1+r2]
add r1, r4
mova [r0], m0
mova [r0+r2], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX 3dnow
AVG_PIXELS8
; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS8_X2 0
%if cpuflag(sse2)
cglobal avg_pixels16_x2, 4,5,4
%else
cglobal avg_pixels8_x2, 4,5
%endif
lea r4, [r2*2]
%if notcpuflag(mmxext)
pcmpeqd m5, m5
paddb m5, m5
%endif
.loop:
movu m0, [r1]
movu m2, [r1+r2]
%if cpuflag(sse2)
movu m1, [r1+1]
movu m3, [r1+r2+1]
pavgb m0, m1
pavgb m2, m3
%else
PAVGB m0, [r1+1], m3, m5
PAVGB m2, [r1+r2+1], m4, m5
%endif
PAVGB m0, [r0], m3, m5
PAVGB m2, [r0+r2], m4, m5
add r1, r4
mova [r0], m0
mova [r0+r2], m2
movu m0, [r1]
movu m2, [r1+r2]
%if cpuflag(sse2)
movu m1, [r1+1]
movu m3, [r1+r2+1]
pavgb m0, m1
pavgb m2, m3
%else
PAVGB m0, [r1+1], m3, m5
PAVGB m2, [r1+r2+1], m4, m5
%endif
add r0, r4
add r1, r4
PAVGB m0, [r0], m3, m5
PAVGB m2, [r0+r2], m4, m5
mova [r0], m0
mova [r0+r2], m2
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmx
AVG_PIXELS8_X2
INIT_MMX mmxext
AVG_PIXELS8_X2
INIT_MMX 3dnow
AVG_PIXELS8_X2
; actually avg_pixels16_x2
INIT_XMM sse2
AVG_PIXELS8_X2
; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS8_Y2 0
%if cpuflag(sse2)
cglobal avg_pixels16_y2, 4,5,3
%else
cglobal avg_pixels8_y2, 4,5
%endif
lea r4, [r2*2]
movu m0, [r1]
sub r0, r2
.loop:
movu m1, [r1+r2]
movu m2, [r1+r4]
add r1, r4
PAVGB m0, m1
PAVGB m1, m2
PAVGB m0, [r0+r2]
PAVGB m1, [r0+r4]
mova [r0+r2], m0
mova [r0+r4], m1
movu m1, [r1+r2]
movu m0, [r1+r4]
PAVGB m2, m1
PAVGB m1, m0
add r0, r4
add r1, r4
PAVGB m2, [r0+r2]
PAVGB m1, [r0+r4]
mova [r0+r2], m2
mova [r0+r4], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
AVG_PIXELS8_Y2
INIT_MMX 3dnow
AVG_PIXELS8_Y2
; actually avg_pixels16_y2
INIT_XMM sse2
AVG_PIXELS8_Y2
; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
; Note this is not correctly rounded, and is therefore used for
; not-bitexact output
%macro AVG_APPROX_PIXELS8_XY2 0
cglobal avg_approx_pixels8_xy2, 4,5
mova m6, [pb_1]
lea r4, [r2*2]
mova m0, [r1]
PAVGB m0, [r1+1]
.loop:
mova m2, [r1+r4]
mova m1, [r1+r2]
psubusb m2, m6
PAVGB m1, [r1+r2+1]
PAVGB m2, [r1+r4+1]
add r1, r4
PAVGB m0, m1
PAVGB m1, m2
PAVGB m0, [r0]
PAVGB m1, [r0+r2]
mova [r0], m0
mova [r0+r2], m1
mova m1, [r1+r2]
mova m0, [r1+r4]
PAVGB m1, [r1+r2+1]
PAVGB m0, [r1+r4+1]
add r0, r4
add r1, r4
PAVGB m2, m1
PAVGB m1, m0
PAVGB m2, [r0]
PAVGB m1, [r0+r2]
mova [r0], m2
mova [r0+r2], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
AVG_APPROX_PIXELS8_XY2
INIT_MMX 3dnow
AVG_APPROX_PIXELS8_XY2
; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro SET_PIXELS_XY2 1
%if cpuflag(sse2)
cglobal %1_pixels16_xy2, 4,5,8
%else
cglobal %1_pixels8_xy2, 4,5
%endif
pxor m7, m7
mova m6, [pw_2]
movu m0, [r1]
movu m4, [r1+1]
mova m1, m0
mova m5, m4
punpcklbw m0, m7
punpcklbw m4, m7
punpckhbw m1, m7
punpckhbw m5, m7
paddusw m4, m0
paddusw m5, m1
xor r4, r4
add r1, r2
.loop:
movu m0, [r1+r4]
movu m2, [r1+r4+1]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpcklbw m2, m7
punpckhbw m1, m7
punpckhbw m3, m7
paddusw m0, m2
paddusw m1, m3
paddusw m4, m6
paddusw m5, m6
paddusw m4, m0
paddusw m5, m1
psrlw m4, 2
psrlw m5, 2
%ifidn %1, avg
mova m3, [r0+r4]
packuswb m4, m5
PAVGB m4, m3
%else
packuswb m4, m5
%endif
mova [r0+r4], m4
add r4, r2
movu m2, [r1+r4]
movu m4, [r1+r4+1]
mova m3, m2
mova m5, m4
punpcklbw m2, m7
punpcklbw m4, m7
punpckhbw m3, m7
punpckhbw m5, m7
paddusw m4, m2
paddusw m5, m3
paddusw m0, m6
paddusw m1, m6
paddusw m0, m4
paddusw m1, m5
psrlw m0, 2
psrlw m1, 2
%ifidn %1, avg
mova m3, [r0+r4]
packuswb m0, m1
PAVGB m0, m3
%else
packuswb m0, m1
%endif
mova [r0+r4], m0
add r4, r2
sub r3d, 2
jnz .loop
REP_RET
%endmacro
INIT_MMX mmxext
SET_PIXELS_XY2 avg
INIT_MMX 3dnow
SET_PIXELS_XY2 avg
INIT_XMM sse2
SET_PIXELS_XY2 put
SET_PIXELS_XY2 avg
%macro SSSE3_PIXELS_XY2 1-2
%if %0 == 2 ; sse2
cglobal %1_pixels16_xy2, 4,5,%2
mova m4, [pb_interleave16]
%else
cglobal %1_pixels8_xy2, 4,5
mova m4, [pb_interleave8]
%endif
mova m5, [pb_1]
movu m0, [r1]
movu m1, [r1+1]
pmaddubsw m0, m5
pmaddubsw m1, m5
xor r4, r4
add r1, r2
.loop:
movu m2, [r1+r4]
movu m3, [r1+r4+1]
pmaddubsw m2, m5
pmaddubsw m3, m5
paddusw m0, m2
paddusw m1, m3
pmulhrsw m0, [pw_8192]
pmulhrsw m1, [pw_8192]
%ifidn %1, avg
mova m6, [r0+r4]
packuswb m0, m1
pshufb m0, m4
pavgb m0, m6
%else
packuswb m0, m1
pshufb m0, m4
%endif
mova [r0+r4], m0
add r4, r2
movu m0, [r1+r4]
movu m1, [r1+r4+1]
pmaddubsw m0, m5
pmaddubsw m1, m5
paddusw m2, m0
paddusw m3, m1
pmulhrsw m2, [pw_8192]
pmulhrsw m3, [pw_8192]
%ifidn %1, avg
mova m6, [r0+r4]
packuswb m2, m3
pshufb m2, m4
pavgb m2, m6
%else
packuswb m2, m3
pshufb m2, m4
%endif
mova [r0+r4], m2
add r4, r2
sub r3d, 2
jnz .loop
REP_RET
%endmacro
INIT_MMX ssse3
SSSE3_PIXELS_XY2 put
SSSE3_PIXELS_XY2 avg
INIT_XMM ssse3
SSSE3_PIXELS_XY2 put, 6
SSSE3_PIXELS_XY2 avg, 7

57
externals/ffmpeg/libavcodec/x86/hpeldsp.h vendored Executable file
View File

@@ -0,0 +1,57 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_HPELDSP_H
#define AVCODEC_X86_HPELDSP_H
#include <stddef.h>
#include <stdint.h>
#include "libavcodec/hpeldsp.h"
void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags);
#endif /* AVCODEC_X86_HPELDSP_H */

313
externals/ffmpeg/libavcodec/x86/hpeldsp_init.c vendored Executable file
View File

@@ -0,0 +1,313 @@
/*
* SIMD-optimized halfpel functions
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/hpeldsp.h"
#include "libavcodec/pixels.h"
#include "fpel.h"
#include "hpeldsp.h"
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
#define avg_pixels8_mmx ff_avg_pixels8_mmx
#define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx
#define avg_pixels16_mmx ff_avg_pixels16_mmx
#define avg_pixels8_xy2_mmx ff_avg_pixels8_xy2_mmx
#define avg_pixels16_xy2_mmx ff_avg_pixels16_xy2_mmx
#define put_pixels8_mmx ff_put_pixels8_mmx
#define put_pixels16_mmx ff_put_pixels16_mmx
#define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx
#define put_pixels16_xy2_mmx ff_put_pixels16_xy2_mmx
#define avg_no_rnd_pixels16_mmx ff_avg_pixels16_mmx
#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx
#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx
#if HAVE_INLINE_ASM
/***********************************/
/* MMX no rounding */
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
#define SET_RND MOVQ_WONE
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
#define STATIC static
#include "rnd_template.c"
#include "hpeldsp_rnd_template.c"
#undef DEF
#undef SET_RND
#undef PAVGBP
#undef PAVGB
#undef STATIC
#if HAVE_MMX
CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
#endif
/***********************************/
/* MMX rounding */
#define DEF(x, y) x ## _ ## y ## _mmx
#define SET_RND MOVQ_WTWO
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
#include "hpeldsp_rnd_template.c"
#undef DEF
#define DEF(x, y) ff_ ## x ## _ ## y ## _mmx
#define STATIC
#include "rnd_template.c"
#undef DEF
#undef SET_RND
#undef PAVGBP
#undef PAVGB
#if HAVE_MMX
CALL_2X_PIXELS(avg_pixels16_y2_mmx, avg_pixels8_y2_mmx, 8)
CALL_2X_PIXELS(put_pixels16_y2_mmx, put_pixels8_y2_mmx, 8)
CALL_2X_PIXELS_EXPORT(ff_avg_pixels16_xy2_mmx, ff_avg_pixels8_xy2_mmx, 8)
CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
#endif
#endif /* HAVE_INLINE_ASM */
#if HAVE_X86ASM
#define HPELDSP_AVG_PIXELS16(CPUEXT) \
CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 ## CPUEXT, 8) \
CALL_2X_PIXELS(put_pixels16_y2 ## CPUEXT, ff_put_pixels8_y2 ## CPUEXT, 8) \
CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16 ## CPUEXT, ff_avg_pixels8 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
HPELDSP_AVG_PIXELS16(_3dnow)
HPELDSP_AVG_PIXELS16(_mmxext)
#endif /* HAVE_X86ASM */
#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
if (HAVE_MMX_EXTERNAL) \
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU;
#if HAVE_MMX_INLINE
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
do { \
SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
} while (0)
#else
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
do { \
SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
} while (0)
#endif
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
{
SET_HPEL_FUNCS(put, [0], 16, mmx);
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
SET_HPEL_FUNCS(avg, [0], 16, mmx);
SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
SET_HPEL_FUNCS(put, [1], 8, mmx);
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
if (HAVE_MMX_EXTERNAL) {
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx;
}
#if HAVE_MMX_INLINE
c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx;
#endif
}
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
{
#if HAVE_MMXEXT_EXTERNAL
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
}
#endif /* HAVE_MMXEXT_EXTERNAL */
}
static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
{
#if HAVE_AMD3DNOW_EXTERNAL
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
if (!(flags & AV_CODEC_FLAG_BITEXACT)){
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow;
c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow;
}
#endif /* HAVE_AMD3DNOW_EXTERNAL */
}
static void hpeldsp_init_sse2_fast(HpelDSPContext *c, int flags)
{
#if HAVE_SSE2_EXTERNAL
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2;
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2;
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2;
#endif /* HAVE_SSE2_EXTERNAL */
}
static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags)
{
#if HAVE_SSSE3_EXTERNAL
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3;
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3;
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3;
#endif
}
av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags))
hpeldsp_init_mmx(c, flags);
if (EXTERNAL_AMD3DNOW(cpu_flags))
hpeldsp_init_3dnow(c, flags);
if (EXTERNAL_MMXEXT(cpu_flags))
hpeldsp_init_mmxext(c, flags);
if (EXTERNAL_SSE2_FAST(cpu_flags))
hpeldsp_init_sse2_fast(c, flags);
if (EXTERNAL_SSSE3(cpu_flags))
hpeldsp_init_ssse3(c, flags);
if (CONFIG_VP3_DECODER)
ff_hpeldsp_vp3_init_x86(c, cpu_flags, flags);
}

View File

@@ -0,0 +1,202 @@
/*
* SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
* and improved by Zdenek Kabelac <kabi@users.sf.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
// put_pixels
av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"FF_REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:FF_REG_a, "memory");
}
av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"FF_REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"movq 8(%1), %%mm0 \n\t"
"movq 9(%1), %%mm1 \n\t"
"movq 8(%1, %3), %%mm2 \n\t"
"movq 9(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, 8(%2) \n\t"
"movq %%mm5, 8(%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"movq 8(%1), %%mm0 \n\t"
"movq 9(%1), %%mm1 \n\t"
"movq 8(%1, %3), %%mm2 \n\t"
"movq 9(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, 8(%2) \n\t"
"movq %%mm5, 8(%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:FF_REG_a, "memory");
}
av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"FF_REG_a" \n\t"
"movq (%1), %%mm0 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"FF_REG_a"),%%mm2\n\t"
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"FF_REG_a"),%%mm0\n\t"
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:FF_REG_a, "memory");
}
av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
".p2align 3 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%2), %%mm3 \n\t"
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
"movq %%mm0, (%2) \n\t"
"movq 8(%1), %%mm0 \n\t"
"movq 9(%1), %%mm1 \n\t"
"movq 8(%2), %%mm3 \n\t"
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
"movq %%mm0, 8(%2) \n\t"
"add %3, %1 \n\t"
"add %3, %2 \n\t"
"subl $1, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:"memory");
}
av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"FF_REG_a" \n\t"
"movq (%1), %%mm0 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
"movq (%2), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
"movq (%2, %3), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
"movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
"movq (%2), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
"movq (%2, %3), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
"movq %%mm2, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:FF_REG_a, "memory");
}

View File

@@ -0,0 +1,111 @@
;******************************************************************************
;* SIMD-optimized halfpel functions for VP3
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_NO_RND_PIXELS8_X2_EXACT 0
cglobal put_no_rnd_pixels8_x2_exact, 4,5
lea r4, [r2*3]
pcmpeqb m6, m6
.loop:
mova m0, [r1]
mova m2, [r1+r2]
mova m1, [r1+1]
mova m3, [r1+r2+1]
pxor m0, m6
pxor m2, m6
pxor m1, m6
pxor m3, m6
PAVGB m0, m1
PAVGB m2, m3
pxor m0, m6
pxor m2, m6
mova [r0], m0
mova [r0+r2], m2
mova m0, [r1+r2*2]
mova m1, [r1+r2*2+1]
mova m2, [r1+r4]
mova m3, [r1+r4+1]
pxor m0, m6
pxor m1, m6
pxor m2, m6
pxor m3, m6
PAVGB m0, m1
PAVGB m2, m3
pxor m0, m6
pxor m2, m6
mova [r0+r2*2], m0
mova [r0+r4], m2
lea r1, [r1+r2*4]
lea r0, [r0+r2*4]
sub r3d, 4
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_X2_EXACT
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_X2_EXACT
; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
cglobal put_no_rnd_pixels8_y2_exact, 4,5
lea r4, [r2*3]
mova m0, [r1]
pcmpeqb m6, m6
add r1, r2
pxor m0, m6
.loop:
mova m1, [r1]
mova m2, [r1+r2]
pxor m1, m6
pxor m2, m6
PAVGB m0, m1
PAVGB m1, m2
pxor m0, m6
pxor m1, m6
mova [r0], m0
mova [r0+r2], m1
mova m1, [r1+r2*2]
mova m0, [r1+r4]
pxor m1, m6
pxor m0, m6
PAVGB m2, m1
PAVGB m1, m0
pxor m2, m6
pxor m1, m6
mova [r0+r2*2], m2
mova [r0+r4], m1
lea r1, [r1+r2*4]
lea r0, [r0+r2*4]
sub r3d, 4
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_Y2_EXACT
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_Y2_EXACT

View File

@@ -0,0 +1,56 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/hpeldsp.h"
#include "hpeldsp.h"
void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
av_cold void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags)
{
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
if (flags & AV_CODEC_FLAG_BITEXACT) {
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
if (flags & AV_CODEC_FLAG_BITEXACT) {
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
}
}
}

164
externals/ffmpeg/libavcodec/x86/huffyuvdsp.asm vendored Executable file
View File

@@ -0,0 +1,164 @@
;******************************************************************************
;* SIMD-optimized HuffYUV functions
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2014 Christophe Gisquet
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
%include "libavcodec/x86/huffyuvdsp_template.asm"
;------------------------------------------------------------------------------
; void (*add_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
;------------------------------------------------------------------------------
%macro ADD_INT16 0
cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
%if mmsize > 8
test srcq, mmsize-1
jnz .unaligned
test dstq, mmsize-1
jnz .unaligned
%endif
INT16_LOOP a, add
%if mmsize > 8
.unaligned:
INT16_LOOP u, add
%endif
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
ADD_INT16
%endif
INIT_XMM sse2
ADD_INT16
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
ADD_INT16
%endif
; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
; intptr_t w, uint8_t *left)
%macro LEFT_BGR32 0
cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
shl wq, 2
movd m0, [leftq]
lea dstq, [dstq + wq]
lea srcq, [srcq + wq]
LSHIFT m0, mmsize-4
neg wq
.loop:
movu m1, [srcq+wq]
mova m2, m1
%if mmsize == 8
punpckhdq m0, m0
%endif
LSHIFT m1, 4
paddb m1, m2
%if mmsize == 16
pshufd m0, m0, q3333
mova m2, m1
LSHIFT m1, 8
paddb m1, m2
%endif
paddb m0, m1
movu [dstq+wq], m0
add wq, mmsize
jl .loop
movd m0, [dstq-4]
movd [leftq], m0
REP_RET
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
LEFT_BGR32
%endif
INIT_XMM sse2
LEFT_BGR32
; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
INIT_MMX mmxext
cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
add wd, wd
movd mm6, maskd
SPLATW mm6, mm6
movq mm0, [topq]
movq mm2, mm0
movd mm4, [left_topq]
psllq mm2, 16
movq mm1, mm0
por mm4, mm2
movd mm3, [leftq]
psubw mm0, mm4 ; t-tl
add dstq, wq
add topq, wq
add diffq, wq
neg wq
jmp .skip
.loop:
movq mm4, [topq+wq]
movq mm0, mm4
psllq mm4, 16
por mm4, mm1
movq mm1, mm0 ; t
psubw mm0, mm4 ; t-tl
.skip:
movq mm2, [diffq+wq]
%assign i 0
%rep 4
movq mm4, mm0
paddw mm4, mm3 ; t-tl+l
pand mm4, mm6
movq mm5, mm3
pmaxsw mm3, mm1
pminsw mm5, mm1
pminsw mm3, mm4
pmaxsw mm3, mm5 ; median
paddw mm3, mm2 ; +residual
pand mm3, mm6
%if i==0
movq mm7, mm3
psllq mm7, 48
%else
movq mm4, mm3
psrlq mm7, 16
psllq mm4, 48
por mm7, mm4
%endif
%if i<3
psrlq mm0, 16
psrlq mm1, 16
psrlq mm2, 16
%endif
%assign i i+1
%endrep
movq [dstq+wq], mm7
add wq, 8
jl .loop
movzx r2d, word [dstq-2]
mov [leftq], r2d
movzx r2d, word [topq-2]
mov [left_topq], r2d
RET

View File

@@ -0,0 +1,61 @@
/*
* Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/pixdesc.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/huffyuvdsp.h"
void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
void ff_add_int16_avx2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
void ff_add_hfyu_left_pred_bgr32_mmx(uint8_t *dst, const uint8_t *src,
intptr_t w, uint8_t *left);
void ff_add_hfyu_left_pred_bgr32_sse2(uint8_t *dst, const uint8_t *src,
intptr_t w, uint8_t *left);
void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt)
{
int cpu_flags = av_get_cpu_flags();
const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(pix_fmt);
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmx;
c->add_int16 = ff_add_int16_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->add_int16 = ff_add_int16_sse2;
c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_sse2;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->add_int16 = ff_add_int16_avx2;
}
}

View File

@@ -0,0 +1,76 @@
;******************************************************************************
;* SIMD-optimized HuffYUV functions
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2014 Christophe Gisquet
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
movd xm4, maskd
SPLATW m4, xm4
add wd, wd
test wq, 2*mmsize - 1
jz %%.tomainloop
push tmpq
%%.wordloop:
sub wq, 2
%ifidn %2, add
mov tmpw, [srcq+wq]
add tmpw, [dstq+wq]
%else
mov tmpw, [src1q+wq]
sub tmpw, [src2q+wq]
%endif
and tmpw, maskw
mov [dstq+wq], tmpw
test wq, 2*mmsize - 1
jnz %%.wordloop
pop tmpq
%%.tomainloop:
%ifidn %2, add
add srcq, wq
%else
add src1q, wq
add src2q, wq
%endif
add dstq, wq
neg wq
jz %%.end
%%.loop:
%ifidn %2, add
mov%1 m0, [srcq+wq]
mov%1 m1, [dstq+wq]
mov%1 m2, [srcq+wq+mmsize]
mov%1 m3, [dstq+wq+mmsize]
%else
mov%1 m0, [src1q+wq]
mov%1 m1, [src2q+wq]
mov%1 m2, [src1q+wq+mmsize]
mov%1 m3, [src2q+wq+mmsize]
%endif
p%2w m0, m1
p%2w m2, m3
pand m0, m4
pand m2, m4
mov%1 [dstq+wq] , m0
mov%1 [dstq+wq+mmsize], m2
add wq, 2*mmsize
jl %%.loop
%%.end:
RET
%endmacro

View File

@@ -0,0 +1,105 @@
;************************************************************************
;* SIMD-optimized HuffYUV encoding functions
;* Copyright (c) 2000, 2001 Fabrice Bellard
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
;*
;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
%include "libavcodec/x86/huffyuvdsp_template.asm"
;------------------------------------------------------------------------------
; void ff_diff_int16(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
; unsigned mask, int w);
;------------------------------------------------------------------------------
%macro DIFF_INT16 0
cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
%if mmsize > 8
test src1q, mmsize-1
jnz .unaligned
test src2q, mmsize-1
jnz .unaligned
test dstq, mmsize-1
jnz .unaligned
%endif
INT16_LOOP a, sub
%if mmsize > 8
.unaligned:
INT16_LOOP u, sub
%endif
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
DIFF_INT16
%endif
INIT_XMM sse2
DIFF_INT16
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
DIFF_INT16
%endif
INIT_MMX mmxext
cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
add wd, wd
movd mm7, maskd
SPLATW mm7, mm7
movq mm0, [src1q]
movq mm2, [src2q]
psllq mm0, 16
psllq mm2, 16
movd mm6, [left_topq]
por mm0, mm6
movd mm6, [leftq]
por mm2, mm6
xor maskq, maskq
.loop:
movq mm1, [src1q + maskq]
movq mm3, [src2q + maskq]
movq mm4, mm2
psubw mm2, mm0
paddw mm2, mm1
pand mm2, mm7
movq mm5, mm4
pmaxsw mm4, mm1
pminsw mm1, mm5
pminsw mm4, mm2
pmaxsw mm4, mm1
psubw mm3, mm4
pand mm3, mm7
movq [dstq + maskq], mm3
add maskq, 8
movq mm0, [src1q + maskq - 2]
movq mm2, [src2q + maskq - 2]
cmp maskq, wq
jb .loop
movzx maskd, word [src1q + wq - 2]
mov [left_topq], maskd
movzx maskd, word [src2q + wq - 2]
mov [leftq], maskd
RET

View File

@@ -0,0 +1,60 @@
/*
* SIMD-optimized HuffYUV encoding functions
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/pixdesc.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/huffyuvencdsp.h"
void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w);
void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w);
void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w);
void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w, int *left, int *left_top);
av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
{
av_unused int cpu_flags = av_get_cpu_flags();
const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
c->diff_int16 = ff_diff_int16_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->diff_int16 = ff_diff_int16_sse2;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->diff_int16 = ff_diff_int16_avx2;
}
}

183
externals/ffmpeg/libavcodec/x86/idctdsp.asm vendored Executable file
View File

@@ -0,0 +1,183 @@
;******************************************************************************
;* SIMD-optimized IDCT-related routines
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pb_80
SECTION .text
;--------------------------------------------------------------------------
;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
; ptrdiff_t line_size)
;--------------------------------------------------------------------------
%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
mova m1, [blockq+mmsize*0+%1]
mova m2, [blockq+mmsize*2+%1]
%if mmsize == 8
mova m3, [blockq+mmsize*4+%1]
mova m4, [blockq+mmsize*6+%1]
%endif
packsswb m1, [blockq+mmsize*1+%1]
packsswb m2, [blockq+mmsize*3+%1]
%if mmsize == 8
packsswb m3, [blockq+mmsize*5+%1]
packsswb m4, [blockq+mmsize*7+%1]
%endif
paddb m1, m0
paddb m2, m0
%if mmsize == 8
paddb m3, m0
paddb m4, m0
movq [pixelsq+lsizeq*0], m1
movq [pixelsq+lsizeq*1], m2
movq [pixelsq+lsizeq*2], m3
movq [pixelsq+lsize3q ], m4
%else
movq [pixelsq+lsizeq*0], m1
movhps [pixelsq+lsizeq*1], m1
movq [pixelsq+lsizeq*2], m2
movhps [pixelsq+lsize3q ], m2
%endif
%endmacro
%macro PUT_SIGNED_PIXELS_CLAMPED 1
cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3
mova m0, [pb_80]
lea lsize3q, [lsizeq*3]
PUT_SIGNED_PIXELS_CLAMPED_HALF 0
lea pixelsq, [pixelsq+lsizeq*4]
PUT_SIGNED_PIXELS_CLAMPED_HALF 64
RET
%endmacro
INIT_MMX mmx
PUT_SIGNED_PIXELS_CLAMPED 0
INIT_XMM sse2
PUT_SIGNED_PIXELS_CLAMPED 3
;--------------------------------------------------------------------------
; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels,
; ptrdiff_t line_size);
;--------------------------------------------------------------------------
; %1 = block offset
%macro PUT_PIXELS_CLAMPED_HALF 1
mova m0, [blockq+mmsize*0+%1]
mova m1, [blockq+mmsize*2+%1]
%if mmsize == 8
mova m2, [blockq+mmsize*4+%1]
mova m3, [blockq+mmsize*6+%1]
%endif
packuswb m0, [blockq+mmsize*1+%1]
packuswb m1, [blockq+mmsize*3+%1]
%if mmsize == 8
packuswb m2, [blockq+mmsize*5+%1]
packuswb m3, [blockq+mmsize*7+%1]
movq [pixelsq], m0
movq [lsizeq+pixelsq], m1
movq [2*lsizeq+pixelsq], m2
movq [lsize3q+pixelsq], m3
%else
movq [pixelsq], m0
movhps [lsizeq+pixelsq], m0
movq [2*lsizeq+pixelsq], m1
movhps [lsize3q+pixelsq], m1
%endif
%endmacro
%macro PUT_PIXELS_CLAMPED 0
cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3
lea lsize3q, [lsizeq*3]
PUT_PIXELS_CLAMPED_HALF 0
lea pixelsq, [pixelsq+lsizeq*4]
PUT_PIXELS_CLAMPED_HALF 64
RET
%endmacro
INIT_MMX mmx
PUT_PIXELS_CLAMPED
INIT_XMM sse2
PUT_PIXELS_CLAMPED
;--------------------------------------------------------------------------
; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels,
; ptrdiff_t line_size);
;--------------------------------------------------------------------------
; %1 = block offset
%macro ADD_PIXELS_CLAMPED 1
mova m0, [blockq+mmsize*0+%1]
mova m1, [blockq+mmsize*1+%1]
%if mmsize == 8
mova m5, [blockq+mmsize*2+%1]
mova m6, [blockq+mmsize*3+%1]
%endif
movq m2, [pixelsq]
movq m3, [pixelsq+lsizeq]
%if mmsize == 8
mova m7, m2
punpcklbw m2, m4
punpckhbw m7, m4
paddsw m0, m2
paddsw m1, m7
mova m7, m3
punpcklbw m3, m4
punpckhbw m7, m4
paddsw m5, m3
paddsw m6, m7
%else
punpcklbw m2, m4
punpcklbw m3, m4
paddsw m0, m2
paddsw m1, m3
%endif
packuswb m0, m1
%if mmsize == 8
packuswb m5, m6
movq [pixelsq], m0
movq [pixelsq+lsizeq], m5
%else
movq [pixelsq], m0
movhps [pixelsq+lsizeq], m0
%endif
%endmacro
%macro ADD_PIXELS_CLAMPED 0
cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize
pxor m4, m4
ADD_PIXELS_CLAMPED 0
lea pixelsq, [pixelsq+lsizeq*2]
ADD_PIXELS_CLAMPED 32
lea pixelsq, [pixelsq+lsizeq*2]
ADD_PIXELS_CLAMPED 64
lea pixelsq, [pixelsq+lsizeq*2]
ADD_PIXELS_CLAMPED 96
RET
%endmacro
INIT_MMX mmx
ADD_PIXELS_CLAMPED
INIT_XMM sse2
ADD_PIXELS_CLAMPED

39
externals/ffmpeg/libavcodec/x86/idctdsp.h vendored Executable file
View File

@@ -0,0 +1,39 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_IDCTDSP_H
#define AVCODEC_X86_IDCTDSP_H
#include <stddef.h>
#include <stdint.h>
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
#endif /* AVCODEC_X86_IDCTDSP_H */

162
externals/ffmpeg/libavcodec/x86/idctdsp_init.c vendored Executable file
View File

@@ -0,0 +1,162 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/idctdsp.h"
#include "idctdsp.h"
#include "simple_idct.h"
/* Input permutation for the simple_idct_mmx */
static const uint8_t simple_mmx_permutation[64] = {
0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
};
static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
enum idct_permutation_type perm_type)
{
int i;
switch (perm_type) {
case FF_IDCT_PERM_SIMPLE:
for (i = 0; i < 64; i++)
idct_permutation[i] = simple_mmx_permutation[i];
return 1;
case FF_IDCT_PERM_SSE2:
for (i = 0; i < 64; i++)
idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
return 1;
}
return 0;
}
av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
if (!high_bit_depth &&
avctx->lowres == 0 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
c->idct_put = ff_simple_idct_put_mmx;
c->idct_add = ff_simple_idct_add_mmx;
c->idct = ff_simple_idct_mmx;
c->perm_type = FF_IDCT_PERM_SIMPLE;
}
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
c->put_pixels_clamped = ff_put_pixels_clamped_sse2;
c->add_pixels_clamped = ff_add_pixels_clamped_sse2;
if (!high_bit_depth &&
avctx->lowres == 0 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
c->idct_put = ff_simple_idct_put_sse2;
c->idct_add = ff_simple_idct_add_sse2;
c->perm_type = FF_IDCT_PERM_SIMPLE;
}
if (ARCH_X86_64 &&
!high_bit_depth &&
avctx->lowres == 0 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX ||
avctx->idct_algo == FF_IDCT_SIMPLE)) {
c->idct = ff_simple_idct8_sse2;
c->idct_put = ff_simple_idct8_put_sse2;
c->idct_add = ff_simple_idct8_add_sse2;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
}
if (ARCH_X86_64 && avctx->lowres == 0) {
if (EXTERNAL_AVX(cpu_flags) &&
!high_bit_depth &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX ||
avctx->idct_algo == FF_IDCT_SIMPLE)) {
c->idct = ff_simple_idct8_avx;
c->idct_put = ff_simple_idct8_put_avx;
c->idct_add = ff_simple_idct8_add_avx;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
if (avctx->bits_per_raw_sample == 10 &&
avctx->codec_id != AV_CODEC_ID_MPEG4 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLE)) {
if (EXTERNAL_SSE2(cpu_flags)) {
c->idct_put = ff_simple_idct10_put_sse2;
c->idct_add = NULL;
c->idct = ff_simple_idct10_sse2;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
if (EXTERNAL_AVX(cpu_flags)) {
c->idct_put = ff_simple_idct10_put_avx;
c->idct_add = NULL;
c->idct = ff_simple_idct10_avx;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
}
if (avctx->bits_per_raw_sample == 12 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
if (EXTERNAL_SSE2(cpu_flags)) {
c->idct_put = ff_simple_idct12_put_sse2;
c->idct_add = NULL;
c->idct = ff_simple_idct12_sse2;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
if (EXTERNAL_AVX(cpu_flags)) {
c->idct_put = ff_simple_idct12_put_avx;
c->idct_add = NULL;
c->idct = ff_simple_idct12_avx;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
}
}
}

741
externals/ffmpeg/libavcodec/x86/imdct36.asm vendored Executable file
View File

@@ -0,0 +1,741 @@
;******************************************************************************
;* 36 point SSE-optimized IMDCT transform
;* Copyright (c) 2011 Vitor Sessak
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
ps_mask: dd 0, ~0, ~0, ~0
ps_mask2: dd 0, ~0, 0, ~0
ps_mask3: dd 0, 0, 0, ~0
ps_mask4: dd 0, ~0, 0, 0
ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
dd 1.0, 0.70710678118654752439, 0.0, 0.0
ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
dd 1.0, -0.70710678118654752439, 0.0, 0.0
costabs: times 4 dd 0.98480773
times 4 dd 0.93969262
times 4 dd 0.86602539
times 4 dd -0.76604444
times 4 dd -0.64278764
times 4 dd 0.50000000
times 4 dd -0.50000000
times 4 dd -0.34202015
times 4 dd -0.17364818
times 4 dd 0.50190992
times 4 dd 0.51763808
times 4 dd 0.55168896
times 4 dd 0.61038726
times 4 dd 0.70710677
times 4 dd 0.87172341
times 4 dd 1.18310082
times 4 dd 1.93185163
times 4 dd 5.73685646
%define SBLIMIT 32
SECTION .text
%macro PSHUFD 3
%if cpuflag(sse2) && notcpuflag(avx)
pshufd %1, %2, %3
%else
shufps %1, %2, %2, %3
%endif
%endmacro
; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
; output %1={x3,x4,y1,y2}
%macro BUILDINVHIGHLOW 3
%if cpuflag(avx)
shufps %1, %2, %3, 0x4e
%else
movlhps %1, %3
movhlps %1, %2
%endif
%endmacro
; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
; output %1={x4,y1,y2,y3}
%macro ROTLEFT 3
%if cpuflag(ssse3)
palignr %1, %3, %2, 12
%else
BUILDINVHIGHLOW %1, %2, %3
shufps %1, %1, %3, 0x99
%endif
%endmacro
%macro INVERTHL 2
%if cpuflag(sse2)
PSHUFD %1, %2, 0x4e
%else
movhlps %1, %2
movlhps %1, %2
%endif
%endmacro
%macro BUTTERF 3
INVERTHL %2, %1
xorps %1, [ps_p1p1m1m1]
addps %1, %2
%if cpuflag(sse3)
mulps %1, %1, [ps_cosh_sse3 + %3]
PSHUFD %2, %1, 0xb1
addsubps %1, %1, %2
%else
mulps %1, [ps_cosh + %3]
PSHUFD %2, %1, 0xb1
xorps %1, [ps_p1m1p1m1]
addps %1, %2
%endif
%endmacro
%macro BUTTERF2 3
%if cpuflag(sse3)
mulps %1, %1, [ps_cosh_sse3 + %3]
PSHUFD %2, %1, 0xe1
addsubps %1, %1, %2
%else
mulps %1, [ps_cosh + %3]
PSHUFD %2, %1, 0xe1
xorps %1, [ps_p1m1p1m1]
addps %1, %2
%endif
%endmacro
%macro STORE 4
%if cpuflag(sse4)
movss [%3 ], %1
extractps dword [%3 + %4], %1, 1
extractps dword [%3 + 2*%4], %1, 2
extractps dword [%3 + 3*%4], %1, 3
%else
movhlps %2, %1
movss [%3 ], %1
movss [%3 + 2*%4], %2
shufps %1, %1, 0xb1
movss [%3 + %4], %1
movhlps %2, %1
movss [%3 + 3*%4], %2
%endif
%endmacro
%macro LOAD 4
movlps %1, [%3 ]
movhps %1, [%3 + %4]
movlps %2, [%3 + 2*%4]
movhps %2, [%3 + 3*%4]
shufps %1, %2, 0x88
%endmacro
%macro LOADA64 2
%if cpuflag(avx)
movu %1, [%2]
%else
movlps %1, [%2]
movhps %1, [%2 + 8]
%endif
%endmacro
%macro DEFINE_IMDCT 0
cglobal imdct36_float, 4,4,9, out, buf, in, win
; for(i=17;i>=1;i--) in[i] += in[i-1];
LOADA64 m0, inq
LOADA64 m1, inq + 16
ROTLEFT m5, m0, m1
PSHUFD m6, m0, 0x93
andps m6, m6, [ps_mask]
addps m0, m0, m6
LOADA64 m2, inq + 32
ROTLEFT m7, m1, m2
addps m1, m1, m5
LOADA64 m3, inq + 48
ROTLEFT m5, m2, m3
xorps m4, m4, m4
movlps m4, [inq+64]
BUILDINVHIGHLOW m6, m3, m4
shufps m6, m6, m4, 0xa9
addps m4, m4, m6
addps m2, m2, m7
addps m3, m3, m5
; for(i=17;i>=3;i-=2) in[i] += in[i-2];
movlhps m5, m5, m0
andps m5, m5, [ps_mask3]
BUILDINVHIGHLOW m7, m0, m1
andps m7, m7, [ps_mask2]
addps m0, m0, m5
BUILDINVHIGHLOW m6, m1, m2
andps m6, m6, [ps_mask2]
addps m1, m1, m7
BUILDINVHIGHLOW m7, m2, m3
andps m7, m7, [ps_mask2]
addps m2, m2, m6
movhlps m6, m6, m3
andps m6, m6, [ps_mask4]
addps m3, m3, m7
addps m4, m4, m6
; Populate tmp[]
movlhps m6, m1, m5 ; zero out high values
subps m6, m6, m4
subps m5, m0, m3
%if ARCH_X86_64
SWAP m5, m8
%endif
mulps m7, m2, [ps_val1]
%if ARCH_X86_64
mulps m5, m8, [ps_val2]
%else
mulps m5, m5, [ps_val2]
%endif
addps m7, m7, m5
mulps m5, m6, [ps_val1]
subps m7, m7, m5
%if ARCH_X86_64
SWAP m5, m8
%else
subps m5, m0, m3
%endif
subps m5, m5, m6
addps m5, m5, m2
shufps m6, m4, m3, 0xe4
subps m6, m6, m2
mulps m6, m6, [ps_val3]
addps m4, m4, m1
mulps m4, m4, [ps_val4]
shufps m1, m1, m0, 0xe4
addps m1, m1, m2
mulps m1, m1, [ps_val5]
mulps m3, m3, [ps_val6]
mulps m0, m0, [ps_val7]
addps m0, m0, m3
xorps m2, m1, [ps_p1p1m1m1]
subps m2, m2, m4
addps m2, m2, m0
addps m3, m4, m0
subps m3, m3, m6
xorps m3, m3, [ps_p1p1m1m1]
shufps m0, m0, m4, 0xe4
subps m0, m0, m1
addps m0, m0, m6
BUILDINVHIGHLOW m4, m2, m3
shufps m3, m3, m2, 0x4e
; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
BUTTERF m0, m1, 0
BUTTERF m7, m2, 16
BUTTERF m3, m6, 32
BUTTERF m4, m1, 48
BUTTERF2 m5, m1, 64
; permutates:
; m0 0 1 2 3 => 2 6 10 14 m1
; m7 4 5 6 7 => 3 7 11 15 m2
; m3 8 9 10 11 => 17 13 9 5 m3
; m4 12 13 14 15 => 16 12 8 4 m5
; m5 16 17 xx xx => 0 1 xx xx m0
unpckhps m1, m0, m7
unpckhps m6, m3, m4
movhlps m2, m6, m1
movlhps m1, m1, m6
unpcklps m5, m5, m4
unpcklps m3, m3, m7
movhlps m4, m3, m5
movlhps m5, m5, m3
SWAP m4, m3
; permutation done
PSHUFD m6, m2, 0xb1
movss m4, [bufq + 4*68]
movss m7, [bufq + 4*64]
unpcklps m7, m7, m4
mulps m6, m6, [winq + 16*4]
addps m6, m6, m7
movss [outq + 64*SBLIMIT], m6
shufps m6, m6, m6, 0xb1
movss [outq + 68*SBLIMIT], m6
mulps m6, m3, [winq + 4*4]
LOAD m4, m7, bufq + 4*16, 16
addps m6, m6, m4
STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
shufps m4, m0, m3, 0xb5
mulps m4, m4, [winq + 8*4]
LOAD m7, m6, bufq + 4*32, 16
addps m4, m4, m7
STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
shufps m3, m3, m2, 0xb1
mulps m3, m3, [winq + 12*4]
LOAD m7, m6, bufq + 4*48, 16
addps m3, m3, m7
STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
mulps m2, m2, [winq]
LOAD m6, m7, bufq, 16
addps m2, m2, m6
STORE m2, m7, outq, 4*SBLIMIT
mulps m4, m1, [winq + 20*4]
STORE m4, m7, bufq, 16
mulps m3, m5, [winq + 24*4]
STORE m3, m7, bufq + 4*16, 16
shufps m0, m0, m5, 0xb0
mulps m0, m0, [winq + 28*4]
STORE m0, m7, bufq + 4*32, 16
shufps m5, m5, m1, 0xb1
mulps m5, m5, [winq + 32*4]
STORE m5, m7, bufq + 4*48, 16
shufps m1, m1, m1, 0xb1
mulps m1, m1, [winq + 36*4]
movss [bufq + 4*64], m1
shufps m1, m1, 0xb1
movss [bufq + 4*68], m1
RET
%endmacro
%if ARCH_X86_32
INIT_XMM sse
DEFINE_IMDCT
%endif
INIT_XMM sse2
DEFINE_IMDCT
INIT_XMM sse3
DEFINE_IMDCT
INIT_XMM ssse3
DEFINE_IMDCT
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEFINE_IMDCT
%endif
INIT_XMM sse
%if ARCH_X86_64
%define SPILL SWAP
%define UNSPILL SWAP
%define SPILLED(x) m %+ x
%else
%define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
%macro SPILL 2 ; xmm#, mempos
movaps SPILLED(%2), m%1
%endmacro
%macro UNSPILL 2
movaps m%1, SPILLED(%2)
%endmacro
%endif
%macro DEFINE_FOUR_IMDCT 0
cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
movlps m0, [inq+64]
movhps m0, [inq+64 + 72]
movlps m3, [inq+64 + 2*72]
movhps m3, [inq+64 + 3*72]
shufps m5, m0, m3, 0xdd
shufps m0, m0, m3, 0x88
mova m1, [inq+48]
movu m6, [inq+48 + 72]
mova m7, [inq+48 + 2*72]
movu m3, [inq+48 + 3*72]
TRANSPOSE4x4PS 1, 6, 7, 3, 4
addps m4, m6, m7
mova [tmpq+4*28], m4
addps m7, m3
addps m6, m1
addps m3, m0
addps m0, m5
addps m0, m7
addps m7, m6
mova [tmpq+4*12], m7
SPILL 3, 12
mova m4, [inq+32]
movu m5, [inq+32 + 72]
mova m2, [inq+32 + 2*72]
movu m7, [inq+32 + 3*72]
TRANSPOSE4x4PS 4, 5, 2, 7, 3
addps m1, m7
SPILL 1, 11
addps m3, m5, m2
SPILL 3, 13
addps m7, m2
addps m5, m4
addps m6, m7
mova [tmpq], m6
addps m7, m5
mova [tmpq+4*16], m7
mova m2, [inq+16]
movu m7, [inq+16 + 72]
mova m1, [inq+16 + 2*72]
movu m6, [inq+16 + 3*72]
TRANSPOSE4x4PS 2, 7, 1, 6, 3
addps m4, m6
addps m6, m1
addps m1, m7
addps m7, m2
addps m5, m6
SPILL 5, 15
addps m6, m7
mulps m6, [costabs + 16*2]
mova [tmpq+4*8], m6
SPILL 1, 10
SPILL 0, 14
mova m1, [inq]
movu m6, [inq + 72]
mova m3, [inq + 2*72]
movu m5, [inq + 3*72]
TRANSPOSE4x4PS 1, 6, 3, 5, 0
addps m2, m5
addps m5, m3
addps m7, m5
addps m3, m6
addps m6, m1
SPILL 7, 8
addps m5, m6
SPILL 6, 9
addps m6, m4, SPILLED(12)
subps m6, m2
UNSPILL 7, 11
SPILL 5, 11
subps m5, m1, m7
mulps m7, [costabs + 16*5]
addps m7, m1
mulps m0, m6, [costabs + 16*6]
addps m0, m5
mova [tmpq+4*24], m0
addps m6, m5
mova [tmpq+4*4], m6
addps m6, m4, m2
mulps m6, [costabs + 16*1]
subps m4, SPILLED(12)
mulps m4, [costabs + 16*8]
addps m2, SPILLED(12)
mulps m2, [costabs + 16*3]
subps m5, m7, m6
subps m5, m2
addps m6, m7
addps m6, m4
addps m7, m2
subps m7, m4
mova [tmpq+4*20], m7
mova m2, [tmpq+4*28]
mova [tmpq+4*28], m5
UNSPILL 7, 13
subps m5, m7, m2
mulps m5, [costabs + 16*7]
UNSPILL 1, 10
mulps m1, [costabs + 16*2]
addps m4, m3, m2
mulps m4, [costabs + 16*4]
addps m2, m7
addps m7, m3
mulps m7, [costabs]
subps m3, m2
mulps m3, [costabs + 16*2]
addps m2, m7, m5
addps m2, m1
SPILL 2, 10
addps m7, m4
subps m7, m1
SPILL 7, 12
subps m5, m4
subps m5, m1
UNSPILL 0, 14
SPILL 5, 13
addps m1, m0, SPILLED(15)
subps m1, SPILLED(8)
mova m4, [costabs + 16*5]
mulps m4, [tmpq]
UNSPILL 2, 9
addps m4, m2
subps m2, [tmpq]
mulps m5, m1, [costabs + 16*6]
addps m5, m2
SPILL 5, 9
addps m2, m1
SPILL 2, 14
UNSPILL 5, 15
subps m7, m5, m0
addps m5, SPILLED(8)
mulps m5, [costabs + 16*1]
mulps m7, [costabs + 16*8]
addps m0, SPILLED(8)
mulps m0, [costabs + 16*3]
subps m2, m4, m5
subps m2, m0
SPILL 2, 15
addps m5, m4
addps m5, m7
addps m4, m0
subps m4, m7
SPILL 4, 8
mova m7, [tmpq+4*16]
mova m2, [tmpq+4*12]
addps m0, m7, m2
subps m0, SPILLED(11)
mulps m0, [costabs + 16*2]
addps m4, m7, SPILLED(11)
mulps m4, [costabs]
subps m7, m2
mulps m7, [costabs + 16*7]
addps m2, SPILLED(11)
mulps m2, [costabs + 16*4]
addps m1, m7, [tmpq+4*8]
addps m1, m4
addps m4, m2
subps m4, [tmpq+4*8]
SPILL 4, 11
subps m7, m2
subps m7, [tmpq+4*8]
addps m4, m6, SPILLED(10)
subps m6, SPILLED(10)
addps m2, m5, m1
mulps m2, [costabs + 16*9]
subps m5, m1
mulps m5, [costabs + 16*17]
subps m1, m4, m2
addps m4, m2
mulps m2, m1, [winq+4*36]
addps m2, [bufq+4*36]
mova [outq+1152], m2
mulps m1, [winq+4*32]
addps m1, [bufq+4*32]
mova [outq+1024], m1
mulps m1, m4, [winq+4*116]
mova [bufq+4*36], m1
mulps m4, [winq+4*112]
mova [bufq+4*32], m4
addps m2, m6, m5
subps m6, m5
mulps m1, m6, [winq+4*68]
addps m1, [bufq+4*68]
mova [outq+2176], m1
mulps m6, [winq]
addps m6, [bufq]
mova [outq], m6
mulps m1, m2, [winq+4*148]
mova [bufq+4*68], m1
mulps m2, [winq+4*80]
mova [bufq], m2
addps m5, m3, [tmpq+4*24]
mova m2, [tmpq+4*24]
subps m2, m3
mova m1, SPILLED(9)
subps m1, m0
mulps m1, [costabs + 16*10]
addps m0, SPILLED(9)
mulps m0, [costabs + 16*16]
addps m6, m5, m1
subps m5, m1
mulps m3, m5, [winq+4*40]
addps m3, [bufq+4*40]
mova [outq+1280], m3
mulps m5, [winq+4*28]
addps m5, [bufq+4*28]
mova [outq+896], m5
mulps m1, m6, [winq+4*120]
mova [bufq+4*40], m1
mulps m6, [winq+4*108]
mova [bufq+4*28], m6
addps m1, m2, m0
subps m2, m0
mulps m5, m2, [winq+4*64]
addps m5, [bufq+4*64]
mova [outq+2048], m5
mulps m2, [winq+4*4]
addps m2, [bufq+4*4]
mova [outq+128], m2
mulps m0, m1, [winq+4*144]
mova [bufq+4*64], m0
mulps m1, [winq+4*84]
mova [bufq+4*4], m1
mova m1, [tmpq+4*28]
mova m5, m1
addps m1, SPILLED(13)
subps m5, SPILLED(13)
UNSPILL 3, 15
addps m2, m7, m3
mulps m2, [costabs + 16*11]
subps m3, m7
mulps m3, [costabs + 16*15]
addps m0, m2, m1
subps m1, m2
SWAP m0, m2
mulps m6, m1, [winq+4*44]
addps m6, [bufq+4*44]
mova [outq+1408], m6
mulps m1, [winq+4*24]
addps m1, [bufq+4*24]
mova [outq+768], m1
mulps m0, m2, [winq+4*124]
mova [bufq+4*44], m0
mulps m2, [winq+4*104]
mova [bufq+4*24], m2
addps m0, m5, m3
subps m5, m3
mulps m1, m5, [winq+4*60]
addps m1, [bufq+4*60]
mova [outq+1920], m1
mulps m5, [winq+4*8]
addps m5, [bufq+4*8]
mova [outq+256], m5
mulps m1, m0, [winq+4*140]
mova [bufq+4*60], m1
mulps m0, [winq+4*88]
mova [bufq+4*8], m0
mova m1, [tmpq+4*20]
addps m1, SPILLED(12)
mova m2, [tmpq+4*20]
subps m2, SPILLED(12)
UNSPILL 7, 8
subps m0, m7, SPILLED(11)
addps m7, SPILLED(11)
mulps m4, m7, [costabs + 16*12]
mulps m0, [costabs + 16*14]
addps m5, m1, m4
subps m1, m4
mulps m7, m1, [winq+4*48]
addps m7, [bufq+4*48]
mova [outq+1536], m7
mulps m1, [winq+4*20]
addps m1, [bufq+4*20]
mova [outq+640], m1
mulps m1, m5, [winq+4*128]
mova [bufq+4*48], m1
mulps m5, [winq+4*100]
mova [bufq+4*20], m5
addps m6, m2, m0
subps m2, m0
mulps m1, m2, [winq+4*56]
addps m1, [bufq+4*56]
mova [outq+1792], m1
mulps m2, [winq+4*12]
addps m2, [bufq+4*12]
mova [outq+384], m2
mulps m0, m6, [winq+4*136]
mova [bufq+4*56], m0
mulps m6, [winq+4*92]
mova [bufq+4*12], m6
UNSPILL 0, 14
mulps m0, [costabs + 16*13]
mova m3, [tmpq+4*4]
addps m2, m0, m3
subps m3, m0
mulps m0, m3, [winq+4*52]
addps m0, [bufq+4*52]
mova [outq+1664], m0
mulps m3, [winq+4*16]
addps m3, [bufq+4*16]
mova [outq+512], m3
mulps m0, m2, [winq+4*132]
mova [bufq+4*52], m0
mulps m2, [winq+4*96]
mova [bufq+4*16], m2
RET
%endmacro
INIT_XMM sse
DEFINE_FOUR_IMDCT
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEFINE_FOUR_IMDCT
%endif

100
externals/ffmpeg/libavcodec/x86/inline_asm.h vendored Executable file
View File

@@ -0,0 +1,100 @@
/*
* inline assembly helper macros
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_INLINE_ASM_H
#define AVCODEC_X86_INLINE_ASM_H
#include "constants.h"
#define MOVQ_WONE(regd) \
__asm__ volatile ( \
"pcmpeqd %%" #regd ", %%" #regd " \n\t" \
"psrlw $15, %%" #regd ::)
#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
#define MOVQ_BFE(regd) \
__asm__ volatile ( \
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
"paddb %%"#regd", %%"#regd" \n\t" ::)
#ifndef PIC
#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2))
#else
// for shared library it's better to use this way for accessing constants
// pcmpeqd -> -1
#define MOVQ_WTWO(regd) \
__asm__ volatile ( \
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
"psrlw $15, %%"#regd" \n\t" \
"psllw $1, %%"#regd" \n\t"::)
#endif
// using regr as temporary and for the output result
// first argument is unmodified and second is trashed
// regfe is supposed to contain 0xfefefefefefefefe
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
"movq "#rega", "#regr" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pand "#regfe", "#regb" \n\t" \
"psrlq $1, "#regb" \n\t" \
"paddb "#regb", "#regr" \n\t"
#define PAVGB_MMX(rega, regb, regr, regfe) \
"movq "#rega", "#regr" \n\t" \
"por "#regb", "#regr" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pand "#regfe", "#regb" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psubb "#regb", "#regr" \n\t"
// mm6 is supposed to contain 0xfefefefefefefefe
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pand "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psrlq $1, "#regd" \n\t" \
"paddb "#regb", "#regr" \n\t" \
"paddb "#regd", "#regp" \n\t"
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"por "#regb", "#regr" \n\t" \
"por "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psubb "#regb", "#regr" \n\t" \
"psubb "#regd", "#regp" \n\t"
#endif /* AVCODEC_X86_INLINE_ASM_H */

View File

@@ -0,0 +1,164 @@
;******************************************************************************
;* SIMD-optimized JPEG2000 DSP functions
;* Copyright (c) 2014 Nicolas Bertrand
;* Copyright (c) 2015 James Almer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
pf_ict0: times 8 dd 1.402
pf_ict1: times 8 dd 0.34413
pf_ict2: times 8 dd 0.71414
pf_ict3: times 8 dd 1.772
SECTION .text
;***********************************************************************
; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize)
;***********************************************************************
%macro ICT_FLOAT 1
cglobal ict_float, 4, 4, %1, src0, src1, src2, csize
shl csized, 2
add src0q, csizeq
add src1q, csizeq
add src2q, csizeq
neg csizeq
movaps m6, [pf_ict0]
movaps m7, [pf_ict1]
%define ICT0 m6
%define ICT1 m7
%if ARCH_X86_64
movaps m8, [pf_ict2]
%define ICT2 m8
%if cpuflag(avx)
movaps m3, [pf_ict3]
%define ICT3 m3
%else
movaps m9, [pf_ict3]
%define ICT3 m9
%endif
%else ; ARCH_X86_32
%define ICT2 [pf_ict2]
%if cpuflag(avx)
movaps m3, [pf_ict3]
%define ICT3 m3
%else
%define ICT3 [pf_ict3]
%endif
%endif ; ARCH
align 16
.loop:
movaps m0, [src0q+csizeq]
movaps m1, [src1q+csizeq]
movaps m2, [src2q+csizeq]
%if cpuflag(fma4) || cpuflag(fma3)
%if cpuflag(fma4)
fnmaddps m5, m1, ICT1, m0
fmaddps m4, m2, ICT0, m0
%else ; fma3
movaps m5, m1
movaps m4, m2
fnmaddps m5, m5, ICT1, m0
fmaddps m4, m4, ICT0, m0
%endif
fmaddps m0, m1, ICT3, m0
fnmaddps m5, m2, ICT2, m5
%else ; non FMA
%if cpuflag(avx)
mulps m5, m1, ICT1
mulps m4, m2, ICT0
mulps m1, m1, ICT3
mulps m2, m2, ICT2
subps m5, m0, m5
%else ; sse
movaps m3, m1
movaps m4, m2
movaps m5, m0
mulps m3, ICT1
mulps m4, ICT0
mulps m1, ICT3
mulps m2, ICT2
subps m5, m3
%endif
addps m4, m4, m0
addps m0, m0, m1
subps m5, m5, m2
%endif
movaps [src0q+csizeq], m4
movaps [src2q+csizeq], m0
movaps [src1q+csizeq], m5
add csizeq, mmsize
jl .loop
REP_RET
%endmacro
INIT_XMM sse
ICT_FLOAT 10
INIT_YMM avx
ICT_FLOAT 9
%if HAVE_FMA4_EXTERNAL
INIT_XMM fma4
ICT_FLOAT 9
%endif
INIT_YMM fma3
ICT_FLOAT 9
;***************************************************************************
; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize)
;***************************************************************************
%macro RCT_INT 0
cglobal rct_int, 4, 4, 4, src0, src1, src2, csize
shl csized, 2
add src0q, csizeq
add src1q, csizeq
add src2q, csizeq
neg csizeq
align 16
.loop:
mova m1, [src1q+csizeq]
mova m2, [src2q+csizeq]
mova m0, [src0q+csizeq]
paddd m3, m1, m2
psrad m3, 2
psubd m0, m3
paddd m1, m0
paddd m2, m0
mova [src1q+csizeq], m0
mova [src2q+csizeq], m1
mova [src0q+csizeq], m2
add csizeq, mmsize
jl .loop
REP_RET
%endmacro
INIT_XMM sse2
RCT_INT
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
RCT_INT
%endif

View File

@@ -0,0 +1,60 @@
/*
* SIMD optimized JPEG 2000 DSP functions
* Copyright (c) 2015 James Almer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/jpeg2000dsp.h"
void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
void ff_ict_float_fma3(void *src0, void *src1, void *src2, int csize);
void ff_ict_float_fma4(void *src0, void *src1, void *src2, int csize);
void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE(cpu_flags)) {
c->mct_decode[FF_DWT97] = ff_ict_float_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->mct_decode[FF_DWT53] = ff_rct_int_sse2;
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
c->mct_decode[FF_DWT97] = ff_ict_float_avx;
}
if (EXTERNAL_FMA4(cpu_flags)) {
c->mct_decode[FF_DWT97] = ff_ict_float_fma4;
}
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
c->mct_decode[FF_DWT97] = ff_ict_float_fma3;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
}
}

View File

@@ -0,0 +1,190 @@
;******************************************************************************
;* Copyright (c) 2008 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
%macro SCALARPRODUCT 0
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
; int order, int mul)
cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
shl orderq, 1
movd m7, mulm
%if mmsize == 16
pshuflw m7, m7, 0
punpcklqdq m7, m7
%else
pshufw m7, m7, 0
%endif
pxor m6, m6
add v1q, orderq
add v2q, orderq
add v3q, orderq
neg orderq
.loop:
movu m0, [v2q + orderq]
movu m1, [v2q + orderq + mmsize]
mova m4, [v1q + orderq]
mova m5, [v1q + orderq + mmsize]
movu m2, [v3q + orderq]
movu m3, [v3q + orderq + mmsize]
pmaddwd m0, m4
pmaddwd m1, m5
pmullw m2, m7
pmullw m3, m7
paddd m6, m0
paddd m6, m1
paddw m2, m4
paddw m3, m5
mova [v1q + orderq], m2
mova [v1q + orderq + mmsize], m3
add orderq, mmsize*2
jl .loop
HADDD m6, m0
movd eax, m6
RET
%endmacro
INIT_MMX mmxext
SCALARPRODUCT
INIT_XMM sse2
SCALARPRODUCT
INIT_XMM sse4
; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3,
; int order, int mul)
cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul
shl orderq, 1
movd m7, mulm
SPLATW m7, m7
pxor m6, m6
add v1q, orderq
lea v2q, [v2q + 2*orderq]
add v3q, orderq
neg orderq
.loop:
mova m3, [v1q + orderq]
movu m0, [v2q + 2*orderq]
pmovsxwd m4, m3
movu m1, [v2q + 2*orderq + mmsize]
movhlps m5, m3
movu m2, [v3q + orderq]
pmovsxwd m5, m5
pmullw m2, m7
pmulld m0, m4
pmulld m1, m5
paddw m2, m3
paddd m6, m0
paddd m6, m1
mova [v1q + orderq], m2
add orderq, 16
jl .loop
HADDD m6, m0
movd eax, m6
RET
%macro SCALARPRODUCT_LOOP 1
align 16
.loop%1:
sub orderq, mmsize*2
%if %1
mova m1, m4
mova m4, [v2q + orderq]
mova m0, [v2q + orderq + mmsize]
palignr m1, m0, %1
palignr m0, m4, %1
mova m3, m5
mova m5, [v3q + orderq]
mova m2, [v3q + orderq + mmsize]
palignr m3, m2, %1
palignr m2, m5, %1
%else
mova m0, [v2q + orderq]
mova m1, [v2q + orderq + mmsize]
mova m2, [v3q + orderq]
mova m3, [v3q + orderq + mmsize]
%endif
%define t0 [v1q + orderq]
%define t1 [v1q + orderq + mmsize]
%if ARCH_X86_64
mova m8, t0
mova m9, t1
%define t0 m8
%define t1 m9
%endif
pmaddwd m0, t0
pmaddwd m1, t1
pmullw m2, m7
pmullw m3, m7
paddw m2, t0
paddw m3, t1
paddd m6, m0
paddd m6, m1
mova [v1q + orderq], m2
mova [v1q + orderq + mmsize], m3
jg .loop%1
%if %1
jmp .end
%endif
%endmacro
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
; int order, int mul)
INIT_XMM ssse3
cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
shl orderq, 1
movd m7, mulm
pshuflw m7, m7, 0
punpcklqdq m7, m7
pxor m6, m6
mov r4d, v2d
and r4d, 15
and v2q, ~15
and v3q, ~15
mova m4, [v2q + orderq]
mova m5, [v3q + orderq]
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
cmp r4d, 0
je .loop0
cmp r4d, 2
je .loop2
cmp r4d, 4
je .loop4
cmp r4d, 6
je .loop6
cmp r4d, 8
je .loop8
cmp r4d, 10
je .loop10
cmp r4d, 12
je .loop12
SCALARPRODUCT_LOOP 14
SCALARPRODUCT_LOOP 12
SCALARPRODUCT_LOOP 10
SCALARPRODUCT_LOOP 8
SCALARPRODUCT_LOOP 6
SCALARPRODUCT_LOOP 4
SCALARPRODUCT_LOOP 2
SCALARPRODUCT_LOOP 0
.end:
HADDD m6, m0
movd eax, m6
RET

View File

@@ -0,0 +1,56 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/lossless_audiodsp.h"
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
int32_t ff_scalarproduct_and_madd_int32_sse4(int16_t *v1, const int32_t *v2,
const int16_t *v3,
int order, int mul);
av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMXEXT(cpu_flags))
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
if (EXTERNAL_SSE2(cpu_flags))
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
if (EXTERNAL_SSSE3(cpu_flags) &&
!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
if (EXTERNAL_SSE4(cpu_flags))
c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4;
#endif
}

View File

@@ -0,0 +1,406 @@
;******************************************************************************
;* SIMD lossless video DSP utils
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2014 Michael Niedermayer
;* Copyright (c) 2017 Jokyo Images
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pb_15
pb_zzzzzzzz77777777: times 8 db -1
pb_7: times 8 db 7
pb_ef: times 8 db 14,15
pb_67: times 8 db 6, 7
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
SECTION .text
;------------------------------------------------------------------------------
; void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
; const uint8_t *diff, int w,
; int *left, int *left_top)
;------------------------------------------------------------------------------
%macro MEDIAN_PRED 0
cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
movu m0, [topq]
mova m2, m0
movd m4, [left_topq]
LSHIFT m2, 1
mova m1, m0
por m4, m2
movd m3, [leftq]
psubb m0, m4 ; t-tl
add dstq, wq
add topq, wq
add diffq, wq
neg wq
jmp .skip
.loop:
movu m4, [topq+wq]
mova m0, m4
LSHIFT m4, 1
por m4, m1
mova m1, m0 ; t
psubb m0, m4 ; t-tl
.skip:
movu m2, [diffq+wq]
%assign i 0
%rep mmsize
mova m4, m0
paddb m4, m3 ; t-tl+l
mova m5, m3
pmaxub m3, m1
pminub m5, m1
pminub m3, m4
pmaxub m3, m5 ; median
paddb m3, m2 ; +residual
%if i==0
mova m7, m3
LSHIFT m7, mmsize-1
%else
mova m6, m3
RSHIFT m7, 1
LSHIFT m6, mmsize-1
por m7, m6
%endif
%if i<mmsize-1
RSHIFT m0, 1
RSHIFT m1, 1
RSHIFT m2, 1
%endif
%assign i i+1
%endrep
movu [dstq+wq], m7
add wq, mmsize
jl .loop
movzx r2d, byte [dstq-1]
mov [leftq], r2d
movzx r2d, byte [topq-1]
mov [left_topq], r2d
RET
%endmacro
%if ARCH_X86_32
INIT_MMX mmxext
MEDIAN_PRED
%endif
INIT_XMM sse2
MEDIAN_PRED
%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
add srcq, wq
add dstq, wq
neg wq
%%.loop:
pshufb xm0, xm5
%if %2
mova m1, [srcq+wq]
%else
movu m1, [srcq+wq]
%endif
psllw m2, m1, 8
paddb m1, m2
pshufb m2, m1, m3
paddb m1, m2
pshufb m2, m1, m4
paddb m1, m2
%if mmsize >= 16
pshufb m2, m1, m6
paddb m1, m2
%endif
paddb xm0, xm1
%if %1
mova [dstq+wq], xm0
%else
movq [dstq+wq], xm0
movhps [dstq+wq+8], xm0
%endif
%if mmsize == 32
vextracti128 xm2, m1, 1 ; get second lane of the ymm
pshufb xm0, xm5 ; set alls val to last val of the first lane
paddb xm0, xm2
;store val
%if %1
mova [dstq+wq+16], xm0
%else;
movq [dstq+wq+16], xm0
movhps [dstq+wq+16+8], xm0
%endif
%endif
add wq, mmsize
jl %%.loop
%if mmsize == 32
movzx eax, byte [dstq - 1]
%else;
mov eax, mmsize-1
sub eax, wd
movd m1, eax
pshufb m0, m1
movd eax, m0
%endif
RET
%endmacro
;------------------------------------------------------------------------------
; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
;------------------------------------------------------------------------------
INIT_MMX ssse3
cglobal add_left_pred, 3,3,7, dst, src, w, left
.skip_prologue:
mova m5, [pb_7]
mova m4, [pb_zzzz3333zzzzbbbb]
mova m3, [pb_zz11zz55zz99zzdd]
movd m0, leftm
psllq m0, 56
ADD_LEFT_LOOP 1, 1
%macro ADD_LEFT_PRED_UNALIGNED 0
cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
mova xm5, [pb_15]
VBROADCASTI128 m6, [pb_zzzzzzzz77777777]
VBROADCASTI128 m4, [pb_zzzz3333zzzzbbbb]
VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd]
movd xm0, leftm
pslldq xm0, 15
test srcq, mmsize - 1
jnz .src_unaligned
test dstq, mmsize - 1
jnz .dst_unaligned
ADD_LEFT_LOOP 1, 1
.dst_unaligned:
ADD_LEFT_LOOP 0, 1
.src_unaligned:
ADD_LEFT_LOOP 0, 0
%endmacro
INIT_XMM ssse3
ADD_LEFT_PRED_UNALIGNED
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
ADD_LEFT_PRED_UNALIGNED
%endif
;------------------------------------------------------------------------------
; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w);
;------------------------------------------------------------------------------
%macro ADD_BYTES 0
cglobal add_bytes, 3,4,2, dst, src, w, size
mov sizeq, wq
and sizeq, -2*mmsize
jz .2
add dstq, sizeq
add srcq, sizeq
neg sizeq
.1:
mova m0, [srcq + sizeq]
mova m1, [srcq + sizeq + mmsize]
paddb m0, [dstq + sizeq]
paddb m1, [dstq + sizeq + mmsize]
mova [dstq + sizeq], m0
mova [dstq + sizeq + mmsize], m1
add sizeq, 2*mmsize
jl .1
.2:
and wq, 2*mmsize-1
jz .end
add dstq, wq
add srcq, wq
neg wq
.3:
mov sizeb, [srcq + wq]
add [dstq + wq], sizeb
inc wq
jl .3
.end:
REP_RET
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
ADD_BYTES
%endif
INIT_XMM sse2
ADD_BYTES
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
ADD_BYTES
%endif
%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
add wd, wd
add srcq, wq
add dstq, wq
neg wq
%%.loop:
mov%2 m1, [srcq+wq]
mova m2, m1
pslld m1, 16
paddw m1, m2
mova m2, m1
pshufb m1, m3
paddw m1, m2
pshufb m0, m5
%if mmsize == 16
mova m2, m1
pshufb m1, m4
paddw m1, m2
%endif
paddw m0, m1
pand m0, m7
%ifidn %1, a
mova [dstq+wq], m0
%else
movq [dstq+wq], m0
movhps [dstq+wq+8], m0
%endif
add wq, mmsize
jl %%.loop
mov eax, mmsize-1
sub eax, wd
mov wd, eax
shl wd, 8
lea eax, [wd+eax-1]
movd m1, eax
pshufb m0, m1
movd eax, m0
RET
%endmacro
;---------------------------------------------------------------------------------------------
; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
;---------------------------------------------------------------------------------------------
INIT_MMX ssse3
cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
.skip_prologue:
mova m5, [pb_67]
mova m3, [pb_zzzz2323zzzzabab]
movd m0, leftm
psllq m0, 48
movd m7, maskm
SPLATW m7 ,m7
ADD_HFYU_LEFT_LOOP_INT16 a, a
INIT_XMM ssse3
cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left
mova m5, [pb_ef]
mova m4, [pb_zzzzzzzz67676767]
mova m3, [pb_zzzz2323zzzzabab]
movd m0, leftm
pslldq m0, 14
movd m7, maskm
SPLATW m7 ,m7
test srcq, 15
jnz .src_unaligned
test dstq, 15
jnz .dst_unaligned
ADD_HFYU_LEFT_LOOP_INT16 a, a
.dst_unaligned:
ADD_HFYU_LEFT_LOOP_INT16 u, a
.src_unaligned:
ADD_HFYU_LEFT_LOOP_INT16 u, u
;---------------------------------------------------------------------------------------------
; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width)
;---------------------------------------------------------------------------------------------
%macro ADD_GRADIENT_PRED 0
cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp
mova xm0, [pb_15]
;load src - 1 in xm1
movd xm1, [srcq-1]
%if cpuflag(avx2)
vpbroadcastb xm1, xm1
%else
pxor xm2, xm2
pshufb xm1, xm2
%endif
add srcq, widthq
neg widthq
neg strideq
.loop:
lea tmpq, [srcq + strideq]
mova m2, [tmpq + widthq] ; A = src[x-stride]
movu m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)]
mova m4, [srcq + widthq] ; current val (src[x])
psubb m2, m3; A - B
; prefix sum A-B
pslldq m3, m2, 1
paddb m2, m3
pslldq m3, m2, 2
paddb m2, m3
pslldq m3, m2, 4
paddb m2, m3
pslldq m3, m2, 8
paddb m2, m3
; prefix sum current val
pslldq m3, m4, 1
paddb m4, m3
pslldq m3, m4, 2
paddb m4, m3
pslldq m3, m4, 4
paddb m4, m3
pslldq m3, m4, 8
paddb m4, m3
; last sum
paddb m2, m4 ; current + (A - B)
paddb xm1, xm2 ; += C
mova [srcq + widthq], xm1 ; store
pshufb xm1, xm0 ; put last val in all val of xm1
%if mmsize == 32
vextracti128 xm2, m2, 1 ; get second lane of the ymm
paddb xm1, xm2; += C
mova [srcq + widthq + 16], xm1 ; store
pshufb xm1, xm0 ; put last val in all val of m1
%endif
add widthq, mmsize
jl .loop
RET
%endmacro
INIT_XMM ssse3
ADD_GRADIENT_PRED
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
ADD_GRADIENT_PRED
%endif

View File

@@ -0,0 +1,128 @@
/*
* Lossless video DSP utils
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/x86/asm.h"
#include "../lossless_videodsp.h"
#include "libavutil/x86/cpu.h"
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t w);
void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
void ff_add_bytes_avx2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, ptrdiff_t w,
int *left, int *left_top);
void ff_add_median_pred_sse2(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, ptrdiff_t w,
int *left, int *left_top);
int ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left);
int ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left);
int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left);
int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
int ff_add_left_pred_int16_unaligned_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
void ff_add_gradient_pred_avx2(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, ptrdiff_t w,
int *left, int *left_top)
{
x86_reg w2 = -w;
x86_reg x;
int l = *left & 0xff;
int tl = *left_top & 0xff;
int t;
__asm__ volatile (
"mov %7, %3 \n"
"1: \n"
"movzbl (%3, %4), %2 \n"
"mov %2, %k3 \n"
"sub %b1, %b3 \n"
"add %b0, %b3 \n"
"mov %2, %1 \n"
"cmp %0, %2 \n"
"cmovg %0, %2 \n"
"cmovg %1, %0 \n"
"cmp %k3, %0 \n"
"cmovg %k3, %0 \n"
"mov %7, %3 \n"
"cmp %2, %0 \n"
"cmovl %2, %0 \n"
"add (%6, %4), %b0 \n"
"mov %b0, (%5, %4) \n"
"inc %4 \n"
"jl 1b \n"
: "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
: "r"(dst + w), "r"(diff + w), "rm"(top + w)
);
*left = l;
*left_top = tl;
}
#endif
void ff_llviddsp_init_x86(LLVidDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
if (cpu_flags & AV_CPU_FLAG_CMOV)
c->add_median_pred = add_median_pred_cmov;
#endif
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
c->add_bytes = ff_add_bytes_mmx;
}
if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) {
/* slower than cmov version on AMD */
if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
c->add_median_pred = ff_add_median_pred_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->add_bytes = ff_add_bytes_sse2;
c->add_median_pred = ff_add_median_pred_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->add_left_pred = ff_add_left_pred_ssse3;
c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
c->add_gradient_pred = ff_add_gradient_pred_ssse3;
}
if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
c->add_left_pred = ff_add_left_pred_unaligned_ssse3;
c->add_left_pred_int16 = ff_add_left_pred_int16_unaligned_ssse3;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->add_bytes = ff_add_bytes_avx2;
c->add_left_pred = ff_add_left_pred_unaligned_avx2;
c->add_gradient_pred = ff_add_gradient_pred_avx2;
}
}

View File

@@ -0,0 +1,194 @@
;************************************************************************
;* SIMD-optimized lossless video encoding functions
;* Copyright (c) 2000, 2001 Fabrice Bellard
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
;*
;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
cextern pb_80
SECTION .text
; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
; intptr_t w);
%macro DIFF_BYTES_PROLOGUE 0
%if ARCH_X86_32
cglobal diff_bytes, 3,5,2, dst, src1, src2
%define wq r4q
DECLARE_REG_TMP 3
mov wq, r3mp
%else
cglobal diff_bytes, 4,5,2, dst, src1, src2, w
DECLARE_REG_TMP 4
%endif ; ARCH_X86_32
%define i t0q
%endmacro
; labels to jump to if w < regsize and w < 0
%macro DIFF_BYTES_LOOP_PREP 2
mov i, wq
and i, -2 * regsize
js %2
jz %1
add dstq, i
add src1q, i
add src2q, i
neg i
%endmacro
; mov type used for src1q, dstq, first reg, second reg
%macro DIFF_BYTES_LOOP_CORE 4
%if mmsize != 16
mov%1 %3, [src1q + i]
mov%1 %4, [src1q + i + regsize]
psubb %3, [src2q + i]
psubb %4, [src2q + i + regsize]
mov%2 [dstq + i], %3
mov%2 [regsize + dstq + i], %4
%else
; SSE enforces alignment of psubb operand
mov%1 %3, [src1q + i]
movu %4, [src2q + i]
psubb %3, %4
mov%2 [dstq + i], %3
mov%1 %3, [src1q + i + regsize]
movu %4, [src2q + i + regsize]
psubb %3, %4
mov%2 [regsize + dstq + i], %3
%endif
%endmacro
%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
%define regsize mmsize
.loop_%1%2:
DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
add i, 2 * regsize
jl .loop_%1%2
.skip_main_%1%2:
and wq, 2 * regsize - 1
jz .end_%1%2
%if mmsize > 16
; fall back to narrower xmm
%define regsize (mmsize / 2)
DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa, .end_aa
.loop2_%1%2:
DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
add i, 2 * regsize
jl .loop2_%1%2
.setup_loop_gpr_%1%2:
and wq, 2 * regsize - 1
jz .end_%1%2
%endif
add dstq, wq
add src1q, wq
add src2q, wq
neg wq
.loop_gpr_%1%2:
mov t0b, [src1q + wq]
sub t0b, [src2q + wq]
mov [dstq + wq], t0b
inc wq
jl .loop_gpr_%1%2
.end_%1%2:
REP_RET
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
DIFF_BYTES_PROLOGUE
%define regsize mmsize
DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa
DIFF_BYTES_BODY a, a
%undef i
%endif
INIT_XMM sse2
DIFF_BYTES_PROLOGUE
%define regsize mmsize
DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa
test dstq, regsize - 1
jnz .loop_uu
test src1q, regsize - 1
jnz .loop_ua
DIFF_BYTES_BODY a, a
DIFF_BYTES_BODY u, a
DIFF_BYTES_BODY u, u
%undef i
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
DIFF_BYTES_PROLOGUE
%define regsize mmsize
; Directly using unaligned SSE2 version is marginally faster than
; branching based on arguments.
DIFF_BYTES_LOOP_PREP .skip_main_uu, .end_uu
test dstq, regsize - 1
jnz .loop_uu
test src1q, regsize - 1
jnz .loop_ua
DIFF_BYTES_BODY a, a
DIFF_BYTES_BODY u, a
DIFF_BYTES_BODY u, u
%undef i
%endif
;--------------------------------------------------------------------------------------------------
;void sub_left_predict(uint8_t *dst, uint8_t *src, ptrdiff_t stride, ptrdiff_t width, int height)
;--------------------------------------------------------------------------------------------------
INIT_XMM avx
cglobal sub_left_predict, 5,6,5, dst, src, stride, width, height, x
mova m1, [pb_80] ; prev initial
add dstq, widthq
add srcq, widthq
lea xd, [widthq-1]
neg widthq
and xd, 15
pinsrb m4, m1, xd, 15
mov xq, widthq
.loop:
movu m0, [srcq + widthq]
palignr m2, m0, m1, 15
movu m1, [srcq + widthq + 16]
palignr m3, m1, m0, 15
psubb m2, m0, m2
psubb m3, m1, m3
movu [dstq + widthq], m2
movu [dstq + widthq + 16], m3
add widthq, 2 * 16
jl .loop
add srcq, strideq
sub dstq, xq ; dst + width
test xd, 16
jz .mod32
mova m1, m0
.mod32:
pshufb m1, m4
mov widthq, xq
dec heightd
jg .loop
RET

Some files were not shown because too many files have changed in this diff Show More