early-access version 1432
This commit is contained in:
199
externals/ffmpeg/libavcodec/x86/Makefile
vendored
Executable file
199
externals/ffmpeg/libavcodec/x86/Makefile
vendored
Executable file
@@ -0,0 +1,199 @@
|
||||
OBJS += x86/constants.o \
|
||||
|
||||
# subsystems
|
||||
OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
|
||||
OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o
|
||||
OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o
|
||||
OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o
|
||||
OBJS-$(CONFIG_DCT) += x86/dct_init.o
|
||||
OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \
|
||||
x86/dirac_dwt_init.o
|
||||
OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
|
||||
OBJS-$(CONFIG_FFT) += x86/fft_init.o
|
||||
OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o
|
||||
OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert_init.o
|
||||
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
|
||||
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
|
||||
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
|
||||
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
|
||||
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
|
||||
OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp_init.o
|
||||
OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp_init.o
|
||||
OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp_init.o
|
||||
OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o
|
||||
OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_init.o
|
||||
OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o
|
||||
OBJS-$(CONFIG_LPC) += x86/lpc.o
|
||||
OBJS-$(CONFIG_MDCT15) += x86/mdct15_init.o
|
||||
OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o
|
||||
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
|
||||
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \
|
||||
x86/mpegvideodsp.o
|
||||
OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o \
|
||||
x86/mpegvideoencdsp_init.o
|
||||
OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp_init.o
|
||||
OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp_init.o
|
||||
OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp_init.o
|
||||
OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_init.o
|
||||
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
|
||||
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
|
||||
OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o
|
||||
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
|
||||
|
||||
# decoders/encoders
|
||||
OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp_init.o \
|
||||
x86/sbrdsp_init.o
|
||||
OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp_init.o
|
||||
OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o
|
||||
OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o
|
||||
OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o
|
||||
OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o
|
||||
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
|
||||
OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o
|
||||
OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp_init.o
|
||||
OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_init.o
|
||||
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
|
||||
OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o
|
||||
OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o
|
||||
OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_init.o
|
||||
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
|
||||
OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
|
||||
OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o
|
||||
OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o
|
||||
OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
|
||||
OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o
|
||||
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
|
||||
OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o
|
||||
OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp_init.o
|
||||
OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp_init.o
|
||||
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
|
||||
OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o
|
||||
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
|
||||
OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3_init.o
|
||||
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
|
||||
OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o \
|
||||
x86/vp9dsp_init_10bpp.o \
|
||||
x86/vp9dsp_init_12bpp.o \
|
||||
x86/vp9dsp_init_16bpp.o
|
||||
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
|
||||
|
||||
|
||||
# GCC inline assembly optimizations
|
||||
# subsystems
|
||||
MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o
|
||||
MMX-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_mmx.o
|
||||
|
||||
# decoders/encoders
|
||||
MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
|
||||
MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
|
||||
|
||||
# subsystems
|
||||
X86ASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o \
|
||||
x86/ac3dsp_downmix.o
|
||||
X86ASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
|
||||
X86ASM-OBJS-$(CONFIG_FFT) += x86/fft.o
|
||||
X86ASM-OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert.o
|
||||
X86ASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
|
||||
X86ASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
|
||||
x86/h264_chromamc_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
|
||||
x86/h264_deblock_10bit.o \
|
||||
x86/h264_idct.o \
|
||||
x86/h264_idct_10bit.o \
|
||||
x86/h264_weight.o \
|
||||
x86/h264_weight_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
|
||||
x86/h264_intrapred_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
|
||||
x86/h264_qpel_10bit.o \
|
||||
x86/fpel.o \
|
||||
x86/qpel.o
|
||||
X86ASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
|
||||
x86/hpeldsp.o
|
||||
X86ASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o
|
||||
X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o
|
||||
X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
|
||||
X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_search.o
|
||||
X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \
|
||||
x86/fpel.o \
|
||||
x86/qpel.o
|
||||
X86ASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o \
|
||||
x86/vc1dsp_mc.o
|
||||
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o \
|
||||
x86/simple_idct.o
|
||||
X86ASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \
|
||||
x86/vp8dsp_loopfilter.o
|
||||
|
||||
# decoders/encoders
|
||||
X86ASM-OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp.o \
|
||||
x86/sbrdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
|
||||
X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
|
||||
X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \
|
||||
x86/dirac_dwt.o
|
||||
X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
|
||||
X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
|
||||
ifdef CONFIG_GPL
|
||||
X86ASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o
|
||||
endif
|
||||
X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \
|
||||
x86/hevc_deblock.o \
|
||||
x86/hevc_idct.o \
|
||||
x86/hevc_mc.o \
|
||||
x86/hevc_sao.o \
|
||||
x86/hevc_sao_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o
|
||||
X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
|
||||
X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o
|
||||
X86ASM-OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o
|
||||
X86ASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
|
||||
X86ASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3.o
|
||||
X86ASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
|
||||
x86/vp9intrapred_16bpp.o \
|
||||
x86/vp9itxfm.o \
|
||||
x86/vp9itxfm_16bpp.o \
|
||||
x86/vp9lpf.o \
|
||||
x86/vp9lpf_16bpp.o \
|
||||
x86/vp9mc.o \
|
||||
x86/vp9mc_16bpp.o
|
||||
X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o
|
||||
86
externals/ffmpeg/libavcodec/x86/aacencdsp.asm
vendored
Executable file
86
externals/ffmpeg/libavcodec/x86/aacencdsp.asm
vendored
Executable file
@@ -0,0 +1,86 @@
|
||||
;******************************************************************************
|
||||
;* SIMD optimized AAC encoder DSP functions
|
||||
;*
|
||||
;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
float_abs_mask: times 4 dd 0x7fffffff
|
||||
|
||||
SECTION .text
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_abs_pow34(float *out, const float *in, const int size);
|
||||
;*******************************************************************
|
||||
INIT_XMM sse
|
||||
cglobal abs_pow34, 3, 3, 3, out, in, size
|
||||
mova m2, [float_abs_mask]
|
||||
shl sizeq, 2
|
||||
add inq, sizeq
|
||||
add outq, sizeq
|
||||
neg sizeq
|
||||
.loop:
|
||||
andps m0, m2, [inq+sizeq]
|
||||
sqrtps m1, m0
|
||||
mulps m0, m1
|
||||
sqrtps m0, m0
|
||||
mova [outq+sizeq], m0
|
||||
add sizeq, mmsize
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled,
|
||||
; int size, int is_signed, int maxval, const float Q34,
|
||||
; const float rounding)
|
||||
;*******************************************************************
|
||||
INIT_XMM sse2
|
||||
cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
|
||||
%if UNIX64 == 0
|
||||
movss m0, Q34m
|
||||
movss m1, roundingm
|
||||
cvtsi2ss m3, dword maxvalm
|
||||
%else
|
||||
cvtsi2ss m3, maxvald
|
||||
%endif
|
||||
shufps m0, m0, 0
|
||||
shufps m1, m1, 0
|
||||
shufps m3, m3, 0
|
||||
shl is_signedd, 31
|
||||
movd m4, is_signedd
|
||||
shufps m4, m4, 0
|
||||
shl sized, 2
|
||||
add inq, sizeq
|
||||
add outq, sizeq
|
||||
add scaledq, sizeq
|
||||
neg sizeq
|
||||
.loop:
|
||||
mulps m2, m0, [scaledq+sizeq]
|
||||
addps m2, m1
|
||||
minps m2, m3
|
||||
andps m5, m4, [inq+sizeq]
|
||||
orps m2, m5
|
||||
cvttps2dq m2, m2
|
||||
mova [outq+sizeq], m2
|
||||
add sizeq, mmsize
|
||||
jl .loop
|
||||
RET
|
||||
43
externals/ffmpeg/libavcodec/x86/aacencdsp_init.c
vendored
Executable file
43
externals/ffmpeg/libavcodec/x86/aacencdsp_init.c
vendored
Executable file
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
* AAC encoder assembly optimizations
|
||||
* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/float_dsp.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/aacenc.h"
|
||||
|
||||
void ff_abs_pow34_sse(float *out, const float *in, const int size);
|
||||
|
||||
void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled,
|
||||
int size, int is_signed, int maxval, const float Q34,
|
||||
const float rounding);
|
||||
|
||||
av_cold void ff_aac_dsp_init_x86(AACEncContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags))
|
||||
s->abs_pow34 = ff_abs_pow34_sse;
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
s->quant_bands = ff_aac_quantize_bands_sse2;
|
||||
}
|
||||
487
externals/ffmpeg/libavcodec/x86/aacpsdsp.asm
vendored
Executable file
487
externals/ffmpeg/libavcodec/x86/aacpsdsp.asm
vendored
Executable file
@@ -0,0 +1,487 @@
|
||||
;******************************************************************************
|
||||
;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
|
||||
;*
|
||||
;* Copyright (C) 2015 James Almer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
|
||||
|
||||
SECTION .text
|
||||
|
||||
;*************************************************************************
|
||||
;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
|
||||
;*************************************************************************
|
||||
%macro PS_ADD_SQUARES 1
|
||||
cglobal ps_add_squares, 3, 3, %1, dst, src, n
|
||||
shl nd, 3
|
||||
add srcq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
movaps m0, [srcq+nq]
|
||||
movaps m1, [srcq+nq+mmsize]
|
||||
mulps m0, m0
|
||||
mulps m1, m1
|
||||
HADDPS m0, m1, m2
|
||||
addps m0, [dstq]
|
||||
movaps [dstq], m0
|
||||
add dstq, mmsize
|
||||
add nq, mmsize*2
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
PS_ADD_SQUARES 2
|
||||
INIT_XMM sse3
|
||||
PS_ADD_SQUARES 3
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
|
||||
; float *src1, int n);
|
||||
;*******************************************************************
|
||||
INIT_XMM sse
|
||||
cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
|
||||
shl nd, 3
|
||||
add src1q, nq
|
||||
add dstq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
movu m0, [src1q+nq]
|
||||
movu m1, [src1q+nq+mmsize]
|
||||
mova m2, [src2q]
|
||||
mova m3, m2
|
||||
unpcklps m2, m2
|
||||
unpckhps m3, m3
|
||||
mulps m0, m2
|
||||
mulps m1, m3
|
||||
mova [dstq+nq], m0
|
||||
mova [dstq+nq+mmsize], m1
|
||||
add src2q, mmsize
|
||||
add nq, mmsize*2
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
;***********************************************************************
|
||||
;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
|
||||
; float h[2][4], float h_step[2][4],
|
||||
; int len);
|
||||
;***********************************************************************
|
||||
INIT_XMM sse3
|
||||
cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
|
||||
movaps m0, [hq]
|
||||
movaps m1, [h_stepq]
|
||||
unpcklps m4, m0, m0
|
||||
unpckhps m0, m0
|
||||
unpcklps m5, m1, m1
|
||||
unpckhps m1, m1
|
||||
shl nd, 3
|
||||
add lq, nq
|
||||
add rq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
addps m4, m5
|
||||
addps m0, m1
|
||||
movddup m2, [lq+nq]
|
||||
movddup m3, [rq+nq]
|
||||
mulps m2, m4
|
||||
mulps m3, m0
|
||||
addps m2, m3
|
||||
movsd [lq+nq], m2
|
||||
movhps [rq+nq], m2
|
||||
add nq, 8
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
;***************************************************************************
|
||||
;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
|
||||
; float h[2][4], float h_step[2][4],
|
||||
; int len);
|
||||
;***************************************************************************
|
||||
INIT_XMM sse3
|
||||
cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
|
||||
movaps m0, [hq]
|
||||
movaps m1, [hq+mmsize]
|
||||
%if ARCH_X86_64
|
||||
movaps m8, [h_stepq]
|
||||
movaps m9, [h_stepq+mmsize]
|
||||
%define H_STEP0 m8
|
||||
%define H_STEP1 m9
|
||||
%else
|
||||
%define H_STEP0 [h_stepq]
|
||||
%define H_STEP1 [h_stepq+mmsize]
|
||||
%endif
|
||||
shl nd, 3
|
||||
add lq, nq
|
||||
add rq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
addps m0, H_STEP0
|
||||
addps m1, H_STEP1
|
||||
movddup m2, [lq+nq]
|
||||
movddup m3, [rq+nq]
|
||||
shufps m4, m2, m2, q2301
|
||||
shufps m5, m3, m3, q2301
|
||||
unpcklps m6, m0, m0
|
||||
unpckhps m7, m0, m0
|
||||
mulps m2, m6
|
||||
mulps m3, m7
|
||||
unpcklps m6, m1, m1
|
||||
unpckhps m7, m1, m1
|
||||
mulps m4, m6
|
||||
mulps m5, m7
|
||||
addps m2, m3
|
||||
addsubps m2, m4
|
||||
addsubps m2, m5
|
||||
movsd [lq+nq], m2
|
||||
movhps [rq+nq], m2
|
||||
add nq, 8
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
;**********************************************************
|
||||
;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
|
||||
; float (*in)[32][2],
|
||||
; int i, int len)
|
||||
;**********************************************************
|
||||
INIT_XMM sse
|
||||
cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
|
||||
movsxdifnidn iq, id
|
||||
mov lend, 32 << 3
|
||||
lea inq, [inq+iq*4]
|
||||
mov tmpd, id
|
||||
shl tmpd, 8
|
||||
add outq, tmpq
|
||||
mov tmpd, 64
|
||||
sub tmpd, id
|
||||
mov id, tmpd
|
||||
|
||||
test id, 1
|
||||
jne .loop4
|
||||
test id, 2
|
||||
jne .loop8
|
||||
|
||||
align 16
|
||||
.loop16:
|
||||
mov in0q, inq
|
||||
mov in1q, 38*64*4
|
||||
add in1q, in0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop16:
|
||||
movaps m0, [in0q]
|
||||
movaps m1, [in1q]
|
||||
movaps m2, [in0q+lenq]
|
||||
movaps m3, [in1q+lenq]
|
||||
TRANSPOSE4x4PS 0, 1, 2, 3, 4
|
||||
movaps [outq], m0
|
||||
movaps [outq+lenq], m1
|
||||
movaps [outq+lenq*2], m2
|
||||
movaps [outq+3*32*2*4], m3
|
||||
lea in0q, [in0q+lenq*2]
|
||||
lea in1q, [in1q+lenq*2]
|
||||
add outq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop16
|
||||
add inq, 16
|
||||
add outq, 3*32*2*4
|
||||
sub id, 4
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop8:
|
||||
mov in0q, inq
|
||||
mov in1q, 38*64*4
|
||||
add in1q, in0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop8:
|
||||
movlps m0, [in0q]
|
||||
movlps m1, [in1q]
|
||||
movhps m0, [in0q+lenq]
|
||||
movhps m1, [in1q+lenq]
|
||||
SBUTTERFLYPS 0, 1, 2
|
||||
SBUTTERFLYPD 0, 1, 2
|
||||
movaps [outq], m0
|
||||
movaps [outq+lenq], m1
|
||||
lea in0q, [in0q+lenq*2]
|
||||
lea in1q, [in1q+lenq*2]
|
||||
add outq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop8
|
||||
add inq, 8
|
||||
add outq, lenq
|
||||
sub id, 2
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop4:
|
||||
mov in0q, inq
|
||||
mov in1q, 38*64*4
|
||||
add in1q, in0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop4:
|
||||
movss m0, [in0q]
|
||||
movss m1, [in1q]
|
||||
movss m2, [in0q+lenq]
|
||||
movss m3, [in1q+lenq]
|
||||
movlhps m0, m1
|
||||
movlhps m2, m3
|
||||
shufps m0, m2, q2020
|
||||
movaps [outq], m0
|
||||
lea in0q, [in0q+lenq*2]
|
||||
lea in1q, [in1q+lenq*2]
|
||||
add outq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop4
|
||||
add inq, 4
|
||||
sub id, 1
|
||||
test id, 2
|
||||
jne .loop8
|
||||
cmp id, 4
|
||||
jge .loop16
|
||||
RET
|
||||
|
||||
;***********************************************************
|
||||
;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
|
||||
; float (*in)[32][2],
|
||||
; int i, int len)
|
||||
;***********************************************************
|
||||
%macro HYBRID_SYNTHESIS_DEINT 0
|
||||
cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
|
||||
%if cpuflag(sse4)
|
||||
%define MOVH movsd
|
||||
%else
|
||||
%define MOVH movlps
|
||||
%endif
|
||||
movsxdifnidn iq, id
|
||||
mov lend, 32 << 3
|
||||
lea outq, [outq+iq*4]
|
||||
mov tmpd, id
|
||||
shl tmpd, 8
|
||||
add inq, tmpq
|
||||
mov tmpd, 64
|
||||
sub tmpd, id
|
||||
mov id, tmpd
|
||||
|
||||
test id, 1
|
||||
jne .loop4
|
||||
test id, 2
|
||||
jne .loop8
|
||||
|
||||
align 16
|
||||
.loop16:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop16:
|
||||
movaps m0, [inq]
|
||||
movaps m1, [inq+lenq]
|
||||
movaps m2, [inq+lenq*2]
|
||||
movaps m3, [inq+3*32*2*4]
|
||||
TRANSPOSE4x4PS 0, 1, 2, 3, 4
|
||||
movaps [out0q], m0
|
||||
movaps [out1q], m1
|
||||
movaps [out0q+lenq], m2
|
||||
movaps [out1q+lenq], m3
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop16
|
||||
add outq, 16
|
||||
add inq, 3*32*2*4
|
||||
sub id, 4
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop8:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop8:
|
||||
movaps m0, [inq]
|
||||
movaps m1, [inq+lenq]
|
||||
SBUTTERFLYPS 0, 1, 2
|
||||
SBUTTERFLYPD 0, 1, 2
|
||||
MOVH [out0q], m0
|
||||
MOVH [out1q], m1
|
||||
movhps [out0q+lenq], m0
|
||||
movhps [out1q+lenq], m1
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop8
|
||||
add outq, 8
|
||||
add inq, lenq
|
||||
sub id, 2
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop4:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop4:
|
||||
movaps m0, [inq]
|
||||
movss [out0q], m0
|
||||
%if cpuflag(sse4)
|
||||
extractps [out1q], m0, 1
|
||||
extractps [out0q+lenq], m0, 2
|
||||
extractps [out1q+lenq], m0, 3
|
||||
%else
|
||||
movhlps m1, m0
|
||||
movss [out0q+lenq], m1
|
||||
shufps m0, m0, 0xb1
|
||||
movss [out1q], m0
|
||||
movhlps m1, m0
|
||||
movss [out1q+lenq], m1
|
||||
%endif
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop4
|
||||
add outq, 4
|
||||
sub id, 1
|
||||
test id, 2
|
||||
jne .loop8
|
||||
cmp id, 4
|
||||
jge .loop16
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
HYBRID_SYNTHESIS_DEINT
|
||||
INIT_XMM sse4
|
||||
HYBRID_SYNTHESIS_DEINT
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
|
||||
; const float (*filter)[8][2],
|
||||
; ptrdiff_t stride, int n);
|
||||
;*******************************************************************
|
||||
%macro PS_HYBRID_ANALYSIS_LOOP 3
|
||||
movu %1, [inq+mmsize*%3]
|
||||
movu m1, [inq+mmsize*(5-%3)+8]
|
||||
%if cpuflag(sse3)
|
||||
pshufd %2, %1, q2301
|
||||
pshufd m4, m1, q0123
|
||||
pshufd m1, m1, q1032
|
||||
pshufd m2, [filterq+nq+mmsize*%3], q2301
|
||||
addsubps %2, m4
|
||||
addsubps %1, m1
|
||||
%else
|
||||
mova m2, [filterq+nq+mmsize*%3]
|
||||
mova %2, %1
|
||||
mova m4, m1
|
||||
shufps %2, %2, q2301
|
||||
shufps m4, m4, q0123
|
||||
shufps m1, m1, q1032
|
||||
shufps m2, m2, q2301
|
||||
xorps m4, m7
|
||||
xorps m1, m7
|
||||
subps %2, m4
|
||||
subps %1, m1
|
||||
%endif
|
||||
mulps %2, m2
|
||||
mulps %1, m2
|
||||
%if %3
|
||||
addps m3, %2
|
||||
addps m0, %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PS_HYBRID_ANALYSIS 0
|
||||
cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
|
||||
%if cpuflag(sse3)
|
||||
%define MOVH movsd
|
||||
%else
|
||||
%define MOVH movlps
|
||||
%endif
|
||||
shl strideq, 3
|
||||
shl nd, 6
|
||||
add filterq, nq
|
||||
neg nq
|
||||
mova m7, [ps_p1m1p1m1]
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
|
||||
PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
|
||||
PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
|
||||
|
||||
%if cpuflag(sse3)
|
||||
pshufd m3, m3, q2301
|
||||
xorps m0, m7
|
||||
hsubps m3, m0
|
||||
pshufd m1, m3, q0020
|
||||
pshufd m3, m3, q0031
|
||||
addps m1, m3
|
||||
movsd m2, [inq+6*8]
|
||||
%else
|
||||
mova m1, m3
|
||||
mova m2, m0
|
||||
shufps m1, m1, q2301
|
||||
shufps m2, m2, q2301
|
||||
subps m1, m3
|
||||
addps m2, m0
|
||||
unpcklps m3, m1, m2
|
||||
unpckhps m1, m2
|
||||
addps m1, m3
|
||||
movu m2, [inq+6*8] ; faster than movlps and no risk of overread
|
||||
%endif
|
||||
movss m3, [filterq+nq+8*6]
|
||||
SPLATD m3
|
||||
mulps m2, m3
|
||||
addps m1, m2
|
||||
MOVH [outq], m1
|
||||
add outq, strideq
|
||||
add nq, 64
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
PS_HYBRID_ANALYSIS
|
||||
INIT_XMM sse3
|
||||
PS_HYBRID_ANALYSIS
|
||||
72
externals/ffmpeg/libavcodec/x86/aacpsdsp_init.c
vendored
Executable file
72
externals/ffmpeg/libavcodec/x86/aacpsdsp_init.c
vendored
Executable file
@@ -0,0 +1,72 @@
|
||||
/*
|
||||
* SIMD optimized MPEG-4 Parametric Stereo decoding functions
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/aacpsdsp.h"
|
||||
|
||||
void ff_ps_add_squares_sse (float *dst, const float (*src)[2], int n);
|
||||
void ff_ps_add_squares_sse3 (float *dst, const float (*src)[2], int n);
|
||||
void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
|
||||
float *src1, int n);
|
||||
void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
|
||||
const float (*filter)[8][2],
|
||||
ptrdiff_t stride, int n);
|
||||
void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
|
||||
const float (*filter)[8][2],
|
||||
ptrdiff_t stride, int n);
|
||||
void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
|
||||
float h[2][4], float h_step[2][4],
|
||||
int len);
|
||||
void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
|
||||
float h[2][4], float h_step[2][4],
|
||||
int len);
|
||||
void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2],
|
||||
int i, int len);
|
||||
void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2],
|
||||
int i, int len);
|
||||
void ff_ps_hybrid_analysis_ileave_sse(float (*out)[32][2], float L[2][38][64],
|
||||
int i, int len);
|
||||
|
||||
av_cold void ff_psdsp_init_x86(PSDSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
s->add_squares = ff_ps_add_squares_sse;
|
||||
s->mul_pair_single = ff_ps_mul_pair_single_sse;
|
||||
s->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_sse;
|
||||
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse;
|
||||
s->hybrid_analysis = ff_ps_hybrid_analysis_sse;
|
||||
}
|
||||
if (EXTERNAL_SSE3(cpu_flags)) {
|
||||
s->add_squares = ff_ps_add_squares_sse3;
|
||||
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3;
|
||||
s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3;
|
||||
s->hybrid_analysis = ff_ps_hybrid_analysis_sse3;
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4;
|
||||
}
|
||||
}
|
||||
552
externals/ffmpeg/libavcodec/x86/ac3dsp.asm
vendored
Executable file
552
externals/ffmpeg/libavcodec/x86/ac3dsp.asm
vendored
Executable file
@@ -0,0 +1,552 @@
|
||||
;*****************************************************************************
|
||||
;* x86-optimized AC-3 DSP functions
|
||||
;* Copyright (c) 2011 Justin Ruggles
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
; 16777216.0f - used in ff_float_to_fixed24()
|
||||
pf_1_24: times 4 dd 0x4B800000
|
||||
|
||||
; used in ff_ac3_compute_mantissa_size()
|
||||
cextern ac3_bap_bits
|
||||
pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
|
||||
pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
|
||||
|
||||
; used in ff_ac3_extract_exponents()
|
||||
cextern pd_1
|
||||
pd_151: times 4 dd 151
|
||||
|
||||
; used in ff_apply_window_int16()
|
||||
pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
|
||||
pd_16384: times 4 dd 16384
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro AC3_EXPONENT_MIN 0
|
||||
cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
|
||||
shl reuse_blksq, 8
|
||||
jz .end
|
||||
LOOP_ALIGN
|
||||
.nextexp:
|
||||
mov offsetq, reuse_blksq
|
||||
mova m0, [expq+offsetq]
|
||||
sub offsetq, 256
|
||||
LOOP_ALIGN
|
||||
.nextblk:
|
||||
PMINUB m0, [expq+offsetq], m1
|
||||
sub offsetq, 256
|
||||
jae .nextblk
|
||||
mova [expq], m0
|
||||
add expq, mmsize
|
||||
sub expnq, mmsize
|
||||
jg .nextexp
|
||||
.end:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%define LOOP_ALIGN
|
||||
INIT_MMX mmx
|
||||
AC3_EXPONENT_MIN
|
||||
%if HAVE_MMXEXT_EXTERNAL
|
||||
%define LOOP_ALIGN ALIGN 16
|
||||
INIT_MMX mmxext
|
||||
AC3_EXPONENT_MIN
|
||||
%endif
|
||||
%if HAVE_SSE2_EXTERNAL
|
||||
INIT_XMM sse2
|
||||
AC3_EXPONENT_MIN
|
||||
%endif
|
||||
%undef LOOP_ALIGN
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
|
||||
;
|
||||
; This function uses 2 different methods to calculate a valid result.
|
||||
; 1) logical 'or' of abs of each element
|
||||
; This is used for ssse3 because of the pabsw instruction.
|
||||
; It is also used for mmx because of the lack of min/max instructions.
|
||||
; 2) calculate min/max for the array, then or(abs(min),abs(max))
|
||||
; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
|
||||
%macro OR_WORDS_HORIZ 2 ; src, tmp
|
||||
%if cpuflag(sse2)
|
||||
movhlps %2, %1
|
||||
por %1, %2
|
||||
pshuflw %2, %1, q0032
|
||||
por %1, %2
|
||||
pshuflw %2, %1, q0001
|
||||
por %1, %2
|
||||
%elif cpuflag(mmxext)
|
||||
pshufw %2, %1, q0032
|
||||
por %1, %2
|
||||
pshufw %2, %1, q0001
|
||||
por %1, %2
|
||||
%else ; mmx
|
||||
movq %2, %1
|
||||
psrlq %2, 32
|
||||
por %1, %2
|
||||
movq %2, %1
|
||||
psrlq %2, 16
|
||||
por %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro AC3_MAX_MSB_ABS_INT16 1
|
||||
cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
|
||||
pxor m2, m2
|
||||
pxor m3, m3
|
||||
.loop:
|
||||
%ifidn %1, min_max
|
||||
mova m0, [srcq]
|
||||
mova m1, [srcq+mmsize]
|
||||
pminsw m2, m0
|
||||
pminsw m2, m1
|
||||
pmaxsw m3, m0
|
||||
pmaxsw m3, m1
|
||||
%else ; or_abs
|
||||
%if notcpuflag(ssse3)
|
||||
mova m0, [srcq]
|
||||
mova m1, [srcq+mmsize]
|
||||
ABS2 m0, m1, m3, m4
|
||||
%else ; ssse3
|
||||
; using memory args is faster for ssse3
|
||||
pabsw m0, [srcq]
|
||||
pabsw m1, [srcq+mmsize]
|
||||
%endif
|
||||
por m2, m0
|
||||
por m2, m1
|
||||
%endif
|
||||
add srcq, mmsize*2
|
||||
sub lend, mmsize
|
||||
ja .loop
|
||||
%ifidn %1, min_max
|
||||
ABS2 m2, m3, m0, m1
|
||||
por m2, m3
|
||||
%endif
|
||||
OR_WORDS_HORIZ m2, m0
|
||||
movd eax, m2
|
||||
and eax, 0xFFFF
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
AC3_MAX_MSB_ABS_INT16 or_abs
|
||||
INIT_MMX mmxext
|
||||
AC3_MAX_MSB_ABS_INT16 min_max
|
||||
INIT_XMM sse2
|
||||
AC3_MAX_MSB_ABS_INT16 min_max
|
||||
INIT_XMM ssse3
|
||||
AC3_MAX_MSB_ABS_INT16 or_abs
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
|
||||
cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
|
||||
movd m0, shiftd
|
||||
.loop:
|
||||
mova m1, [srcq ]
|
||||
mova m2, [srcq+mmsize ]
|
||||
mova m3, [srcq+mmsize*2]
|
||||
mova m4, [srcq+mmsize*3]
|
||||
%3 m1, m0
|
||||
%3 m2, m0
|
||||
%3 m3, m0
|
||||
%3 m4, m0
|
||||
mova [srcq ], m1
|
||||
mova [srcq+mmsize ], m2
|
||||
mova [srcq+mmsize*2], m3
|
||||
mova [srcq+mmsize*3], m4
|
||||
add srcq, mmsize*4
|
||||
sub lend, mmsize*32/%2
|
||||
ja .loop
|
||||
.end:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
INIT_MMX mmx
|
||||
AC3_SHIFT l, 16, psllw
|
||||
INIT_XMM sse2
|
||||
AC3_SHIFT l, 16, psllw
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
INIT_MMX mmx
|
||||
AC3_SHIFT r, 32, psrad
|
||||
INIT_XMM sse2
|
||||
AC3_SHIFT r, 32, psrad
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
; The 3DNow! version is not bit-identical because pf2id uses truncation rather
|
||||
; than round-to-nearest.
|
||||
INIT_MMX 3dnow
|
||||
cglobal float_to_fixed24, 3, 3, 0, dst, src, len
|
||||
movq m0, [pf_1_24]
|
||||
.loop:
|
||||
movq m1, [srcq ]
|
||||
movq m2, [srcq+8 ]
|
||||
movq m3, [srcq+16]
|
||||
movq m4, [srcq+24]
|
||||
pfmul m1, m0
|
||||
pfmul m2, m0
|
||||
pfmul m3, m0
|
||||
pfmul m4, m0
|
||||
pf2id m1, m1
|
||||
pf2id m2, m2
|
||||
pf2id m3, m3
|
||||
pf2id m4, m4
|
||||
movq [dstq ], m1
|
||||
movq [dstq+8 ], m2
|
||||
movq [dstq+16], m3
|
||||
movq [dstq+24], m4
|
||||
add srcq, 32
|
||||
add dstq, 32
|
||||
sub lend, 8
|
||||
ja .loop
|
||||
femms
|
||||
RET
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal float_to_fixed24, 3, 3, 3, dst, src, len
|
||||
movaps m0, [pf_1_24]
|
||||
.loop:
|
||||
movaps m1, [srcq ]
|
||||
movaps m2, [srcq+16]
|
||||
mulps m1, m0
|
||||
mulps m2, m0
|
||||
cvtps2pi mm0, m1
|
||||
movhlps m1, m1
|
||||
cvtps2pi mm1, m1
|
||||
cvtps2pi mm2, m2
|
||||
movhlps m2, m2
|
||||
cvtps2pi mm3, m2
|
||||
movq [dstq ], mm0
|
||||
movq [dstq+ 8], mm1
|
||||
movq [dstq+16], mm2
|
||||
movq [dstq+24], mm3
|
||||
add srcq, 32
|
||||
add dstq, 32
|
||||
sub lend, 8
|
||||
ja .loop
|
||||
emms
|
||||
RET
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal float_to_fixed24, 3, 3, 9, dst, src, len
|
||||
movaps m0, [pf_1_24]
|
||||
.loop:
|
||||
movaps m1, [srcq ]
|
||||
movaps m2, [srcq+16 ]
|
||||
movaps m3, [srcq+32 ]
|
||||
movaps m4, [srcq+48 ]
|
||||
%ifdef m8
|
||||
movaps m5, [srcq+64 ]
|
||||
movaps m6, [srcq+80 ]
|
||||
movaps m7, [srcq+96 ]
|
||||
movaps m8, [srcq+112]
|
||||
%endif
|
||||
mulps m1, m0
|
||||
mulps m2, m0
|
||||
mulps m3, m0
|
||||
mulps m4, m0
|
||||
%ifdef m8
|
||||
mulps m5, m0
|
||||
mulps m6, m0
|
||||
mulps m7, m0
|
||||
mulps m8, m0
|
||||
%endif
|
||||
cvtps2dq m1, m1
|
||||
cvtps2dq m2, m2
|
||||
cvtps2dq m3, m3
|
||||
cvtps2dq m4, m4
|
||||
%ifdef m8
|
||||
cvtps2dq m5, m5
|
||||
cvtps2dq m6, m6
|
||||
cvtps2dq m7, m7
|
||||
cvtps2dq m8, m8
|
||||
%endif
|
||||
movdqa [dstq ], m1
|
||||
movdqa [dstq+16 ], m2
|
||||
movdqa [dstq+32 ], m3
|
||||
movdqa [dstq+48 ], m4
|
||||
%ifdef m8
|
||||
movdqa [dstq+64 ], m5
|
||||
movdqa [dstq+80 ], m6
|
||||
movdqa [dstq+96 ], m7
|
||||
movdqa [dstq+112], m8
|
||||
add srcq, 128
|
||||
add dstq, 128
|
||||
sub lenq, 32
|
||||
%else
|
||||
add srcq, 64
|
||||
add dstq, 64
|
||||
sub lenq, 16
|
||||
%endif
|
||||
ja .loop
|
||||
REP_RET
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro PHADDD4 2 ; xmm src, xmm tmp
|
||||
movhlps %2, %1
|
||||
paddd %1, %2
|
||||
pshufd %2, %1, 0x1
|
||||
paddd %1, %2
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
|
||||
movdqa m0, [mant_cntq ]
|
||||
movdqa m1, [mant_cntq+ 1*16]
|
||||
paddw m0, [mant_cntq+ 2*16]
|
||||
paddw m1, [mant_cntq+ 3*16]
|
||||
paddw m0, [mant_cntq+ 4*16]
|
||||
paddw m1, [mant_cntq+ 5*16]
|
||||
paddw m0, [mant_cntq+ 6*16]
|
||||
paddw m1, [mant_cntq+ 7*16]
|
||||
paddw m0, [mant_cntq+ 8*16]
|
||||
paddw m1, [mant_cntq+ 9*16]
|
||||
paddw m0, [mant_cntq+10*16]
|
||||
paddw m1, [mant_cntq+11*16]
|
||||
pmaddwd m0, [ac3_bap_bits ]
|
||||
pmaddwd m1, [ac3_bap_bits+16]
|
||||
paddd m0, m1
|
||||
PHADDD4 m0, m1
|
||||
movd sumd, m0
|
||||
movdqa m3, [pw_bap_mul1]
|
||||
movhpd m0, [mant_cntq +2]
|
||||
movlpd m0, [mant_cntq+1*32+2]
|
||||
movhpd m1, [mant_cntq+2*32+2]
|
||||
movlpd m1, [mant_cntq+3*32+2]
|
||||
movhpd m2, [mant_cntq+4*32+2]
|
||||
movlpd m2, [mant_cntq+5*32+2]
|
||||
pmulhuw m0, m3
|
||||
pmulhuw m1, m3
|
||||
pmulhuw m2, m3
|
||||
paddusw m0, m1
|
||||
paddusw m0, m2
|
||||
pmaddwd m0, [pw_bap_mul2]
|
||||
PHADDD4 m0, m1
|
||||
movd eax, m0
|
||||
add eax, sumd
|
||||
RET
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro PABSD 1-2 ; src/dst, unused
|
||||
%if cpuflag(ssse3)
|
||||
pabsd %1, %1
|
||||
%else ; src/dst, tmp
|
||||
pxor %2, %2
|
||||
pcmpgtd %2, %1
|
||||
pxor %1, %2
|
||||
psubd %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro AC3_EXTRACT_EXPONENTS 0
|
||||
cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
|
||||
add expq, lenq
|
||||
lea coefq, [coefq+4*lenq]
|
||||
neg lenq
|
||||
mova m2, [pd_1]
|
||||
mova m3, [pd_151]
|
||||
.loop:
|
||||
; move 4 32-bit coefs to xmm0
|
||||
mova m0, [coefq+4*lenq]
|
||||
; absolute value
|
||||
PABSD m0, m1
|
||||
; convert to float and extract exponents
|
||||
pslld m0, 1
|
||||
por m0, m2
|
||||
cvtdq2ps m1, m0
|
||||
psrld m1, 23
|
||||
mova m0, m3
|
||||
psubd m0, m1
|
||||
; move the lowest byte in each of 4 dwords to the low dword
|
||||
; NOTE: We cannot just extract the low bytes with pshufb because the dword
|
||||
; result for 16777215 is -1 due to float inaccuracy. Using packuswb
|
||||
; clips this to 0, which is the correct exponent.
|
||||
packssdw m0, m0
|
||||
packuswb m0, m0
|
||||
movd [expq+lenq], m0
|
||||
|
||||
add lenq, 4
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%if HAVE_SSE2_EXTERNAL
|
||||
INIT_XMM sse2
|
||||
AC3_EXTRACT_EXPONENTS
|
||||
%endif
|
||||
%if HAVE_SSSE3_EXTERNAL
|
||||
INIT_XMM ssse3
|
||||
AC3_EXTRACT_EXPONENTS
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_apply_window_int16(int16_t *output, const int16_t *input,
|
||||
; const int16_t *window, unsigned int len)
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro REVERSE_WORDS 1-2
|
||||
%if cpuflag(ssse3) && notcpuflag(atom)
|
||||
pshufb %1, %2
|
||||
%elif cpuflag(sse2)
|
||||
pshuflw %1, %1, 0x1B
|
||||
pshufhw %1, %1, 0x1B
|
||||
pshufd %1, %1, 0x4E
|
||||
%elif cpuflag(mmxext)
|
||||
pshufw %1, %1, 0x1B
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro MUL16FIXED 3
|
||||
%if cpuflag(ssse3) ; dst, src, unused
|
||||
; dst = ((dst * src) + (1<<14)) >> 15
|
||||
pmulhrsw %1, %2
|
||||
%elif cpuflag(mmxext) ; dst, src, temp
|
||||
; dst = (dst * src) >> 15
|
||||
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
|
||||
; in from the pmullw result.
|
||||
mova %3, %1
|
||||
pmulhw %1, %2
|
||||
pmullw %3, %2
|
||||
psrlw %3, 15
|
||||
psllw %1, 1
|
||||
por %1, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
|
||||
%if %1
|
||||
cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
|
||||
%else
|
||||
cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
|
||||
%endif
|
||||
lea offset2q, [offsetq-mmsize]
|
||||
%if cpuflag(ssse3) && notcpuflag(atom)
|
||||
mova m5, [pb_revwords]
|
||||
ALIGN 16
|
||||
%elif %1
|
||||
mova m5, [pd_16384]
|
||||
%endif
|
||||
.loop:
|
||||
%if cpuflag(ssse3)
|
||||
; This version does the 16x16->16 multiplication in-place without expanding
|
||||
; to 32-bit. The ssse3 version is bit-identical.
|
||||
mova m0, [windowq+offset2q]
|
||||
mova m1, [ inputq+offset2q]
|
||||
pmulhrsw m1, m0
|
||||
REVERSE_WORDS m0, m5
|
||||
pmulhrsw m0, [ inputq+offsetq ]
|
||||
mova [outputq+offset2q], m1
|
||||
mova [outputq+offsetq ], m0
|
||||
%elif %1
|
||||
; This version expands 16-bit to 32-bit, multiplies by the window,
|
||||
; adds 16384 for rounding, right shifts 15, then repacks back to words to
|
||||
; save to the output. The window is reversed for the second half.
|
||||
mova m3, [windowq+offset2q]
|
||||
mova m4, [ inputq+offset2q]
|
||||
pxor m0, m0
|
||||
punpcklwd m0, m3
|
||||
punpcklwd m1, m4
|
||||
pmaddwd m0, m1
|
||||
paddd m0, m5
|
||||
psrad m0, 15
|
||||
pxor m2, m2
|
||||
punpckhwd m2, m3
|
||||
punpckhwd m1, m4
|
||||
pmaddwd m2, m1
|
||||
paddd m2, m5
|
||||
psrad m2, 15
|
||||
packssdw m0, m2
|
||||
mova [outputq+offset2q], m0
|
||||
REVERSE_WORDS m3
|
||||
mova m4, [ inputq+offsetq]
|
||||
pxor m0, m0
|
||||
punpcklwd m0, m3
|
||||
punpcklwd m1, m4
|
||||
pmaddwd m0, m1
|
||||
paddd m0, m5
|
||||
psrad m0, 15
|
||||
pxor m2, m2
|
||||
punpckhwd m2, m3
|
||||
punpckhwd m1, m4
|
||||
pmaddwd m2, m1
|
||||
paddd m2, m5
|
||||
psrad m2, 15
|
||||
packssdw m0, m2
|
||||
mova [outputq+offsetq], m0
|
||||
%else
|
||||
; This version does the 16x16->16 multiplication in-place without expanding
|
||||
; to 32-bit. The mmxext and sse2 versions do not use rounding, and
|
||||
; therefore are not bit-identical to the C version.
|
||||
mova m0, [windowq+offset2q]
|
||||
mova m1, [ inputq+offset2q]
|
||||
mova m2, [ inputq+offsetq ]
|
||||
MUL16FIXED m1, m0, m3
|
||||
REVERSE_WORDS m0
|
||||
MUL16FIXED m2, m0, m3
|
||||
mova [outputq+offset2q], m1
|
||||
mova [outputq+offsetq ], m2
|
||||
%endif
|
||||
add offsetd, mmsize
|
||||
sub offset2d, mmsize
|
||||
jae .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
APPLY_WINDOW_INT16 0
|
||||
INIT_XMM sse2
|
||||
APPLY_WINDOW_INT16 0
|
||||
|
||||
INIT_MMX mmxext
|
||||
APPLY_WINDOW_INT16 1
|
||||
INIT_XMM sse2
|
||||
APPLY_WINDOW_INT16 1
|
||||
INIT_XMM ssse3
|
||||
APPLY_WINDOW_INT16 1
|
||||
INIT_XMM ssse3, atom
|
||||
APPLY_WINDOW_INT16 1
|
||||
187
externals/ffmpeg/libavcodec/x86/ac3dsp_downmix.asm
vendored
Executable file
187
externals/ffmpeg/libavcodec/x86/ac3dsp_downmix.asm
vendored
Executable file
@@ -0,0 +1,187 @@
|
||||
;*****************************************************************************
|
||||
;* x86-optimized AC-3 downmixing
|
||||
;* Copyright (c) 2012 Justin Ruggles
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
;******************************************************************************
|
||||
;* This is based on the channel mixing asm in libavresample, but it is
|
||||
;* simplified for only float coefficients and only 3 to 6 channels.
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; functions to downmix from 3 to 6 channels to mono or stereo
|
||||
; void ff_ac3_downmix_*(float **samples, float **matrix, int len);
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro AC3_DOWNMIX 2 ; %1 = in channels, %2 = out channels
|
||||
; define some names to make the code clearer
|
||||
%assign in_channels %1
|
||||
%assign out_channels %2
|
||||
%assign stereo out_channels - 1
|
||||
|
||||
; determine how many matrix elements must go on the stack vs. mmregs
|
||||
%assign matrix_elements in_channels * out_channels
|
||||
%if stereo
|
||||
%assign needed_mmregs 4
|
||||
%else
|
||||
%assign needed_mmregs 3
|
||||
%endif
|
||||
%assign matrix_elements_mm num_mmregs - needed_mmregs
|
||||
%if matrix_elements < matrix_elements_mm
|
||||
%assign matrix_elements_mm matrix_elements
|
||||
%endif
|
||||
%assign total_mmregs needed_mmregs+matrix_elements_mm
|
||||
%if matrix_elements_mm < matrix_elements
|
||||
%assign matrix_elements_stack matrix_elements - matrix_elements_mm
|
||||
%else
|
||||
%assign matrix_elements_stack 0
|
||||
%endif
|
||||
|
||||
cglobal ac3_downmix_%1_to_%2, 3,in_channels+1,total_mmregs,0-matrix_elements_stack*mmsize, src0, src1, len, src2, src3, src4, src5
|
||||
|
||||
; load matrix pointers
|
||||
%define matrix0q r1q
|
||||
%define matrix1q r3q
|
||||
%if stereo
|
||||
mov matrix1q, [matrix0q+gprsize]
|
||||
%endif
|
||||
mov matrix0q, [matrix0q]
|
||||
|
||||
; define matrix coeff names
|
||||
%assign %%i 0
|
||||
%assign %%j needed_mmregs
|
||||
%rep in_channels
|
||||
%if %%i >= matrix_elements_mm
|
||||
CAT_XDEFINE mx_stack_0_, %%i, 1
|
||||
CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
|
||||
%else
|
||||
CAT_XDEFINE mx_stack_0_, %%i, 0
|
||||
CAT_XDEFINE mx_0_, %%i, m %+ %%j
|
||||
%assign %%j %%j+1
|
||||
%endif
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
%if stereo
|
||||
%assign %%i 0
|
||||
%rep in_channels
|
||||
%if in_channels + %%i >= matrix_elements_mm
|
||||
CAT_XDEFINE mx_stack_1_, %%i, 1
|
||||
CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
|
||||
%else
|
||||
CAT_XDEFINE mx_stack_1_, %%i, 0
|
||||
CAT_XDEFINE mx_1_, %%i, m %+ %%j
|
||||
%assign %%j %%j+1
|
||||
%endif
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
%endif
|
||||
|
||||
; load/splat matrix coeffs
|
||||
%assign %%i 0
|
||||
%rep in_channels
|
||||
%if mx_stack_0_ %+ %%i
|
||||
VBROADCASTSS m0, [matrix0q+4*%%i]
|
||||
mova mx_0_ %+ %%i, m0
|
||||
%else
|
||||
VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
|
||||
%endif
|
||||
%if stereo
|
||||
%if mx_stack_1_ %+ %%i
|
||||
VBROADCASTSS m0, [matrix1q+4*%%i]
|
||||
mova mx_1_ %+ %%i, m0
|
||||
%else
|
||||
VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
|
||||
%endif
|
||||
%endif
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
|
||||
lea lenq, [4*r2d]
|
||||
; load channel pointers to registers
|
||||
%assign %%i 1
|
||||
%rep (in_channels - 1)
|
||||
mov src %+ %%i %+ q, [src0q+%%i*gprsize]
|
||||
add src %+ %%i %+ q, lenq
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
mov src0q, [src0q]
|
||||
add src0q, lenq
|
||||
neg lenq
|
||||
.loop:
|
||||
%if stereo || mx_stack_0_0
|
||||
mova m0, [src0q+lenq]
|
||||
%endif
|
||||
%if stereo
|
||||
mulps m1, m0, mx_1_0
|
||||
%endif
|
||||
%if stereo || mx_stack_0_0
|
||||
mulps m0, m0, mx_0_0
|
||||
%else
|
||||
mulps m0, mx_0_0, [src0q+lenq]
|
||||
%endif
|
||||
%assign %%i 1
|
||||
%rep (in_channels - 1)
|
||||
%define src_ptr src %+ %%i %+ q
|
||||
; avoid extra load for mono if matrix is in a mm register
|
||||
%if stereo || mx_stack_0_ %+ %%i
|
||||
mova m2, [src_ptr+lenq]
|
||||
%endif
|
||||
%if stereo
|
||||
FMULADD_PS m1, m2, mx_1_ %+ %%i, m1, m3
|
||||
%endif
|
||||
%if stereo || mx_stack_0_ %+ %%i
|
||||
FMULADD_PS m0, m2, mx_0_ %+ %%i, m0, m2
|
||||
%else
|
||||
FMULADD_PS m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
|
||||
%endif
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
mova [src0q+lenq], m0
|
||||
%if stereo
|
||||
mova [src1q+lenq], m1
|
||||
%endif
|
||||
|
||||
add lenq, mmsize
|
||||
jl .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro AC3_DOWNMIX_FUNCS 0
|
||||
%assign %%i 3
|
||||
%rep 4
|
||||
INIT_XMM sse
|
||||
AC3_DOWNMIX %%i, 1
|
||||
AC3_DOWNMIX %%i, 2
|
||||
INIT_YMM avx
|
||||
AC3_DOWNMIX %%i, 1
|
||||
AC3_DOWNMIX %%i, 2
|
||||
%if HAVE_FMA3_EXTERNAL
|
||||
INIT_YMM fma3
|
||||
AC3_DOWNMIX %%i, 1
|
||||
AC3_DOWNMIX %%i, 2
|
||||
%endif
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
AC3_DOWNMIX_FUNCS
|
||||
164
externals/ffmpeg/libavcodec/x86/ac3dsp_init.c
vendored
Executable file
164
externals/ffmpeg/libavcodec/x86/ac3dsp_init.c
vendored
Executable file
@@ -0,0 +1,164 @@
|
||||
/*
|
||||
* x86-optimized AC-3 DSP functions
|
||||
* Copyright (c) 2011 Justin Ruggles
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/ac3.h"
|
||||
#include "libavcodec/ac3dsp.h"
|
||||
|
||||
void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
||||
void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
||||
void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
||||
|
||||
int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len);
|
||||
int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
|
||||
int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
|
||||
int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);
|
||||
|
||||
void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
|
||||
void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);
|
||||
|
||||
void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
|
||||
void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
|
||||
|
||||
void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
|
||||
void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len);
|
||||
void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
|
||||
|
||||
int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
|
||||
|
||||
void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
|
||||
void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
|
||||
|
||||
void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
|
||||
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
|
||||
c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
|
||||
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
|
||||
}
|
||||
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
|
||||
if (!bit_exact) {
|
||||
c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
|
||||
if (bit_exact) {
|
||||
c->apply_window_int16 = ff_apply_window_int16_mmxext;
|
||||
} else {
|
||||
c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
c->float_to_fixed24 = ff_float_to_fixed24_sse;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
|
||||
c->float_to_fixed24 = ff_float_to_fixed24_sse2;
|
||||
c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
|
||||
c->extract_exponents = ff_ac3_extract_exponents_sse2;
|
||||
if (bit_exact) {
|
||||
c->apply_window_int16 = ff_apply_window_int16_sse2;
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2_FAST(cpu_flags)) {
|
||||
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
|
||||
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
|
||||
if (!bit_exact) {
|
||||
c->apply_window_int16 = ff_apply_window_int16_round_sse2;
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
|
||||
if (cpu_flags & AV_CPU_FLAG_ATOM) {
|
||||
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
|
||||
} else {
|
||||
c->extract_exponents = ff_ac3_extract_exponents_ssse3;
|
||||
c->apply_window_int16 = ff_apply_window_int16_ssse3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define DOWNMIX_FUNC_OPT(ch, opt) \
|
||||
void ff_ac3_downmix_ ## ch ## _to_1_ ## opt(float **samples, \
|
||||
float **matrix, int len); \
|
||||
void ff_ac3_downmix_ ## ch ## _to_2_ ## opt(float **samples, \
|
||||
float **matrix, int len);
|
||||
|
||||
#define DOWNMIX_FUNCS(opt) \
|
||||
DOWNMIX_FUNC_OPT(3, opt) \
|
||||
DOWNMIX_FUNC_OPT(4, opt) \
|
||||
DOWNMIX_FUNC_OPT(5, opt) \
|
||||
DOWNMIX_FUNC_OPT(6, opt)
|
||||
|
||||
DOWNMIX_FUNCS(sse)
|
||||
DOWNMIX_FUNCS(avx)
|
||||
DOWNMIX_FUNCS(fma3)
|
||||
|
||||
void ff_ac3dsp_set_downmix_x86(AC3DSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#define SET_DOWNMIX(ch, suf, SUF) \
|
||||
if (ch == c->in_channels) { \
|
||||
if (EXTERNAL_ ## SUF (cpu_flags)) { \
|
||||
if (c->out_channels == 1) \
|
||||
c->downmix = ff_ac3_downmix_ ## ch ## _to_1_ ## suf; \
|
||||
else \
|
||||
c->downmix = ff_ac3_downmix_ ## ch ## _to_2_ ## suf; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define SET_DOWNMIX_ALL(suf, SUF) \
|
||||
SET_DOWNMIX(3, suf, SUF) \
|
||||
SET_DOWNMIX(4, suf, SUF) \
|
||||
SET_DOWNMIX(5, suf, SUF) \
|
||||
SET_DOWNMIX(6, suf, SUF)
|
||||
|
||||
SET_DOWNMIX_ALL(sse, SSE)
|
||||
if (!(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
|
||||
SET_DOWNMIX_ALL(avx, AVX)
|
||||
SET_DOWNMIX_ALL(fma3, FMA3)
|
||||
}
|
||||
}
|
||||
133
externals/ffmpeg/libavcodec/x86/alacdsp.asm
vendored
Executable file
133
externals/ffmpeg/libavcodec/x86/alacdsp.asm
vendored
Executable file
@@ -0,0 +1,133 @@
|
||||
;******************************************************************************
|
||||
;* ALAC DSP SIMD optimizations
|
||||
;*
|
||||
;* Copyright (C) 2015 James Almer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM sse4
|
||||
%if ARCH_X86_64
|
||||
cglobal alac_decorrelate_stereo, 2, 5, 8, buf0, len, shift, weight, buf1
|
||||
%else
|
||||
cglobal alac_decorrelate_stereo, 2, 3, 8, buf0, len, shift, weight
|
||||
%define buf1q r2q
|
||||
%endif
|
||||
movd m6, shiftm
|
||||
movd m7, weightm
|
||||
SPLATD m7
|
||||
shl lend, 2
|
||||
mov buf1q, [buf0q + gprsize]
|
||||
mov buf0q, [buf0q]
|
||||
add buf1q, lenq
|
||||
add buf0q, lenq
|
||||
neg lenq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
mova m0, [buf0q + lenq]
|
||||
mova m1, [buf0q + lenq + mmsize]
|
||||
mova m2, [buf1q + lenq]
|
||||
mova m3, [buf1q + lenq + mmsize]
|
||||
pmulld m4, m2, m7
|
||||
pmulld m5, m3, m7
|
||||
psrad m4, m6
|
||||
psrad m5, m6
|
||||
psubd m0, m4
|
||||
psubd m1, m5
|
||||
paddd m2, m0
|
||||
paddd m3, m1
|
||||
mova [buf1q + lenq], m0
|
||||
mova [buf1q + lenq + mmsize], m1
|
||||
mova [buf0q + lenq], m2
|
||||
mova [buf0q + lenq + mmsize], m3
|
||||
|
||||
add lenq, mmsize*2
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal alac_append_extra_bits_stereo, 2, 5, 5, buf0, exbuf0, buf1, exbuf1, len
|
||||
movifnidn lend, lenm
|
||||
movd m4, r2m ; exbits
|
||||
shl lend, 2
|
||||
mov buf1q, [buf0q + gprsize]
|
||||
mov buf0q, [buf0q]
|
||||
mov exbuf1q, [exbuf0q + gprsize]
|
||||
mov exbuf0q, [exbuf0q]
|
||||
add buf1q, lenq
|
||||
add buf0q, lenq
|
||||
add exbuf1q, lenq
|
||||
add exbuf0q, lenq
|
||||
neg lenq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
mova m0, [buf0q + lenq]
|
||||
mova m1, [buf0q + lenq + mmsize]
|
||||
pslld m0, m4
|
||||
pslld m1, m4
|
||||
mova m2, [buf1q + lenq]
|
||||
mova m3, [buf1q + lenq + mmsize]
|
||||
pslld m2, m4
|
||||
pslld m3, m4
|
||||
por m0, [exbuf0q + lenq]
|
||||
por m1, [exbuf0q + lenq + mmsize]
|
||||
por m2, [exbuf1q + lenq]
|
||||
por m3, [exbuf1q + lenq + mmsize]
|
||||
mova [buf0q + lenq ], m0
|
||||
mova [buf0q + lenq + mmsize], m1
|
||||
mova [buf1q + lenq ], m2
|
||||
mova [buf1q + lenq + mmsize], m3
|
||||
|
||||
add lenq, mmsize*2
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len
|
||||
%else
|
||||
cglobal alac_append_extra_bits_mono, 2, 3, 3, buf, exbuf, len
|
||||
%define exbitsm r2m
|
||||
%endif
|
||||
movifnidn lend, r4m
|
||||
movd m2, exbitsm
|
||||
shl lend, 2
|
||||
mov bufq, [bufq]
|
||||
mov exbufq, [exbufq]
|
||||
add bufq, lenq
|
||||
add exbufq, lenq
|
||||
neg lenq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
mova m0, [bufq + lenq]
|
||||
mova m1, [bufq + lenq + mmsize]
|
||||
pslld m0, m2
|
||||
pslld m1, m2
|
||||
por m0, [exbufq + lenq]
|
||||
por m1, [exbufq + lenq + mmsize]
|
||||
mova [bufq + lenq], m0
|
||||
mova [bufq + lenq + mmsize], m1
|
||||
|
||||
add lenq, mmsize*2
|
||||
jl .loop
|
||||
REP_RET
|
||||
44
externals/ffmpeg/libavcodec/x86/alacdsp_init.c
vendored
Executable file
44
externals/ffmpeg/libavcodec/x86/alacdsp_init.c
vendored
Executable file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/alacdsp.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_alac_decorrelate_stereo_sse4(int32_t *buffer[2], int nb_samples,
|
||||
int decorr_shift, int decorr_left_weight);
|
||||
void ff_alac_append_extra_bits_stereo_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
|
||||
int extra_bits, int channels, int nb_samples);
|
||||
void ff_alac_append_extra_bits_mono_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
|
||||
int extra_bits, int channels, int nb_samples);
|
||||
|
||||
av_cold void ff_alacdsp_init_x86(ALACDSPContext *c)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->append_extra_bits[0] = ff_alac_append_extra_bits_mono_sse2;
|
||||
c->append_extra_bits[1] = ff_alac_append_extra_bits_stereo_sse2;
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
c->decorrelate_stereo = ff_alac_decorrelate_stereo_sse4;
|
||||
}
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
||||
174
externals/ffmpeg/libavcodec/x86/audiodsp.asm
vendored
Executable file
174
externals/ffmpeg/libavcodec/x86/audiodsp.asm
vendored
Executable file
@@ -0,0 +1,174 @@
|
||||
;******************************************************************************
|
||||
;* optimized audio functions
|
||||
;* Copyright (c) 2008 Loren Merritt
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro SCALARPRODUCT 0
|
||||
; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
|
||||
cglobal scalarproduct_int16, 3,3,3, v1, v2, order
|
||||
add orderd, orderd
|
||||
add v1q, orderq
|
||||
add v2q, orderq
|
||||
neg orderq
|
||||
pxor m2, m2
|
||||
.loop:
|
||||
movu m0, [v1q + orderq]
|
||||
movu m1, [v1q + orderq + mmsize]
|
||||
pmaddwd m0, [v2q + orderq]
|
||||
pmaddwd m1, [v2q + orderq + mmsize]
|
||||
paddd m2, m0
|
||||
paddd m2, m1
|
||||
add orderq, mmsize*2
|
||||
jl .loop
|
||||
HADDD m2, m0
|
||||
movd eax, m2
|
||||
%if mmsize == 8
|
||||
emms
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
SCALARPRODUCT
|
||||
INIT_XMM sse2
|
||||
SCALARPRODUCT
|
||||
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
|
||||
; int32_t max, unsigned int len)
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
; %1 = number of xmm registers used
|
||||
; %2 = number of inline load/process/store loops per asm loop
|
||||
; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
|
||||
; %4 = CLIPD function takes min/max as float instead of int (SSE2 version)
|
||||
; %5 = suffix
|
||||
%macro VECTOR_CLIP_INT32 4-5
|
||||
cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
|
||||
%if %4
|
||||
cvtsi2ss m4, minm
|
||||
cvtsi2ss m5, maxm
|
||||
%else
|
||||
movd m4, minm
|
||||
movd m5, maxm
|
||||
%endif
|
||||
SPLATD m4
|
||||
SPLATD m5
|
||||
.loop:
|
||||
%assign %%i 0
|
||||
%rep %2
|
||||
mova m0, [srcq + mmsize * (0 + %%i)]
|
||||
mova m1, [srcq + mmsize * (1 + %%i)]
|
||||
mova m2, [srcq + mmsize * (2 + %%i)]
|
||||
mova m3, [srcq + mmsize * (3 + %%i)]
|
||||
%if %3
|
||||
mova m7, [srcq + mmsize * (4 + %%i)]
|
||||
mova m8, [srcq + mmsize * (5 + %%i)]
|
||||
mova m9, [srcq + mmsize * (6 + %%i)]
|
||||
mova m10, [srcq + mmsize * (7 + %%i)]
|
||||
%endif
|
||||
CLIPD m0, m4, m5, m6
|
||||
CLIPD m1, m4, m5, m6
|
||||
CLIPD m2, m4, m5, m6
|
||||
CLIPD m3, m4, m5, m6
|
||||
%if %3
|
||||
CLIPD m7, m4, m5, m6
|
||||
CLIPD m8, m4, m5, m6
|
||||
CLIPD m9, m4, m5, m6
|
||||
CLIPD m10, m4, m5, m6
|
||||
%endif
|
||||
mova [dstq + mmsize * (0 + %%i)], m0
|
||||
mova [dstq + mmsize * (1 + %%i)], m1
|
||||
mova [dstq + mmsize * (2 + %%i)], m2
|
||||
mova [dstq + mmsize * (3 + %%i)], m3
|
||||
%if %3
|
||||
mova [dstq + mmsize * (4 + %%i)], m7
|
||||
mova [dstq + mmsize * (5 + %%i)], m8
|
||||
mova [dstq + mmsize * (6 + %%i)], m9
|
||||
mova [dstq + mmsize * (7 + %%i)], m10
|
||||
%endif
|
||||
%assign %%i (%%i + 4 * (1 + %3))
|
||||
%endrep
|
||||
add srcq, mmsize*4*(%2+%3)
|
||||
add dstq, mmsize*4*(%2+%3)
|
||||
sub lend, mmsize*(%2+%3)
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
VECTOR_CLIP_INT32 0, 1, 0, 0
|
||||
INIT_XMM sse2
|
||||
VECTOR_CLIP_INT32 6, 1, 0, 0, _int
|
||||
VECTOR_CLIP_INT32 6, 2, 0, 1
|
||||
INIT_XMM sse4
|
||||
%ifdef m8
|
||||
VECTOR_CLIP_INT32 11, 1, 1, 0
|
||||
%else
|
||||
VECTOR_CLIP_INT32 6, 1, 0, 0
|
||||
%endif
|
||||
|
||||
; void ff_vector_clipf_sse(float *dst, const float *src,
|
||||
; int len, float min, float max)
|
||||
INIT_XMM sse
|
||||
cglobal vector_clipf, 3, 3, 6, dst, src, len, min, max
|
||||
%if ARCH_X86_32
|
||||
VBROADCASTSS m0, minm
|
||||
VBROADCASTSS m1, maxm
|
||||
%elif WIN64
|
||||
SWAP 0, 3
|
||||
VBROADCASTSS m0, m0
|
||||
VBROADCASTSS m1, maxm
|
||||
%else ; 64bit sysv
|
||||
VBROADCASTSS m0, m0
|
||||
VBROADCASTSS m1, m1
|
||||
%endif
|
||||
|
||||
movsxdifnidn lenq, lend
|
||||
|
||||
.loop:
|
||||
mova m2, [srcq + 4 * lenq - 4 * mmsize]
|
||||
mova m3, [srcq + 4 * lenq - 3 * mmsize]
|
||||
mova m4, [srcq + 4 * lenq - 2 * mmsize]
|
||||
mova m5, [srcq + 4 * lenq - 1 * mmsize]
|
||||
|
||||
maxps m2, m0
|
||||
maxps m3, m0
|
||||
maxps m4, m0
|
||||
maxps m5, m0
|
||||
|
||||
minps m2, m1
|
||||
minps m3, m1
|
||||
minps m4, m1
|
||||
minps m5, m1
|
||||
|
||||
mova [dstq + 4 * lenq - 4 * mmsize], m2
|
||||
mova [dstq + 4 * lenq - 3 * mmsize], m3
|
||||
mova [dstq + 4 * lenq - 2 * mmsize], m4
|
||||
mova [dstq + 4 * lenq - 1 * mmsize], m5
|
||||
|
||||
sub lenq, mmsize
|
||||
jg .loop
|
||||
|
||||
RET
|
||||
66
externals/ffmpeg/libavcodec/x86/audiodsp_init.c
vendored
Executable file
66
externals/ffmpeg/libavcodec/x86/audiodsp_init.c
vendored
Executable file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/audiodsp.h"
|
||||
|
||||
int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
|
||||
int order);
|
||||
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
|
||||
int order);
|
||||
|
||||
void ff_vector_clip_int32_mmx(int32_t *dst, const int32_t *src,
|
||||
int32_t min, int32_t max, unsigned int len);
|
||||
void ff_vector_clip_int32_sse2(int32_t *dst, const int32_t *src,
|
||||
int32_t min, int32_t max, unsigned int len);
|
||||
void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
|
||||
int32_t min, int32_t max, unsigned int len);
|
||||
void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
|
||||
int32_t min, int32_t max, unsigned int len);
|
||||
void ff_vector_clipf_sse(float *dst, const float *src,
|
||||
int len, float min, float max);
|
||||
|
||||
av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags))
|
||||
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags))
|
||||
c->vector_clipf = ff_vector_clipf_sse;
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
|
||||
if (cpu_flags & AV_CPU_FLAG_ATOM)
|
||||
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
|
||||
else
|
||||
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags))
|
||||
c->vector_clip_int32 = ff_vector_clip_int32_sse4;
|
||||
}
|
||||
88
externals/ffmpeg/libavcodec/x86/blockdsp.asm
vendored
Executable file
88
externals/ffmpeg/libavcodec/x86/blockdsp.asm
vendored
Executable file
@@ -0,0 +1,88 @@
|
||||
;******************************************************************************
|
||||
;* SIMD-optimized clear block functions
|
||||
;* Copyright (c) 2002 Michael Niedermayer
|
||||
;* Copyright (c) 2008 Loren Merritt
|
||||
;* Copyright (c) 2009 Fiona Glaser
|
||||
;*
|
||||
;* AVX version by Jokyo Images
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
;----------------------------------------
|
||||
; void ff_clear_block(int16_t *blocks);
|
||||
;----------------------------------------
|
||||
; %1 = number of xmm registers used
|
||||
; %2 = number of inline store loops
|
||||
%macro CLEAR_BLOCK 2
|
||||
cglobal clear_block, 1, 1, %1, blocks
|
||||
ZERO m0, m0, m0
|
||||
%assign %%i 0
|
||||
%rep %2
|
||||
mova [blocksq+mmsize*(0+%%i)], m0
|
||||
mova [blocksq+mmsize*(1+%%i)], m0
|
||||
mova [blocksq+mmsize*(2+%%i)], m0
|
||||
mova [blocksq+mmsize*(3+%%i)], m0
|
||||
%assign %%i %%i+4
|
||||
%endrep
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
%define ZERO pxor
|
||||
CLEAR_BLOCK 0, 4
|
||||
INIT_XMM sse
|
||||
%define ZERO xorps
|
||||
CLEAR_BLOCK 1, 2
|
||||
INIT_YMM avx
|
||||
CLEAR_BLOCK 1, 1
|
||||
|
||||
;-----------------------------------------
|
||||
; void ff_clear_blocks(int16_t *blocks);
|
||||
;-----------------------------------------
|
||||
; %1 = number of xmm registers used
|
||||
%macro CLEAR_BLOCKS 1
|
||||
cglobal clear_blocks, 1, 2, %1, blocks, len
|
||||
add blocksq, 768
|
||||
mov lenq, -768
|
||||
ZERO m0, m0, m0
|
||||
.loop:
|
||||
mova [blocksq+lenq+mmsize*0], m0
|
||||
mova [blocksq+lenq+mmsize*1], m0
|
||||
mova [blocksq+lenq+mmsize*2], m0
|
||||
mova [blocksq+lenq+mmsize*3], m0
|
||||
mova [blocksq+lenq+mmsize*4], m0
|
||||
mova [blocksq+lenq+mmsize*5], m0
|
||||
mova [blocksq+lenq+mmsize*6], m0
|
||||
mova [blocksq+lenq+mmsize*7], m0
|
||||
add lenq, mmsize*8
|
||||
js .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
%define ZERO pxor
|
||||
CLEAR_BLOCKS 0
|
||||
INIT_XMM sse
|
||||
%define ZERO xorps
|
||||
CLEAR_BLOCKS 1
|
||||
INIT_YMM avx
|
||||
CLEAR_BLOCKS 1
|
||||
60
externals/ffmpeg/libavcodec/x86/blockdsp_init.c
vendored
Executable file
60
externals/ffmpeg/libavcodec/x86/blockdsp_init.c
vendored
Executable file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/blockdsp.h"
|
||||
#include "libavcodec/version.h"
|
||||
|
||||
void ff_clear_block_mmx(int16_t *block);
|
||||
void ff_clear_block_sse(int16_t *block);
|
||||
void ff_clear_block_avx(int16_t *block);
|
||||
void ff_clear_blocks_mmx(int16_t *blocks);
|
||||
void ff_clear_blocks_sse(int16_t *blocks);
|
||||
void ff_clear_blocks_avx(int16_t *blocks);
|
||||
|
||||
av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
|
||||
AVCodecContext *avctx)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->clear_block = ff_clear_block_mmx;
|
||||
c->clear_blocks = ff_clear_blocks_mmx;
|
||||
}
|
||||
|
||||
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
|
||||
if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
|
||||
return;
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
c->clear_block = ff_clear_block_sse;
|
||||
c->clear_blocks = ff_clear_blocks_sse;
|
||||
}
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags)) {
|
||||
c->clear_block = ff_clear_block_avx;
|
||||
c->clear_blocks = ff_clear_blocks_avx;
|
||||
}
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
||||
159
externals/ffmpeg/libavcodec/x86/bswapdsp.asm
vendored
Executable file
159
externals/ffmpeg/libavcodec/x86/bswapdsp.asm
vendored
Executable file
@@ -0,0 +1,159 @@
|
||||
;******************************************************************************
|
||||
;* optimized bswap buffer functions
|
||||
;* Copyright (c) 2008 Loren Merritt
|
||||
;* Copyright (c) 2003-2013 Michael Niedermayer
|
||||
;* Copyright (c) 2013 Daniel Kang
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
|
||||
|
||||
cextern pb_80
|
||||
|
||||
SECTION .text
|
||||
|
||||
; %1 = aligned/unaligned
|
||||
%macro BSWAP_LOOPS 1
|
||||
mov r3d, r2d
|
||||
sar r2d, 3
|
||||
jz .left4_%1
|
||||
%if cpuflag(avx2)
|
||||
sar r2d, 1
|
||||
jz .left8_%1
|
||||
%endif
|
||||
.loop8_%1:
|
||||
mov%1 m0, [r1 + 0]
|
||||
mov%1 m1, [r1 + mmsize]
|
||||
%if cpuflag(ssse3)||cpuflag(avx2)
|
||||
pshufb m0, m2
|
||||
pshufb m1, m2
|
||||
mov%1 [r0 + 0], m0
|
||||
mov%1 [r0 + mmsize], m1
|
||||
%else
|
||||
pshuflw m0, m0, 10110001b
|
||||
pshuflw m1, m1, 10110001b
|
||||
pshufhw m0, m0, 10110001b
|
||||
pshufhw m1, m1, 10110001b
|
||||
mova m2, m0
|
||||
mova m3, m1
|
||||
psllw m0, 8
|
||||
psllw m1, 8
|
||||
psrlw m2, 8
|
||||
psrlw m3, 8
|
||||
por m2, m0
|
||||
por m3, m1
|
||||
mov%1 [r0 + 0], m2
|
||||
mov%1 [r0 + 16], m3
|
||||
%endif
|
||||
add r0, mmsize*2
|
||||
add r1, mmsize*2
|
||||
dec r2d
|
||||
jnz .loop8_%1
|
||||
%if cpuflag(avx2)
|
||||
.left8_%1:
|
||||
mov r2d, r3d
|
||||
test r3d, 8
|
||||
jz .left4_%1
|
||||
mov%1 m0, [r1]
|
||||
pshufb m0, m2
|
||||
mov%1 [r0 + 0], m0
|
||||
add r1, mmsize
|
||||
add r0, mmsize
|
||||
%endif
|
||||
.left4_%1:
|
||||
mov r2d, r3d
|
||||
test r3d, 4
|
||||
jz .left
|
||||
mov%1 xm0, [r1]
|
||||
%if cpuflag(ssse3)
|
||||
pshufb xm0, xm2
|
||||
mov%1 [r0], xm0
|
||||
%else
|
||||
pshuflw m0, m0, 10110001b
|
||||
pshufhw m0, m0, 10110001b
|
||||
mova m2, m0
|
||||
psllw m0, 8
|
||||
psrlw m2, 8
|
||||
por m2, m0
|
||||
mov%1 [r0], m2
|
||||
%endif
|
||||
add r1, 16
|
||||
add r0, 16
|
||||
%endmacro
|
||||
|
||||
; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
|
||||
%macro BSWAP32_BUF 0
|
||||
%if cpuflag(ssse3)||cpuflag(avx2)
|
||||
cglobal bswap32_buf, 3,4,3
|
||||
mov r3, r1
|
||||
VBROADCASTI128 m2, [pb_bswap32]
|
||||
%else
|
||||
cglobal bswap32_buf, 3,4,5
|
||||
mov r3, r1
|
||||
%endif
|
||||
or r3, r0
|
||||
test r3, mmsize - 1
|
||||
jz .start_align
|
||||
BSWAP_LOOPS u
|
||||
jmp .left
|
||||
.start_align:
|
||||
BSWAP_LOOPS a
|
||||
.left:
|
||||
%if cpuflag(ssse3)
|
||||
test r2d, 2
|
||||
jz .left1
|
||||
movq xm0, [r1]
|
||||
pshufb xm0, xm2
|
||||
movq [r0], xm0
|
||||
add r1, 8
|
||||
add r0, 8
|
||||
.left1:
|
||||
test r2d, 1
|
||||
jz .end
|
||||
mov r2d, [r1]
|
||||
bswap r2d
|
||||
mov [r0], r2d
|
||||
%else
|
||||
and r2d, 3
|
||||
jz .end
|
||||
.loop2:
|
||||
mov r3d, [r1]
|
||||
bswap r3d
|
||||
mov [r0], r3d
|
||||
add r1, 4
|
||||
add r0, 4
|
||||
dec r2d
|
||||
jnz .loop2
|
||||
%endif
|
||||
.end:
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
BSWAP32_BUF
|
||||
|
||||
INIT_XMM ssse3
|
||||
BSWAP32_BUF
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
BSWAP32_BUF
|
||||
%endif
|
||||
40
externals/ffmpeg/libavcodec/x86/bswapdsp_init.c
vendored
Executable file
40
externals/ffmpeg/libavcodec/x86/bswapdsp_init.c
vendored
Executable file
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/bswapdsp.h"
|
||||
|
||||
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
|
||||
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
|
||||
void ff_bswap32_buf_avx2(uint32_t *dst, const uint32_t *src, int w);
|
||||
|
||||
av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
c->bswap_buf = ff_bswap32_buf_sse2;
|
||||
if (EXTERNAL_SSSE3(cpu_flags))
|
||||
c->bswap_buf = ff_bswap32_buf_ssse3;
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags))
|
||||
c->bswap_buf = ff_bswap32_buf_avx2;
|
||||
}
|
||||
301
externals/ffmpeg/libavcodec/x86/cabac.h
vendored
Executable file
301
externals/ffmpeg/libavcodec/x86/cabac.h
vendored
Executable file
@@ -0,0 +1,301 @@
|
||||
/*
|
||||
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_CABAC_H
|
||||
#define AVCODEC_X86_CABAC_H
|
||||
|
||||
#include "libavcodec/cabac.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/macros.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "config.h"
|
||||
|
||||
#if (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
|
||||
|| ( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)\
|
||||
|| (defined(__INTEL_COMPILER) && defined(_MSC_VER))
|
||||
# define BROKEN_COMPILER 1
|
||||
#else
|
||||
# define BROKEN_COMPILER 0
|
||||
#endif
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#ifndef UNCHECKED_BITSTREAM_READER
|
||||
#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
|
||||
#endif
|
||||
|
||||
#if UNCHECKED_BITSTREAM_READER
|
||||
#define END_CHECK(end) ""
|
||||
#else
|
||||
#define END_CHECK(end) \
|
||||
"cmp "end" , %%"FF_REG_c" \n\t"\
|
||||
"jge 1f \n\t"
|
||||
#endif
|
||||
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
#define TABLES_ARG , "r"(tables)
|
||||
|
||||
#if HAVE_FAST_CMOV
|
||||
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
|
||||
"cmp "low" , "tmp" \n\t"\
|
||||
"cmova %%ecx , "range" \n\t"\
|
||||
"sbb %%rcx , %%rcx \n\t"\
|
||||
"and %%ecx , "tmp" \n\t"\
|
||||
"xor %%rcx , "retq" \n\t"\
|
||||
"sub "tmp" , "low" \n\t"
|
||||
#else /* HAVE_FAST_CMOV */
|
||||
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
|
||||
/* P4 Prescott has crappy cmov,sbb,64-bit shift so avoid them */ \
|
||||
"sub "low" , "tmp" \n\t"\
|
||||
"sar $31 , "tmp" \n\t"\
|
||||
"sub %%ecx , "range" \n\t"\
|
||||
"and "tmp" , "range" \n\t"\
|
||||
"add %%ecx , "range" \n\t"\
|
||||
"shl $17 , %%ecx \n\t"\
|
||||
"and "tmp" , %%ecx \n\t"\
|
||||
"sub %%ecx , "low" \n\t"\
|
||||
"xor "tmp" , "ret" \n\t"\
|
||||
"movslq "ret" , "retq" \n\t"
|
||||
#endif /* HAVE_FAST_CMOV */
|
||||
|
||||
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
|
||||
"movzbl "statep" , "ret" \n\t"\
|
||||
"mov "range" , "tmp" \n\t"\
|
||||
"and $0xC0 , "range" \n\t"\
|
||||
"lea ("ret", "range", 2), %%ecx \n\t"\
|
||||
"movzbl "lps_off"("tables", %%rcx), "range" \n\t"\
|
||||
"sub "range" , "tmp" \n\t"\
|
||||
"mov "tmp" , %%ecx \n\t"\
|
||||
"shl $17 , "tmp" \n\t"\
|
||||
BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
|
||||
"movzbl "norm_off"("tables", "rangeq"), %%ecx \n\t"\
|
||||
"shl %%cl , "range" \n\t"\
|
||||
"movzbl "mlps_off"+128("tables", "retq"), "tmp" \n\t"\
|
||||
"shl %%cl , "low" \n\t"\
|
||||
"mov "tmpbyte" , "statep" \n\t"\
|
||||
"test "lowword" , "lowword" \n\t"\
|
||||
"jnz 2f \n\t"\
|
||||
"mov "byte" , %%"FF_REG_c" \n\t"\
|
||||
END_CHECK(end)\
|
||||
"add"FF_OPSIZE" $2 , "byte" \n\t"\
|
||||
"1: \n\t"\
|
||||
"movzwl (%%"FF_REG_c") , "tmp" \n\t"\
|
||||
"lea -1("low") , %%ecx \n\t"\
|
||||
"xor "low" , %%ecx \n\t"\
|
||||
"shr $15 , %%ecx \n\t"\
|
||||
"bswap "tmp" \n\t"\
|
||||
"shr $15 , "tmp" \n\t"\
|
||||
"movzbl "norm_off"("tables", %%rcx), %%ecx \n\t"\
|
||||
"sub $0xFFFF , "tmp" \n\t"\
|
||||
"neg %%ecx \n\t"\
|
||||
"add $7 , %%ecx \n\t"\
|
||||
"shl %%cl , "tmp" \n\t"\
|
||||
"add "tmp" , "low" \n\t"\
|
||||
"2: \n\t"
|
||||
|
||||
#else /* BROKEN_RELOCATIONS */
|
||||
#define TABLES_ARG NAMED_CONSTRAINTS_ARRAY_ADD(ff_h264_cabac_tables)
|
||||
#define RIP_ARG
|
||||
|
||||
#if HAVE_FAST_CMOV
|
||||
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
|
||||
"mov "tmp" , %%ecx \n\t"\
|
||||
"shl $17 , "tmp" \n\t"\
|
||||
"cmp "low" , "tmp" \n\t"\
|
||||
"cmova %%ecx , "range" \n\t"\
|
||||
"sbb %%ecx , %%ecx \n\t"\
|
||||
"and %%ecx , "tmp" \n\t"\
|
||||
"xor %%ecx , "ret" \n\t"\
|
||||
"sub "tmp" , "low" \n\t"
|
||||
#else /* HAVE_FAST_CMOV */
|
||||
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
|
||||
"mov "tmp" , %%ecx \n\t"\
|
||||
"shl $17 , "tmp" \n\t"\
|
||||
"sub "low" , "tmp" \n\t"\
|
||||
"sar $31 , "tmp" \n\t" /*lps_mask*/\
|
||||
"sub %%ecx , "range" \n\t" /*RangeLPS - range*/\
|
||||
"and "tmp" , "range" \n\t" /*(RangeLPS - range)&lps_mask*/\
|
||||
"add %%ecx , "range" \n\t" /*new range*/\
|
||||
"shl $17 , %%ecx \n\t"\
|
||||
"and "tmp" , %%ecx \n\t"\
|
||||
"sub %%ecx , "low" \n\t"\
|
||||
"xor "tmp" , "ret" \n\t"
|
||||
#endif /* HAVE_FAST_CMOV */
|
||||
|
||||
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
|
||||
"movzbl "statep" , "ret" \n\t"\
|
||||
"mov "range" , "tmp" \n\t"\
|
||||
"and $0xC0 , "range" \n\t"\
|
||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"lps_off"("ret", "range", 2), "range" \n\t"\
|
||||
"sub "range" , "tmp" \n\t"\
|
||||
BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp) \
|
||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"("range"), %%ecx \n\t"\
|
||||
"shl %%cl , "range" \n\t"\
|
||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"mlps_off"+128("ret"), "tmp" \n\t"\
|
||||
"shl %%cl , "low" \n\t"\
|
||||
"mov "tmpbyte" , "statep" \n\t"\
|
||||
"test "lowword" , "lowword" \n\t"\
|
||||
" jnz 2f \n\t"\
|
||||
"mov "byte" , %%"FF_REG_c" \n\t"\
|
||||
END_CHECK(end)\
|
||||
"add"FF_OPSIZE" $2 , "byte" \n\t"\
|
||||
"1: \n\t"\
|
||||
"movzwl (%%"FF_REG_c") , "tmp" \n\t"\
|
||||
"lea -1("low") , %%ecx \n\t"\
|
||||
"xor "low" , %%ecx \n\t"\
|
||||
"shr $15 , %%ecx \n\t"\
|
||||
"bswap "tmp" \n\t"\
|
||||
"shr $15 , "tmp" \n\t"\
|
||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"(%%ecx), %%ecx \n\t"\
|
||||
"sub $0xFFFF , "tmp" \n\t"\
|
||||
"neg %%ecx \n\t"\
|
||||
"add $7 , %%ecx \n\t"\
|
||||
"shl %%cl , "tmp" \n\t"\
|
||||
"add "tmp" , "low" \n\t"\
|
||||
"2: \n\t"
|
||||
|
||||
#endif /* BROKEN_RELOCATIONS */
|
||||
|
||||
#if HAVE_7REGS && !BROKEN_COMPILER
|
||||
#define get_cabac_inline get_cabac_inline_x86
|
||||
static av_always_inline int get_cabac_inline_x86(CABACContext *c,
|
||||
uint8_t *const state)
|
||||
{
|
||||
int bit, tmp;
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
void *tables;
|
||||
|
||||
__asm__ volatile(
|
||||
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
|
||||
: "=&r"(tables)
|
||||
: NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
|
||||
);
|
||||
#endif
|
||||
|
||||
__asm__ volatile(
|
||||
BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1",
|
||||
"%2", "%q2", "%3", "%b3",
|
||||
"%c6(%5)", "%c7(%5)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%8")
|
||||
: "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
|
||||
: "r"(state), "r"(c),
|
||||
"i"(offsetof(CABACContext, bytestream)),
|
||||
"i"(offsetof(CABACContext, bytestream_end))
|
||||
TABLES_ARG
|
||||
,"1"(c->low), "2"(c->range)
|
||||
: "%"FF_REG_c, "memory"
|
||||
);
|
||||
return bit & 1;
|
||||
}
|
||||
#endif /* HAVE_7REGS && !BROKEN_COMPILER */
|
||||
|
||||
#if !BROKEN_COMPILER
|
||||
#define get_cabac_bypass_sign get_cabac_bypass_sign_x86
|
||||
static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
|
||||
{
|
||||
x86_reg tmp;
|
||||
__asm__ volatile(
|
||||
"movl %c6(%2), %k1 \n\t"
|
||||
"movl %c3(%2), %%eax \n\t"
|
||||
"shl $17, %k1 \n\t"
|
||||
"add %%eax, %%eax \n\t"
|
||||
"sub %k1, %%eax \n\t"
|
||||
"cdq \n\t"
|
||||
"and %%edx, %k1 \n\t"
|
||||
"add %k1, %%eax \n\t"
|
||||
"xor %%edx, %%ecx \n\t"
|
||||
"sub %%edx, %%ecx \n\t"
|
||||
"test %%ax, %%ax \n\t"
|
||||
"jnz 1f \n\t"
|
||||
"mov %c4(%2), %1 \n\t"
|
||||
"subl $0xFFFF, %%eax \n\t"
|
||||
"movzwl (%1), %%edx \n\t"
|
||||
"bswap %%edx \n\t"
|
||||
"shrl $15, %%edx \n\t"
|
||||
#if UNCHECKED_BITSTREAM_READER
|
||||
"add $2, %1 \n\t"
|
||||
"addl %%edx, %%eax \n\t"
|
||||
"mov %1, %c4(%2) \n\t"
|
||||
#else
|
||||
"addl %%edx, %%eax \n\t"
|
||||
"cmp %c5(%2), %1 \n\t"
|
||||
"jge 1f \n\t"
|
||||
"add"FF_OPSIZE" $2, %c4(%2) \n\t"
|
||||
#endif
|
||||
"1: \n\t"
|
||||
"movl %%eax, %c3(%2) \n\t"
|
||||
|
||||
: "+c"(val), "=&r"(tmp)
|
||||
: "r"(c),
|
||||
"i"(offsetof(CABACContext, low)),
|
||||
"i"(offsetof(CABACContext, bytestream)),
|
||||
"i"(offsetof(CABACContext, bytestream_end)),
|
||||
"i"(offsetof(CABACContext, range))
|
||||
: "%eax", "%edx", "memory"
|
||||
);
|
||||
return val;
|
||||
}
|
||||
|
||||
#define get_cabac_bypass get_cabac_bypass_x86
|
||||
static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
|
||||
{
|
||||
x86_reg tmp;
|
||||
int res;
|
||||
__asm__ volatile(
|
||||
"movl %c6(%2), %k1 \n\t"
|
||||
"movl %c3(%2), %%eax \n\t"
|
||||
"shl $17, %k1 \n\t"
|
||||
"add %%eax, %%eax \n\t"
|
||||
"sub %k1, %%eax \n\t"
|
||||
"cdq \n\t"
|
||||
"and %%edx, %k1 \n\t"
|
||||
"add %k1, %%eax \n\t"
|
||||
"inc %%edx \n\t"
|
||||
"test %%ax, %%ax \n\t"
|
||||
"jnz 1f \n\t"
|
||||
"mov %c4(%2), %1 \n\t"
|
||||
"subl $0xFFFF, %%eax \n\t"
|
||||
"movzwl (%1), %%ecx \n\t"
|
||||
"bswap %%ecx \n\t"
|
||||
"shrl $15, %%ecx \n\t"
|
||||
"addl %%ecx, %%eax \n\t"
|
||||
"cmp %c5(%2), %1 \n\t"
|
||||
"jge 1f \n\t"
|
||||
"add"FF_OPSIZE" $2, %c4(%2) \n\t"
|
||||
"1: \n\t"
|
||||
"movl %%eax, %c3(%2) \n\t"
|
||||
|
||||
: "=&d"(res), "=&r"(tmp)
|
||||
: "r"(c),
|
||||
"i"(offsetof(CABACContext, low)),
|
||||
"i"(offsetof(CABACContext, bytestream)),
|
||||
"i"(offsetof(CABACContext, bytestream_end)),
|
||||
"i"(offsetof(CABACContext, range))
|
||||
: "%eax", "%ecx", "memory"
|
||||
);
|
||||
return res;
|
||||
}
|
||||
#endif /* !BROKEN_COMPILER */
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
#endif /* AVCODEC_X86_CABAC_H */
|
||||
463
externals/ffmpeg/libavcodec/x86/cavsdsp.c
vendored
Executable file
463
externals/ffmpeg/libavcodec/x86/cavsdsp.c
vendored
Executable file
@@ -0,0 +1,463 @@
|
||||
/*
|
||||
* Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
|
||||
* Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
|
||||
*
|
||||
* MMX-optimized DSP functions, based on H.264 optimizations by
|
||||
* Michael Niedermayer and Loren Merritt
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/cavsdsp.h"
|
||||
#include "libavcodec/idctdsp.h"
|
||||
#include "constants.h"
|
||||
#include "fpel.h"
|
||||
#include "idctdsp.h"
|
||||
#include "config.h"
|
||||
|
||||
|
||||
#if HAVE_MMX_EXTERNAL
|
||||
|
||||
void ff_cavs_idct8_mmx(int16_t *out, const int16_t *in);
|
||||
|
||||
static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
|
||||
{
|
||||
LOCAL_ALIGNED(16, int16_t, b2, [64]);
|
||||
ff_cavs_idct8_mmx(b2, block);
|
||||
ff_add_pixels_clamped_mmx(b2, dst, stride);
|
||||
}
|
||||
|
||||
void ff_cavs_idct8_sse2(int16_t *out, const int16_t *in);
|
||||
|
||||
static void cavs_idct8_add_sse2(uint8_t *dst, int16_t *block, ptrdiff_t stride)
|
||||
{
|
||||
LOCAL_ALIGNED(16, int16_t, b2, [64]);
|
||||
ff_cavs_idct8_sse2(b2, block);
|
||||
ff_add_pixels_clamped_sse2(b2, dst, stride);
|
||||
}
|
||||
|
||||
#endif /* HAVE_MMX_EXTERNAL */
|
||||
|
||||
#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
|
||||
|
||||
/*****************************************************************************
|
||||
*
|
||||
* motion compensation
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
/* vertical filter [-1 -2 96 42 -7 0] */
|
||||
#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
|
||||
"movd (%0), "#F" \n\t"\
|
||||
"movq "#C", %%mm6 \n\t"\
|
||||
"pmullw "MANGLE(MUL1)", %%mm6\n\t"\
|
||||
"movq "#D", %%mm7 \n\t"\
|
||||
"pmullw "MANGLE(MUL2)", %%mm7\n\t"\
|
||||
"psllw $3, "#E" \n\t"\
|
||||
"psubw "#E", %%mm6 \n\t"\
|
||||
"psraw $3, "#E" \n\t"\
|
||||
"paddw %%mm7, %%mm6 \n\t"\
|
||||
"paddw "#E", %%mm6 \n\t"\
|
||||
"paddw "#B", "#B" \n\t"\
|
||||
"pxor %%mm7, %%mm7 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"punpcklbw %%mm7, "#F" \n\t"\
|
||||
"psubw "#B", %%mm6 \n\t"\
|
||||
"psraw $1, "#B" \n\t"\
|
||||
"psubw "#A", %%mm6 \n\t"\
|
||||
"paddw "MANGLE(ADD)", %%mm6 \n\t"\
|
||||
"psraw $7, %%mm6 \n\t"\
|
||||
"packuswb %%mm6, %%mm6 \n\t"\
|
||||
OP(%%mm6, (%1), A, d) \
|
||||
"add %3, %1 \n\t"
|
||||
|
||||
/* vertical filter [ 0 -1 5 5 -1 0] */
|
||||
#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
|
||||
"movd (%0), "#F" \n\t"\
|
||||
"movq "#C", %%mm6 \n\t"\
|
||||
"paddw "#D", %%mm6 \n\t"\
|
||||
"pmullw "MANGLE(MUL1)", %%mm6\n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"punpcklbw %%mm7, "#F" \n\t"\
|
||||
"psubw "#B", %%mm6 \n\t"\
|
||||
"psubw "#E", %%mm6 \n\t"\
|
||||
"paddw "MANGLE(ADD)", %%mm6 \n\t"\
|
||||
"psraw $3, %%mm6 \n\t"\
|
||||
"packuswb %%mm6, %%mm6 \n\t"\
|
||||
OP(%%mm6, (%1), A, d) \
|
||||
"add %3, %1 \n\t"
|
||||
|
||||
/* vertical filter [ 0 -7 42 96 -2 -1] */
|
||||
#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
|
||||
"movd (%0), "#F" \n\t"\
|
||||
"movq "#C", %%mm6 \n\t"\
|
||||
"pmullw "MANGLE(MUL2)", %%mm6\n\t"\
|
||||
"movq "#D", %%mm7 \n\t"\
|
||||
"pmullw "MANGLE(MUL1)", %%mm7\n\t"\
|
||||
"psllw $3, "#B" \n\t"\
|
||||
"psubw "#B", %%mm6 \n\t"\
|
||||
"psraw $3, "#B" \n\t"\
|
||||
"paddw %%mm7, %%mm6 \n\t"\
|
||||
"paddw "#B", %%mm6 \n\t"\
|
||||
"paddw "#E", "#E" \n\t"\
|
||||
"pxor %%mm7, %%mm7 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"punpcklbw %%mm7, "#F" \n\t"\
|
||||
"psubw "#E", %%mm6 \n\t"\
|
||||
"psraw $1, "#E" \n\t"\
|
||||
"psubw "#F", %%mm6 \n\t"\
|
||||
"paddw "MANGLE(ADD)", %%mm6 \n\t"\
|
||||
"psraw $7, %%mm6 \n\t"\
|
||||
"packuswb %%mm6, %%mm6 \n\t"\
|
||||
OP(%%mm6, (%1), A, d) \
|
||||
"add %3, %1 \n\t"
|
||||
|
||||
|
||||
#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
|
||||
int w= 2;\
|
||||
src -= 2*srcStride;\
|
||||
\
|
||||
while(w--){\
|
||||
__asm__ volatile(\
|
||||
"pxor %%mm7, %%mm7 \n\t"\
|
||||
"movd (%0), %%mm0 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"movd (%0), %%mm1 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"movd (%0), %%mm2 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"movd (%0), %%mm3 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"movd (%0), %%mm4 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm1 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm3 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"\
|
||||
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
|
||||
\
|
||||
: "+a"(src), "+c"(dst)\
|
||||
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
|
||||
NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
|
||||
: "memory"\
|
||||
);\
|
||||
if(h==16){\
|
||||
__asm__ volatile(\
|
||||
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
|
||||
\
|
||||
: "+a"(src), "+c"(dst)\
|
||||
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
|
||||
NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
src += 4-(h+5)*srcStride;\
|
||||
dst += 4-h*dstStride;\
|
||||
}
|
||||
|
||||
#define QPEL_CAVS(OPNAME, OP, MMX)\
|
||||
static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{\
|
||||
int h=8;\
|
||||
__asm__ volatile(\
|
||||
"pxor %%mm7, %%mm7 \n\t"\
|
||||
"movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
|
||||
"1: \n\t"\
|
||||
"movq (%0), %%mm0 \n\t"\
|
||||
"movq 1(%0), %%mm2 \n\t"\
|
||||
"movq %%mm0, %%mm1 \n\t"\
|
||||
"movq %%mm2, %%mm3 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"\
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"\
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"\
|
||||
"paddw %%mm2, %%mm0 \n\t"\
|
||||
"paddw %%mm3, %%mm1 \n\t"\
|
||||
"pmullw %%mm6, %%mm0 \n\t"\
|
||||
"pmullw %%mm6, %%mm1 \n\t"\
|
||||
"movq -1(%0), %%mm2 \n\t"\
|
||||
"movq 2(%0), %%mm4 \n\t"\
|
||||
"movq %%mm2, %%mm3 \n\t"\
|
||||
"movq %%mm4, %%mm5 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"\
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"\
|
||||
"punpckhbw %%mm7, %%mm5 \n\t"\
|
||||
"paddw %%mm4, %%mm2 \n\t"\
|
||||
"paddw %%mm3, %%mm5 \n\t"\
|
||||
"psubw %%mm2, %%mm0 \n\t"\
|
||||
"psubw %%mm5, %%mm1 \n\t"\
|
||||
"movq "MANGLE(ff_pw_4)", %%mm5\n\t"\
|
||||
"paddw %%mm5, %%mm0 \n\t"\
|
||||
"paddw %%mm5, %%mm1 \n\t"\
|
||||
"psraw $3, %%mm0 \n\t"\
|
||||
"psraw $3, %%mm1 \n\t"\
|
||||
"packuswb %%mm1, %%mm0 \n\t"\
|
||||
OP(%%mm0, (%1),%%mm5, q) \
|
||||
"add %3, %0 \n\t"\
|
||||
"add %4, %1 \n\t"\
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+a"(src), "+c"(dst), "+m"(h)\
|
||||
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
|
||||
NAMED_CONSTRAINTS_ADD(ff_pw_4,ff_pw_5)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
\
|
||||
static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
|
||||
{ \
|
||||
QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
|
||||
}\
|
||||
\
|
||||
static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
|
||||
{ \
|
||||
QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_42) \
|
||||
}\
|
||||
\
|
||||
static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
|
||||
{ \
|
||||
QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{ \
|
||||
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{ \
|
||||
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{ \
|
||||
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{ \
|
||||
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{ \
|
||||
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{ \
|
||||
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{ \
|
||||
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
src += 8*srcStride;\
|
||||
dst += 8*dstStride;\
|
||||
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
}\
|
||||
|
||||
#define CAVS_MC(OPNAME, SIZE, MMX) \
|
||||
static void OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
|
||||
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
|
||||
#define AVG_3DNOW_OP(a,b,temp, size) \
|
||||
"mov" #size " " #b ", " #temp " \n\t"\
|
||||
"pavgusb " #temp ", " #a " \n\t"\
|
||||
"mov" #size " " #a ", " #b " \n\t"
|
||||
#define AVG_MMXEXT_OP(a, b, temp, size) \
|
||||
"mov" #size " " #b ", " #temp " \n\t"\
|
||||
"pavgb " #temp ", " #a " \n\t"\
|
||||
"mov" #size " " #a ", " #b " \n\t"
|
||||
|
||||
#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
|
||||
|
||||
#if HAVE_MMX_EXTERNAL
|
||||
static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_put_pixels8_mmx(dst, src, stride, 8);
|
||||
}
|
||||
|
||||
static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_avg_pixels8_mmx(dst, src, stride, 8);
|
||||
}
|
||||
|
||||
static void avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_avg_pixels8_mmxext(dst, src, stride, 8);
|
||||
}
|
||||
|
||||
static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_put_pixels16_mmx(dst, src, stride, 16);
|
||||
}
|
||||
|
||||
static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_avg_pixels16_mmx(dst, src, stride, 16);
|
||||
}
|
||||
|
||||
static void avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_avg_pixels16_mmxext(dst, src, stride, 16);
|
||||
}
|
||||
|
||||
static void put_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_put_pixels16_sse2(dst, src, stride, 16);
|
||||
}
|
||||
|
||||
static void avg_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_avg_pixels16_sse2(dst, src, stride, 16);
|
||||
}
|
||||
#endif
|
||||
|
||||
static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
|
||||
AVCodecContext *avctx)
|
||||
{
|
||||
#if HAVE_MMX_EXTERNAL
|
||||
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
|
||||
c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
|
||||
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
|
||||
c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
|
||||
|
||||
c->cavs_idct8_add = cavs_idct8_add_mmx;
|
||||
c->idct_perm = FF_IDCT_PERM_TRANSPOSE;
|
||||
#endif /* HAVE_MMX_EXTERNAL */
|
||||
}
|
||||
|
||||
#define DSPFUNC(PFX, IDX, NUM, EXT) \
|
||||
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
|
||||
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 4] = PFX ## _cavs_qpel ## NUM ## _mc01_ ## EXT; \
|
||||
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 8] = PFX ## _cavs_qpel ## NUM ## _mc02_ ## EXT; \
|
||||
c->PFX ## _cavs_qpel_pixels_tab[IDX][12] = PFX ## _cavs_qpel ## NUM ## _mc03_ ## EXT; \
|
||||
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
QPEL_CAVS(put_, PUT_OP, mmxext)
|
||||
QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext)
|
||||
|
||||
CAVS_MC(put_, 8, mmxext)
|
||||
CAVS_MC(put_, 16, mmxext)
|
||||
CAVS_MC(avg_, 8, mmxext)
|
||||
CAVS_MC(avg_, 16, mmxext)
|
||||
#endif /* HAVE_MMXEXT_INLINE */
|
||||
|
||||
#if HAVE_AMD3DNOW_INLINE
|
||||
QPEL_CAVS(put_, PUT_OP, 3dnow)
|
||||
QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
|
||||
|
||||
CAVS_MC(put_, 8, 3dnow)
|
||||
CAVS_MC(put_, 16,3dnow)
|
||||
CAVS_MC(avg_, 8, 3dnow)
|
||||
CAVS_MC(avg_, 16,3dnow)
|
||||
|
||||
static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
|
||||
AVCodecContext *avctx)
|
||||
{
|
||||
DSPFUNC(put, 0, 16, 3dnow);
|
||||
DSPFUNC(put, 1, 8, 3dnow);
|
||||
DSPFUNC(avg, 0, 16, 3dnow);
|
||||
DSPFUNC(avg, 1, 8, 3dnow);
|
||||
}
|
||||
#endif /* HAVE_AMD3DNOW_INLINE */
|
||||
|
||||
av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
av_unused int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (X86_MMX(cpu_flags))
|
||||
cavsdsp_init_mmx(c, avctx);
|
||||
|
||||
#if HAVE_AMD3DNOW_INLINE
|
||||
if (INLINE_AMD3DNOW(cpu_flags))
|
||||
cavsdsp_init_3dnow(c, avctx);
|
||||
#endif /* HAVE_AMD3DNOW_INLINE */
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
if (INLINE_MMXEXT(cpu_flags)) {
|
||||
DSPFUNC(put, 0, 16, mmxext);
|
||||
DSPFUNC(put, 1, 8, mmxext);
|
||||
DSPFUNC(avg, 0, 16, mmxext);
|
||||
DSPFUNC(avg, 1, 8, mmxext);
|
||||
}
|
||||
#endif
|
||||
#if HAVE_MMX_EXTERNAL
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmxext;
|
||||
c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext;
|
||||
}
|
||||
#endif
|
||||
#if HAVE_SSE2_EXTERNAL
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
|
||||
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
|
||||
|
||||
c->cavs_idct8_add = cavs_idct8_add_sse2;
|
||||
c->idct_perm = FF_IDCT_PERM_TRANSPOSE;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
211
externals/ffmpeg/libavcodec/x86/cavsidct.asm
vendored
Executable file
211
externals/ffmpeg/libavcodec/x86/cavsidct.asm
vendored
Executable file
@@ -0,0 +1,211 @@
|
||||
; Chinese AVS video (AVS1-P2, JiZhun profile) decoder
|
||||
; Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
|
||||
;
|
||||
; MMX-optimized DSP functions, based on H.264 optimizations by
|
||||
; Michael Niedermayer and Loren Merritt
|
||||
; Conversion from gcc syntax to x264asm syntax with modifications
|
||||
; by Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;
|
||||
; This file is part of FFmpeg.
|
||||
;
|
||||
; FFmpeg is free software; you can redistribute it and/or
|
||||
; modify it under the terms of the GNU Lesser General Public
|
||||
; License as published by the Free Software Foundation; either
|
||||
; version 2.1 of the License, or (at your option) any later version.
|
||||
;
|
||||
; FFmpeg is distributed in the hope that it will be useful,
|
||||
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
; Lesser General Public License for more details.
|
||||
;
|
||||
; You should have received a copy of the GNU Lesser General Public License
|
||||
; along with FFmpeg; if not, write to the Free Software Foundation,
|
||||
; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
cextern pw_4
|
||||
cextern pw_64
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro CAVS_IDCT8_1D 2-3 1 ; source, round, init_load
|
||||
%if %3 == 1
|
||||
mova m4, [%1+7*16] ; m4 = src7
|
||||
mova m5, [%1+1*16] ; m5 = src1
|
||||
mova m2, [%1+5*16] ; m2 = src5
|
||||
mova m7, [%1+3*16] ; m7 = src3
|
||||
%else
|
||||
SWAP 1, 7
|
||||
SWAP 4, 6
|
||||
%endif
|
||||
mova m0, m4
|
||||
mova m3, m5
|
||||
mova m6, m2
|
||||
mova m1, m7
|
||||
|
||||
paddw m4, m4 ; m4 = 2*src7
|
||||
paddw m3, m3 ; m3 = 2*src1
|
||||
paddw m6, m6 ; m6 = 2*src5
|
||||
paddw m1, m1 ; m1 = 2*src3
|
||||
paddw m0, m4 ; m0 = 3*src7
|
||||
paddw m5, m3 ; m5 = 3*src1
|
||||
paddw m2, m6 ; m2 = 3*src5
|
||||
paddw m7, m1 ; m7 = 3*src3
|
||||
psubw m5, m4 ; m5 = 3*src1 - 2*src7 = a0
|
||||
paddw m7, m6 ; m7 = 3*src3 - 2*src5 = a1
|
||||
psubw m1, m2 ; m1 = 2*src3 - 3*src5 = a2
|
||||
paddw m3, m0 ; m3 = 2*src1 - 3*src7 = a3
|
||||
|
||||
mova m4, m5
|
||||
mova m6, m7
|
||||
mova m0, m3
|
||||
mova m2, m1
|
||||
SUMSUB_BA w, 7, 5 ; m7 = a0 + a1, m5 = a0 - a1
|
||||
paddw m7, m3 ; m7 = a0 + a1 + a3
|
||||
paddw m5, m1 ; m5 = a0 - a1 + a2
|
||||
paddw m7, m7
|
||||
paddw m5, m5
|
||||
paddw m7, m6 ; m7 = b4
|
||||
paddw m5, m4 ; m5 = b5
|
||||
|
||||
SUMSUB_BA w, 1, 3 ; m1 = a3 + a2, m3 = a3 - a2
|
||||
psubw m4, m1 ; m4 = a0 - a2 - a3
|
||||
mova m1, m4 ; m1 = a0 - a2 - a3
|
||||
psubw m3, m6 ; m3 = a3 - a2 - a1
|
||||
paddw m1, m1
|
||||
paddw m3, m3
|
||||
psubw m1, m2 ; m1 = b7
|
||||
paddw m3, m0 ; m3 = b6
|
||||
|
||||
mova m2, [%1+2*16] ; m2 = src2
|
||||
mova m6, [%1+6*16] ; m6 = src6
|
||||
mova m4, m2
|
||||
mova m0, m6
|
||||
psllw m4, 2 ; m4 = 4*src2
|
||||
psllw m6, 2 ; m6 = 4*src6
|
||||
paddw m2, m4 ; m2 = 5*src2
|
||||
paddw m0, m6 ; m0 = 5*src6
|
||||
paddw m2, m2
|
||||
paddw m0, m0
|
||||
psubw m4, m0 ; m4 = 4*src2 - 10*src6 = a7
|
||||
paddw m6, m2 ; m6 = 4*src6 + 10*src2 = a6
|
||||
|
||||
mova m2, [%1+0*16] ; m2 = src0
|
||||
mova m0, [%1+4*16] ; m0 = src4
|
||||
SUMSUB_BA w, 0, 2 ; m0 = src0 + src4, m2 = src0 - src4
|
||||
psllw m0, 3
|
||||
psllw m2, 3
|
||||
paddw m0, %2 ; add rounding bias
|
||||
paddw m2, %2 ; add rounding bias
|
||||
|
||||
SUMSUB_BA w, 6, 0 ; m6 = a4 + a6, m0 = a4 - a6
|
||||
SUMSUB_BA w, 4, 2 ; m4 = a5 + a7, m2 = a5 - a7
|
||||
SUMSUB_BA w, 7, 6 ; m7 = dst0, m6 = dst7
|
||||
SUMSUB_BA w, 5, 4 ; m5 = dst1, m4 = dst6
|
||||
SUMSUB_BA w, 3, 2 ; m3 = dst2, m2 = dst5
|
||||
SUMSUB_BA w, 1, 0 ; m1 = dst3, m0 = dst4
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
cglobal cavs_idct8, 2, 4, 8, 8 * 16, out, in, cnt, tmp
|
||||
mov cntd, 2
|
||||
mov tmpq, rsp
|
||||
|
||||
.loop_1:
|
||||
CAVS_IDCT8_1D inq, [pw_4]
|
||||
psraw m7, 3
|
||||
psraw m6, 3
|
||||
psraw m5, 3
|
||||
psraw m4, 3
|
||||
psraw m3, 3
|
||||
psraw m2, 3
|
||||
psraw m1, 3
|
||||
psraw m0, 3
|
||||
mova [tmpq], m7
|
||||
TRANSPOSE4x4W 0, 2, 4, 6, 7
|
||||
mova [tmpq+1*8], m0
|
||||
mova [tmpq+3*8], m2
|
||||
mova [tmpq+5*8], m4
|
||||
mova [tmpq+7*8], m6
|
||||
mova m7, [tmpq]
|
||||
TRANSPOSE4x4W 7, 5, 3, 1, 0
|
||||
mova [tmpq+0*8], m7
|
||||
mova [tmpq+2*8], m5
|
||||
mova [tmpq+4*8], m3
|
||||
mova [tmpq+6*8], m1
|
||||
|
||||
add inq, mmsize
|
||||
add tmpq, 64
|
||||
dec cntd
|
||||
jg .loop_1
|
||||
|
||||
mov cntd, 2
|
||||
mov tmpq, rsp
|
||||
.loop_2:
|
||||
CAVS_IDCT8_1D tmpq, [pw_64]
|
||||
psraw m7, 7
|
||||
psraw m6, 7
|
||||
psraw m5, 7
|
||||
psraw m4, 7
|
||||
psraw m3, 7
|
||||
psraw m2, 7
|
||||
psraw m1, 7
|
||||
psraw m0, 7
|
||||
|
||||
mova [outq+0*16], m7
|
||||
mova [outq+1*16], m5
|
||||
mova [outq+2*16], m3
|
||||
mova [outq+3*16], m1
|
||||
mova [outq+4*16], m0
|
||||
mova [outq+5*16], m2
|
||||
mova [outq+6*16], m4
|
||||
mova [outq+7*16], m6
|
||||
|
||||
add outq, mmsize
|
||||
add tmpq, mmsize
|
||||
dec cntd
|
||||
jg .loop_2
|
||||
|
||||
RET
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal cavs_idct8, 2, 2, 8 + ARCH_X86_64, 0 - 8 * 16, out, in
|
||||
CAVS_IDCT8_1D inq, [pw_4]
|
||||
psraw m7, 3
|
||||
psraw m6, 3
|
||||
psraw m5, 3
|
||||
psraw m4, 3
|
||||
psraw m3, 3
|
||||
psraw m2, 3
|
||||
psraw m1, 3
|
||||
psraw m0, 3
|
||||
%if ARCH_X86_64
|
||||
TRANSPOSE8x8W 7, 5, 3, 1, 0, 2, 4, 6, 8
|
||||
mova [rsp+4*16], m0
|
||||
%else
|
||||
mova [rsp+0*16], m4
|
||||
TRANSPOSE8x8W 7, 5, 3, 1, 0, 2, 4, 6, [rsp+0*16], [rsp+4*16], 1
|
||||
%endif
|
||||
mova [rsp+0*16], m7
|
||||
mova [rsp+2*16], m3
|
||||
mova [rsp+6*16], m4
|
||||
CAVS_IDCT8_1D rsp, [pw_64], 0
|
||||
psraw m7, 7
|
||||
psraw m6, 7
|
||||
psraw m5, 7
|
||||
psraw m4, 7
|
||||
psraw m3, 7
|
||||
psraw m2, 7
|
||||
psraw m1, 7
|
||||
psraw m0, 7
|
||||
|
||||
mova [outq+0*16], m7
|
||||
mova [outq+1*16], m5
|
||||
mova [outq+2*16], m3
|
||||
mova [outq+3*16], m1
|
||||
mova [outq+4*16], m0
|
||||
mova [outq+5*16], m2
|
||||
mova [outq+6*16], m4
|
||||
mova [outq+7*16], m6
|
||||
RET
|
||||
43
externals/ffmpeg/libavcodec/x86/celt_pvq_init.c
vendored
Executable file
43
externals/ffmpeg/libavcodec/x86/celt_pvq_init.c
vendored
Executable file
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Opus encoder assembly optimizations
|
||||
* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/opus_pvq.h"
|
||||
|
||||
extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N);
|
||||
extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N);
|
||||
extern float ff_pvq_search_exact_avx (float *X, int *y, int K, int N);
|
||||
|
||||
av_cold void ff_celt_pvq_init_x86(CeltPVQ *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
s->pvq_search = ff_pvq_search_approx_sse2;
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags))
|
||||
s->pvq_search = ff_pvq_search_approx_sse4;
|
||||
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags))
|
||||
s->pvq_search = ff_pvq_search_exact_avx;
|
||||
}
|
||||
385
externals/ffmpeg/libavcodec/x86/celt_pvq_search.asm
vendored
Executable file
385
externals/ffmpeg/libavcodec/x86/celt_pvq_search.asm
vendored
Executable file
@@ -0,0 +1,385 @@
|
||||
;******************************************************************************
|
||||
;* SIMD optimized Opus encoder DSP function
|
||||
;*
|
||||
;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "config.asm"
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
%ifdef __NASM_VER__
|
||||
%use "smartalign"
|
||||
ALIGNMODE p6
|
||||
%endif
|
||||
|
||||
SECTION_RODATA 64
|
||||
|
||||
const_float_abs_mask: times 8 dd 0x7fffffff
|
||||
const_align_abs_edge: times 8 dd 0
|
||||
|
||||
const_float_0_5: times 8 dd 0.5
|
||||
const_float_1: times 8 dd 1.0
|
||||
const_float_sign_mask: times 8 dd 0x80000000
|
||||
|
||||
const_int32_offsets:
|
||||
%rep 8
|
||||
dd $-const_int32_offsets
|
||||
%endrep
|
||||
SECTION .text
|
||||
|
||||
;
|
||||
; Setup High Register to be used
|
||||
; for holding memory constants
|
||||
;
|
||||
; %1 - the register to be used, assmues it is >= mm8
|
||||
; %2 - name of the constant.
|
||||
;
|
||||
; Subsequent opcodes are going to use the constant in the form
|
||||
; "addps m0, mm_const_name" and it would be turned into:
|
||||
; "addps m0, [const_name]" on 32 bit arch or
|
||||
; "addps m0, m8" on 64 bit arch
|
||||
%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
|
||||
%if num_mmregs > 8
|
||||
%define mm_%3 %2
|
||||
%{1} %2, [%3] ; movaps m8, [const_name]
|
||||
%else
|
||||
%define mm_%3 [%3]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; Set Position Independent Code
|
||||
; Base address of a constant
|
||||
; %1 - the register to be used, if PIC is set
|
||||
; %2 - name of the constant.
|
||||
;
|
||||
; Subsequent opcode are going to use the base address in the form
|
||||
; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
|
||||
; "movaps m0, [r5 + r4]" if PIC is enabled
|
||||
; "movaps m0, [constant_name + r4]" if texrel are used
|
||||
%macro SET_PIC_BASE 3; reg, const_label
|
||||
%ifdef PIC
|
||||
%{1} %2, [%3] ; lea r5, [rip+const]
|
||||
%define pic_base_%3 %2
|
||||
%else
|
||||
%define pic_base_%3 %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PULSES_SEARCH 1
|
||||
; m6 Syy_norm
|
||||
; m7 Sxy_norm
|
||||
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
|
||||
pxor m1, m1 ; max_idx
|
||||
xorps m3, m3 ; p_max
|
||||
xor r4d, r4d
|
||||
align 16
|
||||
%%distortion_search:
|
||||
movd xm2, dword r4d ; movd zero extends
|
||||
%ifidn %1,add
|
||||
movaps m4, [tmpY + r4] ; y[i]
|
||||
movaps m5, [tmpX + r4] ; X[i]
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
xorps m0, m0
|
||||
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
|
||||
%endif
|
||||
|
||||
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
|
||||
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
|
||||
%endif
|
||||
|
||||
%else
|
||||
movaps m5, [tmpY + r4] ; m5 = y[i]
|
||||
|
||||
xorps m0, m0 ; m0 = 0;
|
||||
cmpps m0, m0, m5, 1 ; m0 = (0<y)
|
||||
|
||||
subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i]
|
||||
subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i]
|
||||
andps m5, m0 ; (0<y)?m5:0
|
||||
%endif
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
rsqrtps m4, m4
|
||||
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
|
||||
%else
|
||||
mulps m5, m5
|
||||
divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy
|
||||
%endif
|
||||
VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later)
|
||||
|
||||
cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
|
||||
maxps m3, m5 ; m3=max(p_max,p)
|
||||
; maxps here is faster than blendvps, despite blend having lower latency.
|
||||
|
||||
pand m2, m0 ; This version seems faster than sse41 pblendvb
|
||||
pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4
|
||||
|
||||
add r4d, mmsize
|
||||
cmp r4d, Nd
|
||||
jb %%distortion_search
|
||||
|
||||
por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop)
|
||||
movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing
|
||||
|
||||
%if mmsize >= 32
|
||||
; Merge parallel maximums round 8 (4 vs 4)
|
||||
|
||||
vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b]
|
||||
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
|
||||
|
||||
vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b]
|
||||
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
|
||||
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128]
|
||||
%endif
|
||||
|
||||
; Merge parallel maximums round 4 (2 vs 2)
|
||||
; m3=p[3210]
|
||||
movhlps xm5, xm3 ; m5=p[xx32]
|
||||
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
|
||||
|
||||
pshufd xm2, xm1, q3232
|
||||
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
|
||||
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0]
|
||||
|
||||
; Merge parallel maximums final round (1 vs 1)
|
||||
shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1]
|
||||
cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
|
||||
|
||||
pshufd xm2, xm1, q1111
|
||||
PBLENDVB xm1, xm2, xm0
|
||||
|
||||
movd dword r4d, xm1 ; zero extends to the rest of r4q
|
||||
|
||||
VBROADCASTSS m3, [tmpX + r4]
|
||||
%{1}ps m7, m3 ; Sxy += X[max_idx]
|
||||
|
||||
VBROADCASTSS m5, [tmpY + r4]
|
||||
%{1}ps m6, m5 ; Syy += Y[max_idx]
|
||||
|
||||
; We have to update a single element in Y[i]
|
||||
; However writing 4 bytes and then doing 16 byte load in the inner loop
|
||||
; could cause a stall due to breaking write forwarding.
|
||||
VPBROADCASTD m1, xm1
|
||||
pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it
|
||||
|
||||
and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load
|
||||
movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0]
|
||||
andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0]
|
||||
%{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
|
||||
movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0;
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; We need one more register for
|
||||
; PIC relative addressing. Use this
|
||||
; to count it in cglobal
|
||||
;
|
||||
%ifdef PIC
|
||||
%define num_pic_regs 1
|
||||
%else
|
||||
%define num_pic_regs 0
|
||||
%endif
|
||||
|
||||
;
|
||||
; Pyramid Vector Quantization Search implementation
|
||||
;
|
||||
; float * inX - Unaligned (SIMD) access, it will be overread,
|
||||
; but extra data is masked away.
|
||||
; int32 * outY - Should be aligned and padded buffer.
|
||||
; It is used as temp buffer.
|
||||
; uint32 K - Number of pulses to have after quantizations.
|
||||
; uint32 N - Number of vector elements. Must be 0 < N < 256
|
||||
;
|
||||
%macro PVQ_FAST_SEARCH 1
|
||||
cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
|
||||
%define tmpX rsp
|
||||
%define tmpY outYq
|
||||
|
||||
movaps m0, [const_float_abs_mask]
|
||||
shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
|
||||
mov r4d, Nd
|
||||
|
||||
neg r4d
|
||||
and r4d, mmsize-1
|
||||
|
||||
SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const
|
||||
movups m2, [pic_base_const_align_abs_edge + r4 - mmsize]
|
||||
|
||||
add Nd, r4d ; N = align(N, mmsize)
|
||||
|
||||
lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
|
||||
movups m1, [inXq + r4]
|
||||
andps m1, m2
|
||||
movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] )
|
||||
|
||||
align 16
|
||||
%%loop_abs_sum:
|
||||
sub r4d, mmsize
|
||||
jc %%end_loop_abs_sum
|
||||
|
||||
movups m2, [inXq + r4]
|
||||
andps m2, m0
|
||||
|
||||
movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
|
||||
addps m1, m2 ; Sx += abs(X[i])
|
||||
jmp %%loop_abs_sum
|
||||
|
||||
align 16
|
||||
%%end_loop_abs_sum:
|
||||
|
||||
HSUMPS m1, m2 ; m1 = Sx
|
||||
|
||||
xorps m0, m0
|
||||
comiss xm0, xm1 ;
|
||||
jz %%zero_input ; if (Sx==0) goto zero_input
|
||||
|
||||
cvtsi2ss xm0, dword Kd ; m0 = K
|
||||
%if USE_APPROXIMATION == 1
|
||||
rcpss xm1, xm1 ; m1 = approx(1/Sx)
|
||||
mulss xm0, xm1 ; m0 = K*(1/Sx)
|
||||
%else
|
||||
divss xm0, xm1 ; b = K/Sx
|
||||
; b = K/max_x
|
||||
%endif
|
||||
|
||||
VBROADCASTSS m0, xm0
|
||||
|
||||
lea r4d, [Nd - mmsize]
|
||||
pxor m5, m5 ; Sy ( Sum of abs( y[i]) )
|
||||
xorps m6, m6 ; Syy ( Sum of y[i]*y[i] )
|
||||
xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] )
|
||||
align 16
|
||||
%%loop_guess:
|
||||
movaps m1, [tmpX + r4] ; m1 = X[i]
|
||||
mulps m2, m0, m1 ; m2 = res*X[i]
|
||||
cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
|
||||
paddd m5, m2 ; Sy += yt
|
||||
cvtdq2ps m2, m2 ; yt = (float)yt
|
||||
mulps m1, m2 ; m1 = X[i]*yt
|
||||
movaps [tmpY + r4], m2 ; y[i] = m2
|
||||
addps m7, m1 ; Sxy += m1;
|
||||
mulps m2, m2 ; m2 = yt*yt
|
||||
addps m6, m2 ; Syy += m2
|
||||
|
||||
sub r4d, mmsize
|
||||
jnc %%loop_guess
|
||||
|
||||
HSUMPS m6, m1 ; Syy_norm
|
||||
HADDD m5, m4 ; pulses
|
||||
|
||||
movd dword r4d, xm5 ; zero extends to the rest of r4q
|
||||
|
||||
sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
|
||||
jz %%finish ; K - pulses == 0
|
||||
|
||||
SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5
|
||||
SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1
|
||||
SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
|
||||
; Use Syy/2 in distortion parameter calculations.
|
||||
; Saves pre and post-caclulation to correct Y[] values.
|
||||
; Same precision, since float mantisa is normalized.
|
||||
; The SQRT approximation does differ.
|
||||
HSUMPS m7, m0 ; Sxy_norm
|
||||
mulps m6, mm_const_float_0_5
|
||||
|
||||
jc %%remove_pulses_loop ; K - pulses < 0
|
||||
|
||||
align 16 ; K - pulses > 0
|
||||
%%add_pulses_loop:
|
||||
|
||||
PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
|
||||
|
||||
sub Kd, 1
|
||||
jnz %%add_pulses_loop
|
||||
|
||||
addps m6, m6 ; Syy*=2
|
||||
|
||||
jmp %%finish
|
||||
|
||||
align 16
|
||||
%%remove_pulses_loop:
|
||||
|
||||
PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
|
||||
|
||||
add Kd, 1
|
||||
jnz %%remove_pulses_loop
|
||||
|
||||
addps m6, m6 ; Syy*=2
|
||||
|
||||
align 16
|
||||
%%finish:
|
||||
lea r4d, [Nd - mmsize]
|
||||
movaps m2, [const_float_sign_mask]
|
||||
|
||||
align 16
|
||||
%%restore_sign_loop:
|
||||
movaps m0, [tmpY + r4] ; m0 = Y[i]
|
||||
movups m1, [inXq + r4] ; m1 = X[i]
|
||||
andps m1, m2 ; m1 = sign(X[i])
|
||||
orps m0, m1 ; m0 = Y[i]*sign
|
||||
cvtps2dq m3, m0 ; m3 = (int)m0
|
||||
movaps [outYq + r4], m3
|
||||
|
||||
sub r4d, mmsize
|
||||
jnc %%restore_sign_loop
|
||||
%%return:
|
||||
|
||||
%if ARCH_X86_64 == 0 ; sbrdsp
|
||||
movss r0m, xm6 ; return (float)Syy_norm
|
||||
fld dword r0m
|
||||
%else
|
||||
movaps m0, m6 ; return (float)Syy_norm
|
||||
%endif
|
||||
|
||||
RET
|
||||
|
||||
align 16
|
||||
%%zero_input:
|
||||
lea r4d, [Nd - mmsize]
|
||||
xorps m0, m0
|
||||
%%zero_loop:
|
||||
movaps [outYq + r4], m0
|
||||
sub r4d, mmsize
|
||||
jnc %%zero_loop
|
||||
|
||||
movaps m6, [const_float_1]
|
||||
jmp %%return
|
||||
%endmacro
|
||||
|
||||
; if 1, use a float op that give half precision but execute for around 3 cycles.
|
||||
; On Skylake & Ryzen the division is much faster (around 11c/3),
|
||||
; that makes the full precision code about 2% slower.
|
||||
; Opus also does use rsqrt approximation in their intrinsics code.
|
||||
%define USE_APPROXIMATION 1
|
||||
|
||||
INIT_XMM sse2
|
||||
PVQ_FAST_SEARCH _approx
|
||||
|
||||
INIT_XMM sse4
|
||||
PVQ_FAST_SEARCH _approx
|
||||
|
||||
%define USE_APPROXIMATION 0
|
||||
|
||||
INIT_XMM avx
|
||||
PVQ_FAST_SEARCH _exact
|
||||
94
externals/ffmpeg/libavcodec/x86/constants.c
vendored
Executable file
94
externals/ffmpeg/libavcodec/x86/constants.c
vendored
Executable file
@@ -0,0 +1,94 @@
|
||||
/*
|
||||
* MMX/SSE/AVX constants used across x86 dsp optimizations.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/asm.h" // for xmm_reg
|
||||
#include "constants.h"
|
||||
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL,
|
||||
0x0001000100010001ULL, 0x0001000100010001ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL,
|
||||
0x0002000200020002ULL, 0x0002000200020002ULL };
|
||||
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
|
||||
DECLARE_ASM_ALIGNED(32, const ymm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL,
|
||||
0x0004000400040004ULL, 0x0004000400040004ULL };
|
||||
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
|
||||
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
|
||||
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
|
||||
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_20) = { 0x0014001400140014ULL, 0x0014001400140014ULL };
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
|
||||
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
|
||||
0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL,
|
||||
0x0100010001000100ULL, 0x0100010001000100ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL,
|
||||
0x0200020002000200ULL, 0x0200020002000200ULL };
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL,
|
||||
0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL};
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL,
|
||||
0x0400040004000400ULL, 0x0400040004000400ULL};
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
|
||||
0x0800080008000800ULL, 0x0800080008000800ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
|
||||
0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
|
||||
0x1000100010001000ULL, 0x1000100010001000ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
|
||||
0x2000200020002000ULL, 0x2000200020002000ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
|
||||
0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
|
||||
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL,
|
||||
0x0000000000000000ULL, 0x0000000000000000ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL,
|
||||
0x0101010101010101ULL, 0x0101010101010101ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_2) = { 0x0202020202020202ULL, 0x0202020202020202ULL,
|
||||
0x0202020202020202ULL, 0x0202020202020202ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL,
|
||||
0x0303030303030303ULL, 0x0303030303030303ULL };
|
||||
DECLARE_ALIGNED(32, const xmm_reg, ff_pb_15) = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL,
|
||||
0x8080808080808080ULL, 0x8080808080808080ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
|
||||
0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
|
||||
DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
|
||||
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL };
|
||||
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL,
|
||||
0x0000000100000001ULL, 0x0000000100000001ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x0000001000000010ULL,
|
||||
0x0000001000000010ULL, 0x0000001000000010ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL,
|
||||
0x0000002000000020ULL, 0x0000002000000020ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL,
|
||||
0x0000200000002000ULL, 0x0000200000002000ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
|
||||
0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };
|
||||
72
externals/ffmpeg/libavcodec/x86/constants.h
vendored
Executable file
72
externals/ffmpeg/libavcodec/x86/constants.h
vendored
Executable file
@@ -0,0 +1,72 @@
|
||||
/*
|
||||
* MMX/SSE constants used across x86 dsp optimizations.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_CONSTANTS_H
|
||||
#define AVCODEC_X86_CONSTANTS_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/x86/asm.h"
|
||||
|
||||
extern const ymm_reg ff_pw_1;
|
||||
extern const ymm_reg ff_pw_2;
|
||||
extern const xmm_reg ff_pw_3;
|
||||
extern const ymm_reg ff_pw_4;
|
||||
extern const xmm_reg ff_pw_5;
|
||||
extern const xmm_reg ff_pw_8;
|
||||
extern const xmm_reg ff_pw_9;
|
||||
extern const uint64_t ff_pw_15;
|
||||
extern const xmm_reg ff_pw_16;
|
||||
extern const xmm_reg ff_pw_18;
|
||||
extern const xmm_reg ff_pw_20;
|
||||
extern const xmm_reg ff_pw_32;
|
||||
extern const uint64_t ff_pw_42;
|
||||
extern const uint64_t ff_pw_53;
|
||||
extern const xmm_reg ff_pw_64;
|
||||
extern const uint64_t ff_pw_96;
|
||||
extern const uint64_t ff_pw_128;
|
||||
extern const ymm_reg ff_pw_255;
|
||||
extern const ymm_reg ff_pw_256;
|
||||
extern const ymm_reg ff_pw_512;
|
||||
extern const ymm_reg ff_pw_1023;
|
||||
extern const ymm_reg ff_pw_1024;
|
||||
extern const ymm_reg ff_pw_2048;
|
||||
extern const ymm_reg ff_pw_4095;
|
||||
extern const ymm_reg ff_pw_4096;
|
||||
extern const ymm_reg ff_pw_8192;
|
||||
extern const ymm_reg ff_pw_m1;
|
||||
|
||||
extern const ymm_reg ff_pb_0;
|
||||
extern const ymm_reg ff_pb_1;
|
||||
extern const ymm_reg ff_pb_2;
|
||||
extern const ymm_reg ff_pb_3;
|
||||
extern const ymm_reg ff_pb_80;
|
||||
extern const ymm_reg ff_pb_FE;
|
||||
extern const uint64_t ff_pb_FC;
|
||||
|
||||
extern const xmm_reg ff_ps_neg;
|
||||
|
||||
extern const ymm_reg ff_pd_1;
|
||||
extern const ymm_reg ff_pd_16;
|
||||
extern const ymm_reg ff_pd_32;
|
||||
extern const ymm_reg ff_pd_8192;
|
||||
extern const ymm_reg ff_pd_65535;
|
||||
|
||||
#endif /* AVCODEC_X86_CONSTANTS_H */
|
||||
301
externals/ffmpeg/libavcodec/x86/dcadsp.asm
vendored
Executable file
301
externals/ffmpeg/libavcodec/x86/dcadsp.asm
vendored
Executable file
@@ -0,0 +1,301 @@
|
||||
;******************************************************************************
|
||||
;* SIMD-optimized functions for the DCA decoder
|
||||
;* Copyright (C) 2016 James Almer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
%define sizeof_float 4
|
||||
%define FMA3_OFFSET (8 * cpuflag(fma3))
|
||||
|
||||
%macro LFE_FIR0_FLOAT 0
|
||||
cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
|
||||
shr nblocksd, 1
|
||||
sub lfeq, 7*sizeof_float
|
||||
mov cnt1d, 32*sizeof_float
|
||||
mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET
|
||||
lea coeffq, [coeffq+cnt1q*8]
|
||||
add samplesq, cnt1q
|
||||
neg cnt1q
|
||||
|
||||
.loop:
|
||||
%if cpuflag(avx)
|
||||
cvtdq2ps m4, [lfeq+16]
|
||||
cvtdq2ps m5, [lfeq ]
|
||||
shufps m7, m4, m4, q0123
|
||||
shufps m6, m5, m5, q0123
|
||||
%elif cpuflag(sse2)
|
||||
movu m4, [lfeq+16]
|
||||
movu m5, [lfeq ]
|
||||
cvtdq2ps m4, m4
|
||||
cvtdq2ps m5, m5
|
||||
pshufd m7, m4, q0123
|
||||
pshufd m6, m5, q0123
|
||||
%else
|
||||
cvtpi2ps m4, [lfeq+16]
|
||||
cvtpi2ps m0, [lfeq+24]
|
||||
cvtpi2ps m5, [lfeq ]
|
||||
cvtpi2ps m1, [lfeq+8 ]
|
||||
shufps m4, m0, q1010
|
||||
shufps m5, m1, q1010
|
||||
shufps m7, m4, m4, q0123
|
||||
shufps m6, m5, m5, q0123
|
||||
%endif
|
||||
|
||||
.inner_loop:
|
||||
%if ARCH_X86_64
|
||||
movaps m8, [coeffq+cnt1q*8 ]
|
||||
movaps m9, [coeffq+cnt1q*8+16]
|
||||
movaps m10, [coeffq+cnt1q*8+32]
|
||||
movaps m11, [coeffq+cnt1q*8+48]
|
||||
%if cpuflag(fma3)
|
||||
movaps m12, [coeffq+cnt1q*8+64]
|
||||
movaps m13, [coeffq+cnt1q*8+80]
|
||||
movaps m14, [coeffq+cnt1q*8+96]
|
||||
movaps m15, [coeffq+cnt1q*8+112]
|
||||
mulps m0, m7, m8
|
||||
mulps m1, m7, m10
|
||||
mulps m2, m7, m12
|
||||
mulps m3, m7, m14
|
||||
fmaddps m0, m6, m9, m0
|
||||
fmaddps m1, m6, m11, m1
|
||||
fmaddps m2, m6, m13, m2
|
||||
fmaddps m3, m6, m15, m3
|
||||
|
||||
haddps m0, m1
|
||||
haddps m2, m3
|
||||
haddps m0, m2
|
||||
movaps [samplesq+cnt1q], m0
|
||||
%else
|
||||
mulps m0, m7, m8
|
||||
mulps m1, m6, m9
|
||||
mulps m2, m7, m10
|
||||
mulps m3, m6, m11
|
||||
addps m0, m1
|
||||
addps m2, m3
|
||||
|
||||
unpckhps m3, m0, m2
|
||||
unpcklps m0, m2
|
||||
addps m3, m0
|
||||
movhlps m2, m3
|
||||
addps m2, m3
|
||||
movlps [samplesq+cnt1q], m2
|
||||
%endif
|
||||
%else ; ARCH_X86_32
|
||||
%if cpuflag(fma3)
|
||||
mulps m0, m7, [coeffq+cnt1q*8 ]
|
||||
mulps m1, m7, [coeffq+cnt1q*8+32 ]
|
||||
mulps m2, m7, [coeffq+cnt1q*8+64 ]
|
||||
mulps m3, m7, [coeffq+cnt1q*8+96 ]
|
||||
fmaddps m0, m6, [coeffq+cnt1q*8+16 ], m0
|
||||
fmaddps m1, m6, [coeffq+cnt1q*8+48 ], m1
|
||||
fmaddps m2, m6, [coeffq+cnt1q*8+80 ], m2
|
||||
fmaddps m3, m6, [coeffq+cnt1q*8+112], m3
|
||||
|
||||
haddps m0, m1
|
||||
haddps m2, m3
|
||||
haddps m0, m2
|
||||
movaps [samplesq+cnt1q], m0
|
||||
%else
|
||||
mulps m0, m7, [coeffq+cnt1q*8 ]
|
||||
mulps m1, m6, [coeffq+cnt1q*8+16]
|
||||
mulps m2, m7, [coeffq+cnt1q*8+32]
|
||||
mulps m3, m6, [coeffq+cnt1q*8+48]
|
||||
addps m0, m1
|
||||
addps m2, m3
|
||||
|
||||
unpckhps m3, m0, m2
|
||||
unpcklps m0, m2
|
||||
addps m3, m0
|
||||
movhlps m2, m3
|
||||
addps m2, m3
|
||||
movlps [samplesq+cnt1q], m2
|
||||
%endif
|
||||
%endif; ARCH
|
||||
|
||||
%if ARCH_X86_64
|
||||
%if cpuflag(fma3)
|
||||
mulps m8, m5
|
||||
mulps m10, m5
|
||||
mulps m12, m5
|
||||
mulps m14, m5
|
||||
fmaddps m8, m4, m9, m8
|
||||
fmaddps m10, m4, m11, m10
|
||||
fmaddps m12, m4, m13, m12
|
||||
fmaddps m14, m4, m15, m14
|
||||
|
||||
haddps m10, m8
|
||||
haddps m14, m12
|
||||
haddps m14, m10
|
||||
movaps [samplesq+cnt2q], m14
|
||||
%else
|
||||
mulps m8, m5
|
||||
mulps m9, m4
|
||||
mulps m10, m5
|
||||
mulps m11, m4
|
||||
addps m8, m9
|
||||
addps m10, m11
|
||||
|
||||
unpckhps m11, m10, m8
|
||||
unpcklps m10, m8
|
||||
addps m11, m10
|
||||
movhlps m8, m11
|
||||
addps m8, m11
|
||||
movlps [samplesq+cnt2q], m8
|
||||
%endif
|
||||
%else ; ARCH_X86_32
|
||||
%if cpuflag(fma3)
|
||||
mulps m0, m5, [coeffq+cnt1q*8 ]
|
||||
mulps m1, m5, [coeffq+cnt1q*8+32 ]
|
||||
mulps m2, m5, [coeffq+cnt1q*8+64 ]
|
||||
mulps m3, m5, [coeffq+cnt1q*8+96 ]
|
||||
fmaddps m0, m4, [coeffq+cnt1q*8+16 ], m0
|
||||
fmaddps m1, m4, [coeffq+cnt1q*8+48 ], m1
|
||||
fmaddps m2, m4, [coeffq+cnt1q*8+80 ], m2
|
||||
fmaddps m3, m4, [coeffq+cnt1q*8+112], m3
|
||||
|
||||
haddps m1, m0
|
||||
haddps m3, m2
|
||||
haddps m3, m1
|
||||
movaps [samplesq+cnt2q], m3
|
||||
%else
|
||||
mulps m0, m5, [coeffq+cnt1q*8 ]
|
||||
mulps m1, m4, [coeffq+cnt1q*8+16]
|
||||
mulps m2, m5, [coeffq+cnt1q*8+32]
|
||||
mulps m3, m4, [coeffq+cnt1q*8+48]
|
||||
addps m0, m1
|
||||
addps m2, m3
|
||||
|
||||
unpckhps m3, m2, m0
|
||||
unpcklps m2, m0
|
||||
addps m3, m2
|
||||
movhlps m0, m3
|
||||
addps m0, m3
|
||||
movlps [samplesq+cnt2q], m0
|
||||
%endif
|
||||
%endif; ARCH
|
||||
|
||||
sub cnt2d, 8 + FMA3_OFFSET
|
||||
add cnt1q, 8 + FMA3_OFFSET
|
||||
jl .inner_loop
|
||||
|
||||
add lfeq, 4
|
||||
add samplesq, 64*sizeof_float
|
||||
mov cnt1q, -32*sizeof_float
|
||||
mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET
|
||||
sub nblocksd, 1
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_XMM sse
|
||||
LFE_FIR0_FLOAT
|
||||
%endif
|
||||
INIT_XMM sse2
|
||||
LFE_FIR0_FLOAT
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
LFE_FIR0_FLOAT
|
||||
%endif
|
||||
%if HAVE_FMA3_EXTERNAL
|
||||
INIT_XMM fma3
|
||||
LFE_FIR0_FLOAT
|
||||
%endif
|
||||
|
||||
%macro LFE_FIR1_FLOAT 0
|
||||
cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
|
||||
shr nblocksd, 2
|
||||
sub lfeq, 3*sizeof_float
|
||||
mov cnt1d, 64*sizeof_float
|
||||
mov cnt2d, 64*sizeof_float-16
|
||||
lea coeffq, [coeffq+cnt1q*4]
|
||||
add samplesq, cnt1q
|
||||
neg cnt1q
|
||||
|
||||
.loop:
|
||||
%if cpuflag(avx)
|
||||
cvtdq2ps m4, [lfeq]
|
||||
shufps m5, m4, m4, q0123
|
||||
%elif cpuflag(sse2)
|
||||
movu m4, [lfeq]
|
||||
cvtdq2ps m4, m4
|
||||
pshufd m5, m4, q0123
|
||||
%endif
|
||||
|
||||
.inner_loop:
|
||||
movaps m6, [coeffq+cnt1q*4 ]
|
||||
movaps m7, [coeffq+cnt1q*4+16]
|
||||
mulps m0, m5, m6
|
||||
mulps m1, m5, m7
|
||||
%if ARCH_X86_64
|
||||
movaps m8, [coeffq+cnt1q*4+32]
|
||||
movaps m9, [coeffq+cnt1q*4+48]
|
||||
mulps m2, m5, m8
|
||||
mulps m3, m5, m9
|
||||
%else
|
||||
mulps m2, m5, [coeffq+cnt1q*4+32]
|
||||
mulps m3, m5, [coeffq+cnt1q*4+48]
|
||||
%endif
|
||||
|
||||
haddps m0, m1
|
||||
haddps m2, m3
|
||||
haddps m0, m2
|
||||
movaps [samplesq+cnt1q], m0
|
||||
|
||||
mulps m6, m4
|
||||
mulps m7, m4
|
||||
%if ARCH_X86_64
|
||||
mulps m8, m4
|
||||
mulps m9, m4
|
||||
|
||||
haddps m6, m7
|
||||
haddps m8, m9
|
||||
haddps m6, m8
|
||||
%else
|
||||
mulps m2, m4, [coeffq+cnt1q*4+32]
|
||||
mulps m3, m4, [coeffq+cnt1q*4+48]
|
||||
|
||||
haddps m6, m7
|
||||
haddps m2, m3
|
||||
haddps m6, m2
|
||||
%endif
|
||||
movaps [samplesq+cnt2q], m6
|
||||
|
||||
sub cnt2d, 16
|
||||
add cnt1q, 16
|
||||
jl .inner_loop
|
||||
|
||||
add lfeq, sizeof_float
|
||||
add samplesq, 128*sizeof_float
|
||||
mov cnt1q, -64*sizeof_float
|
||||
mov cnt2d, 64*sizeof_float-16
|
||||
sub nblocksd, 1
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse3
|
||||
LFE_FIR1_FLOAT
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
LFE_FIR1_FLOAT
|
||||
%endif
|
||||
52
externals/ffmpeg/libavcodec/x86/dcadsp_init.c
vendored
Executable file
52
externals/ffmpeg/libavcodec/x86/dcadsp_init.c
vendored
Executable file
@@ -0,0 +1,52 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dcadsp.h"
|
||||
|
||||
#define LFE_FIR_FLOAT_FUNC(opt) \
|
||||
void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
|
||||
const float *filter_coeff, ptrdiff_t npcmblocks); \
|
||||
void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
|
||||
const float *filter_coeff, ptrdiff_t npcmblocks);
|
||||
|
||||
LFE_FIR_FLOAT_FUNC(sse)
|
||||
LFE_FIR_FLOAT_FUNC(sse2)
|
||||
LFE_FIR_FLOAT_FUNC(sse3)
|
||||
LFE_FIR_FLOAT_FUNC(avx)
|
||||
LFE_FIR_FLOAT_FUNC(fma3)
|
||||
|
||||
av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags))
|
||||
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
|
||||
if (EXTERNAL_SSE3(cpu_flags))
|
||||
s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3;
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
|
||||
s->lfe_fir_float[1] = ff_lfe_fir1_float_avx;
|
||||
}
|
||||
if (EXTERNAL_FMA3(cpu_flags))
|
||||
s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
|
||||
}
|
||||
491
externals/ffmpeg/libavcodec/x86/dct32.asm
vendored
Executable file
491
externals/ffmpeg/libavcodec/x86/dct32.asm
vendored
Executable file
@@ -0,0 +1,491 @@
|
||||
;******************************************************************************
|
||||
;* 32 point SSE-optimized DCT transform
|
||||
;* Copyright (c) 2010 Vitor Sessak
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
|
||||
|
||||
ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
|
||||
dd 0.553104, 0.582935, 0.622504, 0.674808
|
||||
dd -10.190008, -3.407609, -2.057781, -1.484165
|
||||
dd -1.169440, -0.972568, -0.839350, -0.744536
|
||||
dd 0.502419, 0.522499, 0.566944, 0.646822
|
||||
dd 0.788155, 1.060678, 1.722447, 5.101149
|
||||
dd 0.509796, 0.601345, 0.899976, 2.562916
|
||||
dd 0.509796, 0.601345, 0.899976, 2.562916
|
||||
dd 1.000000, 1.000000, 1.306563, 0.541196
|
||||
dd 1.000000, 1.000000, 1.306563, 0.541196
|
||||
dd 1.000000, 0.707107, 1.000000, -0.707107
|
||||
dd 1.000000, 0.707107, 1.000000, -0.707107
|
||||
dd 0.707107, 0.707107, 0.707107, 0.707107
|
||||
|
||||
%macro BUTTERFLY 4
|
||||
subps %4, %1, %2
|
||||
addps %2, %2, %1
|
||||
mulps %1, %4, %3
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERFLY0 5
|
||||
%if cpuflag(sse2) && notcpuflag(avx)
|
||||
pshufd %4, %1, %5
|
||||
xorps %1, %2
|
||||
addps %1, %4
|
||||
mulps %1, %3
|
||||
%else
|
||||
shufps %4, %1, %1, %5
|
||||
xorps %1, %1, %2
|
||||
addps %4, %4, %1
|
||||
mulps %1, %4, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERFLY2 4
|
||||
BUTTERFLY0 %1, %2, %3, %4, 0x1b
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERFLY3 4
|
||||
BUTTERFLY0 %1, %2, %3, %4, 0xb1
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERFLY3V 5
|
||||
movaps m%5, m%1
|
||||
addps m%1, m%2
|
||||
subps m%5, m%2
|
||||
SWAP %2, %5
|
||||
mulps m%2, [ps_cos_vec+192]
|
||||
movaps m%5, m%3
|
||||
addps m%3, m%4
|
||||
subps m%4, m%5
|
||||
mulps m%4, [ps_cos_vec+192]
|
||||
%endmacro
|
||||
|
||||
%macro PASS6_AND_PERMUTE 0
|
||||
mov tmpd, [outq+4]
|
||||
movss m7, [outq+72]
|
||||
addss m7, [outq+76]
|
||||
movss m3, [outq+56]
|
||||
addss m3, [outq+60]
|
||||
addss m4, m3
|
||||
movss m2, [outq+52]
|
||||
addss m2, m3
|
||||
movss m3, [outq+104]
|
||||
addss m3, [outq+108]
|
||||
addss m1, m3
|
||||
addss m5, m4
|
||||
movss [outq+ 16], m1
|
||||
movss m1, [outq+100]
|
||||
addss m1, m3
|
||||
movss m3, [outq+40]
|
||||
movss [outq+ 48], m1
|
||||
addss m3, [outq+44]
|
||||
movss m1, [outq+100]
|
||||
addss m4, m3
|
||||
addss m3, m2
|
||||
addss m1, [outq+108]
|
||||
movss [outq+ 40], m3
|
||||
addss m2, [outq+36]
|
||||
movss m3, [outq+8]
|
||||
movss [outq+ 56], m2
|
||||
addss m3, [outq+12]
|
||||
movss [outq+ 32], m3
|
||||
movss m3, [outq+80]
|
||||
movss [outq+ 8], m5
|
||||
movss [outq+ 80], m1
|
||||
movss m2, [outq+52]
|
||||
movss m5, [outq+120]
|
||||
addss m5, [outq+124]
|
||||
movss m1, [outq+64]
|
||||
addss m2, [outq+60]
|
||||
addss m0, m5
|
||||
addss m5, [outq+116]
|
||||
mov [outq+64], tmpd
|
||||
addss m6, m0
|
||||
addss m1, m6
|
||||
mov tmpd, [outq+12]
|
||||
mov [outq+ 96], tmpd
|
||||
movss [outq+ 4], m1
|
||||
movss m1, [outq+24]
|
||||
movss [outq+ 24], m4
|
||||
movss m4, [outq+88]
|
||||
addss m4, [outq+92]
|
||||
addss m3, m4
|
||||
addss m4, [outq+84]
|
||||
mov tmpd, [outq+108]
|
||||
addss m1, [outq+28]
|
||||
addss m0, m1
|
||||
addss m1, m5
|
||||
addss m6, m3
|
||||
addss m3, m0
|
||||
addss m0, m7
|
||||
addss m5, [outq+20]
|
||||
addss m7, m1
|
||||
movss [outq+ 12], m6
|
||||
mov [outq+112], tmpd
|
||||
movss m6, [outq+28]
|
||||
movss [outq+ 28], m0
|
||||
movss m0, [outq+36]
|
||||
movss [outq+ 36], m7
|
||||
addss m1, m4
|
||||
movss m7, [outq+116]
|
||||
addss m0, m2
|
||||
addss m7, [outq+124]
|
||||
movss [outq+ 72], m0
|
||||
movss m0, [outq+44]
|
||||
addss m2, m0
|
||||
movss [outq+ 44], m1
|
||||
movss [outq+ 88], m2
|
||||
addss m0, [outq+60]
|
||||
mov tmpd, [outq+60]
|
||||
mov [outq+120], tmpd
|
||||
movss [outq+104], m0
|
||||
addss m4, m5
|
||||
addss m5, [outq+68]
|
||||
movss [outq+52], m4
|
||||
movss [outq+60], m5
|
||||
movss m4, [outq+68]
|
||||
movss m5, [outq+20]
|
||||
movss [outq+ 20], m3
|
||||
addss m5, m7
|
||||
addss m7, m6
|
||||
addss m4, m5
|
||||
movss m2, [outq+84]
|
||||
addss m2, [outq+92]
|
||||
addss m5, m2
|
||||
movss [outq+ 68], m4
|
||||
addss m2, m7
|
||||
movss m4, [outq+76]
|
||||
movss [outq+ 84], m2
|
||||
movss [outq+ 76], m5
|
||||
addss m7, m4
|
||||
addss m6, [outq+124]
|
||||
addss m4, m6
|
||||
addss m6, [outq+92]
|
||||
movss [outq+100], m4
|
||||
movss [outq+108], m6
|
||||
movss m6, [outq+92]
|
||||
movss [outq+92], m7
|
||||
addss m6, [outq+124]
|
||||
movss [outq+116], m6
|
||||
%endmacro
|
||||
|
||||
INIT_YMM avx
|
||||
SECTION .text
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
|
||||
cglobal dct32_float, 2,3,8, out, in, tmp
|
||||
; pass 1
|
||||
vmovaps m4, [inq+0]
|
||||
vinsertf128 m5, m5, [inq+96], 1
|
||||
vinsertf128 m5, m5, [inq+112], 0
|
||||
vshufps m5, m5, m5, 0x1b
|
||||
BUTTERFLY m4, m5, [ps_cos_vec], m6
|
||||
|
||||
vmovaps m2, [inq+64]
|
||||
vinsertf128 m6, m6, [inq+32], 1
|
||||
vinsertf128 m6, m6, [inq+48], 0
|
||||
vshufps m6, m6, m6, 0x1b
|
||||
BUTTERFLY m2, m6, [ps_cos_vec+32], m0
|
||||
|
||||
; pass 2
|
||||
|
||||
BUTTERFLY m5, m6, [ps_cos_vec+64], m0
|
||||
BUTTERFLY m4, m2, [ps_cos_vec+64], m7
|
||||
|
||||
|
||||
; pass 3
|
||||
vperm2f128 m3, m6, m4, 0x31
|
||||
vperm2f128 m1, m6, m4, 0x20
|
||||
vshufps m3, m3, m3, 0x1b
|
||||
|
||||
BUTTERFLY m1, m3, [ps_cos_vec+96], m6
|
||||
|
||||
|
||||
vperm2f128 m4, m5, m2, 0x20
|
||||
vperm2f128 m5, m5, m2, 0x31
|
||||
vshufps m5, m5, m5, 0x1b
|
||||
|
||||
BUTTERFLY m4, m5, [ps_cos_vec+96], m6
|
||||
|
||||
; pass 4
|
||||
vmovaps m6, [ps_p1p1m1m1+0]
|
||||
vmovaps m2, [ps_cos_vec+128]
|
||||
|
||||
BUTTERFLY2 m5, m6, m2, m7
|
||||
BUTTERFLY2 m4, m6, m2, m7
|
||||
BUTTERFLY2 m1, m6, m2, m7
|
||||
BUTTERFLY2 m3, m6, m2, m7
|
||||
|
||||
|
||||
; pass 5
|
||||
vshufps m6, m6, m6, 0xcc
|
||||
vmovaps m2, [ps_cos_vec+160]
|
||||
|
||||
BUTTERFLY3 m5, m6, m2, m7
|
||||
BUTTERFLY3 m4, m6, m2, m7
|
||||
BUTTERFLY3 m1, m6, m2, m7
|
||||
BUTTERFLY3 m3, m6, m2, m7
|
||||
|
||||
vperm2f128 m6, m3, m3, 0x31
|
||||
vmovaps [outq], m3
|
||||
|
||||
vextractf128 [outq+64], m5, 1
|
||||
vextractf128 [outq+32], m5, 0
|
||||
|
||||
vextractf128 [outq+80], m4, 1
|
||||
vextractf128 [outq+48], m4, 0
|
||||
|
||||
vperm2f128 m0, m1, m1, 0x31
|
||||
vmovaps [outq+96], m1
|
||||
|
||||
vzeroupper
|
||||
|
||||
; pass 6, no SIMD...
|
||||
INIT_XMM
|
||||
PASS6_AND_PERMUTE
|
||||
RET
|
||||
%endif
|
||||
|
||||
%if ARCH_X86_64
|
||||
%define SPILL SWAP
|
||||
%define UNSPILL SWAP
|
||||
|
||||
%macro PASS5 0
|
||||
nop ; FIXME code alignment
|
||||
SWAP 5, 8
|
||||
SWAP 4, 12
|
||||
SWAP 6, 14
|
||||
SWAP 7, 13
|
||||
SWAP 0, 15
|
||||
PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
|
||||
TRANSPOSE4x4PS 8, 9, 10, 11, 0
|
||||
BUTTERFLY3V 8, 9, 10, 11, 0
|
||||
addps m10, m11
|
||||
TRANSPOSE4x4PS 12, 13, 14, 15, 0
|
||||
BUTTERFLY3V 12, 13, 14, 15, 0
|
||||
addps m14, m15
|
||||
addps m12, m14
|
||||
addps m14, m13
|
||||
addps m13, m15
|
||||
%endmacro
|
||||
|
||||
%macro PASS6 0
|
||||
SWAP 9, 12
|
||||
SWAP 11, 14
|
||||
movss [outq+0x00], m8
|
||||
pshuflw m0, m8, 0xe
|
||||
movss [outq+0x10], m9
|
||||
pshuflw m1, m9, 0xe
|
||||
movss [outq+0x20], m10
|
||||
pshuflw m2, m10, 0xe
|
||||
movss [outq+0x30], m11
|
||||
pshuflw m3, m11, 0xe
|
||||
movss [outq+0x40], m12
|
||||
pshuflw m4, m12, 0xe
|
||||
movss [outq+0x50], m13
|
||||
pshuflw m5, m13, 0xe
|
||||
movss [outq+0x60], m14
|
||||
pshuflw m6, m14, 0xe
|
||||
movaps [outq+0x70], m15
|
||||
pshuflw m7, m15, 0xe
|
||||
addss m0, m1
|
||||
addss m1, m2
|
||||
movss [outq+0x08], m0
|
||||
addss m2, m3
|
||||
movss [outq+0x18], m1
|
||||
addss m3, m4
|
||||
movss [outq+0x28], m2
|
||||
addss m4, m5
|
||||
movss [outq+0x38], m3
|
||||
addss m5, m6
|
||||
movss [outq+0x48], m4
|
||||
addss m6, m7
|
||||
movss [outq+0x58], m5
|
||||
movss [outq+0x68], m6
|
||||
movss [outq+0x78], m7
|
||||
|
||||
PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
|
||||
movhlps m0, m1
|
||||
pshufd m1, m1, 3
|
||||
SWAP 0, 2, 4, 6, 8, 10, 12, 14
|
||||
SWAP 1, 3, 5, 7, 9, 11, 13, 15
|
||||
%rep 7
|
||||
movhlps m0, m1
|
||||
pshufd m1, m1, 3
|
||||
addss m15, m1
|
||||
SWAP 0, 2, 4, 6, 8, 10, 12, 14
|
||||
SWAP 1, 3, 5, 7, 9, 11, 13, 15
|
||||
%endrep
|
||||
%assign i 4
|
||||
%rep 15
|
||||
addss m0, m1
|
||||
movss [outq+i], m0
|
||||
SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
%assign i i+8
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
%else ; ARCH_X86_32
|
||||
%macro SPILL 2 ; xmm#, mempos
|
||||
movaps [outq+(%2-8)*16], m%1
|
||||
%endmacro
|
||||
%macro UNSPILL 2
|
||||
movaps m%1, [outq+(%2-8)*16]
|
||||
%endmacro
|
||||
|
||||
%define PASS6 PASS6_AND_PERMUTE
|
||||
%macro PASS5 0
|
||||
movaps m2, [ps_cos_vec+160]
|
||||
shufps m3, m3, 0xcc
|
||||
|
||||
BUTTERFLY3 m5, m3, m2, m1
|
||||
SPILL 5, 8
|
||||
|
||||
UNSPILL 1, 9
|
||||
BUTTERFLY3 m1, m3, m2, m5
|
||||
SPILL 1, 14
|
||||
|
||||
BUTTERFLY3 m4, m3, m2, m5
|
||||
SPILL 4, 12
|
||||
|
||||
BUTTERFLY3 m7, m3, m2, m5
|
||||
SPILL 7, 13
|
||||
|
||||
UNSPILL 5, 10
|
||||
BUTTERFLY3 m5, m3, m2, m7
|
||||
SPILL 5, 10
|
||||
|
||||
UNSPILL 4, 11
|
||||
BUTTERFLY3 m4, m3, m2, m7
|
||||
SPILL 4, 11
|
||||
|
||||
BUTTERFLY3 m6, m3, m2, m7
|
||||
SPILL 6, 9
|
||||
|
||||
BUTTERFLY3 m0, m3, m2, m7
|
||||
SPILL 0, 15
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
|
||||
; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
|
||||
%macro DCT32_FUNC 0
|
||||
cglobal dct32_float, 2, 3, 16, out, in, tmp
|
||||
; pass 1
|
||||
|
||||
movaps m0, [inq+0]
|
||||
LOAD_INV m1, [inq+112]
|
||||
BUTTERFLY m0, m1, [ps_cos_vec], m3
|
||||
|
||||
movaps m7, [inq+64]
|
||||
LOAD_INV m4, [inq+48]
|
||||
BUTTERFLY m7, m4, [ps_cos_vec+32], m3
|
||||
|
||||
; pass 2
|
||||
movaps m2, [ps_cos_vec+64]
|
||||
BUTTERFLY m1, m4, m2, m3
|
||||
SPILL 1, 11
|
||||
SPILL 4, 8
|
||||
|
||||
; pass 1
|
||||
movaps m1, [inq+16]
|
||||
LOAD_INV m6, [inq+96]
|
||||
BUTTERFLY m1, m6, [ps_cos_vec+16], m3
|
||||
|
||||
movaps m4, [inq+80]
|
||||
LOAD_INV m5, [inq+32]
|
||||
BUTTERFLY m4, m5, [ps_cos_vec+48], m3
|
||||
|
||||
; pass 2
|
||||
BUTTERFLY m0, m7, m2, m3
|
||||
|
||||
movaps m2, [ps_cos_vec+80]
|
||||
BUTTERFLY m6, m5, m2, m3
|
||||
|
||||
BUTTERFLY m1, m4, m2, m3
|
||||
|
||||
; pass 3
|
||||
movaps m2, [ps_cos_vec+96]
|
||||
shufps m1, m1, 0x1b
|
||||
BUTTERFLY m0, m1, m2, m3
|
||||
SPILL 0, 15
|
||||
SPILL 1, 14
|
||||
|
||||
UNSPILL 0, 8
|
||||
shufps m5, m5, 0x1b
|
||||
BUTTERFLY m0, m5, m2, m3
|
||||
|
||||
UNSPILL 1, 11
|
||||
shufps m6, m6, 0x1b
|
||||
BUTTERFLY m1, m6, m2, m3
|
||||
SPILL 1, 11
|
||||
|
||||
shufps m4, m4, 0x1b
|
||||
BUTTERFLY m7, m4, m2, m3
|
||||
|
||||
; pass 4
|
||||
movaps m3, [ps_p1p1m1m1+0]
|
||||
movaps m2, [ps_cos_vec+128]
|
||||
|
||||
BUTTERFLY2 m5, m3, m2, m1
|
||||
|
||||
BUTTERFLY2 m0, m3, m2, m1
|
||||
SPILL 0, 9
|
||||
|
||||
BUTTERFLY2 m6, m3, m2, m1
|
||||
SPILL 6, 10
|
||||
|
||||
UNSPILL 0, 11
|
||||
BUTTERFLY2 m0, m3, m2, m1
|
||||
SPILL 0, 11
|
||||
|
||||
BUTTERFLY2 m4, m3, m2, m1
|
||||
|
||||
BUTTERFLY2 m7, m3, m2, m1
|
||||
|
||||
UNSPILL 6, 14
|
||||
BUTTERFLY2 m6, m3, m2, m1
|
||||
|
||||
UNSPILL 0, 15
|
||||
BUTTERFLY2 m0, m3, m2, m1
|
||||
|
||||
PASS5
|
||||
PASS6
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_INV 2
|
||||
%if cpuflag(sse2)
|
||||
pshufd %1, %2, 0x1b
|
||||
%elif cpuflag(sse)
|
||||
movaps %1, %2
|
||||
shufps %1, %1, 0x1b
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_XMM sse
|
||||
DCT32_FUNC
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
DCT32_FUNC
|
||||
41
externals/ffmpeg/libavcodec/x86/dct_init.c
vendored
Executable file
41
externals/ffmpeg/libavcodec/x86/dct_init.c
vendored
Executable file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dct.h"
|
||||
|
||||
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
|
||||
void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
|
||||
void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
|
||||
|
||||
av_cold void ff_dct_init_x86(DCTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_SSE(cpu_flags))
|
||||
s->dct32 = ff_dct32_float_sse;
|
||||
#endif
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
s->dct32 = ff_dct32_float_sse2;
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags))
|
||||
s->dct32 = ff_dct32_float_avx;
|
||||
}
|
||||
307
externals/ffmpeg/libavcodec/x86/dirac_dwt.asm
vendored
Executable file
307
externals/ffmpeg/libavcodec/x86/dirac_dwt.asm
vendored
Executable file
@@ -0,0 +1,307 @@
|
||||
;******************************************************************************
|
||||
;* x86 optimized discrete wavelet trasnform
|
||||
;* Copyright (c) 2010 David Conrad
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
pw_1991: times 4 dw 9,-1
|
||||
|
||||
cextern pw_1
|
||||
cextern pw_2
|
||||
cextern pw_8
|
||||
cextern pw_16
|
||||
|
||||
SECTION .text
|
||||
|
||||
; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2
|
||||
%macro COMPOSE_53iL0 4
|
||||
paddw %2, %3
|
||||
paddw %2, %4
|
||||
psraw %2, 2
|
||||
psubw %1, %2
|
||||
%endm
|
||||
|
||||
; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
|
||||
; if %4 is supplied, %1 is loaded unaligned from there
|
||||
; m2: clobbered m3: pw_8 m4: pw_1991
|
||||
%macro COMPOSE_DD97iH0 3-4
|
||||
paddw m0, %3
|
||||
paddw m1, %2
|
||||
psubw m0, m3
|
||||
mova m2, m1
|
||||
punpcklwd m1, m0
|
||||
punpckhwd m2, m0
|
||||
pmaddwd m1, m4
|
||||
pmaddwd m2, m4
|
||||
%if %0 > 3
|
||||
movu %1, %4
|
||||
%endif
|
||||
psrad m1, 4
|
||||
psrad m2, 4
|
||||
packssdw m1, m2
|
||||
paddw m1, %1
|
||||
%endm
|
||||
|
||||
%macro COMPOSE_VERTICAL 1
|
||||
; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
|
||||
; int width)
|
||||
cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
|
||||
mova m2, [pw_2]
|
||||
%if ARCH_X86_64
|
||||
mov widthd, widthd
|
||||
%endif
|
||||
.loop:
|
||||
sub widthq, mmsize/2
|
||||
mova m1, [b0q+2*widthq]
|
||||
mova m0, [b1q+2*widthq]
|
||||
COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
|
||||
mova [b1q+2*widthq], m0
|
||||
jg .loop
|
||||
REP_RET
|
||||
|
||||
; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
|
||||
; int width)
|
||||
cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
|
||||
mova m1, [pw_1]
|
||||
%if ARCH_X86_64
|
||||
mov widthd, widthd
|
||||
%endif
|
||||
.loop:
|
||||
sub widthq, mmsize/2
|
||||
mova m0, [b0q+2*widthq]
|
||||
paddw m0, [b2q+2*widthq]
|
||||
paddw m0, m1
|
||||
psraw m0, 1
|
||||
paddw m0, [b1q+2*widthq]
|
||||
mova [b1q+2*widthq], m0
|
||||
jg .loop
|
||||
REP_RET
|
||||
|
||||
; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
|
||||
; IDWTELEM *b3, IDWTELEM *b4, int width)
|
||||
cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
|
||||
mova m3, [pw_8]
|
||||
mova m4, [pw_1991]
|
||||
%if ARCH_X86_64
|
||||
mov widthd, widthd
|
||||
%endif
|
||||
.loop:
|
||||
sub widthq, mmsize/2
|
||||
mova m0, [b0q+2*widthq]
|
||||
mova m1, [b1q+2*widthq]
|
||||
COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
|
||||
mova [b2q+2*widthq], m1
|
||||
jg .loop
|
||||
REP_RET
|
||||
|
||||
; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
|
||||
; IDWTELEM *b3, IDWTELEM *b4, int width)
|
||||
cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
|
||||
mova m3, [pw_16]
|
||||
mova m4, [pw_1991]
|
||||
%if ARCH_X86_64
|
||||
mov widthd, widthd
|
||||
%endif
|
||||
.loop:
|
||||
sub widthq, mmsize/2
|
||||
mova m0, [b0q+2*widthq]
|
||||
mova m1, [b1q+2*widthq]
|
||||
mova m5, [b2q+2*widthq]
|
||||
paddw m0, [b4q+2*widthq]
|
||||
paddw m1, [b3q+2*widthq]
|
||||
psubw m0, m3
|
||||
mova m2, m1
|
||||
punpcklwd m1, m0
|
||||
punpckhwd m2, m0
|
||||
pmaddwd m1, m4
|
||||
pmaddwd m2, m4
|
||||
psrad m1, 5
|
||||
psrad m2, 5
|
||||
packssdw m1, m2
|
||||
psubw m5, m1
|
||||
mova [b2q+2*widthq], m5
|
||||
jg .loop
|
||||
REP_RET
|
||||
|
||||
; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
|
||||
cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
|
||||
mova m3, [pw_1]
|
||||
%if ARCH_X86_64
|
||||
mov widthd, widthd
|
||||
%endif
|
||||
.loop:
|
||||
sub widthq, mmsize/2
|
||||
mova m1, [b1q+2*widthq]
|
||||
mova m0, [b0q+2*widthq]
|
||||
mova m2, m1
|
||||
paddw m1, m3
|
||||
psraw m1, 1
|
||||
psubw m0, m1
|
||||
mova [b0q+2*widthq], m0
|
||||
paddw m2, m0
|
||||
mova [b1q+2*widthq], m2
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
; extend the left and right edges of the tmp array by %1 and %2 respectively
|
||||
%macro EDGE_EXTENSION 3
|
||||
mov %3, [tmpq]
|
||||
%assign %%i 1
|
||||
%rep %1
|
||||
mov [tmpq-2*%%i], %3
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
mov %3, [tmpq+2*w2q-2]
|
||||
%assign %%i 0
|
||||
%rep %2
|
||||
mov [tmpq+2*w2q+2*%%i], %3
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro HAAR_HORIZONTAL 2
|
||||
; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
|
||||
cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
|
||||
mov w2d, wd
|
||||
xor xq, xq
|
||||
shr w2d, 1
|
||||
lea b_w2q, [bq+wq]
|
||||
mova m3, [pw_1]
|
||||
.lowpass_loop:
|
||||
movu m1, [b_w2q + 2*xq]
|
||||
mova m0, [bq + 2*xq]
|
||||
paddw m1, m3
|
||||
psraw m1, 1
|
||||
psubw m0, m1
|
||||
mova [tmpq + 2*xq], m0
|
||||
add xq, mmsize/2
|
||||
cmp xq, w2q
|
||||
jl .lowpass_loop
|
||||
|
||||
xor xq, xq
|
||||
and w2q, ~(mmsize/2 - 1)
|
||||
cmp w2q, mmsize/2
|
||||
jl .end
|
||||
|
||||
.highpass_loop:
|
||||
movu m1, [b_w2q + 2*xq]
|
||||
mova m0, [tmpq + 2*xq]
|
||||
paddw m1, m0
|
||||
|
||||
; shift and interleave
|
||||
%if %2 == 1
|
||||
paddw m0, m3
|
||||
paddw m1, m3
|
||||
psraw m0, 1
|
||||
psraw m1, 1
|
||||
%endif
|
||||
mova m2, m0
|
||||
punpcklwd m0, m1
|
||||
punpckhwd m2, m1
|
||||
mova [bq+4*xq], m0
|
||||
mova [bq+4*xq+mmsize], m2
|
||||
|
||||
add xq, mmsize/2
|
||||
cmp xq, w2q
|
||||
jl .highpass_loop
|
||||
.end:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
|
||||
INIT_XMM
|
||||
; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
|
||||
cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
|
||||
mov w2d, wd
|
||||
xor xd, xd
|
||||
shr w2d, 1
|
||||
lea b_w2q, [bq+wq]
|
||||
movu m4, [bq+wq]
|
||||
mova m7, [pw_2]
|
||||
pslldq m4, 14
|
||||
.lowpass_loop:
|
||||
movu m1, [b_w2q + 2*xq]
|
||||
mova m0, [bq + 2*xq]
|
||||
mova m2, m1
|
||||
palignr m1, m4, 14
|
||||
mova m4, m2
|
||||
COMPOSE_53iL0 m0, m1, m2, m7
|
||||
mova [tmpq + 2*xq], m0
|
||||
add xd, mmsize/2
|
||||
cmp xd, w2d
|
||||
jl .lowpass_loop
|
||||
|
||||
EDGE_EXTENSION 1, 2, xw
|
||||
; leave the last up to 7 (sse) or 3 (mmx) values for C
|
||||
xor xd, xd
|
||||
and w2d, ~(mmsize/2 - 1)
|
||||
cmp w2d, mmsize/2
|
||||
jl .end
|
||||
|
||||
mova m7, [tmpq-mmsize]
|
||||
mova m0, [tmpq]
|
||||
mova m5, [pw_1]
|
||||
mova m3, [pw_8]
|
||||
mova m4, [pw_1991]
|
||||
.highpass_loop:
|
||||
mova m6, m0
|
||||
palignr m0, m7, 14
|
||||
mova m7, [tmpq + 2*xq + 16]
|
||||
mova m1, m7
|
||||
mova m2, m7
|
||||
palignr m1, m6, 2
|
||||
palignr m2, m6, 4
|
||||
COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
|
||||
mova m0, m7
|
||||
mova m7, m6
|
||||
|
||||
; shift and interleave
|
||||
paddw m6, m5
|
||||
paddw m1, m5
|
||||
psraw m6, 1
|
||||
psraw m1, 1
|
||||
mova m2, m6
|
||||
punpcklwd m6, m1
|
||||
punpckhwd m2, m1
|
||||
mova [bq+4*xq], m6
|
||||
mova [bq+4*xq+mmsize], m2
|
||||
|
||||
add xd, mmsize/2
|
||||
cmp xd, w2d
|
||||
jl .highpass_loop
|
||||
.end:
|
||||
REP_RET
|
||||
|
||||
|
||||
%if ARCH_X86_64 == 0
|
||||
INIT_MMX
|
||||
COMPOSE_VERTICAL mmx
|
||||
HAAR_HORIZONTAL mmx, 0
|
||||
HAAR_HORIZONTAL mmx, 1
|
||||
%endif
|
||||
|
||||
;;INIT_XMM
|
||||
INIT_XMM
|
||||
COMPOSE_VERTICAL sse2
|
||||
HAAR_HORIZONTAL sse2, 0
|
||||
HAAR_HORIZONTAL sse2, 1
|
||||
229
externals/ffmpeg/libavcodec/x86/dirac_dwt_init.c
vendored
Executable file
229
externals/ffmpeg/libavcodec/x86/dirac_dwt_init.c
vendored
Executable file
@@ -0,0 +1,229 @@
|
||||
/*
|
||||
* x86 optimized discrete wavelet transform
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
* Copyright (c) 2010 David Conrad
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dirac_dwt.h"
|
||||
|
||||
#define COMPOSE_VERTICAL(ext, align) \
|
||||
void ff_vertical_compose53iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
|
||||
void ff_vertical_compose_dirac53iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
|
||||
void ff_vertical_compose_dd137iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
|
||||
void ff_vertical_compose_dd97iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
|
||||
void ff_vertical_compose_haar##ext(int16_t *b0, int16_t *b1, int width); \
|
||||
void ff_horizontal_compose_haar0i##ext(int16_t *b, int16_t *tmp, int w);\
|
||||
void ff_horizontal_compose_haar1i##ext(int16_t *b, int16_t *tmp, int w);\
|
||||
\
|
||||
static void vertical_compose53iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
int16_t *b0 = (int16_t *)_b0; \
|
||||
int16_t *b1 = (int16_t *)_b1; \
|
||||
int16_t *b2 = (int16_t *)_b2; \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
|
||||
\
|
||||
ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
|
||||
} \
|
||||
\
|
||||
static void vertical_compose_dirac53iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
int16_t *b0 = (int16_t *)_b0; \
|
||||
int16_t *b1 = (int16_t *)_b1; \
|
||||
int16_t *b2 = (int16_t *)_b2; \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
|
||||
\
|
||||
ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
|
||||
} \
|
||||
\
|
||||
static void vertical_compose_dd137iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
|
||||
uint8_t *_b3, uint8_t *_b4, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
int16_t *b0 = (int16_t *)_b0; \
|
||||
int16_t *b1 = (int16_t *)_b1; \
|
||||
int16_t *b2 = (int16_t *)_b2; \
|
||||
int16_t *b3 = (int16_t *)_b3; \
|
||||
int16_t *b4 = (int16_t *)_b4; \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
|
||||
\
|
||||
ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
|
||||
} \
|
||||
\
|
||||
static void vertical_compose_dd97iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
|
||||
uint8_t *_b3, uint8_t *_b4, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
int16_t *b0 = (int16_t *)_b0; \
|
||||
int16_t *b1 = (int16_t *)_b1; \
|
||||
int16_t *b2 = (int16_t *)_b2; \
|
||||
int16_t *b3 = (int16_t *)_b3; \
|
||||
int16_t *b4 = (int16_t *)_b4; \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
|
||||
\
|
||||
ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
|
||||
} \
|
||||
static void vertical_compose_haar##ext(uint8_t *_b0, uint8_t *_b1, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
int16_t *b0 = (int16_t *)_b0; \
|
||||
int16_t *b1 = (int16_t *)_b1; \
|
||||
\
|
||||
for(i=width_align; i<width; i++) { \
|
||||
b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
|
||||
b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
|
||||
} \
|
||||
\
|
||||
ff_vertical_compose_haar##ext(b0, b1, width_align); \
|
||||
} \
|
||||
static void horizontal_compose_haar0i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
|
||||
{\
|
||||
int w2= w>>1;\
|
||||
int x= w2 - (w2&(align-1));\
|
||||
int16_t *b = (int16_t *)_b; \
|
||||
int16_t *tmp = (int16_t *)_tmp; \
|
||||
\
|
||||
ff_horizontal_compose_haar0i##ext(b, tmp, w);\
|
||||
\
|
||||
for (; x < w2; x++) {\
|
||||
b[2*x ] = tmp[x];\
|
||||
b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
|
||||
}\
|
||||
}\
|
||||
static void horizontal_compose_haar1i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
|
||||
{\
|
||||
int w2= w>>1;\
|
||||
int x= w2 - (w2&(align-1));\
|
||||
int16_t *b = (int16_t *)_b; \
|
||||
int16_t *tmp = (int16_t *)_tmp; \
|
||||
\
|
||||
ff_horizontal_compose_haar1i##ext(b, tmp, w);\
|
||||
\
|
||||
for (; x < w2; x++) {\
|
||||
b[2*x ] = (tmp[x] + 1)>>1;\
|
||||
b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
|
||||
}\
|
||||
}\
|
||||
\
|
||||
|
||||
#if HAVE_X86ASM
|
||||
#if !ARCH_X86_64
|
||||
COMPOSE_VERTICAL(_mmx, 4)
|
||||
#endif
|
||||
COMPOSE_VERTICAL(_sse2, 8)
|
||||
|
||||
|
||||
void ff_horizontal_compose_dd97i_ssse3(int16_t *_b, int16_t *_tmp, int w);
|
||||
|
||||
static void horizontal_compose_dd97i_ssse3(uint8_t *_b, uint8_t *_tmp, int w)
|
||||
{
|
||||
int w2= w>>1;
|
||||
int x= w2 - (w2&7);
|
||||
int16_t *b = (int16_t *)_b;
|
||||
int16_t *tmp = (int16_t *)_tmp;
|
||||
|
||||
ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
|
||||
|
||||
for (; x < w2; x++) {
|
||||
b[2*x ] = (tmp[x] + 1)>>1;
|
||||
b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
#if !ARCH_X86_64
|
||||
if (!(mm_flags & AV_CPU_FLAG_MMX))
|
||||
return;
|
||||
|
||||
switch (type) {
|
||||
case DWT_DIRAC_DD9_7:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
|
||||
break;
|
||||
case DWT_DIRAC_LEGALL5_3:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
|
||||
break;
|
||||
case DWT_DIRAC_DD13_7:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
|
||||
break;
|
||||
case DWT_DIRAC_HAAR0:
|
||||
d->vertical_compose = (void*)vertical_compose_haar_mmx;
|
||||
d->horizontal_compose = horizontal_compose_haar0i_mmx;
|
||||
break;
|
||||
case DWT_DIRAC_HAAR1:
|
||||
d->vertical_compose = (void*)vertical_compose_haar_mmx;
|
||||
d->horizontal_compose = horizontal_compose_haar1i_mmx;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!(mm_flags & AV_CPU_FLAG_SSE2))
|
||||
return;
|
||||
|
||||
switch (type) {
|
||||
case DWT_DIRAC_DD9_7:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
|
||||
break;
|
||||
case DWT_DIRAC_LEGALL5_3:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
|
||||
break;
|
||||
case DWT_DIRAC_DD13_7:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
|
||||
break;
|
||||
case DWT_DIRAC_HAAR0:
|
||||
d->vertical_compose = (void*)vertical_compose_haar_sse2;
|
||||
d->horizontal_compose = horizontal_compose_haar0i_sse2;
|
||||
break;
|
||||
case DWT_DIRAC_HAAR1:
|
||||
d->vertical_compose = (void*)vertical_compose_haar_sse2;
|
||||
d->horizontal_compose = horizontal_compose_haar1i_sse2;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!(mm_flags & AV_CPU_FLAG_SSSE3))
|
||||
return;
|
||||
|
||||
switch (type) {
|
||||
case DWT_DIRAC_DD9_7:
|
||||
d->horizontal_compose = horizontal_compose_dd97i_ssse3;
|
||||
break;
|
||||
}
|
||||
#endif // HAVE_X86ASM
|
||||
}
|
||||
348
externals/ffmpeg/libavcodec/x86/diracdsp.asm
vendored
Executable file
348
externals/ffmpeg/libavcodec/x86/diracdsp.asm
vendored
Executable file
@@ -0,0 +1,348 @@
|
||||
;******************************************************************************
|
||||
;* Copyright (c) 2010 David Conrad
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
pw_7: times 8 dw 7
|
||||
convert_to_unsigned_10bit: times 4 dd 0x200
|
||||
clip_10bit: times 8 dw 0x3ff
|
||||
|
||||
cextern pw_3
|
||||
cextern pw_16
|
||||
cextern pw_32
|
||||
cextern pb_80
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro UNPACK_ADD 6
|
||||
mov%5 %1, %3
|
||||
mov%6 m5, %4
|
||||
mova m4, %1
|
||||
mova %2, m5
|
||||
punpcklbw %1, m7
|
||||
punpcklbw m5, m7
|
||||
punpckhbw m4, m7
|
||||
punpckhbw %2, m7
|
||||
paddw %1, m5
|
||||
paddw %2, m4
|
||||
%endmacro
|
||||
|
||||
%macro HPEL_FILTER 1
|
||||
; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
|
||||
cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
|
||||
mov src0q, srcq
|
||||
lea stridex3q, [3*strideq]
|
||||
sub src0q, stridex3q
|
||||
pxor m7, m7
|
||||
.loop:
|
||||
; 7*(src[0] + src[1])
|
||||
UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
|
||||
pmullw m0, [pw_7]
|
||||
pmullw m1, [pw_7]
|
||||
|
||||
; 3*( ... + src[-2] + src[3])
|
||||
UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
pmullw m0, [pw_3]
|
||||
pmullw m1, [pw_3]
|
||||
|
||||
; ... - 7*(src[-1] + src[2])
|
||||
UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
|
||||
pmullw m2, [pw_7]
|
||||
pmullw m3, [pw_7]
|
||||
psubw m0, m2
|
||||
psubw m1, m3
|
||||
|
||||
; ... - (src[-3] + src[4])
|
||||
UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
|
||||
psubw m0, m2
|
||||
psubw m1, m3
|
||||
|
||||
paddw m0, [pw_16]
|
||||
paddw m1, [pw_16]
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
packuswb m0, m1
|
||||
mova [dstq], m0
|
||||
add dstq, mmsize
|
||||
add srcq, mmsize
|
||||
add src0q, mmsize
|
||||
sub widthd, mmsize
|
||||
jg .loop
|
||||
RET
|
||||
|
||||
; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
|
||||
cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
|
||||
dec widthd
|
||||
pxor m7, m7
|
||||
and widthd, ~(mmsize-1)
|
||||
.loop:
|
||||
; 7*(src[0] + src[1])
|
||||
UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
|
||||
pmullw m0, [pw_7]
|
||||
pmullw m1, [pw_7]
|
||||
|
||||
; 3*( ... + src[-2] + src[3])
|
||||
UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
pmullw m0, [pw_3]
|
||||
pmullw m1, [pw_3]
|
||||
|
||||
; ... - 7*(src[-1] + src[2])
|
||||
UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
|
||||
pmullw m2, [pw_7]
|
||||
pmullw m3, [pw_7]
|
||||
psubw m0, m2
|
||||
psubw m1, m3
|
||||
|
||||
; ... - (src[-3] + src[4])
|
||||
UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
|
||||
psubw m0, m2
|
||||
psubw m1, m3
|
||||
|
||||
paddw m0, [pw_16]
|
||||
paddw m1, [pw_16]
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
packuswb m0, m1
|
||||
mova [dstq + widthq], m0
|
||||
sub widthd, mmsize
|
||||
jge .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro PUT_RECT 1
|
||||
; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
|
||||
cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
|
||||
mova m0, [pb_80]
|
||||
add wd, (mmsize-1)
|
||||
and wd, ~(mmsize-1)
|
||||
|
||||
%if ARCH_X86_64
|
||||
movsxd dst_strideq, dst_strided
|
||||
movsxd src_strideq, src_strided
|
||||
mov r7d, r5m
|
||||
mov r8d, wd
|
||||
%define wspill r8d
|
||||
%define hd r7d
|
||||
%else
|
||||
mov r4m, wd
|
||||
%define wspill r4m
|
||||
%define hd r5mp
|
||||
%endif
|
||||
|
||||
.loopy:
|
||||
lea src2q, [srcq+src_strideq]
|
||||
lea dst2q, [dstq+dst_strideq]
|
||||
.loopx:
|
||||
sub wd, mmsize
|
||||
mova m1, [srcq +2*wq]
|
||||
mova m2, [src2q+2*wq]
|
||||
packsswb m1, [srcq +2*wq+mmsize]
|
||||
packsswb m2, [src2q+2*wq+mmsize]
|
||||
paddb m1, m0
|
||||
paddb m2, m0
|
||||
mova [dstq +wq], m1
|
||||
mova [dst2q+wq], m2
|
||||
jg .loopx
|
||||
|
||||
lea srcq, [srcq+src_strideq*2]
|
||||
lea dstq, [dstq+dst_strideq*2]
|
||||
sub hd, 2
|
||||
mov wd, wspill
|
||||
jg .loopy
|
||||
RET
|
||||
%endm
|
||||
|
||||
%macro ADD_RECT 1
|
||||
; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
|
||||
cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
|
||||
mova m0, [pw_32]
|
||||
add wd, (mmsize-1)
|
||||
and wd, ~(mmsize-1)
|
||||
|
||||
%if ARCH_X86_64
|
||||
movsxd strideq, strided
|
||||
movsxd idwt_strideq, idwt_strided
|
||||
mov r8d, wd
|
||||
%define wspill r8d
|
||||
%else
|
||||
mov r5m, wd
|
||||
%define wspill r5m
|
||||
%endif
|
||||
|
||||
.loop:
|
||||
sub wd, mmsize
|
||||
movu m1, [srcq +2*wq] ; FIXME: ensure alignment
|
||||
paddw m1, m0
|
||||
psraw m1, 6
|
||||
movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
|
||||
paddw m2, m0
|
||||
psraw m2, 6
|
||||
paddw m1, [idwtq+2*wq]
|
||||
paddw m2, [idwtq+2*wq+mmsize]
|
||||
packuswb m1, m2
|
||||
mova [dstq +wq], m1
|
||||
jg .loop
|
||||
|
||||
lea srcq, [srcq + 2*strideq]
|
||||
add dstq, strideq
|
||||
lea idwtq, [idwtq+ 2*idwt_strideq]
|
||||
sub hd, 1
|
||||
mov wd, wspill
|
||||
jg .loop
|
||||
RET
|
||||
%endm
|
||||
|
||||
%macro ADD_OBMC 2
|
||||
; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
|
||||
cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
|
||||
pxor m4, m4
|
||||
.loop:
|
||||
%assign i 0
|
||||
%rep %1 / mmsize
|
||||
mova m0, [srcq+i]
|
||||
mova m1, m0
|
||||
punpcklbw m0, m4
|
||||
punpckhbw m1, m4
|
||||
mova m2, [obmcq+i]
|
||||
mova m3, m2
|
||||
punpcklbw m2, m4
|
||||
punpckhbw m3, m4
|
||||
pmullw m0, m2
|
||||
pmullw m1, m3
|
||||
movu m2, [dstq+2*i]
|
||||
movu m3, [dstq+2*i+mmsize]
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
movu [dstq+2*i], m0
|
||||
movu [dstq+2*i+mmsize], m1
|
||||
%assign i i+mmsize
|
||||
%endrep
|
||||
lea srcq, [srcq+strideq]
|
||||
lea dstq, [dstq+2*strideq]
|
||||
add obmcq, 32
|
||||
sub yblend, 1
|
||||
jg .loop
|
||||
RET
|
||||
%endm
|
||||
|
||||
INIT_MMX
|
||||
%if ARCH_X86_64 == 0
|
||||
PUT_RECT mmx
|
||||
ADD_RECT mmx
|
||||
|
||||
HPEL_FILTER mmx
|
||||
ADD_OBMC 32, mmx
|
||||
ADD_OBMC 16, mmx
|
||||
%endif
|
||||
ADD_OBMC 8, mmx
|
||||
|
||||
INIT_XMM
|
||||
PUT_RECT sse2
|
||||
ADD_RECT sse2
|
||||
|
||||
HPEL_FILTER sse2
|
||||
ADD_OBMC 32, sse2
|
||||
ADD_OBMC 16, sse2
|
||||
|
||||
INIT_XMM sse4
|
||||
|
||||
; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
|
||||
cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
|
||||
movd m2, qfd
|
||||
movd m3, qsd
|
||||
SPLATD m2
|
||||
SPLATD m3
|
||||
mov r4d, tot_hd
|
||||
mov r3, dstq
|
||||
|
||||
.loop_v:
|
||||
mov tot_hq, r4
|
||||
mov dstq, r3
|
||||
|
||||
.loop_h:
|
||||
movu m0, [srcq]
|
||||
|
||||
pabsd m1, m0
|
||||
pmulld m1, m2
|
||||
paddd m1, m3
|
||||
psrld m1, 2
|
||||
psignd m1, m0
|
||||
|
||||
movu [dstq], m1
|
||||
|
||||
add srcq, mmsize
|
||||
add dstq, mmsize
|
||||
sub tot_hq, 4
|
||||
jg .loop_h
|
||||
lea srcq, [srcq + 4*tot_hq]
|
||||
|
||||
add r3, strideq
|
||||
dec tot_vd
|
||||
jg .loop_v
|
||||
|
||||
RET
|
||||
|
||||
INIT_XMM sse4
|
||||
; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
|
||||
%if ARCH_X86_64
|
||||
cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2
|
||||
%else
|
||||
cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2
|
||||
%define hd r5mp
|
||||
%endif
|
||||
shl wd, 2
|
||||
add srcq, wq
|
||||
neg wq
|
||||
mov t2q, dstq
|
||||
mov t1q, wq
|
||||
pxor m2, m2
|
||||
mova m3, [clip_10bit]
|
||||
mova m4, [convert_to_unsigned_10bit]
|
||||
|
||||
.loop_h:
|
||||
mov dstq, t2q
|
||||
mov wq, t1q
|
||||
|
||||
.loop_w:
|
||||
movu m0, [srcq+wq+0*mmsize]
|
||||
movu m1, [srcq+wq+1*mmsize]
|
||||
|
||||
paddd m0, m4
|
||||
paddd m1, m4
|
||||
packusdw m0, m0, m1
|
||||
CLIPW m0, m2, m3 ; packusdw saturates so it's fine
|
||||
|
||||
movu [dstq], m0
|
||||
|
||||
add dstq, 1*mmsize
|
||||
add wq, 2*mmsize
|
||||
jl .loop_w
|
||||
|
||||
add srcq, src_strideq
|
||||
add t2q, dst_strideq
|
||||
sub hd, 1
|
||||
jg .loop_h
|
||||
|
||||
RET
|
||||
195
externals/ffmpeg/libavcodec/x86/diracdsp_init.c
vendored
Executable file
195
externals/ffmpeg/libavcodec/x86/diracdsp_init.c
vendored
Executable file
@@ -0,0 +1,195 @@
|
||||
/*
|
||||
* Copyright (C) 2010 David Conrad
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/diracdsp.h"
|
||||
#include "fpel.h"
|
||||
|
||||
DECL_DIRAC_PIXOP(put, mmx);
|
||||
DECL_DIRAC_PIXOP(avg, mmx);
|
||||
DECL_DIRAC_PIXOP(avg, mmxext);
|
||||
|
||||
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
|
||||
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
|
||||
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
|
||||
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
|
||||
|
||||
void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
|
||||
void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
|
||||
|
||||
void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
|
||||
void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
|
||||
void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
|
||||
void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
|
||||
void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
|
||||
void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
|
||||
void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
|
||||
|
||||
void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
|
||||
|
||||
#if HAVE_X86ASM
|
||||
|
||||
#define HPEL_FILTER(MMSIZE, EXT) \
|
||||
void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \
|
||||
void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \
|
||||
\
|
||||
static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
|
||||
const uint8_t *src, int stride, int width, int height) \
|
||||
{ \
|
||||
while( height-- ) \
|
||||
{ \
|
||||
ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
|
||||
ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \
|
||||
ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \
|
||||
\
|
||||
dsth += stride; \
|
||||
dstv += stride; \
|
||||
dstc += stride; \
|
||||
src += stride; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define PIXFUNC(PFX, IDX, EXT) \
|
||||
/*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \
|
||||
c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
|
||||
c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
|
||||
|
||||
#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
|
||||
void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
|
||||
{\
|
||||
if (h&3)\
|
||||
ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
|
||||
else\
|
||||
OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
|
||||
}\
|
||||
void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
|
||||
{\
|
||||
if (h&3)\
|
||||
ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
|
||||
else\
|
||||
OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
|
||||
}\
|
||||
void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
|
||||
{\
|
||||
if (h&3) {\
|
||||
ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
|
||||
} else {\
|
||||
OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
|
||||
OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
|
||||
}\
|
||||
}
|
||||
|
||||
DIRAC_PIXOP(put, ff_put, mmx)
|
||||
DIRAC_PIXOP(avg, ff_avg, mmx)
|
||||
DIRAC_PIXOP(avg, ff_avg, mmxext)
|
||||
|
||||
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
|
||||
{
|
||||
if (h&3)
|
||||
ff_put_dirac_pixels16_c(dst, src, stride, h);
|
||||
else
|
||||
ff_put_pixels16_sse2(dst, src[0], stride, h);
|
||||
}
|
||||
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
|
||||
{
|
||||
if (h&3)
|
||||
ff_avg_dirac_pixels16_c(dst, src, stride, h);
|
||||
else
|
||||
ff_avg_pixels16_sse2(dst, src[0], stride, h);
|
||||
}
|
||||
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
|
||||
{
|
||||
if (h&3) {
|
||||
ff_put_dirac_pixels32_c(dst, src, stride, h);
|
||||
} else {
|
||||
ff_put_pixels16_sse2(dst , src[0] , stride, h);
|
||||
ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
|
||||
}
|
||||
}
|
||||
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
|
||||
{
|
||||
if (h&3) {
|
||||
ff_avg_dirac_pixels32_c(dst, src, stride, h);
|
||||
} else {
|
||||
ff_avg_pixels16_sse2(dst , src[0] , stride, h);
|
||||
ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
|
||||
}
|
||||
}
|
||||
|
||||
#else // HAVE_X86ASM
|
||||
|
||||
#define HPEL_FILTER(MMSIZE, EXT) \
|
||||
void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
|
||||
const uint8_t *src, int stride, int width, int height);
|
||||
|
||||
#define PIXFUNC(PFX, IDX, EXT) do {} while (0)
|
||||
|
||||
#endif // HAVE_X86ASM
|
||||
|
||||
#if !ARCH_X86_64
|
||||
HPEL_FILTER(8, mmx)
|
||||
#endif
|
||||
HPEL_FILTER(16, sse2)
|
||||
|
||||
void ff_diracdsp_init_x86(DiracDSPContext* c)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(mm_flags)) {
|
||||
c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
|
||||
#if !ARCH_X86_64
|
||||
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
|
||||
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
|
||||
c->dirac_hpel_filter = dirac_hpel_filter_mmx;
|
||||
c->add_rect_clamped = ff_add_rect_clamped_mmx;
|
||||
c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_mmx;
|
||||
#endif
|
||||
PIXFUNC(put, 0, mmx);
|
||||
PIXFUNC(avg, 0, mmx);
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(mm_flags)) {
|
||||
PIXFUNC(avg, 0, mmxext);
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(mm_flags)) {
|
||||
c->dirac_hpel_filter = dirac_hpel_filter_sse2;
|
||||
c->add_rect_clamped = ff_add_rect_clamped_sse2;
|
||||
c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
|
||||
|
||||
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
|
||||
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
|
||||
|
||||
c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
|
||||
c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
|
||||
c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
|
||||
c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE4(mm_flags)) {
|
||||
c->dequant_subband[1] = ff_dequant_subband_32_sse4;
|
||||
c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
|
||||
}
|
||||
}
|
||||
49
externals/ffmpeg/libavcodec/x86/dnxhdenc.asm
vendored
Executable file
49
externals/ffmpeg/libavcodec/x86/dnxhdenc.asm
vendored
Executable file
@@ -0,0 +1,49 @@
|
||||
;************************************************************************
|
||||
;* VC3/DNxHD SIMD functions
|
||||
;* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
|
||||
;* Copyright (c) 2014 Tiancheng "Timothy" Gu <timothygu99@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
; void get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
|
||||
; ptrdiff_t line_size)
|
||||
INIT_XMM sse2
|
||||
cglobal get_pixels_8x4_sym, 3,3,5, block, pixels, linesize
|
||||
pxor m4, m4
|
||||
movq m0, [pixelsq]
|
||||
add pixelsq, linesizeq
|
||||
movq m1, [pixelsq]
|
||||
movq m2, [pixelsq+linesizeq]
|
||||
movq m3, [pixelsq+linesizeq*2]
|
||||
punpcklbw m0, m4
|
||||
punpcklbw m1, m4
|
||||
punpcklbw m2, m4
|
||||
punpcklbw m3, m4
|
||||
mova [blockq ], m0
|
||||
mova [blockq+16 ], m1
|
||||
mova [blockq+32 ], m2
|
||||
mova [blockq+48 ], m3
|
||||
mova [blockq+64 ], m3
|
||||
mova [blockq+80 ], m2
|
||||
mova [blockq+96 ], m1
|
||||
mova [blockq+112], m0
|
||||
RET
|
||||
37
externals/ffmpeg/libavcodec/x86/dnxhdenc_init.c
vendored
Executable file
37
externals/ffmpeg/libavcodec/x86/dnxhdenc_init.c
vendored
Executable file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
* VC3/DNxHD SIMD functions
|
||||
* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
|
||||
*
|
||||
* VC-3 encoder funded by the British Broadcasting Corporation
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dnxhdenc.h"
|
||||
|
||||
void ff_get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
|
||||
av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
|
||||
{
|
||||
if (EXTERNAL_SSE2(av_get_cpu_flags())) {
|
||||
if (ctx->cid_table->bit_depth == 8)
|
||||
ctx->get_pixels_8x4_sym = ff_get_pixels_8x4_sym_sse2;
|
||||
}
|
||||
}
|
||||
118
externals/ffmpeg/libavcodec/x86/exrdsp.asm
vendored
Executable file
118
externals/ffmpeg/libavcodec/x86/exrdsp.asm
vendored
Executable file
@@ -0,0 +1,118 @@
|
||||
;******************************************************************************
|
||||
;* X86 Optimized functions for Open Exr Decoder
|
||||
;* Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
|
||||
;*
|
||||
;* reorder_pixels, predictor based on patch by John Loy
|
||||
;* port to ASM by Jokyo Images support by CNC - French National Center for Cinema
|
||||
;*
|
||||
;* predictor AVX/AVX2 by Henrik Gramner
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
cextern pb_15
|
||||
cextern pb_80
|
||||
|
||||
SECTION .text
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_reorder_pixels(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro REORDER_PIXELS 0
|
||||
cglobal reorder_pixels, 3,4,3, dst, src1, size, src2
|
||||
lea src2q, [src1q+sizeq] ; src2 = src + 2 * half_size
|
||||
add dstq, sizeq ; dst offset by size
|
||||
shr sizeq, 1 ; half_size
|
||||
add src1q, sizeq ; offset src by half_size
|
||||
neg sizeq ; size = offset for dst, src1, src2
|
||||
.loop:
|
||||
|
||||
mova m0, [src1q+sizeq] ; load first part
|
||||
movu m1, [src2q+sizeq] ; load second part
|
||||
SBUTTERFLY bw, 0, 1, 2 ; interleaved
|
||||
mova [dstq+2*sizeq ], xm0 ; copy to dst
|
||||
mova [dstq+2*sizeq+16], xm1
|
||||
%if cpuflag(avx2)
|
||||
vperm2i128 m0, m0, m1, q0301
|
||||
mova [dstq+2*sizeq+32], m0
|
||||
%endif
|
||||
add sizeq, mmsize
|
||||
jl .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
REORDER_PIXELS
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
REORDER_PIXELS
|
||||
%endif
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_predictor(uint8_t *src, ptrdiff_t size);
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro PREDICTOR 0
|
||||
cglobal predictor, 2,2,5, src, size
|
||||
mova m0, [pb_80]
|
||||
mova xm1, [pb_15]
|
||||
mova xm2, xm0
|
||||
add srcq, sizeq
|
||||
neg sizeq
|
||||
.loop:
|
||||
pxor m3, m0, [srcq + sizeq]
|
||||
pslldq m4, m3, 1
|
||||
paddb m3, m4
|
||||
pslldq m4, m3, 2
|
||||
paddb m3, m4
|
||||
pslldq m4, m3, 4
|
||||
paddb m3, m4
|
||||
pslldq m4, m3, 8
|
||||
%if mmsize == 32
|
||||
paddb m3, m4
|
||||
paddb xm2, xm3
|
||||
vextracti128 xm4, m3, 1
|
||||
mova [srcq + sizeq], xm2
|
||||
pshufb xm2, xm1
|
||||
paddb xm2, xm4
|
||||
mova [srcq + sizeq + 16], xm2
|
||||
%else
|
||||
paddb m2, m3
|
||||
paddb m2, m4
|
||||
mova [srcq + sizeq], m2
|
||||
%endif
|
||||
pshufb xm2, xm1
|
||||
add sizeq, mmsize
|
||||
jl .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
PREDICTOR
|
||||
|
||||
INIT_XMM avx
|
||||
PREDICTOR
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
PREDICTOR
|
||||
%endif
|
||||
52
externals/ffmpeg/libavcodec/x86/exrdsp_init.c
vendored
Executable file
52
externals/ffmpeg/libavcodec/x86/exrdsp_init.c
vendored
Executable file
@@ -0,0 +1,52 @@
|
||||
/*
|
||||
* OpenEXR (.exr) image decoder
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/exrdsp.h"
|
||||
|
||||
void ff_reorder_pixels_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
|
||||
|
||||
void ff_reorder_pixels_avx2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
|
||||
|
||||
void ff_predictor_ssse3(uint8_t *src, ptrdiff_t size);
|
||||
|
||||
void ff_predictor_avx(uint8_t *src, ptrdiff_t size);
|
||||
|
||||
void ff_predictor_avx2(uint8_t *src, ptrdiff_t size);
|
||||
|
||||
av_cold void ff_exrdsp_init_x86(ExrDSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
dsp->reorder_pixels = ff_reorder_pixels_sse2;
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
dsp->predictor = ff_predictor_ssse3;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
dsp->predictor = ff_predictor_avx;
|
||||
}
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
dsp->reorder_pixels = ff_reorder_pixels_avx2;
|
||||
dsp->predictor = ff_predictor_avx2;
|
||||
}
|
||||
}
|
||||
594
externals/ffmpeg/libavcodec/x86/fdct.c
vendored
Executable file
594
externals/ffmpeg/libavcodec/x86/fdct.c
vendored
Executable file
@@ -0,0 +1,594 @@
|
||||
/*
|
||||
* SIMD-optimized forward DCT
|
||||
* The gcc porting is Copyright (c) 2001 Fabrice Bellard.
|
||||
* cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
* SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
|
||||
*
|
||||
* from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
|
||||
*
|
||||
* Intel Application Note AP-922 - fast, precise implementation of DCT
|
||||
* http://developer.intel.com/vtune/cbts/appnotes.htm
|
||||
*
|
||||
* Also of inspiration:
|
||||
* a page about fdct at http://www.geocities.com/ssavekar/dct.htm
|
||||
* Skal's fdct at http://skal.planet-d.net/coding/dct.html
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "fdct.h"
|
||||
|
||||
#if HAVE_MMX_INLINE
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// constants for the forward DCT
|
||||
// -----------------------------
|
||||
//
|
||||
// Be sure to check that your compiler is aligning all constants to QWORD
|
||||
// (8-byte) memory boundaries! Otherwise the unaligned memory access will
|
||||
// severely stall MMX execution.
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
|
||||
#define SHIFT_FRW_COL BITS_FRW_ACC
|
||||
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
|
||||
#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
|
||||
//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
|
||||
|
||||
#define X8(x) x,x,x,x,x,x,x,x
|
||||
|
||||
//concatenated table, for forward DCT transformation
|
||||
DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
|
||||
X8(13036), // tg * (2<<16) + 0.5
|
||||
X8(27146), // tg * (2<<16) + 0.5
|
||||
X8(-21746) // tg * (2<<16) + 0.5
|
||||
};
|
||||
|
||||
DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
|
||||
X8(23170) //cos * (2<<15) + 0.5
|
||||
};
|
||||
|
||||
DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
|
||||
|
||||
DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
|
||||
|
||||
static const struct
|
||||
{
|
||||
DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
|
||||
} fdct_r_row_sse2 =
|
||||
{{
|
||||
RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
|
||||
}};
|
||||
//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
|
||||
|
||||
DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table
|
||||
16384, 16384, 22725, 19266,
|
||||
16384, 16384, 12873, 4520,
|
||||
21407, 8867, 19266, -4520,
|
||||
-8867, -21407, -22725, -12873,
|
||||
16384, -16384, 12873, -22725,
|
||||
-16384, 16384, 4520, 19266,
|
||||
8867, -21407, 4520, -12873,
|
||||
21407, -8867, 19266, -22725,
|
||||
|
||||
22725, 22725, 31521, 26722,
|
||||
22725, 22725, 17855, 6270,
|
||||
29692, 12299, 26722, -6270,
|
||||
-12299, -29692, -31521, -17855,
|
||||
22725, -22725, 17855, -31521,
|
||||
-22725, 22725, 6270, 26722,
|
||||
12299, -29692, 6270, -17855,
|
||||
29692, -12299, 26722, -31521,
|
||||
|
||||
21407, 21407, 29692, 25172,
|
||||
21407, 21407, 16819, 5906,
|
||||
27969, 11585, 25172, -5906,
|
||||
-11585, -27969, -29692, -16819,
|
||||
21407, -21407, 16819, -29692,
|
||||
-21407, 21407, 5906, 25172,
|
||||
11585, -27969, 5906, -16819,
|
||||
27969, -11585, 25172, -29692,
|
||||
|
||||
19266, 19266, 26722, 22654,
|
||||
19266, 19266, 15137, 5315,
|
||||
25172, 10426, 22654, -5315,
|
||||
-10426, -25172, -26722, -15137,
|
||||
19266, -19266, 15137, -26722,
|
||||
-19266, 19266, 5315, 22654,
|
||||
10426, -25172, 5315, -15137,
|
||||
25172, -10426, 22654, -26722,
|
||||
|
||||
16384, 16384, 22725, 19266,
|
||||
16384, 16384, 12873, 4520,
|
||||
21407, 8867, 19266, -4520,
|
||||
-8867, -21407, -22725, -12873,
|
||||
16384, -16384, 12873, -22725,
|
||||
-16384, 16384, 4520, 19266,
|
||||
8867, -21407, 4520, -12873,
|
||||
21407, -8867, 19266, -22725,
|
||||
|
||||
19266, 19266, 26722, 22654,
|
||||
19266, 19266, 15137, 5315,
|
||||
25172, 10426, 22654, -5315,
|
||||
-10426, -25172, -26722, -15137,
|
||||
19266, -19266, 15137, -26722,
|
||||
-19266, 19266, 5315, 22654,
|
||||
10426, -25172, 5315, -15137,
|
||||
25172, -10426, 22654, -26722,
|
||||
|
||||
21407, 21407, 29692, 25172,
|
||||
21407, 21407, 16819, 5906,
|
||||
27969, 11585, 25172, -5906,
|
||||
-11585, -27969, -29692, -16819,
|
||||
21407, -21407, 16819, -29692,
|
||||
-21407, 21407, 5906, 25172,
|
||||
11585, -27969, 5906, -16819,
|
||||
27969, -11585, 25172, -29692,
|
||||
|
||||
22725, 22725, 31521, 26722,
|
||||
22725, 22725, 17855, 6270,
|
||||
29692, 12299, 26722, -6270,
|
||||
-12299, -29692, -31521, -17855,
|
||||
22725, -22725, 17855, -31521,
|
||||
-22725, 22725, 6270, 26722,
|
||||
12299, -29692, 6270, -17855,
|
||||
29692, -12299, 26722, -31521,
|
||||
};
|
||||
|
||||
static const struct
|
||||
{
|
||||
DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
|
||||
} tab_frw_01234567_sse2 =
|
||||
{{
|
||||
//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table
|
||||
#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
|
||||
C4, C4, C5, C7, C2, C6, C3, -C7, \
|
||||
-C4, C4, C7, C3, C6, -C2, C7, -C5, \
|
||||
C4, -C4, C5, -C1, C2, -C6, C3, -C1,
|
||||
// c1..c7 * cos(pi/4) * 2^15
|
||||
#define C1 22725
|
||||
#define C2 21407
|
||||
#define C3 19266
|
||||
#define C4 16384
|
||||
#define C5 12873
|
||||
#define C6 8867
|
||||
#define C7 4520
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 31521
|
||||
#define C2 29692
|
||||
#define C3 26722
|
||||
#define C4 22725
|
||||
#define C5 17855
|
||||
#define C6 12299
|
||||
#define C7 6270
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 29692
|
||||
#define C2 27969
|
||||
#define C3 25172
|
||||
#define C4 21407
|
||||
#define C5 16819
|
||||
#define C6 11585
|
||||
#define C7 5906
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 26722
|
||||
#define C2 25172
|
||||
#define C3 22654
|
||||
#define C4 19266
|
||||
#define C5 15137
|
||||
#define C6 10426
|
||||
#define C7 5315
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 22725
|
||||
#define C2 21407
|
||||
#define C3 19266
|
||||
#define C4 16384
|
||||
#define C5 12873
|
||||
#define C6 8867
|
||||
#define C7 4520
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 26722
|
||||
#define C2 25172
|
||||
#define C3 22654
|
||||
#define C4 19266
|
||||
#define C5 15137
|
||||
#define C6 10426
|
||||
#define C7 5315
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 29692
|
||||
#define C2 27969
|
||||
#define C3 25172
|
||||
#define C4 21407
|
||||
#define C5 16819
|
||||
#define C6 11585
|
||||
#define C7 5906
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 31521
|
||||
#define C2 29692
|
||||
#define C3 26722
|
||||
#define C4 22725
|
||||
#define C5 17855
|
||||
#define C6 12299
|
||||
#define C7 6270
|
||||
TABLE_SSE2
|
||||
}};
|
||||
|
||||
#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
|
||||
|
||||
#define FDCT_COL(cpu, mm, mov)\
|
||||
static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
|
||||
{\
|
||||
__asm__ volatile (\
|
||||
#mov" 16(%0), %%"#mm"0 \n\t" \
|
||||
#mov" 96(%0), %%"#mm"1 \n\t" \
|
||||
#mov" %%"#mm"0, %%"#mm"2 \n\t" \
|
||||
#mov" 32(%0), %%"#mm"3 \n\t" \
|
||||
"paddsw %%"#mm"1, %%"#mm"0 \n\t" \
|
||||
#mov" 80(%0), %%"#mm"4 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
|
||||
#mov" (%0), %%"#mm"5 \n\t" \
|
||||
"paddsw %%"#mm"3, %%"#mm"4 \n\t" \
|
||||
"paddsw 112(%0), %%"#mm"5 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
|
||||
#mov" %%"#mm"0, %%"#mm"6 \n\t" \
|
||||
"psubsw %%"#mm"1, %%"#mm"2 \n\t" \
|
||||
#mov" 16(%1), %%"#mm"1 \n\t" \
|
||||
"psubsw %%"#mm"4, %%"#mm"0 \n\t" \
|
||||
#mov" 48(%0), %%"#mm"7 \n\t" \
|
||||
"pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
|
||||
"paddsw 64(%0), %%"#mm"7 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
|
||||
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
|
||||
#mov" %%"#mm"5, %%"#mm"4 \n\t" \
|
||||
"psubsw %%"#mm"7, %%"#mm"5 \n\t" \
|
||||
"paddsw %%"#mm"5, %%"#mm"1 \n\t" \
|
||||
"paddsw %%"#mm"7, %%"#mm"4 \n\t" \
|
||||
"por (%2), %%"#mm"1 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
|
||||
"pmulhw 16(%1), %%"#mm"5 \n\t" \
|
||||
#mov" %%"#mm"4, %%"#mm"7 \n\t" \
|
||||
"psubsw 80(%0), %%"#mm"3 \n\t" \
|
||||
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
|
||||
#mov" %%"#mm"1, 32(%3) \n\t" \
|
||||
"paddsw %%"#mm"6, %%"#mm"7 \n\t" \
|
||||
#mov" 48(%0), %%"#mm"1 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
|
||||
"psubsw 64(%0), %%"#mm"1 \n\t" \
|
||||
#mov" %%"#mm"2, %%"#mm"6 \n\t" \
|
||||
#mov" %%"#mm"4, 64(%3) \n\t" \
|
||||
"paddsw %%"#mm"3, %%"#mm"2 \n\t" \
|
||||
"pmulhw (%4), %%"#mm"2 \n\t" \
|
||||
"psubsw %%"#mm"3, %%"#mm"6 \n\t" \
|
||||
"pmulhw (%4), %%"#mm"6 \n\t" \
|
||||
"psubsw %%"#mm"0, %%"#mm"5 \n\t" \
|
||||
"por (%2), %%"#mm"5 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
|
||||
"por (%2), %%"#mm"2 \n\t" \
|
||||
#mov" %%"#mm"1, %%"#mm"4 \n\t" \
|
||||
#mov" (%0), %%"#mm"3 \n\t" \
|
||||
"paddsw %%"#mm"6, %%"#mm"1 \n\t" \
|
||||
"psubsw 112(%0), %%"#mm"3 \n\t" \
|
||||
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
|
||||
#mov" (%1), %%"#mm"0 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
|
||||
#mov" 32(%1), %%"#mm"6 \n\t" \
|
||||
"pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
|
||||
#mov" %%"#mm"7, (%3) \n\t" \
|
||||
"pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
|
||||
#mov" %%"#mm"5, 96(%3) \n\t" \
|
||||
#mov" %%"#mm"3, %%"#mm"7 \n\t" \
|
||||
#mov" 32(%1), %%"#mm"5 \n\t" \
|
||||
"psubsw %%"#mm"2, %%"#mm"7 \n\t" \
|
||||
"paddsw %%"#mm"2, %%"#mm"3 \n\t" \
|
||||
"pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
|
||||
"paddsw %%"#mm"3, %%"#mm"0 \n\t" \
|
||||
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
|
||||
"pmulhw (%1), %%"#mm"3 \n\t" \
|
||||
"por (%2), %%"#mm"0 \n\t" \
|
||||
"paddsw %%"#mm"7, %%"#mm"5 \n\t" \
|
||||
"psubsw %%"#mm"6, %%"#mm"7 \n\t" \
|
||||
#mov" %%"#mm"0, 16(%3) \n\t" \
|
||||
"paddsw %%"#mm"4, %%"#mm"5 \n\t" \
|
||||
#mov" %%"#mm"7, 48(%3) \n\t" \
|
||||
"psubsw %%"#mm"1, %%"#mm"3 \n\t" \
|
||||
#mov" %%"#mm"5, 80(%3) \n\t" \
|
||||
#mov" %%"#mm"3, 112(%3) \n\t" \
|
||||
: \
|
||||
: "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
|
||||
"r" (out + offset), "r" (ocos_4_16)); \
|
||||
}
|
||||
|
||||
FDCT_COL(mmx, mm, movq)
|
||||
FDCT_COL(sse2, xmm, movdqa)
|
||||
|
||||
static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
|
||||
{
|
||||
__asm__ volatile(
|
||||
#define FDCT_ROW_SSE2_H1(i,t) \
|
||||
"movq " #i "(%0), %%xmm2 \n\t" \
|
||||
"movq " #i "+8(%0), %%xmm0 \n\t" \
|
||||
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
|
||||
"movdqa " #t "+48(%1), %%xmm7 \n\t" \
|
||||
"movdqa " #t "(%1), %%xmm4 \n\t" \
|
||||
"movdqa " #t "+16(%1), %%xmm5 \n\t"
|
||||
|
||||
#define FDCT_ROW_SSE2_H2(i,t) \
|
||||
"movq " #i "(%0), %%xmm2 \n\t" \
|
||||
"movq " #i "+8(%0), %%xmm0 \n\t" \
|
||||
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
|
||||
"movdqa " #t "+48(%1), %%xmm7 \n\t"
|
||||
|
||||
#define FDCT_ROW_SSE2(i) \
|
||||
"movq %%xmm2, %%xmm1 \n\t" \
|
||||
"pshuflw $27, %%xmm0, %%xmm0 \n\t" \
|
||||
"paddsw %%xmm0, %%xmm1 \n\t" \
|
||||
"psubsw %%xmm0, %%xmm2 \n\t" \
|
||||
"punpckldq %%xmm2, %%xmm1 \n\t" \
|
||||
"pshufd $78, %%xmm1, %%xmm2 \n\t" \
|
||||
"pmaddwd %%xmm2, %%xmm3 \n\t" \
|
||||
"pmaddwd %%xmm1, %%xmm7 \n\t" \
|
||||
"pmaddwd %%xmm5, %%xmm2 \n\t" \
|
||||
"pmaddwd %%xmm4, %%xmm1 \n\t" \
|
||||
"paddd %%xmm7, %%xmm3 \n\t" \
|
||||
"paddd %%xmm2, %%xmm1 \n\t" \
|
||||
"paddd %%xmm6, %%xmm3 \n\t" \
|
||||
"paddd %%xmm6, %%xmm1 \n\t" \
|
||||
"psrad %3, %%xmm3 \n\t" \
|
||||
"psrad %3, %%xmm1 \n\t" \
|
||||
"packssdw %%xmm3, %%xmm1 \n\t" \
|
||||
"movdqa %%xmm1, " #i "(%4) \n\t"
|
||||
|
||||
"movdqa (%2), %%xmm6 \n\t"
|
||||
FDCT_ROW_SSE2_H1(0,0)
|
||||
FDCT_ROW_SSE2(0)
|
||||
FDCT_ROW_SSE2_H2(64,0)
|
||||
FDCT_ROW_SSE2(64)
|
||||
|
||||
FDCT_ROW_SSE2_H1(16,64)
|
||||
FDCT_ROW_SSE2(16)
|
||||
FDCT_ROW_SSE2_H2(112,64)
|
||||
FDCT_ROW_SSE2(112)
|
||||
|
||||
FDCT_ROW_SSE2_H1(32,128)
|
||||
FDCT_ROW_SSE2(32)
|
||||
FDCT_ROW_SSE2_H2(96,128)
|
||||
FDCT_ROW_SSE2(96)
|
||||
|
||||
FDCT_ROW_SSE2_H1(48,192)
|
||||
FDCT_ROW_SSE2(48)
|
||||
FDCT_ROW_SSE2_H2(80,192)
|
||||
FDCT_ROW_SSE2(80)
|
||||
:
|
||||
: "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
|
||||
"r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
|
||||
);
|
||||
}
|
||||
|
||||
static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out,
|
||||
const int16_t *table)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"pshufw $0x1B, 8(%0), %%mm5 \n\t"
|
||||
"movq (%0), %%mm0 \n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"paddsw %%mm5, %%mm0 \n\t"
|
||||
"psubsw %%mm5, %%mm1 \n\t"
|
||||
"movq %%mm0, %%mm2 \n\t"
|
||||
"punpckldq %%mm1, %%mm0 \n\t"
|
||||
"punpckhdq %%mm1, %%mm2 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"movq 8(%1), %%mm3 \n\t"
|
||||
"movq 16(%1), %%mm4 \n\t"
|
||||
"movq 24(%1), %%mm5 \n\t"
|
||||
"movq 32(%1), %%mm6 \n\t"
|
||||
"movq 40(%1), %%mm7 \n\t"
|
||||
"pmaddwd %%mm0, %%mm1 \n\t"
|
||||
"pmaddwd %%mm2, %%mm3 \n\t"
|
||||
"pmaddwd %%mm0, %%mm4 \n\t"
|
||||
"pmaddwd %%mm2, %%mm5 \n\t"
|
||||
"pmaddwd %%mm0, %%mm6 \n\t"
|
||||
"pmaddwd %%mm2, %%mm7 \n\t"
|
||||
"pmaddwd 48(%1), %%mm0 \n\t"
|
||||
"pmaddwd 56(%1), %%mm2 \n\t"
|
||||
"paddd %%mm1, %%mm3 \n\t"
|
||||
"paddd %%mm4, %%mm5 \n\t"
|
||||
"paddd %%mm6, %%mm7 \n\t"
|
||||
"paddd %%mm0, %%mm2 \n\t"
|
||||
"movq (%2), %%mm0 \n\t"
|
||||
"paddd %%mm0, %%mm3 \n\t"
|
||||
"paddd %%mm0, %%mm5 \n\t"
|
||||
"paddd %%mm0, %%mm7 \n\t"
|
||||
"paddd %%mm0, %%mm2 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
|
||||
"packssdw %%mm5, %%mm3 \n\t"
|
||||
"packssdw %%mm2, %%mm7 \n\t"
|
||||
"movq %%mm3, (%3) \n\t"
|
||||
"movq %%mm7, 8(%3) \n\t"
|
||||
:
|
||||
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
|
||||
}
|
||||
|
||||
static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
|
||||
{
|
||||
//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
|
||||
__asm__ volatile(
|
||||
"movd 12(%0), %%mm1 \n\t"
|
||||
"punpcklwd 8(%0), %%mm1 \n\t"
|
||||
"movq %%mm1, %%mm2 \n\t"
|
||||
"psrlq $0x20, %%mm1 \n\t"
|
||||
"movq 0(%0), %%mm0 \n\t"
|
||||
"punpcklwd %%mm2, %%mm1 \n\t"
|
||||
"movq %%mm0, %%mm5 \n\t"
|
||||
"paddsw %%mm1, %%mm0 \n\t"
|
||||
"psubsw %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm0, %%mm2 \n\t"
|
||||
"punpckldq %%mm5, %%mm0 \n\t"
|
||||
"punpckhdq %%mm5, %%mm2 \n\t"
|
||||
"movq 0(%1), %%mm1 \n\t"
|
||||
"movq 8(%1), %%mm3 \n\t"
|
||||
"movq 16(%1), %%mm4 \n\t"
|
||||
"movq 24(%1), %%mm5 \n\t"
|
||||
"movq 32(%1), %%mm6 \n\t"
|
||||
"movq 40(%1), %%mm7 \n\t"
|
||||
"pmaddwd %%mm0, %%mm1 \n\t"
|
||||
"pmaddwd %%mm2, %%mm3 \n\t"
|
||||
"pmaddwd %%mm0, %%mm4 \n\t"
|
||||
"pmaddwd %%mm2, %%mm5 \n\t"
|
||||
"pmaddwd %%mm0, %%mm6 \n\t"
|
||||
"pmaddwd %%mm2, %%mm7 \n\t"
|
||||
"pmaddwd 48(%1), %%mm0 \n\t"
|
||||
"pmaddwd 56(%1), %%mm2 \n\t"
|
||||
"paddd %%mm1, %%mm3 \n\t"
|
||||
"paddd %%mm4, %%mm5 \n\t"
|
||||
"paddd %%mm6, %%mm7 \n\t"
|
||||
"paddd %%mm0, %%mm2 \n\t"
|
||||
"movq (%2), %%mm0 \n\t"
|
||||
"paddd %%mm0, %%mm3 \n\t"
|
||||
"paddd %%mm0, %%mm5 \n\t"
|
||||
"paddd %%mm0, %%mm7 \n\t"
|
||||
"paddd %%mm0, %%mm2 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
|
||||
"packssdw %%mm5, %%mm3 \n\t"
|
||||
"packssdw %%mm2, %%mm7 \n\t"
|
||||
"movq %%mm3, 0(%3) \n\t"
|
||||
"movq %%mm7, 8(%3) \n\t"
|
||||
:
|
||||
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
|
||||
}
|
||||
|
||||
void ff_fdct_mmx(int16_t *block)
|
||||
{
|
||||
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
|
||||
int16_t * block1= (int16_t*)align_tmp;
|
||||
const int16_t *table= tab_frw_01234567;
|
||||
int i;
|
||||
|
||||
fdct_col_mmx(block, block1, 0);
|
||||
fdct_col_mmx(block, block1, 4);
|
||||
|
||||
for(i=8;i>0;i--) {
|
||||
fdct_row_mmx(block1, block, table);
|
||||
block1 += 8;
|
||||
table += 32;
|
||||
block += 8;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* HAVE_MMX_INLINE */
|
||||
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
|
||||
void ff_fdct_mmxext(int16_t *block)
|
||||
{
|
||||
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
|
||||
int16_t *block1= (int16_t*)align_tmp;
|
||||
const int16_t *table= tab_frw_01234567;
|
||||
int i;
|
||||
|
||||
fdct_col_mmx(block, block1, 0);
|
||||
fdct_col_mmx(block, block1, 4);
|
||||
|
||||
for(i=8;i>0;i--) {
|
||||
fdct_row_mmxext(block1, block, table);
|
||||
block1 += 8;
|
||||
table += 32;
|
||||
block += 8;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* HAVE_MMXEXT_INLINE */
|
||||
|
||||
#if HAVE_SSE2_INLINE
|
||||
|
||||
void ff_fdct_sse2(int16_t *block)
|
||||
{
|
||||
DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
|
||||
int16_t * const block1= (int16_t*)align_tmp;
|
||||
|
||||
fdct_col_sse2(block, block1, 0);
|
||||
fdct_row_sse2(block1, block);
|
||||
}
|
||||
|
||||
#endif /* HAVE_SSE2_INLINE */
|
||||
28
externals/ffmpeg/libavcodec/x86/fdct.h
vendored
Executable file
28
externals/ffmpeg/libavcodec/x86/fdct.h
vendored
Executable file
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_FDCT_H
|
||||
#define AVCODEC_X86_FDCT_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
void ff_fdct_mmx(int16_t *block);
|
||||
void ff_fdct_mmxext(int16_t *block);
|
||||
void ff_fdct_sse2(int16_t *block);
|
||||
|
||||
#endif /* AVCODEC_X86_FDCT_H */
|
||||
44
externals/ffmpeg/libavcodec/x86/fdctdsp_init.c
vendored
Executable file
44
externals/ffmpeg/libavcodec/x86/fdctdsp_init.c
vendored
Executable file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/fdctdsp.h"
|
||||
#include "fdct.h"
|
||||
|
||||
av_cold void ff_fdctdsp_init_x86(FDCTDSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
const int dct_algo = avctx->dct_algo;
|
||||
|
||||
if (!high_bit_depth) {
|
||||
if ((dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) {
|
||||
if (INLINE_MMX(cpu_flags))
|
||||
c->fdct = ff_fdct_mmx;
|
||||
|
||||
if (INLINE_MMXEXT(cpu_flags))
|
||||
c->fdct = ff_fdct_mmxext;
|
||||
|
||||
if (INLINE_SSE2(cpu_flags))
|
||||
c->fdct = ff_fdct_sse2;
|
||||
}
|
||||
}
|
||||
}
|
||||
1085
externals/ffmpeg/libavcodec/x86/fft.asm
vendored
Executable file
1085
externals/ffmpeg/libavcodec/x86/fft.asm
vendored
Executable file
File diff suppressed because it is too large
Load Diff
38
externals/ffmpeg/libavcodec/x86/fft.h
vendored
Executable file
38
externals/ffmpeg/libavcodec/x86/fft.h
vendored
Executable file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_FFT_H
|
||||
#define AVCODEC_X86_FFT_H
|
||||
|
||||
#include "libavcodec/fft.h"
|
||||
|
||||
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
|
||||
|
||||
void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
#endif /* AVCODEC_X86_FFT_H */
|
||||
61
externals/ffmpeg/libavcodec/x86/fft_init.c
vendored
Executable file
61
externals/ffmpeg/libavcodec/x86/fft_init.c
vendored
Executable file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
|
||||
#include "fft.h"
|
||||
|
||||
av_cold void ff_fft_init_x86(FFTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (s->nbits > 16)
|
||||
return;
|
||||
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
|
||||
s->imdct_calc = ff_imdct_calc_3dnow;
|
||||
s->imdct_half = ff_imdct_half_3dnow;
|
||||
s->fft_calc = ff_fft_calc_3dnow;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
|
||||
s->imdct_calc = ff_imdct_calc_3dnowext;
|
||||
s->imdct_half = ff_imdct_half_3dnowext;
|
||||
s->fft_calc = ff_fft_calc_3dnowext;
|
||||
}
|
||||
#endif /* ARCH_X86_32 */
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
s->imdct_calc = ff_imdct_calc_sse;
|
||||
s->imdct_half = ff_imdct_half_sse;
|
||||
s->fft_permute = ff_fft_permute_sse;
|
||||
s->fft_calc = ff_fft_calc_sse;
|
||||
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
|
||||
s->imdct_half = ff_imdct_half_avx;
|
||||
s->fft_calc = ff_fft_calc_avx;
|
||||
s->fft_permutation = FF_FFT_PERM_AVX;
|
||||
}
|
||||
}
|
||||
101
externals/ffmpeg/libavcodec/x86/flac_dsp_gpl.asm
vendored
Executable file
101
externals/ffmpeg/libavcodec/x86/flac_dsp_gpl.asm
vendored
Executable file
@@ -0,0 +1,101 @@
|
||||
;******************************************************************************
|
||||
;* FLAC DSP functions
|
||||
;*
|
||||
;* Copyright (c) 2014 James Darnley <james.darnley@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License along
|
||||
;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
|
||||
;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM sse4
|
||||
%if ARCH_X86_64
|
||||
cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
|
||||
DECLARE_REG_TMP 5, 6
|
||||
%define length r2d
|
||||
|
||||
movsxd orderq, orderd
|
||||
%else
|
||||
cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs
|
||||
DECLARE_REG_TMP 2, 5
|
||||
%define length r2mp
|
||||
%endif
|
||||
|
||||
; Here we assume that the maximum order value is 32. This means that we only
|
||||
; need to copy a maximum of 32 samples. Therefore we let the preprocessor
|
||||
; unroll this loop and copy all 32.
|
||||
%assign iter 0
|
||||
%rep 32/(mmsize/4)
|
||||
movu m0, [smpq+iter]
|
||||
movu [resq+iter], m0
|
||||
%assign iter iter+mmsize
|
||||
%endrep
|
||||
|
||||
lea resq, [resq+orderq*4]
|
||||
lea smpq, [smpq+orderq*4]
|
||||
lea coefsq, [coefsq+orderq*4]
|
||||
sub length, orderd
|
||||
movd m3, r5m
|
||||
neg orderq
|
||||
|
||||
%define posj t0q
|
||||
%define negj t1q
|
||||
|
||||
.looplen:
|
||||
pxor m0, m0
|
||||
pxor m4, m4
|
||||
pxor m6, m6
|
||||
mov posj, orderq
|
||||
xor negj, negj
|
||||
|
||||
.looporder:
|
||||
movd m2, [coefsq+posj*4] ; c = coefs[j]
|
||||
SPLATD m2
|
||||
movu m1, [smpq+negj*4-4] ; s = smp[i-j-1]
|
||||
movu m5, [smpq+negj*4-4+mmsize]
|
||||
movu m7, [smpq+negj*4-4+mmsize*2]
|
||||
pmulld m1, m2
|
||||
pmulld m5, m2
|
||||
pmulld m7, m2
|
||||
paddd m0, m1 ; p += c * s
|
||||
paddd m4, m5
|
||||
paddd m6, m7
|
||||
|
||||
dec negj
|
||||
inc posj
|
||||
jnz .looporder
|
||||
|
||||
psrad m0, m3 ; p >>= shift
|
||||
psrad m4, m3
|
||||
psrad m6, m3
|
||||
movu m1, [smpq]
|
||||
movu m5, [smpq+mmsize]
|
||||
movu m7, [smpq+mmsize*2]
|
||||
psubd m1, m0 ; smp[i] - p
|
||||
psubd m5, m4
|
||||
psubd m7, m6
|
||||
movu [resq], m1 ; res[i] = smp[i] - (p >> shift)
|
||||
movu [resq+mmsize], m5
|
||||
movu [resq+mmsize*2], m7
|
||||
|
||||
add resq, 3*mmsize
|
||||
add smpq, 3*mmsize
|
||||
sub length, (3*mmsize)/4
|
||||
jg .looplen
|
||||
RET
|
||||
313
externals/ffmpeg/libavcodec/x86/flacdsp.asm
vendored
Executable file
313
externals/ffmpeg/libavcodec/x86/flacdsp.asm
vendored
Executable file
@@ -0,0 +1,313 @@
|
||||
;******************************************************************************
|
||||
;* FLAC DSP SIMD optimizations
|
||||
;*
|
||||
;* Copyright (C) 2014 Loren Merritt
|
||||
;* Copyright (C) 2014 James Almer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro PMACSDQL 5
|
||||
%if cpuflag(xop)
|
||||
pmacsdql %1, %2, %3, %1
|
||||
%else
|
||||
pmuldq %2, %3
|
||||
paddq %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro LPC_32 1
|
||||
INIT_XMM %1
|
||||
cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
|
||||
sub lend, pred_orderd
|
||||
jle .ret
|
||||
lea decodedq, [decodedq+pred_orderq*4-8]
|
||||
lea coeffsq, [coeffsq+pred_orderq*4]
|
||||
neg pred_orderq
|
||||
movd m4, qlevelm
|
||||
ALIGN 16
|
||||
.loop_sample:
|
||||
movd m0, [decodedq+pred_orderq*4+8]
|
||||
add decodedq, 8
|
||||
movd m1, [coeffsq+pred_orderq*4]
|
||||
pxor m2, m2
|
||||
pxor m3, m3
|
||||
lea jq, [pred_orderq+1]
|
||||
test jq, jq
|
||||
jz .end_order
|
||||
.loop_order:
|
||||
PMACSDQL m2, m0, m1, m2, m0
|
||||
movd m0, [decodedq+jq*4]
|
||||
PMACSDQL m3, m1, m0, m3, m1
|
||||
movd m1, [coeffsq+jq*4]
|
||||
inc jq
|
||||
jl .loop_order
|
||||
.end_order:
|
||||
PMACSDQL m2, m0, m1, m2, m0
|
||||
psrlq m2, m4
|
||||
movd m0, [decodedq]
|
||||
paddd m0, m2
|
||||
movd [decodedq], m0
|
||||
sub lend, 2
|
||||
jl .ret
|
||||
PMACSDQL m3, m1, m0, m3, m1
|
||||
psrlq m3, m4
|
||||
movd m1, [decodedq+4]
|
||||
paddd m1, m3
|
||||
movd [decodedq+4], m1
|
||||
jg .loop_sample
|
||||
.ret:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%if HAVE_XOP_EXTERNAL
|
||||
LPC_32 xop
|
||||
%endif
|
||||
LPC_32 sse4
|
||||
|
||||
;----------------------------------------------------------------------------------
|
||||
;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
|
||||
; int len, int shift);
|
||||
;----------------------------------------------------------------------------------
|
||||
%macro FLAC_DECORRELATE_16 3-4
|
||||
cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
|
||||
%if ARCH_X86_32
|
||||
mov lend, lenm
|
||||
%endif
|
||||
movd m3, r4m
|
||||
shl lend, 2
|
||||
mov in1q, [in0q + gprsize]
|
||||
mov in0q, [in0q]
|
||||
mov outq, [outq]
|
||||
add in1q, lenq
|
||||
add in0q, lenq
|
||||
add outq, lenq
|
||||
neg lenq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
mova m0, [in0q + lenq]
|
||||
mova m1, [in1q + lenq]
|
||||
%ifidn %1, ms
|
||||
psrad m2, m1, 1
|
||||
psubd m0, m2
|
||||
%endif
|
||||
%ifnidn %1, indep2
|
||||
p%4d m2, m0, m1
|
||||
%endif
|
||||
packssdw m%2, m%2
|
||||
packssdw m%3, m%3
|
||||
punpcklwd m%2, m%3
|
||||
psllw m%2, m3
|
||||
mova [outq + lenq], m%2
|
||||
add lenq, 16
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
FLAC_DECORRELATE_16 ls, 0, 2, sub
|
||||
FLAC_DECORRELATE_16 rs, 2, 1, add
|
||||
FLAC_DECORRELATE_16 ms, 2, 0, add
|
||||
|
||||
;----------------------------------------------------------------------------------
|
||||
;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
|
||||
; int len, int shift);
|
||||
;----------------------------------------------------------------------------------
|
||||
%macro FLAC_DECORRELATE_32 5
|
||||
cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
|
||||
%if ARCH_X86_32
|
||||
mov lend, lenm
|
||||
%endif
|
||||
movd m3, r4m
|
||||
mov in1q, [in0q + gprsize]
|
||||
mov in0q, [in0q]
|
||||
mov outq, [outq]
|
||||
sub in1q, in0q
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
mova m0, [in0q]
|
||||
mova m1, [in0q + in1q]
|
||||
%ifidn %1, ms
|
||||
psrad m2, m1, 1
|
||||
psubd m0, m2
|
||||
%endif
|
||||
p%5d m2, m0, m1
|
||||
pslld m%2, m3
|
||||
pslld m%3, m3
|
||||
|
||||
SBUTTERFLY dq, %2, %3, %4
|
||||
|
||||
mova [outq ], m%2
|
||||
mova [outq + mmsize], m%3
|
||||
|
||||
add in0q, mmsize
|
||||
add outq, mmsize*2
|
||||
sub lend, mmsize/4
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
|
||||
FLAC_DECORRELATE_32 rs, 2, 1, 0, add
|
||||
FLAC_DECORRELATE_32 ms, 2, 0, 1, add
|
||||
|
||||
;-----------------------------------------------------------------------------------------
|
||||
;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
|
||||
; int len, int shift);
|
||||
;-----------------------------------------------------------------------------------------
|
||||
;%1 = bps
|
||||
;%2 = channels
|
||||
;%3 = last xmm reg used
|
||||
;%4 = word/dword (shift instruction)
|
||||
%macro FLAC_DECORRELATE_INDEP 4
|
||||
%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
|
||||
cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
|
||||
%if ARCH_X86_32
|
||||
%if %2 == 6
|
||||
DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
|
||||
%define lend dword r3m
|
||||
%else
|
||||
mov lend, lenm
|
||||
%endif
|
||||
%endif
|
||||
movd m%3, r4m
|
||||
|
||||
%assign %%i 1
|
||||
%rep %2-1
|
||||
mov in %+ %%i %+ q, [in0q+%%i*gprsize]
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
|
||||
mov in0q, [in0q]
|
||||
mov outq, [outq]
|
||||
|
||||
%assign %%i 1
|
||||
%rep %2-1
|
||||
sub in %+ %%i %+ q, in0q
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
mova m0, [in0q]
|
||||
|
||||
%assign %%i 1
|
||||
%rep REPCOUNT-1
|
||||
mova m %+ %%i, [in0q + in %+ %%i %+ q]
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
|
||||
%if %1 == 32
|
||||
|
||||
%if %2 == 8
|
||||
TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
%elif %2 == 6
|
||||
SBUTTERFLY dq, 0, 1, 6
|
||||
SBUTTERFLY dq, 2, 3, 6
|
||||
SBUTTERFLY dq, 4, 5, 6
|
||||
|
||||
punpcklqdq m6, m0, m2
|
||||
punpckhqdq m2, m4
|
||||
shufps m4, m0, 0xe4
|
||||
punpcklqdq m0, m1, m3
|
||||
punpckhqdq m3, m5
|
||||
shufps m5, m1, 0xe4
|
||||
SWAP 0,6,1,4,5,3
|
||||
%elif %2 == 4
|
||||
TRANSPOSE4x4D 0, 1, 2, 3, 4
|
||||
%else ; %2 == 2
|
||||
SBUTTERFLY dq, 0, 1, 2
|
||||
%endif
|
||||
|
||||
%else ; %1 == 16
|
||||
|
||||
%if %2 == 8
|
||||
packssdw m0, [in0q + in4q]
|
||||
packssdw m1, [in0q + in5q]
|
||||
packssdw m2, [in0q + in6q]
|
||||
packssdw m3, [in0q + in7q]
|
||||
TRANSPOSE2x4x4W 0, 1, 2, 3, 4
|
||||
%elif %2 == 6
|
||||
packssdw m0, [in0q + in3q]
|
||||
packssdw m1, [in0q + in4q]
|
||||
packssdw m2, [in0q + in5q]
|
||||
pshufd m3, m0, q1032
|
||||
punpcklwd m0, m1
|
||||
punpckhwd m1, m2
|
||||
punpcklwd m2, m3
|
||||
|
||||
shufps m3, m0, m2, q2020
|
||||
shufps m0, m1, q2031
|
||||
shufps m2, m1, q3131
|
||||
shufps m1, m2, m3, q3120
|
||||
shufps m3, m0, q0220
|
||||
shufps m0, m2, q3113
|
||||
SWAP 2, 0, 3
|
||||
%else ; %2 == 4
|
||||
packssdw m0, [in0q + in2q]
|
||||
packssdw m1, [in0q + in3q]
|
||||
SBUTTERFLY wd, 0, 1, 2
|
||||
SBUTTERFLY dq, 0, 1, 2
|
||||
%endif
|
||||
|
||||
%endif
|
||||
|
||||
%assign %%i 0
|
||||
%rep REPCOUNT
|
||||
psll%4 m %+ %%i, m%3
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
|
||||
%assign %%i 0
|
||||
%rep REPCOUNT
|
||||
mova [outq + %%i*mmsize], m %+ %%i
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
|
||||
add in0q, mmsize
|
||||
add outq, mmsize*REPCOUNT
|
||||
sub lend, mmsize/4
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
|
||||
FLAC_DECORRELATE_INDEP 32, 2, 3, d
|
||||
FLAC_DECORRELATE_INDEP 16, 4, 3, w
|
||||
FLAC_DECORRELATE_INDEP 32, 4, 5, d
|
||||
FLAC_DECORRELATE_INDEP 16, 6, 4, w
|
||||
FLAC_DECORRELATE_INDEP 32, 6, 7, d
|
||||
%if ARCH_X86_64
|
||||
FLAC_DECORRELATE_INDEP 16, 8, 5, w
|
||||
FLAC_DECORRELATE_INDEP 32, 8, 9, d
|
||||
%endif
|
||||
|
||||
INIT_XMM avx
|
||||
FLAC_DECORRELATE_INDEP 32, 4, 5, d
|
||||
FLAC_DECORRELATE_INDEP 32, 6, 7, d
|
||||
%if ARCH_X86_64
|
||||
FLAC_DECORRELATE_INDEP 16, 8, 5, w
|
||||
FLAC_DECORRELATE_INDEP 32, 8, 9, d
|
||||
%endif
|
||||
115
externals/ffmpeg/libavcodec/x86/flacdsp_init.c
vendored
Executable file
115
externals/ffmpeg/libavcodec/x86/flacdsp_init.c
vendored
Executable file
@@ -0,0 +1,115 @@
|
||||
/*
|
||||
* Copyright (c) 2014 James Almer
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/flacdsp.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
|
||||
int qlevel, int len);
|
||||
void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
|
||||
int qlevel, int len);
|
||||
|
||||
void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
|
||||
|
||||
#define DECORRELATE_FUNCS(fmt, opt) \
|
||||
void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
|
||||
int len, int shift); \
|
||||
void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
|
||||
int len, int shift); \
|
||||
void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
|
||||
int len, int shift); \
|
||||
void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
|
||||
int len, int shift); \
|
||||
void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
|
||||
int len, int shift); \
|
||||
void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
|
||||
int len, int shift); \
|
||||
void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
|
||||
int len, int shift)
|
||||
|
||||
DECORRELATE_FUNCS(16, sse2);
|
||||
DECORRELATE_FUNCS(16, avx);
|
||||
DECORRELATE_FUNCS(32, sse2);
|
||||
DECORRELATE_FUNCS(32, avx);
|
||||
|
||||
av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
|
||||
int bps)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if CONFIG_FLAC_DECODER
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
if (fmt == AV_SAMPLE_FMT_S16) {
|
||||
if (channels == 2)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2;
|
||||
else if (channels == 4)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2;
|
||||
else if (channels == 6)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2;
|
||||
else if (ARCH_X86_64 && channels == 8)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2;
|
||||
c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
|
||||
c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
|
||||
c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
|
||||
} else if (fmt == AV_SAMPLE_FMT_S32) {
|
||||
if (channels == 2)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2;
|
||||
else if (channels == 4)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2;
|
||||
else if (channels == 6)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2;
|
||||
else if (ARCH_X86_64 && channels == 8)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2;
|
||||
c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
|
||||
c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
|
||||
c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
c->lpc32 = ff_flac_lpc_32_sse4;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
if (fmt == AV_SAMPLE_FMT_S16) {
|
||||
if (ARCH_X86_64 && channels == 8)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx;
|
||||
} else if (fmt == AV_SAMPLE_FMT_S32) {
|
||||
if (channels == 4)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx;
|
||||
else if (channels == 6)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx;
|
||||
else if (ARCH_X86_64 && channels == 8)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_XOP(cpu_flags)) {
|
||||
c->lpc32 = ff_flac_lpc_32_xop;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CONFIG_FLAC_ENCODER
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
if (CONFIG_GPL)
|
||||
c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
|
||||
}
|
||||
#endif
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
||||
124
externals/ffmpeg/libavcodec/x86/fmtconvert.asm
vendored
Executable file
124
externals/ffmpeg/libavcodec/x86/fmtconvert.asm
vendored
Executable file
@@ -0,0 +1,124 @@
|
||||
;******************************************************************************
|
||||
;* x86 optimized Format Conversion Utils
|
||||
;* Copyright (c) 2008 Loren Merritt
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul,
|
||||
; int len);
|
||||
;------------------------------------------------------------------------------
|
||||
%macro INT32_TO_FLOAT_FMUL_SCALAR 1
|
||||
%if UNIX64
|
||||
cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
|
||||
%else
|
||||
cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
|
||||
%endif
|
||||
%if WIN64
|
||||
SWAP 0, 2
|
||||
%elif ARCH_X86_32
|
||||
movss m0, mulm
|
||||
%endif
|
||||
SPLATD m0
|
||||
shl lend, 2
|
||||
add srcq, lenq
|
||||
add dstq, lenq
|
||||
neg lenq
|
||||
.loop:
|
||||
%if cpuflag(sse2)
|
||||
cvtdq2ps m1, [srcq+lenq ]
|
||||
cvtdq2ps m2, [srcq+lenq+16]
|
||||
%else
|
||||
cvtpi2ps m1, [srcq+lenq ]
|
||||
cvtpi2ps m3, [srcq+lenq+ 8]
|
||||
cvtpi2ps m2, [srcq+lenq+16]
|
||||
cvtpi2ps m4, [srcq+lenq+24]
|
||||
movlhps m1, m3
|
||||
movlhps m2, m4
|
||||
%endif
|
||||
mulps m1, m0
|
||||
mulps m2, m0
|
||||
mova [dstq+lenq ], m1
|
||||
mova [dstq+lenq+16], m2
|
||||
add lenq, 32
|
||||
jl .loop
|
||||
%if notcpuflag(sse2)
|
||||
;; cvtpi2ps switches to MMX even if the source is a memory location
|
||||
;; possible an error in documentation since every tested CPU disagrees with
|
||||
;; that. Use emms anyway since the vast majority of machines will use the
|
||||
;; SSE2 variant
|
||||
emms
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
INT32_TO_FLOAT_FMUL_SCALAR 5
|
||||
INIT_XMM sse2
|
||||
INT32_TO_FLOAT_FMUL_SCALAR 3
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
|
||||
; const float *mul, int len);
|
||||
;------------------------------------------------------------------------------
|
||||
%macro INT32_TO_FLOAT_FMUL_ARRAY8 0
|
||||
cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
|
||||
shl lend, 2
|
||||
add srcq, lenq
|
||||
add dstq, lenq
|
||||
neg lenq
|
||||
.loop:
|
||||
movss m0, [mulq]
|
||||
SPLATD m0
|
||||
%if cpuflag(sse2)
|
||||
cvtdq2ps m1, [srcq+lenq ]
|
||||
cvtdq2ps m2, [srcq+lenq+16]
|
||||
%else
|
||||
cvtpi2ps m1, [srcq+lenq ]
|
||||
cvtpi2ps m3, [srcq+lenq+ 8]
|
||||
cvtpi2ps m2, [srcq+lenq+16]
|
||||
cvtpi2ps m4, [srcq+lenq+24]
|
||||
movlhps m1, m3
|
||||
movlhps m2, m4
|
||||
%endif
|
||||
mulps m1, m0
|
||||
mulps m2, m0
|
||||
mova [dstq+lenq ], m1
|
||||
mova [dstq+lenq+16], m2
|
||||
add mulq, 4
|
||||
add lenq, 32
|
||||
jl .loop
|
||||
%if notcpuflag(sse2)
|
||||
;; cvtpi2ps switches to MMX even if the source is a memory location
|
||||
;; possible an error in documentation since every tested CPU disagrees with
|
||||
;; that. Use emms anyway since the vast majority of machines will use the
|
||||
;; SSE2 variant
|
||||
emms
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
INT32_TO_FLOAT_FMUL_ARRAY8
|
||||
INIT_XMM sse2
|
||||
INT32_TO_FLOAT_FMUL_ARRAY8
|
||||
|
||||
55
externals/ffmpeg/libavcodec/x86/fmtconvert_init.c
vendored
Executable file
55
externals/ffmpeg/libavcodec/x86/fmtconvert_init.c
vendored
Executable file
@@ -0,0 +1,55 @@
|
||||
/*
|
||||
* Format Conversion Utils
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/fmtconvert.h"
|
||||
|
||||
#if HAVE_X86ASM
|
||||
|
||||
void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
|
||||
void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
|
||||
void ff_int32_to_float_fmul_array8_sse (FmtConvertContext *c, float *dst, const int32_t *src,
|
||||
const float *mul, int len);
|
||||
void ff_int32_to_float_fmul_array8_sse2(FmtConvertContext *c, float *dst, const int32_t *src,
|
||||
const float *mul, int len);
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
|
||||
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
|
||||
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse2;
|
||||
}
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
||||
106
externals/ffmpeg/libavcodec/x86/fpel.asm
vendored
Executable file
106
externals/ffmpeg/libavcodec/x86/fpel.asm
vendored
Executable file
@@ -0,0 +1,106 @@
|
||||
;******************************************************************************
|
||||
;* SIMD-optimized fullpel functions
|
||||
;* Copyright (c) 2008 Loren Merritt
|
||||
;* Copyright (c) 2003-2013 Michael Niedermayer
|
||||
;* Copyright (c) 2013 Daniel Kang
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro PAVGB_MMX 4
|
||||
LOAD %3, %1
|
||||
por %3, %2
|
||||
pxor %2, %1
|
||||
pand %2, %4
|
||||
psrlq %2, 1
|
||||
psubb %3, %2
|
||||
SWAP %2, %3
|
||||
%endmacro
|
||||
|
||||
; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels,
|
||||
; ptrdiff_t line_size, int h)
|
||||
%macro OP_PIXELS 2
|
||||
%if %2 == mmsize/2
|
||||
%define LOAD movh
|
||||
%define SAVE movh
|
||||
%define LEN mmsize
|
||||
%else
|
||||
%define LOAD movu
|
||||
%define SAVE mova
|
||||
%define LEN %2
|
||||
%endif
|
||||
cglobal %1_pixels%2, 4,5,4
|
||||
lea r4, [r2*3]
|
||||
%ifidn %1, avg
|
||||
%if notcpuflag(mmxext)
|
||||
pcmpeqd m6, m6
|
||||
paddb m6, m6
|
||||
%endif
|
||||
%endif
|
||||
.loop:
|
||||
%assign %%i 0
|
||||
%rep LEN/mmsize
|
||||
LOAD m0, [r1 + %%i]
|
||||
LOAD m1, [r1+r2 + %%i]
|
||||
LOAD m2, [r1+r2*2 + %%i]
|
||||
LOAD m3, [r1+r4 + %%i]
|
||||
%ifidn %1, avg
|
||||
%if notcpuflag(mmxext)
|
||||
PAVGB_MMX [r0 + %%i], m0, m4, m6
|
||||
PAVGB_MMX [r0+r2 + %%i], m1, m5, m6
|
||||
PAVGB_MMX [r0+r2*2 + %%i], m2, m4, m6
|
||||
PAVGB_MMX [r0+r4 + %%i], m3, m5, m6
|
||||
%else
|
||||
pavgb m0, [r0 + %%i]
|
||||
pavgb m1, [r0+r2 + %%i]
|
||||
pavgb m2, [r0+r2*2 + %%i]
|
||||
pavgb m3, [r0+r4 + %%i]
|
||||
%endif
|
||||
%endif
|
||||
SAVE [r0 + %%i], m0
|
||||
SAVE [r0+r2 + %%i], m1
|
||||
SAVE [r0+r2*2 + %%i], m2
|
||||
SAVE [r0+r4 + %%i], m3
|
||||
%assign %%i %%i+mmsize
|
||||
%endrep
|
||||
sub r3d, 4
|
||||
lea r1, [r1+r2*4]
|
||||
lea r0, [r0+r2*4]
|
||||
jne .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
OP_PIXELS put, 4
|
||||
OP_PIXELS avg, 4
|
||||
OP_PIXELS put, 8
|
||||
OP_PIXELS avg, 8
|
||||
OP_PIXELS put, 16
|
||||
OP_PIXELS avg, 16
|
||||
|
||||
INIT_MMX mmxext
|
||||
OP_PIXELS avg, 4
|
||||
OP_PIXELS avg, 8
|
||||
OP_PIXELS avg, 16
|
||||
|
||||
INIT_XMM sse2
|
||||
OP_PIXELS put, 16
|
||||
OP_PIXELS avg, 16
|
||||
49
externals/ffmpeg/libavcodec/x86/fpel.h
vendored
Executable file
49
externals/ffmpeg/libavcodec/x86/fpel.h
vendored
Executable file
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_FPEL_H
|
||||
#define AVCODEC_X86_FPEL_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
void ff_avg_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
|
||||
#endif /* AVCODEC_X86_FPEL_H */
|
||||
54
externals/ffmpeg/libavcodec/x86/g722dsp.asm
vendored
Executable file
54
externals/ffmpeg/libavcodec/x86/g722dsp.asm
vendored
Executable file
@@ -0,0 +1,54 @@
|
||||
;******************************************************************************
|
||||
;* SIMD optimized DSP functions for G722 coding
|
||||
;*
|
||||
;* Copyright (c) 2014 James Almer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
pw_qmf_coeffs: dw 3, -210, -11, -805, -11, 951, 53, 3876
|
||||
pw_qmf_coeffs2: dw 12, 3876, -156, 951, 32, -805, 362, -210
|
||||
pw_qmf_coeffs3: dw 362, 0 , 32, 0, -156, 0, 12, 0
|
||||
pw_qmf_coeffs4: dw 53, 0, -11, 0, -11, 0, 3, 0
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal g722_apply_qmf, 2, 2, 5, prev, out
|
||||
movu m0, [prevq+mmsize*0]
|
||||
movu m1, [prevq+mmsize*1]
|
||||
movu m2, [prevq+mmsize*2]
|
||||
punpcklwd m3, m0, m1
|
||||
punpckhwd m0, m1
|
||||
punpcklwd m4, m2, m2
|
||||
punpckhwd m2, m2
|
||||
pmaddwd m3, [pw_qmf_coeffs ]
|
||||
pmaddwd m0, [pw_qmf_coeffs2]
|
||||
pmaddwd m4, [pw_qmf_coeffs3]
|
||||
pmaddwd m2, [pw_qmf_coeffs4]
|
||||
paddd m0, m3
|
||||
paddd m2, m4
|
||||
paddd m0, m2
|
||||
pshufd m2, m0, q0032
|
||||
paddd m0, m2
|
||||
pshufd m0, m0, q0001
|
||||
movq [outq], m0
|
||||
RET
|
||||
35
externals/ffmpeg/libavcodec/x86/g722dsp_init.c
vendored
Executable file
35
externals/ffmpeg/libavcodec/x86/g722dsp_init.c
vendored
Executable file
@@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Copyright (c) 2014 James Almer
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/g722dsp.h"
|
||||
|
||||
void ff_g722_apply_qmf_sse2(const int16_t *prev_samples, int xout[2]);
|
||||
|
||||
av_cold void ff_g722dsp_init_x86(G722DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
dsp->apply_qmf = ff_g722_apply_qmf_sse2;
|
||||
}
|
||||
189
externals/ffmpeg/libavcodec/x86/h263_loopfilter.asm
vendored
Executable file
189
externals/ffmpeg/libavcodec/x86/h263_loopfilter.asm
vendored
Executable file
@@ -0,0 +1,189 @@
|
||||
;******************************************************************************
|
||||
;* MMX-optimized H.263 loop filter
|
||||
;* Copyright (c) 2003-2013 Michael Niedermayer
|
||||
;* Copyright (c) 2013 Daniel Kang
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
cextern pb_FC
|
||||
cextern h263_loop_filter_strength
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro H263_LOOP_FILTER 5
|
||||
pxor m7, m7
|
||||
mova m0, [%1]
|
||||
mova m1, [%1]
|
||||
mova m2, [%4]
|
||||
mova m3, [%4]
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
psubw m0, m2
|
||||
psubw m1, m3
|
||||
mova m2, [%2]
|
||||
mova m3, [%2]
|
||||
mova m4, [%3]
|
||||
mova m5, [%3]
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
punpckhbw m5, m7
|
||||
psubw m4, m2
|
||||
psubw m5, m3
|
||||
psllw m4, 2
|
||||
psllw m5, 2
|
||||
paddw m4, m0
|
||||
paddw m5, m1
|
||||
pxor m6, m6
|
||||
pcmpgtw m6, m4
|
||||
pcmpgtw m7, m5
|
||||
pxor m4, m6
|
||||
pxor m5, m7
|
||||
psubw m4, m6
|
||||
psubw m5, m7
|
||||
psrlw m4, 3
|
||||
psrlw m5, 3
|
||||
packuswb m4, m5
|
||||
packsswb m6, m7
|
||||
pxor m7, m7
|
||||
movd m2, %5
|
||||
punpcklbw m2, m2
|
||||
punpcklbw m2, m2
|
||||
punpcklbw m2, m2
|
||||
psubusb m2, m4
|
||||
mova m3, m2
|
||||
psubusb m3, m4
|
||||
psubb m2, m3
|
||||
mova m3, [%2]
|
||||
mova m4, [%3]
|
||||
pxor m3, m6
|
||||
pxor m4, m6
|
||||
paddusb m3, m2
|
||||
psubusb m4, m2
|
||||
pxor m3, m6
|
||||
pxor m4, m6
|
||||
paddusb m2, m2
|
||||
packsswb m0, m1
|
||||
pcmpgtb m7, m0
|
||||
pxor m0, m7
|
||||
psubb m0, m7
|
||||
mova m1, m0
|
||||
psubusb m0, m2
|
||||
psubb m1, m0
|
||||
pand m1, [pb_FC]
|
||||
psrlw m1, 2
|
||||
pxor m1, m7
|
||||
psubb m1, m7
|
||||
mova m5, [%1]
|
||||
mova m6, [%4]
|
||||
psubb m5, m1
|
||||
paddb m6, m1
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
; void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
|
||||
cglobal h263_v_loop_filter, 3,5
|
||||
movsxdifnidn r1, r1d
|
||||
movsxdifnidn r2, r2d
|
||||
|
||||
lea r4, [h263_loop_filter_strength]
|
||||
movzx r3d, BYTE [r4+r2]
|
||||
movsx r2, r3b
|
||||
shl r2, 1
|
||||
|
||||
mov r3, r0
|
||||
sub r3, r1
|
||||
mov r4, r3
|
||||
sub r4, r1
|
||||
H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
|
||||
|
||||
mova [r3], m3
|
||||
mova [r0], m4
|
||||
mova [r4], m5
|
||||
mova [r0+r1], m6
|
||||
RET
|
||||
|
||||
%macro TRANSPOSE4X4 2
|
||||
movd m0, [%1]
|
||||
movd m1, [%1+r1]
|
||||
movd m2, [%1+r1*2]
|
||||
movd m3, [%1+r3]
|
||||
punpcklbw m0, m1
|
||||
punpcklbw m2, m3
|
||||
mova m1, m0
|
||||
punpcklwd m0, m2
|
||||
punpckhwd m1, m2
|
||||
movd [%2+ 0], m0
|
||||
punpckhdq m0, m0
|
||||
movd [%2+ 8], m0
|
||||
movd [%2+16], m1
|
||||
punpckhdq m1, m1
|
||||
movd [%2+24], m1
|
||||
%endmacro
|
||||
|
||||
|
||||
; void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
|
||||
INIT_MMX mmx
|
||||
cglobal h263_h_loop_filter, 3,5,0,32
|
||||
movsxdifnidn r1, r1d
|
||||
movsxdifnidn r2, r2d
|
||||
|
||||
lea r4, [h263_loop_filter_strength]
|
||||
movzx r3d, BYTE [r4+r2]
|
||||
movsx r2, r3b
|
||||
shl r2, 1
|
||||
|
||||
sub r0, 2
|
||||
lea r3, [r1*3]
|
||||
|
||||
TRANSPOSE4X4 r0, rsp
|
||||
lea r4, [r0+r1*4]
|
||||
TRANSPOSE4X4 r4, rsp+4
|
||||
|
||||
H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
|
||||
|
||||
mova m1, m5
|
||||
mova m0, m4
|
||||
punpcklbw m5, m3
|
||||
punpcklbw m4, m6
|
||||
punpckhbw m1, m3
|
||||
punpckhbw m0, m6
|
||||
mova m3, m5
|
||||
mova m6, m1
|
||||
punpcklwd m5, m4
|
||||
punpcklwd m1, m0
|
||||
punpckhwd m3, m4
|
||||
punpckhwd m6, m0
|
||||
movd [r0], m5
|
||||
punpckhdq m5, m5
|
||||
movd [r0+r1*1], m5
|
||||
movd [r0+r1*2], m3
|
||||
punpckhdq m3, m3
|
||||
movd [r0+r3], m3
|
||||
movd [r4], m1
|
||||
punpckhdq m1, m1
|
||||
movd [r4+r1*1], m1
|
||||
movd [r4+r1*2], m6
|
||||
punpckhdq m6, m6
|
||||
movd [r4+r3], m6
|
||||
RET
|
||||
39
externals/ffmpeg/libavcodec/x86/h263dsp_init.c
vendored
Executable file
39
externals/ffmpeg/libavcodec/x86/h263dsp_init.c
vendored
Executable file
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) 2013 Diego Biurrun <diego@biurrun.de>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/h263dsp.h"
|
||||
|
||||
void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
|
||||
void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
|
||||
|
||||
av_cold void ff_h263dsp_init_x86(H263DSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
|
||||
c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
|
||||
}
|
||||
}
|
||||
208
externals/ffmpeg/libavcodec/x86/h264_cabac.c
vendored
Executable file
208
externals/ffmpeg/libavcodec/x86/h264_cabac.c
vendored
Executable file
@@ -0,0 +1,208 @@
|
||||
/*
|
||||
* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
|
||||
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* H.264 / AVC / MPEG-4 part10 codec.
|
||||
* non-SIMD x86-specific optimizations for H.264
|
||||
* @author Michael Niedermayer <michaelni@gmx.at>
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include "libavcodec/cabac.h"
|
||||
#include "cabac.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#if ARCH_X86_64
|
||||
#define REG64 "r"
|
||||
#else
|
||||
#define REG64 "m"
|
||||
#endif
|
||||
|
||||
//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
|
||||
//as that would make optimization work hard)
|
||||
#if HAVE_7REGS && !BROKEN_COMPILER
|
||||
#define decode_significance decode_significance_x86
|
||||
static int decode_significance_x86(CABACContext *c, int max_coeff,
|
||||
uint8_t *significant_coeff_ctx_base,
|
||||
int *index, x86_reg last_off){
|
||||
void *end= significant_coeff_ctx_base + max_coeff - 1;
|
||||
int minusstart= -(intptr_t)significant_coeff_ctx_base;
|
||||
int minusindex= 4-(intptr_t)index;
|
||||
int bit;
|
||||
x86_reg coeff_count;
|
||||
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
void *tables;
|
||||
|
||||
__asm__ volatile(
|
||||
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
|
||||
: "=&r"(tables)
|
||||
: NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
|
||||
);
|
||||
#endif
|
||||
|
||||
__asm__ volatile(
|
||||
"3: \n\t"
|
||||
|
||||
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
|
||||
"%5", "%q5", "%k0", "%b0",
|
||||
"%c11(%6)", "%c12(%6)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%13")
|
||||
|
||||
"test $1, %4 \n\t"
|
||||
" jz 4f \n\t"
|
||||
"add %10, %1 \n\t"
|
||||
|
||||
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
|
||||
"%5", "%q5", "%k0", "%b0",
|
||||
"%c11(%6)", "%c12(%6)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%13")
|
||||
|
||||
"sub %10, %1 \n\t"
|
||||
"mov %2, %0 \n\t"
|
||||
"movl %7, %%ecx \n\t"
|
||||
"add %1, %%"FF_REG_c" \n\t"
|
||||
"movl %%ecx, (%0) \n\t"
|
||||
|
||||
"test $1, %4 \n\t"
|
||||
" jnz 5f \n\t"
|
||||
|
||||
"add"FF_OPSIZE" $4, %2 \n\t"
|
||||
|
||||
"4: \n\t"
|
||||
"add $1, %1 \n\t"
|
||||
"cmp %8, %1 \n\t"
|
||||
" jb 3b \n\t"
|
||||
"mov %2, %0 \n\t"
|
||||
"movl %7, %%ecx \n\t"
|
||||
"add %1, %%"FF_REG_c" \n\t"
|
||||
"movl %%ecx, (%0) \n\t"
|
||||
"5: \n\t"
|
||||
"add %9, %k0 \n\t"
|
||||
"shr $2, %k0 \n\t"
|
||||
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
|
||||
"+&r"(c->low), "=&r"(bit), "+&r"(c->range)
|
||||
: "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
|
||||
"i"(offsetof(CABACContext, bytestream)),
|
||||
"i"(offsetof(CABACContext, bytestream_end))
|
||||
TABLES_ARG
|
||||
: "%"FF_REG_c, "memory"
|
||||
);
|
||||
return coeff_count;
|
||||
}
|
||||
|
||||
#define decode_significance_8x8 decode_significance_8x8_x86
|
||||
static int decode_significance_8x8_x86(CABACContext *c,
|
||||
uint8_t *significant_coeff_ctx_base,
|
||||
int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){
|
||||
int minusindex= 4-(intptr_t)index;
|
||||
int bit;
|
||||
x86_reg coeff_count;
|
||||
x86_reg last=0;
|
||||
x86_reg state;
|
||||
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
void *tables;
|
||||
|
||||
__asm__ volatile(
|
||||
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
|
||||
: "=&r"(tables)
|
||||
: NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
|
||||
);
|
||||
#endif
|
||||
|
||||
__asm__ volatile(
|
||||
"mov %1, %6 \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"mov %10, %0 \n\t"
|
||||
"movzb (%0, %6), %6 \n\t"
|
||||
"add %9, %6 \n\t"
|
||||
|
||||
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
|
||||
"%5", "%q5", "%k0", "%b0",
|
||||
"%c12(%7)", "%c13(%7)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%15")
|
||||
|
||||
"mov %1, %6 \n\t"
|
||||
"test $1, %4 \n\t"
|
||||
" jz 4f \n\t"
|
||||
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
"movzb %c14(%15, %q6), %6\n\t"
|
||||
#else
|
||||
"movzb "MANGLE(ff_h264_cabac_tables)"+%c14(%6), %6\n\t"
|
||||
#endif
|
||||
"add %11, %6 \n\t"
|
||||
|
||||
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
|
||||
"%5", "%q5", "%k0", "%b0",
|
||||
"%c12(%7)", "%c13(%7)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%15")
|
||||
|
||||
"mov %2, %0 \n\t"
|
||||
"mov %1, %6 \n\t"
|
||||
"mov %k6, (%0) \n\t"
|
||||
|
||||
"test $1, %4 \n\t"
|
||||
" jnz 5f \n\t"
|
||||
|
||||
"add"FF_OPSIZE" $4, %2 \n\t"
|
||||
|
||||
"4: \n\t"
|
||||
"add $1, %6 \n\t"
|
||||
"mov %6, %1 \n\t"
|
||||
"cmp $63, %6 \n\t"
|
||||
" jb 3b \n\t"
|
||||
"mov %2, %0 \n\t"
|
||||
"mov %k6, (%0) \n\t"
|
||||
"5: \n\t"
|
||||
"addl %8, %k0 \n\t"
|
||||
"shr $2, %k0 \n\t"
|
||||
: "=&q"(coeff_count), "+"REG64(last), "+"REG64(index), "+&r"(c->low),
|
||||
"=&r"(bit), "+&r"(c->range), "=&r"(state)
|
||||
: "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
|
||||
REG64(sig_off), REG64(last_coeff_ctx_base),
|
||||
"i"(offsetof(CABACContext, bytestream)),
|
||||
"i"(offsetof(CABACContext, bytestream_end)),
|
||||
"i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
|
||||
: "%"FF_REG_c, "memory"
|
||||
);
|
||||
return coeff_count;
|
||||
}
|
||||
#endif /* HAVE_7REGS && BROKEN_COMPILER */
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
663
externals/ffmpeg/libavcodec/x86/h264_chromamc.asm
vendored
Executable file
663
externals/ffmpeg/libavcodec/x86/h264_chromamc.asm
vendored
Executable file
@@ -0,0 +1,663 @@
|
||||
;******************************************************************************
|
||||
;* MMX/SSSE3-optimized functions for H.264 chroma MC
|
||||
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
|
||||
;* 2005-2008 Loren Merritt
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
rnd_rv40_2d_tbl: times 4 dw 0
|
||||
times 4 dw 16
|
||||
times 4 dw 32
|
||||
times 4 dw 16
|
||||
times 4 dw 32
|
||||
times 4 dw 28
|
||||
times 4 dw 32
|
||||
times 4 dw 28
|
||||
times 4 dw 0
|
||||
times 4 dw 32
|
||||
times 4 dw 16
|
||||
times 4 dw 32
|
||||
times 4 dw 32
|
||||
times 4 dw 28
|
||||
times 4 dw 32
|
||||
times 4 dw 28
|
||||
rnd_rv40_1d_tbl: times 4 dw 0
|
||||
times 4 dw 2
|
||||
times 4 dw 4
|
||||
times 4 dw 2
|
||||
times 4 dw 4
|
||||
times 4 dw 3
|
||||
times 4 dw 4
|
||||
times 4 dw 3
|
||||
times 4 dw 0
|
||||
times 4 dw 4
|
||||
times 4 dw 2
|
||||
times 4 dw 4
|
||||
times 4 dw 4
|
||||
times 4 dw 3
|
||||
times 4 dw 4
|
||||
times 4 dw 3
|
||||
|
||||
cextern pw_3
|
||||
cextern pw_4
|
||||
cextern pw_8
|
||||
pw_28: times 8 dw 28
|
||||
cextern pw_32
|
||||
cextern pw_64
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro mv0_pixels_mc8 0
|
||||
lea r4, [r2*2 ]
|
||||
.next4rows:
|
||||
movq mm0, [r1 ]
|
||||
movq mm1, [r1+r2]
|
||||
add r1, r4
|
||||
CHROMAMC_AVG mm0, [r0 ]
|
||||
CHROMAMC_AVG mm1, [r0+r2]
|
||||
movq [r0 ], mm0
|
||||
movq [r0+r2], mm1
|
||||
add r0, r4
|
||||
movq mm0, [r1 ]
|
||||
movq mm1, [r1+r2]
|
||||
add r1, r4
|
||||
CHROMAMC_AVG mm0, [r0 ]
|
||||
CHROMAMC_AVG mm1, [r0+r2]
|
||||
movq [r0 ], mm0
|
||||
movq [r0+r2], mm1
|
||||
add r0, r4
|
||||
sub r3d, 4
|
||||
jne .next4rows
|
||||
%endmacro
|
||||
|
||||
%macro chroma_mc8_mmx_func 2-3
|
||||
%ifidn %2, rv40
|
||||
%ifdef PIC
|
||||
%define rnd_1d_rv40 r8
|
||||
%define rnd_2d_rv40 r8
|
||||
%define extra_regs 2
|
||||
%else ; no-PIC
|
||||
%define rnd_1d_rv40 rnd_rv40_1d_tbl
|
||||
%define rnd_2d_rv40 rnd_rv40_2d_tbl
|
||||
%define extra_regs 1
|
||||
%endif ; PIC
|
||||
%else
|
||||
%define extra_regs 0
|
||||
%endif ; rv40
|
||||
; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */,
|
||||
; uint8_t *src /* align 1 */,
|
||||
; ptrdiff_t stride, int h, int mx, int my)
|
||||
cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
|
||||
mov r6d, r5d
|
||||
or r6d, r4d
|
||||
jne .at_least_one_non_zero
|
||||
; mx == 0 AND my == 0 - no filter needed
|
||||
mv0_pixels_mc8
|
||||
REP_RET
|
||||
|
||||
.at_least_one_non_zero:
|
||||
%ifidn %2, rv40
|
||||
%if ARCH_X86_64
|
||||
mov r7, r5
|
||||
and r7, 6 ; &~1 for mx/my=[0,7]
|
||||
lea r7, [r7*4+r4]
|
||||
sar r7d, 1
|
||||
%define rnd_bias r7
|
||||
%define dest_reg r0
|
||||
%else ; x86-32
|
||||
mov r0, r5
|
||||
and r0, 6 ; &~1 for mx/my=[0,7]
|
||||
lea r0, [r0*4+r4]
|
||||
sar r0d, 1
|
||||
%define rnd_bias r0
|
||||
%define dest_reg r5
|
||||
%endif
|
||||
%else ; vc1, h264
|
||||
%define rnd_bias 0
|
||||
%define dest_reg r0
|
||||
%endif
|
||||
|
||||
test r5d, r5d
|
||||
mov r6, 1
|
||||
je .my_is_zero
|
||||
test r4d, r4d
|
||||
mov r6, r2 ; dxy = x ? 1 : stride
|
||||
jne .both_non_zero
|
||||
.my_is_zero:
|
||||
; mx == 0 XOR my == 0 - 1 dimensional filter only
|
||||
or r4d, r5d ; x + y
|
||||
|
||||
%ifidn %2, rv40
|
||||
%ifdef PIC
|
||||
lea r8, [rnd_rv40_1d_tbl]
|
||||
%endif
|
||||
%if ARCH_X86_64 == 0
|
||||
mov r5, r0m
|
||||
%endif
|
||||
%endif
|
||||
|
||||
movd m5, r4d
|
||||
movq m4, [pw_8]
|
||||
movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
|
||||
punpcklwd m5, m5
|
||||
punpckldq m5, m5 ; mm5 = B = x
|
||||
pxor m7, m7
|
||||
psubw m4, m5 ; mm4 = A = 8-x
|
||||
|
||||
.next1drow:
|
||||
movq m0, [r1 ] ; mm0 = src[0..7]
|
||||
movq m2, [r1+r6] ; mm1 = src[1..8]
|
||||
|
||||
movq m1, m0
|
||||
movq m3, m2
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
pmullw m0, m4 ; [mm0,mm1] = A * src[0..7]
|
||||
pmullw m1, m4
|
||||
pmullw m2, m5 ; [mm2,mm3] = B * src[1..8]
|
||||
pmullw m3, m5
|
||||
|
||||
paddw m0, m6
|
||||
paddw m1, m6
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
psrlw m0, 3
|
||||
psrlw m1, 3
|
||||
packuswb m0, m1
|
||||
CHROMAMC_AVG m0, [dest_reg]
|
||||
movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
|
||||
|
||||
add dest_reg, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jne .next1drow
|
||||
REP_RET
|
||||
|
||||
.both_non_zero: ; general case, bilinear
|
||||
movd m4, r4d ; x
|
||||
movd m6, r5d ; y
|
||||
%ifidn %2, rv40
|
||||
%ifdef PIC
|
||||
lea r8, [rnd_rv40_2d_tbl]
|
||||
%endif
|
||||
%if ARCH_X86_64 == 0
|
||||
mov r5, r0m
|
||||
%endif
|
||||
%endif
|
||||
mov r6, rsp ; backup stack pointer
|
||||
and rsp, ~(mmsize-1) ; align stack
|
||||
sub rsp, 16 ; AA and DD
|
||||
|
||||
punpcklwd m4, m4
|
||||
punpcklwd m6, m6
|
||||
punpckldq m4, m4 ; mm4 = x words
|
||||
punpckldq m6, m6 ; mm6 = y words
|
||||
movq m5, m4
|
||||
pmullw m4, m6 ; mm4 = x * y
|
||||
psllw m5, 3
|
||||
psllw m6, 3
|
||||
movq m7, m5
|
||||
paddw m7, m6
|
||||
movq [rsp+8], m4 ; DD = x * y
|
||||
psubw m5, m4 ; mm5 = B = 8x - xy
|
||||
psubw m6, m4 ; mm6 = C = 8y - xy
|
||||
paddw m4, [pw_64]
|
||||
psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64
|
||||
pxor m7, m7
|
||||
movq [rsp ], m4
|
||||
|
||||
movq m0, [r1 ] ; mm0 = src[0..7]
|
||||
movq m1, [r1+1] ; mm1 = src[1..8]
|
||||
.next2drow:
|
||||
add r1, r2
|
||||
|
||||
movq m2, m0
|
||||
movq m3, m1
|
||||
punpckhbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
pmullw m0, [rsp]
|
||||
pmullw m2, [rsp]
|
||||
pmullw m1, m5
|
||||
pmullw m3, m5
|
||||
paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4]
|
||||
paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8]
|
||||
|
||||
movq m0, [r1]
|
||||
movq m1, m0
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m1, m7
|
||||
pmullw m0, m6
|
||||
pmullw m1, m6
|
||||
paddw m2, m0
|
||||
paddw m3, m1 ; [mm2,mm3] += C * src[0..7]
|
||||
|
||||
movq m1, [r1+1]
|
||||
movq m0, m1
|
||||
movq m4, m1
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m4, m7
|
||||
pmullw m0, [rsp+8]
|
||||
pmullw m4, [rsp+8]
|
||||
paddw m2, m0
|
||||
paddw m3, m4 ; [mm2,mm3] += D * src[1..8]
|
||||
movq m0, [r1]
|
||||
|
||||
paddw m2, [rnd_2d_%2+rnd_bias*8]
|
||||
paddw m3, [rnd_2d_%2+rnd_bias*8]
|
||||
psrlw m2, 6
|
||||
psrlw m3, 6
|
||||
packuswb m2, m3
|
||||
CHROMAMC_AVG m2, [dest_reg]
|
||||
movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
|
||||
|
||||
add dest_reg, r2
|
||||
dec r3d
|
||||
jne .next2drow
|
||||
mov rsp, r6 ; restore stack pointer
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro chroma_mc4_mmx_func 2
|
||||
%define extra_regs 0
|
||||
%ifidn %2, rv40
|
||||
%ifdef PIC
|
||||
%define extra_regs 1
|
||||
%endif ; PIC
|
||||
%endif ; rv40
|
||||
cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
|
||||
pxor m7, m7
|
||||
movd m2, r4d ; x
|
||||
movd m3, r5d ; y
|
||||
movq m4, [pw_8]
|
||||
movq m5, [pw_8]
|
||||
punpcklwd m2, m2
|
||||
punpcklwd m3, m3
|
||||
punpcklwd m2, m2
|
||||
punpcklwd m3, m3
|
||||
psubw m4, m2
|
||||
psubw m5, m3
|
||||
|
||||
%ifidn %2, rv40
|
||||
%ifdef PIC
|
||||
lea r6, [rnd_rv40_2d_tbl]
|
||||
%define rnd_2d_rv40 r6
|
||||
%else
|
||||
%define rnd_2d_rv40 rnd_rv40_2d_tbl
|
||||
%endif
|
||||
and r5, 6 ; &~1 for mx/my=[0,7]
|
||||
lea r5, [r5*4+r4]
|
||||
sar r5d, 1
|
||||
%define rnd_bias r5
|
||||
%else ; vc1, h264
|
||||
%define rnd_bias 0
|
||||
%endif
|
||||
|
||||
movd m0, [r1 ]
|
||||
movd m6, [r1+1]
|
||||
add r1, r2
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m6, m7
|
||||
pmullw m0, m4
|
||||
pmullw m6, m2
|
||||
paddw m6, m0
|
||||
|
||||
.next2rows:
|
||||
movd m0, [r1 ]
|
||||
movd m1, [r1+1]
|
||||
add r1, r2
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
pmullw m0, m4
|
||||
pmullw m1, m2
|
||||
paddw m1, m0
|
||||
movq m0, m1
|
||||
|
||||
pmullw m6, m5
|
||||
pmullw m1, m3
|
||||
paddw m6, [rnd_2d_%2+rnd_bias*8]
|
||||
paddw m1, m6
|
||||
psrlw m1, 6
|
||||
packuswb m1, m1
|
||||
CHROMAMC_AVG4 m1, m6, [r0]
|
||||
movd [r0], m1
|
||||
add r0, r2
|
||||
|
||||
movd m6, [r1 ]
|
||||
movd m1, [r1+1]
|
||||
add r1, r2
|
||||
punpcklbw m6, m7
|
||||
punpcklbw m1, m7
|
||||
pmullw m6, m4
|
||||
pmullw m1, m2
|
||||
paddw m1, m6
|
||||
movq m6, m1
|
||||
pmullw m0, m5
|
||||
pmullw m1, m3
|
||||
paddw m0, [rnd_2d_%2+rnd_bias*8]
|
||||
paddw m1, m0
|
||||
psrlw m1, 6
|
||||
packuswb m1, m1
|
||||
CHROMAMC_AVG4 m1, m0, [r0]
|
||||
movd [r0], m1
|
||||
add r0, r2
|
||||
sub r3d, 2
|
||||
jnz .next2rows
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%macro chroma_mc2_mmx_func 2
|
||||
cglobal %1_%2_chroma_mc2, 6, 7, 0
|
||||
mov r6d, r4d
|
||||
shl r4d, 16
|
||||
sub r4d, r6d
|
||||
add r4d, 8
|
||||
imul r5d, r4d ; x*y<<16 | y*(8-x)
|
||||
shl r4d, 3
|
||||
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
|
||||
|
||||
movd m5, r4d
|
||||
movd m6, r5d
|
||||
punpckldq m5, m5 ; mm5 = {A,B,A,B}
|
||||
punpckldq m6, m6 ; mm6 = {C,D,C,D}
|
||||
pxor m7, m7
|
||||
movd m2, [r1]
|
||||
punpcklbw m2, m7
|
||||
pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
|
||||
|
||||
.nextrow:
|
||||
add r1, r2
|
||||
movq m1, m2
|
||||
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
|
||||
movd m0, [r1]
|
||||
punpcklbw m0, m7
|
||||
pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
|
||||
movq m2, m0
|
||||
pmaddwd m0, m6
|
||||
paddw m1, [rnd_2d_%2]
|
||||
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
|
||||
psrlw m1, 6
|
||||
packssdw m1, m7
|
||||
packuswb m1, m7
|
||||
CHROMAMC_AVG4 m1, m3, [r0]
|
||||
movd r5d, m1
|
||||
mov [r0], r5w
|
||||
add r0, r2
|
||||
sub r3d, 1
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%define rnd_1d_h264 pw_4
|
||||
%define rnd_2d_h264 pw_32
|
||||
%define rnd_1d_vc1 pw_3
|
||||
%define rnd_2d_vc1 pw_28
|
||||
|
||||
%macro NOTHING 2-3
|
||||
%endmacro
|
||||
%macro DIRECT_AVG 2
|
||||
PAVGB %1, %2
|
||||
%endmacro
|
||||
%macro COPY_AVG 3
|
||||
movd %2, %3
|
||||
PAVGB %1, %2
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
%define CHROMAMC_AVG NOTHING
|
||||
%define CHROMAMC_AVG4 NOTHING
|
||||
chroma_mc8_mmx_func put, h264, _rnd
|
||||
chroma_mc8_mmx_func put, vc1, _nornd
|
||||
chroma_mc8_mmx_func put, rv40
|
||||
chroma_mc4_mmx_func put, h264
|
||||
chroma_mc4_mmx_func put, rv40
|
||||
|
||||
INIT_MMX mmxext
|
||||
chroma_mc2_mmx_func put, h264
|
||||
|
||||
%define CHROMAMC_AVG DIRECT_AVG
|
||||
%define CHROMAMC_AVG4 COPY_AVG
|
||||
chroma_mc8_mmx_func avg, h264, _rnd
|
||||
chroma_mc8_mmx_func avg, vc1, _nornd
|
||||
chroma_mc8_mmx_func avg, rv40
|
||||
chroma_mc4_mmx_func avg, h264
|
||||
chroma_mc4_mmx_func avg, rv40
|
||||
chroma_mc2_mmx_func avg, h264
|
||||
|
||||
INIT_MMX 3dnow
|
||||
chroma_mc8_mmx_func avg, h264, _rnd
|
||||
chroma_mc8_mmx_func avg, vc1, _nornd
|
||||
chroma_mc8_mmx_func avg, rv40
|
||||
chroma_mc4_mmx_func avg, h264
|
||||
chroma_mc4_mmx_func avg, rv40
|
||||
|
||||
%macro chroma_mc8_ssse3_func 2-3
|
||||
cglobal %1_%2_chroma_mc8%3, 6, 7, 8
|
||||
mov r6d, r5d
|
||||
or r6d, r4d
|
||||
jne .at_least_one_non_zero
|
||||
; mx == 0 AND my == 0 - no filter needed
|
||||
mv0_pixels_mc8
|
||||
REP_RET
|
||||
|
||||
.at_least_one_non_zero:
|
||||
test r5d, r5d
|
||||
je .my_is_zero
|
||||
test r4d, r4d
|
||||
je .mx_is_zero
|
||||
|
||||
; general case, bilinear
|
||||
mov r6d, r4d
|
||||
shl r4d, 8
|
||||
sub r4, r6
|
||||
mov r6, 8
|
||||
add r4, 8 ; x*288+8 = x<<8 | (8-x)
|
||||
sub r6d, r5d
|
||||
imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
|
||||
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
|
||||
|
||||
movd m7, r6d
|
||||
movd m6, r4d
|
||||
movdqa m5, [rnd_2d_%2]
|
||||
movq m0, [r1 ]
|
||||
movq m1, [r1+1]
|
||||
pshuflw m7, m7, 0
|
||||
pshuflw m6, m6, 0
|
||||
punpcklbw m0, m1
|
||||
movlhps m7, m7
|
||||
movlhps m6, m6
|
||||
|
||||
.next2rows:
|
||||
movq m1, [r1+r2*1 ]
|
||||
movq m2, [r1+r2*1+1]
|
||||
movq m3, [r1+r2*2 ]
|
||||
movq m4, [r1+r2*2+1]
|
||||
lea r1, [r1+r2*2]
|
||||
punpcklbw m1, m2
|
||||
movdqa m2, m1
|
||||
punpcklbw m3, m4
|
||||
movdqa m4, m3
|
||||
pmaddubsw m0, m7
|
||||
pmaddubsw m1, m6
|
||||
pmaddubsw m2, m7
|
||||
pmaddubsw m3, m6
|
||||
paddw m0, m5
|
||||
paddw m2, m5
|
||||
paddw m1, m0
|
||||
paddw m3, m2
|
||||
psrlw m1, 6
|
||||
movdqa m0, m4
|
||||
psrlw m3, 6
|
||||
%ifidn %1, avg
|
||||
movq m2, [r0 ]
|
||||
movhps m2, [r0+r2]
|
||||
%endif
|
||||
packuswb m1, m3
|
||||
CHROMAMC_AVG m1, m2
|
||||
movq [r0 ], m1
|
||||
movhps [r0+r2], m1
|
||||
sub r3d, 2
|
||||
lea r0, [r0+r2*2]
|
||||
jg .next2rows
|
||||
REP_RET
|
||||
|
||||
.my_is_zero:
|
||||
mov r5d, r4d
|
||||
shl r4d, 8
|
||||
add r4, 8
|
||||
sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
|
||||
movd m7, r4d
|
||||
movdqa m6, [rnd_1d_%2]
|
||||
pshuflw m7, m7, 0
|
||||
movlhps m7, m7
|
||||
|
||||
.next2xrows:
|
||||
movq m0, [r1 ]
|
||||
movq m1, [r1 +1]
|
||||
movq m2, [r1+r2 ]
|
||||
movq m3, [r1+r2+1]
|
||||
punpcklbw m0, m1
|
||||
punpcklbw m2, m3
|
||||
pmaddubsw m0, m7
|
||||
pmaddubsw m2, m7
|
||||
%ifidn %1, avg
|
||||
movq m4, [r0 ]
|
||||
movhps m4, [r0+r2]
|
||||
%endif
|
||||
paddw m0, m6
|
||||
paddw m2, m6
|
||||
psrlw m0, 3
|
||||
psrlw m2, 3
|
||||
packuswb m0, m2
|
||||
CHROMAMC_AVG m0, m4
|
||||
movq [r0 ], m0
|
||||
movhps [r0+r2], m0
|
||||
sub r3d, 2
|
||||
lea r0, [r0+r2*2]
|
||||
lea r1, [r1+r2*2]
|
||||
jg .next2xrows
|
||||
REP_RET
|
||||
|
||||
.mx_is_zero:
|
||||
mov r4d, r5d
|
||||
shl r5d, 8
|
||||
add r5, 8
|
||||
sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
|
||||
movd m7, r5d
|
||||
movdqa m6, [rnd_1d_%2]
|
||||
pshuflw m7, m7, 0
|
||||
movlhps m7, m7
|
||||
|
||||
.next2yrows:
|
||||
movq m0, [r1 ]
|
||||
movq m1, [r1+r2 ]
|
||||
movdqa m2, m1
|
||||
movq m3, [r1+r2*2]
|
||||
lea r1, [r1+r2*2]
|
||||
punpcklbw m0, m1
|
||||
punpcklbw m2, m3
|
||||
pmaddubsw m0, m7
|
||||
pmaddubsw m2, m7
|
||||
%ifidn %1, avg
|
||||
movq m4, [r0 ]
|
||||
movhps m4, [r0+r2]
|
||||
%endif
|
||||
paddw m0, m6
|
||||
paddw m2, m6
|
||||
psrlw m0, 3
|
||||
psrlw m2, 3
|
||||
packuswb m0, m2
|
||||
CHROMAMC_AVG m0, m4
|
||||
movq [r0 ], m0
|
||||
movhps [r0+r2], m0
|
||||
sub r3d, 2
|
||||
lea r0, [r0+r2*2]
|
||||
jg .next2yrows
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%macro chroma_mc4_ssse3_func 2
|
||||
cglobal %1_%2_chroma_mc4, 6, 7, 0
|
||||
mov r6, r4
|
||||
shl r4d, 8
|
||||
sub r4d, r6d
|
||||
mov r6, 8
|
||||
add r4d, 8 ; x*288+8
|
||||
sub r6d, r5d
|
||||
imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
|
||||
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
|
||||
|
||||
movd m7, r6d
|
||||
movd m6, r4d
|
||||
movq m5, [pw_32]
|
||||
movd m0, [r1 ]
|
||||
pshufw m7, m7, 0
|
||||
punpcklbw m0, [r1+1]
|
||||
pshufw m6, m6, 0
|
||||
|
||||
.next2rows:
|
||||
movd m1, [r1+r2*1 ]
|
||||
movd m3, [r1+r2*2 ]
|
||||
punpcklbw m1, [r1+r2*1+1]
|
||||
punpcklbw m3, [r1+r2*2+1]
|
||||
lea r1, [r1+r2*2]
|
||||
movq m2, m1
|
||||
movq m4, m3
|
||||
pmaddubsw m0, m7
|
||||
pmaddubsw m1, m6
|
||||
pmaddubsw m2, m7
|
||||
pmaddubsw m3, m6
|
||||
paddw m0, m5
|
||||
paddw m2, m5
|
||||
paddw m1, m0
|
||||
paddw m3, m2
|
||||
psrlw m1, 6
|
||||
movq m0, m4
|
||||
psrlw m3, 6
|
||||
packuswb m1, m1
|
||||
packuswb m3, m3
|
||||
CHROMAMC_AVG m1, [r0 ]
|
||||
CHROMAMC_AVG m3, [r0+r2]
|
||||
movd [r0 ], m1
|
||||
movd [r0+r2], m3
|
||||
sub r3d, 2
|
||||
lea r0, [r0+r2*2]
|
||||
jg .next2rows
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%define CHROMAMC_AVG NOTHING
|
||||
INIT_XMM ssse3
|
||||
chroma_mc8_ssse3_func put, h264, _rnd
|
||||
chroma_mc8_ssse3_func put, vc1, _nornd
|
||||
INIT_MMX ssse3
|
||||
chroma_mc4_ssse3_func put, h264
|
||||
|
||||
%define CHROMAMC_AVG DIRECT_AVG
|
||||
INIT_XMM ssse3
|
||||
chroma_mc8_ssse3_func avg, h264, _rnd
|
||||
chroma_mc8_ssse3_func avg, vc1, _nornd
|
||||
INIT_MMX ssse3
|
||||
chroma_mc4_ssse3_func avg, h264
|
||||
269
externals/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm
vendored
Executable file
269
externals/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm
vendored
Executable file
@@ -0,0 +1,269 @@
|
||||
;*****************************************************************************
|
||||
;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2005-2011 x264 project
|
||||
;*
|
||||
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
cextern pw_4
|
||||
cextern pw_8
|
||||
cextern pw_32
|
||||
cextern pw_64
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
||||
%macro MV0_PIXELS_MC8 0
|
||||
lea r4, [r2*3 ]
|
||||
lea r5, [r2*4 ]
|
||||
.next4rows:
|
||||
movu m0, [r1 ]
|
||||
movu m1, [r1+r2 ]
|
||||
CHROMAMC_AVG m0, [r0 ]
|
||||
CHROMAMC_AVG m1, [r0+r2 ]
|
||||
mova [r0 ], m0
|
||||
mova [r0+r2 ], m1
|
||||
movu m0, [r1+r2*2]
|
||||
movu m1, [r1+r4 ]
|
||||
CHROMAMC_AVG m0, [r0+r2*2]
|
||||
CHROMAMC_AVG m1, [r0+r4 ]
|
||||
mova [r0+r2*2], m0
|
||||
mova [r0+r4 ], m1
|
||||
add r1, r5
|
||||
add r0, r5
|
||||
sub r3d, 4
|
||||
jne .next4rows
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_put/avg_h264_chroma_mc8(pixel *dst, pixel *src, ptrdiff_t stride,
|
||||
; int h, int mx, int my)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro CHROMA_MC8 1
|
||||
cglobal %1_h264_chroma_mc8_10, 6,7,8
|
||||
mov r6d, r5d
|
||||
or r6d, r4d
|
||||
jne .at_least_one_non_zero
|
||||
; mx == 0 AND my == 0 - no filter needed
|
||||
MV0_PIXELS_MC8
|
||||
REP_RET
|
||||
|
||||
.at_least_one_non_zero:
|
||||
mov r6d, 2
|
||||
test r5d, r5d
|
||||
je .x_interpolation
|
||||
mov r6, r2 ; dxy = x ? 1 : stride
|
||||
test r4d, r4d
|
||||
jne .xy_interpolation
|
||||
.x_interpolation:
|
||||
; mx == 0 XOR my == 0 - 1 dimensional filter only
|
||||
or r4d, r5d ; x + y
|
||||
movd m5, r4d
|
||||
mova m4, [pw_8]
|
||||
mova m6, [pw_4] ; mm6 = rnd >> 3
|
||||
SPLATW m5, m5 ; mm5 = B = x
|
||||
psubw m4, m5 ; mm4 = A = 8-x
|
||||
|
||||
.next1drow:
|
||||
movu m0, [r1 ] ; mm0 = src[0..7]
|
||||
movu m2, [r1+r6] ; mm2 = src[1..8]
|
||||
|
||||
pmullw m0, m4 ; mm0 = A * src[0..7]
|
||||
pmullw m2, m5 ; mm2 = B * src[1..8]
|
||||
|
||||
paddw m0, m6
|
||||
paddw m0, m2
|
||||
psrlw m0, 3
|
||||
CHROMAMC_AVG m0, [r0]
|
||||
mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
|
||||
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jne .next1drow
|
||||
REP_RET
|
||||
|
||||
.xy_interpolation: ; general case, bilinear
|
||||
movd m4, r4m ; x
|
||||
movd m6, r5m ; y
|
||||
|
||||
SPLATW m4, m4 ; mm4 = x words
|
||||
SPLATW m6, m6 ; mm6 = y words
|
||||
psllw m5, m4, 3 ; mm5 = 8x
|
||||
pmullw m4, m6 ; mm4 = x * y
|
||||
psllw m6, 3 ; mm6 = 8y
|
||||
paddw m1, m5, m6 ; mm7 = 8x+8y
|
||||
mova m7, m4 ; DD = x * y
|
||||
psubw m5, m4 ; mm5 = B = 8x - xy
|
||||
psubw m6, m4 ; mm6 = C = 8y - xy
|
||||
paddw m4, [pw_64]
|
||||
psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64
|
||||
|
||||
movu m0, [r1 ] ; mm0 = src[0..7]
|
||||
movu m1, [r1+2] ; mm1 = src[1..8]
|
||||
.next2drow:
|
||||
add r1, r2
|
||||
|
||||
pmullw m2, m0, m4
|
||||
pmullw m1, m5
|
||||
paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8]
|
||||
|
||||
movu m0, [r1]
|
||||
movu m1, [r1+2]
|
||||
pmullw m3, m0, m6
|
||||
paddw m2, m3 ; mm2 += C * src[0..7+strde]
|
||||
pmullw m3, m1, m7
|
||||
paddw m2, m3 ; mm2 += D * src[1..8+strde]
|
||||
|
||||
paddw m2, [pw_32]
|
||||
psrlw m2, 6
|
||||
CHROMAMC_AVG m2, [r0]
|
||||
mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6
|
||||
|
||||
add r0, r2
|
||||
dec r3d
|
||||
jne .next2drow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_put/avg_h264_chroma_mc4(pixel *dst, pixel *src, ptrdiff_t stride,
|
||||
; int h, int mx, int my)
|
||||
;-----------------------------------------------------------------------------
|
||||
;TODO: xmm mc4
|
||||
%macro MC4_OP 2
|
||||
movq %1, [r1 ]
|
||||
movq m1, [r1+2]
|
||||
add r1, r2
|
||||
pmullw %1, m4
|
||||
pmullw m1, m2
|
||||
paddw m1, %1
|
||||
mova %1, m1
|
||||
|
||||
pmullw %2, m5
|
||||
pmullw m1, m3
|
||||
paddw %2, [pw_32]
|
||||
paddw m1, %2
|
||||
psrlw m1, 6
|
||||
CHROMAMC_AVG m1, %2, [r0]
|
||||
movq [r0], m1
|
||||
add r0, r2
|
||||
%endmacro
|
||||
|
||||
%macro CHROMA_MC4 1
|
||||
cglobal %1_h264_chroma_mc4_10, 6,6,7
|
||||
movd m2, r4m ; x
|
||||
movd m3, r5m ; y
|
||||
mova m4, [pw_8]
|
||||
mova m5, m4
|
||||
SPLATW m2, m2
|
||||
SPLATW m3, m3
|
||||
psubw m4, m2
|
||||
psubw m5, m3
|
||||
|
||||
movq m0, [r1 ]
|
||||
movq m6, [r1+2]
|
||||
add r1, r2
|
||||
pmullw m0, m4
|
||||
pmullw m6, m2
|
||||
paddw m6, m0
|
||||
|
||||
.next2rows:
|
||||
MC4_OP m0, m6
|
||||
MC4_OP m6, m0
|
||||
sub r3d, 2
|
||||
jnz .next2rows
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_put/avg_h264_chroma_mc2(pixel *dst, pixel *src, ptrdiff_t stride,
|
||||
; int h, int mx, int my)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro CHROMA_MC2 1
|
||||
cglobal %1_h264_chroma_mc2_10, 6,7
|
||||
mov r6d, r4d
|
||||
shl r4d, 16
|
||||
sub r4d, r6d
|
||||
add r4d, 8
|
||||
imul r5d, r4d ; x*y<<16 | y*(8-x)
|
||||
shl r4d, 3
|
||||
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
|
||||
|
||||
movd m5, r4d
|
||||
movd m6, r5d
|
||||
punpckldq m5, m5 ; mm5 = {A,B,A,B}
|
||||
punpckldq m6, m6 ; mm6 = {C,D,C,D}
|
||||
pxor m7, m7
|
||||
pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2]
|
||||
|
||||
.nextrow:
|
||||
add r1, r2
|
||||
movq m1, m2
|
||||
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
|
||||
pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2]
|
||||
movq m2, m0
|
||||
pmaddwd m0, m6
|
||||
paddw m1, [pw_32]
|
||||
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
|
||||
psrlw m1, 6
|
||||
packssdw m1, m7
|
||||
CHROMAMC_AVG m1, m3, [r0]
|
||||
movd [r0], m1
|
||||
add r0, r2
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%macro NOTHING 2-3
|
||||
%endmacro
|
||||
%macro AVG 2-3
|
||||
%if %0==3
|
||||
movq %2, %3
|
||||
%endif
|
||||
pavgw %1, %2
|
||||
%endmacro
|
||||
|
||||
%define CHROMAMC_AVG NOTHING
|
||||
INIT_XMM sse2
|
||||
CHROMA_MC8 put
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
CHROMA_MC8 put
|
||||
%endif
|
||||
INIT_MMX mmxext
|
||||
CHROMA_MC4 put
|
||||
CHROMA_MC2 put
|
||||
|
||||
%define CHROMAMC_AVG AVG
|
||||
INIT_XMM sse2
|
||||
CHROMA_MC8 avg
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
CHROMA_MC8 avg
|
||||
%endif
|
||||
INIT_MMX mmxext
|
||||
CHROMA_MC4 avg
|
||||
CHROMA_MC2 avg
|
||||
1420
externals/ffmpeg/libavcodec/x86/h264_deblock.asm
vendored
Executable file
1420
externals/ffmpeg/libavcodec/x86/h264_deblock.asm
vendored
Executable file
File diff suppressed because it is too large
Load Diff
1080
externals/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm
vendored
Executable file
1080
externals/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm
vendored
Executable file
File diff suppressed because it is too large
Load Diff
1199
externals/ffmpeg/libavcodec/x86/h264_idct.asm
vendored
Executable file
1199
externals/ffmpeg/libavcodec/x86/h264_idct.asm
vendored
Executable file
File diff suppressed because it is too large
Load Diff
657
externals/ffmpeg/libavcodec/x86/h264_idct_10bit.asm
vendored
Executable file
657
externals/ffmpeg/libavcodec/x86/h264_idct_10bit.asm
vendored
Executable file
@@ -0,0 +1,657 @@
|
||||
;*****************************************************************************
|
||||
;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2005-2011 x264 project
|
||||
;*
|
||||
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
cextern pw_1023
|
||||
%define pw_pixel_max pw_1023
|
||||
cextern pd_32
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro STORE_DIFFx2 6
|
||||
psrad %1, 6
|
||||
psrad %2, 6
|
||||
packssdw %1, %2
|
||||
movq %3, [%5]
|
||||
movhps %3, [%5+%6]
|
||||
paddsw %1, %3
|
||||
CLIPW %1, %4, [pw_pixel_max]
|
||||
movq [%5], %1
|
||||
movhps [%5+%6], %1
|
||||
%endmacro
|
||||
|
||||
%macro STORE_DIFF16 5
|
||||
psrad %1, 6
|
||||
psrad %2, 6
|
||||
packssdw %1, %2
|
||||
paddsw %1, [%5]
|
||||
CLIPW %1, %3, %4
|
||||
mova [%5], %1
|
||||
%endmacro
|
||||
|
||||
;dst, in, stride
|
||||
%macro IDCT4_ADD_10 3
|
||||
mova m0, [%2+ 0]
|
||||
mova m1, [%2+16]
|
||||
mova m2, [%2+32]
|
||||
mova m3, [%2+48]
|
||||
IDCT4_1D d,0,1,2,3,4,5
|
||||
TRANSPOSE4x4D 0,1,2,3,4
|
||||
paddd m0, [pd_32]
|
||||
IDCT4_1D d,0,1,2,3,4,5
|
||||
pxor m5, m5
|
||||
mova [%2+ 0], m5
|
||||
mova [%2+16], m5
|
||||
mova [%2+32], m5
|
||||
mova [%2+48], m5
|
||||
STORE_DIFFx2 m0, m1, m4, m5, %1, %3
|
||||
lea %1, [%1+%3*2]
|
||||
STORE_DIFFx2 m2, m3, m4, m5, %1, %3
|
||||
%endmacro
|
||||
|
||||
%macro IDCT_ADD_10 0
|
||||
cglobal h264_idct_add_10, 3,3
|
||||
movsxdifnidn r2, r2d
|
||||
IDCT4_ADD_10 r0, r1, r2
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT_ADD_10
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT_ADD_10
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset,
|
||||
; int16_t *block, int stride,
|
||||
; const uint8_t nnzc[6*8])
|
||||
;-----------------------------------------------------------------------------
|
||||
;;;;;;; NO FATE SAMPLES TRIGGER THIS
|
||||
%macro ADD4x4IDCT 0
|
||||
add4x4_idct %+ SUFFIX:
|
||||
add r5, r0
|
||||
mova m0, [r2+ 0]
|
||||
mova m1, [r2+16]
|
||||
mova m2, [r2+32]
|
||||
mova m3, [r2+48]
|
||||
IDCT4_1D d,0,1,2,3,4,5
|
||||
TRANSPOSE4x4D 0,1,2,3,4
|
||||
paddd m0, [pd_32]
|
||||
IDCT4_1D d,0,1,2,3,4,5
|
||||
pxor m5, m5
|
||||
mova [r2+ 0], m5
|
||||
mova [r2+16], m5
|
||||
mova [r2+32], m5
|
||||
mova [r2+48], m5
|
||||
STORE_DIFFx2 m0, m1, m4, m5, r5, r3
|
||||
lea r5, [r5+r3*2]
|
||||
STORE_DIFFx2 m2, m3, m4, m5, r5, r3
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
ALIGN 16
|
||||
ADD4x4IDCT
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
ALIGN 16
|
||||
ADD4x4IDCT
|
||||
%endif
|
||||
|
||||
%macro ADD16_OP 2
|
||||
cmp byte [r4+%2], 0
|
||||
jz .skipblock%1
|
||||
mov r5d, [r1+%1*4]
|
||||
call add4x4_idct %+ SUFFIX
|
||||
.skipblock%1:
|
||||
%if %1<15
|
||||
add r2, 64
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro IDCT_ADD16_10 0
|
||||
cglobal h264_idct_add16_10, 5,6
|
||||
movsxdifnidn r3, r3d
|
||||
ADD16_OP 0, 4+1*8
|
||||
ADD16_OP 1, 5+1*8
|
||||
ADD16_OP 2, 4+2*8
|
||||
ADD16_OP 3, 5+2*8
|
||||
ADD16_OP 4, 6+1*8
|
||||
ADD16_OP 5, 7+1*8
|
||||
ADD16_OP 6, 6+2*8
|
||||
ADD16_OP 7, 7+2*8
|
||||
ADD16_OP 8, 4+3*8
|
||||
ADD16_OP 9, 5+3*8
|
||||
ADD16_OP 10, 4+4*8
|
||||
ADD16_OP 11, 5+4*8
|
||||
ADD16_OP 12, 6+3*8
|
||||
ADD16_OP 13, 7+3*8
|
||||
ADD16_OP 14, 6+4*8
|
||||
ADD16_OP 15, 7+4*8
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT_ADD16_10
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT_ADD16_10
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro IDCT_DC_ADD_OP_10 3
|
||||
pxor m5, m5
|
||||
%if avx_enabled
|
||||
paddw m1, m0, [%1+0 ]
|
||||
paddw m2, m0, [%1+%2 ]
|
||||
paddw m3, m0, [%1+%2*2]
|
||||
paddw m4, m0, [%1+%3 ]
|
||||
%else
|
||||
mova m1, [%1+0 ]
|
||||
mova m2, [%1+%2 ]
|
||||
mova m3, [%1+%2*2]
|
||||
mova m4, [%1+%3 ]
|
||||
paddw m1, m0
|
||||
paddw m2, m0
|
||||
paddw m3, m0
|
||||
paddw m4, m0
|
||||
%endif
|
||||
CLIPW m1, m5, m6
|
||||
CLIPW m2, m5, m6
|
||||
CLIPW m3, m5, m6
|
||||
CLIPW m4, m5, m6
|
||||
mova [%1+0 ], m1
|
||||
mova [%1+%2 ], m2
|
||||
mova [%1+%2*2], m3
|
||||
mova [%1+%3 ], m4
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
cglobal h264_idct_dc_add_10,3,3
|
||||
movsxdifnidn r2, r2d
|
||||
movd m0, [r1]
|
||||
mov dword [r1], 0
|
||||
paddd m0, [pd_32]
|
||||
psrad m0, 6
|
||||
lea r1, [r2*3]
|
||||
pshufw m0, m0, 0
|
||||
mova m6, [pw_pixel_max]
|
||||
IDCT_DC_ADD_OP_10 r0, r2, r1
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_idct8_dc_add_10(pixel *dst, int16_t *block, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro IDCT8_DC_ADD 0
|
||||
cglobal h264_idct8_dc_add_10,3,4,7
|
||||
movsxdifnidn r2, r2d
|
||||
movd m0, [r1]
|
||||
mov dword[r1], 0
|
||||
paddd m0, [pd_32]
|
||||
psrad m0, 6
|
||||
lea r1, [r2*3]
|
||||
SPLATW m0, m0, 0
|
||||
mova m6, [pw_pixel_max]
|
||||
IDCT_DC_ADD_OP_10 r0, r2, r1
|
||||
lea r0, [r0+r2*4]
|
||||
IDCT_DC_ADD_OP_10 r0, r2, r1
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT8_DC_ADD
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT8_DC_ADD
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset,
|
||||
; int16_t *block, int stride,
|
||||
; const uint8_t nnzc[6*8])
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro AC 1
|
||||
.ac%1:
|
||||
mov r5d, [r1+(%1+0)*4]
|
||||
call add4x4_idct %+ SUFFIX
|
||||
mov r5d, [r1+(%1+1)*4]
|
||||
add r2, 64
|
||||
call add4x4_idct %+ SUFFIX
|
||||
add r2, 64
|
||||
jmp .skipadd%1
|
||||
%endmacro
|
||||
|
||||
%assign last_block 16
|
||||
%macro ADD16_OP_INTRA 2
|
||||
cmp word [r4+%2], 0
|
||||
jnz .ac%1
|
||||
mov r5d, [r2+ 0]
|
||||
or r5d, [r2+64]
|
||||
jz .skipblock%1
|
||||
mov r5d, [r1+(%1+0)*4]
|
||||
call idct_dc_add %+ SUFFIX
|
||||
.skipblock%1:
|
||||
%if %1<last_block-2
|
||||
add r2, 128
|
||||
%endif
|
||||
.skipadd%1:
|
||||
%endmacro
|
||||
|
||||
%macro IDCT_ADD16INTRA_10 0
|
||||
idct_dc_add %+ SUFFIX:
|
||||
add r5, r0
|
||||
movq m0, [r2+ 0]
|
||||
movhps m0, [r2+64]
|
||||
mov dword [r2+ 0], 0
|
||||
mov dword [r2+64], 0
|
||||
paddd m0, [pd_32]
|
||||
psrad m0, 6
|
||||
pshufhw m0, m0, 0
|
||||
pshuflw m0, m0, 0
|
||||
lea r6, [r3*3]
|
||||
mova m6, [pw_pixel_max]
|
||||
IDCT_DC_ADD_OP_10 r5, r3, r6
|
||||
ret
|
||||
|
||||
cglobal h264_idct_add16intra_10,5,7,8
|
||||
movsxdifnidn r3, r3d
|
||||
ADD16_OP_INTRA 0, 4+1*8
|
||||
ADD16_OP_INTRA 2, 4+2*8
|
||||
ADD16_OP_INTRA 4, 6+1*8
|
||||
ADD16_OP_INTRA 6, 6+2*8
|
||||
ADD16_OP_INTRA 8, 4+3*8
|
||||
ADD16_OP_INTRA 10, 4+4*8
|
||||
ADD16_OP_INTRA 12, 6+3*8
|
||||
ADD16_OP_INTRA 14, 6+4*8
|
||||
REP_RET
|
||||
AC 8
|
||||
AC 10
|
||||
AC 12
|
||||
AC 14
|
||||
AC 0
|
||||
AC 2
|
||||
AC 4
|
||||
AC 6
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT_ADD16INTRA_10
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT_ADD16INTRA_10
|
||||
%endif
|
||||
|
||||
%assign last_block 36
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_idct_add8_10(pixel **dst, const int *block_offset,
|
||||
; int16_t *block, int stride,
|
||||
; const uint8_t nnzc[6*8])
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro IDCT_ADD8 0
|
||||
cglobal h264_idct_add8_10,5,8,7
|
||||
movsxdifnidn r3, r3d
|
||||
%if ARCH_X86_64
|
||||
mov r7, r0
|
||||
%endif
|
||||
add r2, 1024
|
||||
mov r0, [r0]
|
||||
ADD16_OP_INTRA 16, 4+ 6*8
|
||||
ADD16_OP_INTRA 18, 4+ 7*8
|
||||
add r2, 1024-128*2
|
||||
%if ARCH_X86_64
|
||||
mov r0, [r7+gprsize]
|
||||
%else
|
||||
mov r0, r0m
|
||||
mov r0, [r0+gprsize]
|
||||
%endif
|
||||
ADD16_OP_INTRA 32, 4+11*8
|
||||
ADD16_OP_INTRA 34, 4+12*8
|
||||
REP_RET
|
||||
AC 16
|
||||
AC 18
|
||||
AC 32
|
||||
AC 34
|
||||
|
||||
%endmacro ; IDCT_ADD8
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT_ADD8
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT_ADD8
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_idct_add8_422_10(pixel **dst, const int *block_offset,
|
||||
; int16_t *block, int stride,
|
||||
; const uint8_t nnzc[6*8])
|
||||
;-----------------------------------------------------------------------------
|
||||
%assign last_block 44
|
||||
|
||||
%macro IDCT_ADD8_422 0
|
||||
|
||||
cglobal h264_idct_add8_422_10, 5, 8, 7
|
||||
movsxdifnidn r3, r3d
|
||||
%if ARCH_X86_64
|
||||
mov r7, r0
|
||||
%endif
|
||||
|
||||
add r2, 1024
|
||||
mov r0, [r0]
|
||||
ADD16_OP_INTRA 16, 4+ 6*8
|
||||
ADD16_OP_INTRA 18, 4+ 7*8
|
||||
ADD16_OP_INTRA 24, 4+ 8*8 ; i+4
|
||||
ADD16_OP_INTRA 26, 4+ 9*8 ; i+4
|
||||
add r2, 1024-128*4
|
||||
|
||||
%if ARCH_X86_64
|
||||
mov r0, [r7+gprsize]
|
||||
%else
|
||||
mov r0, r0m
|
||||
mov r0, [r0+gprsize]
|
||||
%endif
|
||||
|
||||
ADD16_OP_INTRA 32, 4+11*8
|
||||
ADD16_OP_INTRA 34, 4+12*8
|
||||
ADD16_OP_INTRA 40, 4+13*8 ; i+4
|
||||
ADD16_OP_INTRA 42, 4+14*8 ; i+4
|
||||
REP_RET
|
||||
AC 16
|
||||
AC 18
|
||||
AC 24 ; i+4
|
||||
AC 26 ; i+4
|
||||
AC 32
|
||||
AC 34
|
||||
AC 40 ; i+4
|
||||
AC 42 ; i+4
|
||||
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT_ADD8_422
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT_ADD8_422
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro IDCT8_1D 2
|
||||
SWAP 0, 1
|
||||
psrad m4, m5, 1
|
||||
psrad m1, m0, 1
|
||||
paddd m4, m5
|
||||
paddd m1, m0
|
||||
paddd m4, m7
|
||||
paddd m1, m5
|
||||
psubd m4, m0
|
||||
paddd m1, m3
|
||||
|
||||
psubd m0, m3
|
||||
psubd m5, m3
|
||||
paddd m0, m7
|
||||
psubd m5, m7
|
||||
psrad m3, 1
|
||||
psrad m7, 1
|
||||
psubd m0, m3
|
||||
psubd m5, m7
|
||||
|
||||
SWAP 1, 7
|
||||
psrad m1, m7, 2
|
||||
psrad m3, m4, 2
|
||||
paddd m3, m0
|
||||
psrad m0, 2
|
||||
paddd m1, m5
|
||||
psrad m5, 2
|
||||
psubd m0, m4
|
||||
psubd m7, m5
|
||||
|
||||
SWAP 5, 6
|
||||
psrad m4, m2, 1
|
||||
psrad m6, m5, 1
|
||||
psubd m4, m5
|
||||
paddd m6, m2
|
||||
|
||||
mova m2, %1
|
||||
mova m5, %2
|
||||
SUMSUB_BA d, 5, 2
|
||||
SUMSUB_BA d, 6, 5
|
||||
SUMSUB_BA d, 4, 2
|
||||
SUMSUB_BA d, 7, 6
|
||||
SUMSUB_BA d, 0, 4
|
||||
SUMSUB_BA d, 3, 2
|
||||
SUMSUB_BA d, 1, 5
|
||||
SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
|
||||
%endmacro
|
||||
|
||||
%macro IDCT8_1D_FULL 1
|
||||
mova m7, [%1+112*2]
|
||||
mova m6, [%1+ 96*2]
|
||||
mova m5, [%1+ 80*2]
|
||||
mova m3, [%1+ 48*2]
|
||||
mova m2, [%1+ 32*2]
|
||||
mova m1, [%1+ 16*2]
|
||||
IDCT8_1D [%1], [%1+ 64*2]
|
||||
%endmacro
|
||||
|
||||
; %1=int16_t *block, %2=int16_t *dstblock
|
||||
%macro IDCT8_ADD_SSE_START 2
|
||||
IDCT8_1D_FULL %1
|
||||
%if ARCH_X86_64
|
||||
TRANSPOSE4x4D 0,1,2,3,8
|
||||
mova [%2 ], m0
|
||||
TRANSPOSE4x4D 4,5,6,7,8
|
||||
mova [%2+8*2], m4
|
||||
%else
|
||||
mova [%1], m7
|
||||
TRANSPOSE4x4D 0,1,2,3,7
|
||||
mova m7, [%1]
|
||||
mova [%2 ], m0
|
||||
mova [%2+16*2], m1
|
||||
mova [%2+32*2], m2
|
||||
mova [%2+48*2], m3
|
||||
TRANSPOSE4x4D 4,5,6,7,3
|
||||
mova [%2+ 8*2], m4
|
||||
mova [%2+24*2], m5
|
||||
mova [%2+40*2], m6
|
||||
mova [%2+56*2], m7
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
|
||||
%macro IDCT8_ADD_SSE_END 3
|
||||
IDCT8_1D_FULL %2
|
||||
mova [%2 ], m6
|
||||
mova [%2+16*2], m7
|
||||
|
||||
pxor m7, m7
|
||||
STORE_DIFFx2 m0, m1, m6, m7, %1, %3
|
||||
lea %1, [%1+%3*2]
|
||||
STORE_DIFFx2 m2, m3, m6, m7, %1, %3
|
||||
mova m0, [%2 ]
|
||||
mova m1, [%2+16*2]
|
||||
lea %1, [%1+%3*2]
|
||||
STORE_DIFFx2 m4, m5, m6, m7, %1, %3
|
||||
lea %1, [%1+%3*2]
|
||||
STORE_DIFFx2 m0, m1, m6, m7, %1, %3
|
||||
%endmacro
|
||||
|
||||
%macro IDCT8_ADD 0
|
||||
cglobal h264_idct8_add_10, 3,4,16
|
||||
movsxdifnidn r2, r2d
|
||||
%if UNIX64 == 0
|
||||
%assign pad 16-gprsize-(stack_offset&15)
|
||||
sub rsp, pad
|
||||
call h264_idct8_add1_10 %+ SUFFIX
|
||||
add rsp, pad
|
||||
RET
|
||||
%endif
|
||||
|
||||
ALIGN 16
|
||||
; TODO: does not need to use stack
|
||||
h264_idct8_add1_10 %+ SUFFIX:
|
||||
%assign pad 256+16-gprsize
|
||||
sub rsp, pad
|
||||
add dword [r1], 32
|
||||
|
||||
%if ARCH_X86_64
|
||||
IDCT8_ADD_SSE_START r1, rsp
|
||||
SWAP 1, 9
|
||||
SWAP 2, 10
|
||||
SWAP 3, 11
|
||||
SWAP 5, 13
|
||||
SWAP 6, 14
|
||||
SWAP 7, 15
|
||||
IDCT8_ADD_SSE_START r1+16, rsp+128
|
||||
PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
|
||||
IDCT8_1D [rsp], [rsp+128]
|
||||
SWAP 0, 8
|
||||
SWAP 1, 9
|
||||
SWAP 2, 10
|
||||
SWAP 3, 11
|
||||
SWAP 4, 12
|
||||
SWAP 5, 13
|
||||
SWAP 6, 14
|
||||
SWAP 7, 15
|
||||
IDCT8_1D [rsp+16], [rsp+144]
|
||||
psrad m8, 6
|
||||
psrad m0, 6
|
||||
packssdw m8, m0
|
||||
paddsw m8, [r0]
|
||||
pxor m0, m0
|
||||
mova [r1+ 0], m0
|
||||
mova [r1+ 16], m0
|
||||
mova [r1+ 32], m0
|
||||
mova [r1+ 48], m0
|
||||
mova [r1+ 64], m0
|
||||
mova [r1+ 80], m0
|
||||
mova [r1+ 96], m0
|
||||
mova [r1+112], m0
|
||||
mova [r1+128], m0
|
||||
mova [r1+144], m0
|
||||
mova [r1+160], m0
|
||||
mova [r1+176], m0
|
||||
mova [r1+192], m0
|
||||
mova [r1+208], m0
|
||||
mova [r1+224], m0
|
||||
mova [r1+240], m0
|
||||
CLIPW m8, m0, [pw_pixel_max]
|
||||
mova [r0], m8
|
||||
mova m8, [pw_pixel_max]
|
||||
STORE_DIFF16 m9, m1, m0, m8, r0+r2
|
||||
lea r0, [r0+r2*2]
|
||||
STORE_DIFF16 m10, m2, m0, m8, r0
|
||||
STORE_DIFF16 m11, m3, m0, m8, r0+r2
|
||||
lea r0, [r0+r2*2]
|
||||
STORE_DIFF16 m12, m4, m0, m8, r0
|
||||
STORE_DIFF16 m13, m5, m0, m8, r0+r2
|
||||
lea r0, [r0+r2*2]
|
||||
STORE_DIFF16 m14, m6, m0, m8, r0
|
||||
STORE_DIFF16 m15, m7, m0, m8, r0+r2
|
||||
%else
|
||||
IDCT8_ADD_SSE_START r1, rsp
|
||||
IDCT8_ADD_SSE_START r1+16, rsp+128
|
||||
lea r3, [r0+8]
|
||||
IDCT8_ADD_SSE_END r0, rsp, r2
|
||||
IDCT8_ADD_SSE_END r3, rsp+16, r2
|
||||
mova [r1+ 0], m7
|
||||
mova [r1+ 16], m7
|
||||
mova [r1+ 32], m7
|
||||
mova [r1+ 48], m7
|
||||
mova [r1+ 64], m7
|
||||
mova [r1+ 80], m7
|
||||
mova [r1+ 96], m7
|
||||
mova [r1+112], m7
|
||||
mova [r1+128], m7
|
||||
mova [r1+144], m7
|
||||
mova [r1+160], m7
|
||||
mova [r1+176], m7
|
||||
mova [r1+192], m7
|
||||
mova [r1+208], m7
|
||||
mova [r1+224], m7
|
||||
mova [r1+240], m7
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
add rsp, pad
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT8_ADD
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT8_ADD
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset,
|
||||
; int16_t *block, int stride,
|
||||
; const uint8_t nnzc[6*8])
|
||||
;-----------------------------------------------------------------------------
|
||||
;;;;;;; NO FATE SAMPLES TRIGGER THIS
|
||||
%macro IDCT8_ADD4_OP 2
|
||||
cmp byte [r4+%2], 0
|
||||
jz .skipblock%1
|
||||
mov r0d, [r6+%1*4]
|
||||
add r0, r5
|
||||
call h264_idct8_add1_10 %+ SUFFIX
|
||||
.skipblock%1:
|
||||
%if %1<12
|
||||
add r1, 256
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro IDCT8_ADD4 0
|
||||
cglobal h264_idct8_add4_10, 0,7,16
|
||||
movsxdifnidn r3, r3d
|
||||
%assign pad 16-gprsize-(stack_offset&15)
|
||||
SUB rsp, pad
|
||||
mov r5, r0mp
|
||||
mov r6, r1mp
|
||||
mov r1, r2mp
|
||||
mov r2d, r3m
|
||||
movifnidn r4, r4mp
|
||||
IDCT8_ADD4_OP 0, 4+1*8
|
||||
IDCT8_ADD4_OP 4, 6+1*8
|
||||
IDCT8_ADD4_OP 8, 4+3*8
|
||||
IDCT8_ADD4_OP 12, 6+3*8
|
||||
ADD rsp, pad
|
||||
RET
|
||||
%endmacro ; IDCT8_ADD4
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT8_ADD4
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT8_ADD4
|
||||
%endif
|
||||
2757
externals/ffmpeg/libavcodec/x86/h264_intrapred.asm
vendored
Executable file
2757
externals/ffmpeg/libavcodec/x86/h264_intrapred.asm
vendored
Executable file
File diff suppressed because it is too large
Load Diff
1199
externals/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm
vendored
Executable file
1199
externals/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm
vendored
Executable file
File diff suppressed because it is too large
Load Diff
410
externals/ffmpeg/libavcodec/x86/h264_intrapred_init.c
vendored
Executable file
410
externals/ffmpeg/libavcodec/x86/h264_intrapred_init.c
vendored
Executable file
@@ -0,0 +1,410 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/h264pred.h"
|
||||
|
||||
#define PRED4x4(TYPE, DEPTH, OPT) \
|
||||
void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
|
||||
const uint8_t *topright, \
|
||||
ptrdiff_t stride);
|
||||
|
||||
PRED4x4(dc, 10, mmxext)
|
||||
PRED4x4(down_left, 10, sse2)
|
||||
PRED4x4(down_left, 10, avx)
|
||||
PRED4x4(down_right, 10, sse2)
|
||||
PRED4x4(down_right, 10, ssse3)
|
||||
PRED4x4(down_right, 10, avx)
|
||||
PRED4x4(vertical_left, 10, sse2)
|
||||
PRED4x4(vertical_left, 10, avx)
|
||||
PRED4x4(vertical_right, 10, sse2)
|
||||
PRED4x4(vertical_right, 10, ssse3)
|
||||
PRED4x4(vertical_right, 10, avx)
|
||||
PRED4x4(horizontal_up, 10, mmxext)
|
||||
PRED4x4(horizontal_down, 10, sse2)
|
||||
PRED4x4(horizontal_down, 10, ssse3)
|
||||
PRED4x4(horizontal_down, 10, avx)
|
||||
|
||||
#define PRED8x8(TYPE, DEPTH, OPT) \
|
||||
void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
|
||||
ptrdiff_t stride);
|
||||
|
||||
PRED8x8(dc, 10, mmxext)
|
||||
PRED8x8(dc, 10, sse2)
|
||||
PRED8x8(top_dc, 10, sse2)
|
||||
PRED8x8(plane, 10, sse2)
|
||||
PRED8x8(vertical, 10, sse2)
|
||||
PRED8x8(horizontal, 10, sse2)
|
||||
|
||||
#define PRED8x8L(TYPE, DEPTH, OPT)\
|
||||
void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
|
||||
int has_topleft, \
|
||||
int has_topright, \
|
||||
ptrdiff_t stride);
|
||||
|
||||
PRED8x8L(dc, 10, sse2)
|
||||
PRED8x8L(dc, 10, avx)
|
||||
PRED8x8L(128_dc, 10, mmxext)
|
||||
PRED8x8L(128_dc, 10, sse2)
|
||||
PRED8x8L(top_dc, 10, sse2)
|
||||
PRED8x8L(top_dc, 10, avx)
|
||||
PRED8x8L(vertical, 10, sse2)
|
||||
PRED8x8L(vertical, 10, avx)
|
||||
PRED8x8L(horizontal, 10, sse2)
|
||||
PRED8x8L(horizontal, 10, ssse3)
|
||||
PRED8x8L(horizontal, 10, avx)
|
||||
PRED8x8L(down_left, 10, sse2)
|
||||
PRED8x8L(down_left, 10, ssse3)
|
||||
PRED8x8L(down_left, 10, avx)
|
||||
PRED8x8L(down_right, 10, sse2)
|
||||
PRED8x8L(down_right, 10, ssse3)
|
||||
PRED8x8L(down_right, 10, avx)
|
||||
PRED8x8L(vertical_right, 10, sse2)
|
||||
PRED8x8L(vertical_right, 10, ssse3)
|
||||
PRED8x8L(vertical_right, 10, avx)
|
||||
PRED8x8L(horizontal_up, 10, sse2)
|
||||
PRED8x8L(horizontal_up, 10, ssse3)
|
||||
PRED8x8L(horizontal_up, 10, avx)
|
||||
|
||||
#define PRED16x16(TYPE, DEPTH, OPT)\
|
||||
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
|
||||
ptrdiff_t stride);
|
||||
|
||||
PRED16x16(dc, 10, mmxext)
|
||||
PRED16x16(dc, 10, sse2)
|
||||
PRED16x16(top_dc, 10, mmxext)
|
||||
PRED16x16(top_dc, 10, sse2)
|
||||
PRED16x16(128_dc, 10, mmxext)
|
||||
PRED16x16(128_dc, 10, sse2)
|
||||
PRED16x16(left_dc, 10, mmxext)
|
||||
PRED16x16(left_dc, 10, sse2)
|
||||
PRED16x16(vertical, 10, mmxext)
|
||||
PRED16x16(vertical, 10, sse2)
|
||||
PRED16x16(horizontal, 10, mmxext)
|
||||
PRED16x16(horizontal, 10, sse2)
|
||||
|
||||
/* 8-bit versions */
|
||||
PRED16x16(vertical, 8, mmx)
|
||||
PRED16x16(vertical, 8, sse)
|
||||
PRED16x16(horizontal, 8, mmx)
|
||||
PRED16x16(horizontal, 8, mmxext)
|
||||
PRED16x16(horizontal, 8, ssse3)
|
||||
PRED16x16(dc, 8, mmxext)
|
||||
PRED16x16(dc, 8, sse2)
|
||||
PRED16x16(dc, 8, ssse3)
|
||||
PRED16x16(plane_h264, 8, mmx)
|
||||
PRED16x16(plane_h264, 8, mmxext)
|
||||
PRED16x16(plane_h264, 8, sse2)
|
||||
PRED16x16(plane_h264, 8, ssse3)
|
||||
PRED16x16(plane_rv40, 8, mmx)
|
||||
PRED16x16(plane_rv40, 8, mmxext)
|
||||
PRED16x16(plane_rv40, 8, sse2)
|
||||
PRED16x16(plane_rv40, 8, ssse3)
|
||||
PRED16x16(plane_svq3, 8, mmx)
|
||||
PRED16x16(plane_svq3, 8, mmxext)
|
||||
PRED16x16(plane_svq3, 8, sse2)
|
||||
PRED16x16(plane_svq3, 8, ssse3)
|
||||
PRED16x16(tm_vp8, 8, mmx)
|
||||
PRED16x16(tm_vp8, 8, mmxext)
|
||||
PRED16x16(tm_vp8, 8, sse2)
|
||||
PRED16x16(tm_vp8, 8, avx2)
|
||||
|
||||
PRED8x8(top_dc, 8, mmxext)
|
||||
PRED8x8(dc_rv40, 8, mmxext)
|
||||
PRED8x8(dc, 8, mmxext)
|
||||
PRED8x8(vertical, 8, mmx)
|
||||
PRED8x8(horizontal, 8, mmx)
|
||||
PRED8x8(horizontal, 8, mmxext)
|
||||
PRED8x8(horizontal, 8, ssse3)
|
||||
PRED8x8(plane, 8, mmx)
|
||||
PRED8x8(plane, 8, mmxext)
|
||||
PRED8x8(plane, 8, sse2)
|
||||
PRED8x8(plane, 8, ssse3)
|
||||
PRED8x8(tm_vp8, 8, mmx)
|
||||
PRED8x8(tm_vp8, 8, mmxext)
|
||||
PRED8x8(tm_vp8, 8, sse2)
|
||||
PRED8x8(tm_vp8, 8, ssse3)
|
||||
|
||||
PRED8x8L(top_dc, 8, mmxext)
|
||||
PRED8x8L(top_dc, 8, ssse3)
|
||||
PRED8x8L(dc, 8, mmxext)
|
||||
PRED8x8L(dc, 8, ssse3)
|
||||
PRED8x8L(horizontal, 8, mmxext)
|
||||
PRED8x8L(horizontal, 8, ssse3)
|
||||
PRED8x8L(vertical, 8, mmxext)
|
||||
PRED8x8L(vertical, 8, ssse3)
|
||||
PRED8x8L(down_left, 8, mmxext)
|
||||
PRED8x8L(down_left, 8, sse2)
|
||||
PRED8x8L(down_left, 8, ssse3)
|
||||
PRED8x8L(down_right, 8, mmxext)
|
||||
PRED8x8L(down_right, 8, sse2)
|
||||
PRED8x8L(down_right, 8, ssse3)
|
||||
PRED8x8L(vertical_right, 8, mmxext)
|
||||
PRED8x8L(vertical_right, 8, sse2)
|
||||
PRED8x8L(vertical_right, 8, ssse3)
|
||||
PRED8x8L(vertical_left, 8, sse2)
|
||||
PRED8x8L(vertical_left, 8, ssse3)
|
||||
PRED8x8L(horizontal_up, 8, mmxext)
|
||||
PRED8x8L(horizontal_up, 8, ssse3)
|
||||
PRED8x8L(horizontal_down, 8, mmxext)
|
||||
PRED8x8L(horizontal_down, 8, sse2)
|
||||
PRED8x8L(horizontal_down, 8, ssse3)
|
||||
|
||||
PRED4x4(dc, 8, mmxext)
|
||||
PRED4x4(down_left, 8, mmxext)
|
||||
PRED4x4(down_right, 8, mmxext)
|
||||
PRED4x4(vertical_left, 8, mmxext)
|
||||
PRED4x4(vertical_right, 8, mmxext)
|
||||
PRED4x4(horizontal_up, 8, mmxext)
|
||||
PRED4x4(horizontal_down, 8, mmxext)
|
||||
PRED4x4(tm_vp8, 8, mmx)
|
||||
PRED4x4(tm_vp8, 8, mmxext)
|
||||
PRED4x4(tm_vp8, 8, ssse3)
|
||||
PRED4x4(vertical_vp8, 8, mmxext)
|
||||
|
||||
av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
|
||||
const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (bit_depth == 8) {
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmx;
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmx;
|
||||
if (chroma_format_idc <= 1) {
|
||||
h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx;
|
||||
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmx;
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmx;
|
||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmx;
|
||||
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmx;
|
||||
} else {
|
||||
if (chroma_format_idc <= 1)
|
||||
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx;
|
||||
if (codec_id == AV_CODEC_ID_SVQ3) {
|
||||
if (cpu_flags & AV_CPU_FLAG_CMOV)
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx;
|
||||
} else if (codec_id == AV_CODEC_ID_RV40) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx;
|
||||
} else {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext;
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext;
|
||||
if (chroma_format_idc <= 1)
|
||||
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext;
|
||||
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext;
|
||||
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext;
|
||||
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext;
|
||||
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext;
|
||||
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_mmxext;
|
||||
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_mmxext;
|
||||
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext;
|
||||
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_mmxext;
|
||||
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_mmxext;
|
||||
h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] = ff_pred4x4_down_right_8_mmxext;
|
||||
h->pred4x4 [VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_8_mmxext;
|
||||
h->pred4x4 [HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_8_mmxext;
|
||||
h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_8_mmxext;
|
||||
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8 ||
|
||||
codec_id == AV_CODEC_ID_H264) {
|
||||
h->pred4x4 [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext;
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
|
||||
h->pred4x4 [VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_8_mmxext;
|
||||
}
|
||||
if (codec_id != AV_CODEC_ID_RV40) {
|
||||
h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_8_mmxext;
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
|
||||
if (chroma_format_idc <= 1) {
|
||||
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext;
|
||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext;
|
||||
}
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmxext;
|
||||
h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext;
|
||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmxext;
|
||||
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext;
|
||||
h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext;
|
||||
} else {
|
||||
if (chroma_format_idc <= 1)
|
||||
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext;
|
||||
if (codec_id == AV_CODEC_ID_SVQ3) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmxext;
|
||||
} else if (codec_id == AV_CODEC_ID_RV40) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmxext;
|
||||
} else {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmxext;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2;
|
||||
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2;
|
||||
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2;
|
||||
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2;
|
||||
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2;
|
||||
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2;
|
||||
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2;
|
||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2;
|
||||
} else {
|
||||
if (chroma_format_idc <= 1)
|
||||
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2;
|
||||
if (codec_id == AV_CODEC_ID_SVQ3) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2;
|
||||
} else if (codec_id == AV_CODEC_ID_RV40) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2;
|
||||
} else {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3;
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3;
|
||||
if (chroma_format_idc <= 1)
|
||||
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3;
|
||||
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_ssse3;
|
||||
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3;
|
||||
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3;
|
||||
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3;
|
||||
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3;
|
||||
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3;
|
||||
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_ssse3;
|
||||
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_ssse3;
|
||||
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_ssse3;
|
||||
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_ssse3;
|
||||
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_ssse3;
|
||||
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_ssse3;
|
||||
} else {
|
||||
if (chroma_format_idc <= 1)
|
||||
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3;
|
||||
if (codec_id == AV_CODEC_ID_SVQ3) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3;
|
||||
} else if (codec_id == AV_CODEC_ID_RV40) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3;
|
||||
} else {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(EXTERNAL_AVX2(cpu_flags)){
|
||||
if (codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2;
|
||||
}
|
||||
}
|
||||
} else if (bit_depth == 10) {
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
|
||||
h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext;
|
||||
|
||||
if (chroma_format_idc <= 1)
|
||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext;
|
||||
|
||||
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext;
|
||||
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext;
|
||||
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext;
|
||||
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext;
|
||||
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext;
|
||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext;
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2;
|
||||
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2;
|
||||
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2;
|
||||
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2;
|
||||
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2;
|
||||
|
||||
if (chroma_format_idc <= 1) {
|
||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2;
|
||||
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2;
|
||||
h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2;
|
||||
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2;
|
||||
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2;
|
||||
}
|
||||
|
||||
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2;
|
||||
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2;
|
||||
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2;
|
||||
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2;
|
||||
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2;
|
||||
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2;
|
||||
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2;
|
||||
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2;
|
||||
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2;
|
||||
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2;
|
||||
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2;
|
||||
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2;
|
||||
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2;
|
||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2;
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2;
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3;
|
||||
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3;
|
||||
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3;
|
||||
|
||||
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3;
|
||||
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
|
||||
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3;
|
||||
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3;
|
||||
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
|
||||
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
|
||||
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx;
|
||||
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx;
|
||||
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx;
|
||||
|
||||
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx;
|
||||
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx;
|
||||
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx;
|
||||
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx;
|
||||
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
|
||||
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
|
||||
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx;
|
||||
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx;
|
||||
}
|
||||
}
|
||||
}
|
||||
634
externals/ffmpeg/libavcodec/x86/h264_qpel.c
vendored
Executable file
634
externals/ffmpeg/libavcodec/x86/h264_qpel.c
vendored
Executable file
@@ -0,0 +1,634 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
|
||||
* Copyright (c) 2011 Daniel Kang
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/h264dec.h"
|
||||
#include "libavcodec/h264qpel.h"
|
||||
#include "libavcodec/pixels.h"
|
||||
#include "fpel.h"
|
||||
|
||||
#if HAVE_X86ASM
|
||||
void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext
|
||||
#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
|
||||
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
|
||||
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
|
||||
#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
|
||||
#define ff_put_pixels8_mmxext ff_put_pixels8_mmx
|
||||
#define ff_put_pixels4_mmxext ff_put_pixels4_mmx
|
||||
|
||||
#define DEF_QPEL(OPNAME)\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
|
||||
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\
|
||||
void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);
|
||||
|
||||
DEF_QPEL(avg)
|
||||
DEF_QPEL(put)
|
||||
|
||||
static av_always_inline void ff_put_h264_qpel8or16_hv1_lowpass_mmxext(int16_t *tmp, const uint8_t *src, int tmpStride, int srcStride, int size)
|
||||
{
|
||||
int w = (size + 8) >> 2;
|
||||
src -= 2 * srcStride + 2;
|
||||
while (w--) {
|
||||
ff_put_h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);
|
||||
tmp += 4;
|
||||
src += 4;
|
||||
}
|
||||
}
|
||||
|
||||
#define QPEL_H264(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
int w=3;\
|
||||
src -= 2*srcStride+2;\
|
||||
while(w--){\
|
||||
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
|
||||
tmp += 4;\
|
||||
src += 4;\
|
||||
}\
|
||||
tmp -= 3*4;\
|
||||
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\
|
||||
src -= 2*srcStride;\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
|
||||
src += 4;\
|
||||
dst += 4;\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
|
||||
int w = size>>4;\
|
||||
do{\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
|
||||
tmp += 8;\
|
||||
dst += 8;\
|
||||
}while(w--);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
src += 8*srcStride;\
|
||||
dst += 8*dstStride;\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
|
||||
src += 8*dstStride;\
|
||||
dst += 8*dstStride;\
|
||||
src2 += 8*src2Stride;\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
|
||||
ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h)\
|
||||
{\
|
||||
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
|
||||
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
|
||||
}\
|
||||
|
||||
|
||||
#if ARCH_X86_64
|
||||
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
|
||||
|
||||
void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
|
||||
void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
|
||||
|
||||
#else // ARCH_X86_64
|
||||
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
|
||||
src += 8*dstStride;\
|
||||
dst += 8*dstStride;\
|
||||
src2 += 8*src2Stride;\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
|
||||
}
|
||||
#endif // ARCH_X86_64
|
||||
|
||||
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
|
||||
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
src += 8*srcStride;\
|
||||
dst += 8*dstStride;\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
}\
|
||||
|
||||
#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}
|
||||
|
||||
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
|
||||
const uint8_t *src,
|
||||
int tmpStride,
|
||||
int srcStride,
|
||||
int size)
|
||||
{
|
||||
int w = (size+8)>>3;
|
||||
src -= 2*srcStride+2;
|
||||
while(w--){
|
||||
ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
|
||||
tmp += 8;
|
||||
src += 8;
|
||||
}
|
||||
}
|
||||
|
||||
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
|
||||
put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
|
||||
}\
|
||||
|
||||
#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext
|
||||
#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext
|
||||
#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
|
||||
#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
|
||||
|
||||
#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
|
||||
#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
|
||||
#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
|
||||
#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
|
||||
|
||||
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
|
||||
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
|
||||
|
||||
#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
|
||||
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
|
||||
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
|
||||
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
|
||||
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
|
||||
|
||||
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_put_pixels16_sse2(dst, src, stride, 16);
|
||||
}
|
||||
static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_avg_pixels16_sse2(dst, src, stride, 16);
|
||||
}
|
||||
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
|
||||
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
|
||||
|
||||
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
|
||||
}\
|
||||
|
||||
#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
|
||||
}\
|
||||
|
||||
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
|
||||
}\
|
||||
|
||||
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
|
||||
uint8_t * const halfHV= temp;\
|
||||
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
|
||||
av_assert2(((int)temp & 7) == 0);\
|
||||
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
|
||||
uint8_t * const halfHV= temp;\
|
||||
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
|
||||
av_assert2(((int)temp & 7) == 0);\
|
||||
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
|
||||
uint8_t * const halfHV= temp;\
|
||||
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
|
||||
av_assert2(((int)temp & 7) == 0);\
|
||||
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
|
||||
uint8_t * const halfHV= temp;\
|
||||
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
|
||||
av_assert2(((int)temp & 7) == 0);\
|
||||
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
|
||||
}\
|
||||
|
||||
#define H264_MC_4816(MMX)\
|
||||
H264_MC(put_, 4, MMX, 8)\
|
||||
H264_MC(put_, 8, MMX, 8)\
|
||||
H264_MC(put_, 16,MMX, 8)\
|
||||
H264_MC(avg_, 4, MMX, 8)\
|
||||
H264_MC(avg_, 8, MMX, 8)\
|
||||
H264_MC(avg_, 16,MMX, 8)\
|
||||
|
||||
#define H264_MC_816(QPEL, XMM)\
|
||||
QPEL(put_, 8, XMM, 16)\
|
||||
QPEL(put_, 16,XMM, 16)\
|
||||
QPEL(avg_, 8, XMM, 16)\
|
||||
QPEL(avg_, 16,XMM, 16)\
|
||||
|
||||
QPEL_H264(put_, PUT_OP, mmxext)
|
||||
QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
|
||||
QPEL_H264_V_XMM(put_, PUT_OP, sse2)
|
||||
QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
|
||||
QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
|
||||
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
|
||||
QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
|
||||
QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
|
||||
QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
|
||||
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
|
||||
|
||||
H264_MC_4816(mmxext)
|
||||
H264_MC_816(H264_MC_V, sse2)
|
||||
H264_MC_816(H264_MC_HV, sse2)
|
||||
H264_MC_816(H264_MC_H, ssse3)
|
||||
H264_MC_816(H264_MC_HV, ssse3)
|
||||
|
||||
|
||||
//10bit
|
||||
#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
|
||||
void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
|
||||
(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
|
||||
|
||||
#define LUMA_MC_816(DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
|
||||
|
||||
LUMA_MC_ALL(10, mc00, mmxext)
|
||||
LUMA_MC_ALL(10, mc10, mmxext)
|
||||
LUMA_MC_ALL(10, mc20, mmxext)
|
||||
LUMA_MC_ALL(10, mc30, mmxext)
|
||||
LUMA_MC_ALL(10, mc01, mmxext)
|
||||
LUMA_MC_ALL(10, mc11, mmxext)
|
||||
LUMA_MC_ALL(10, mc21, mmxext)
|
||||
LUMA_MC_ALL(10, mc31, mmxext)
|
||||
LUMA_MC_ALL(10, mc02, mmxext)
|
||||
LUMA_MC_ALL(10, mc12, mmxext)
|
||||
LUMA_MC_ALL(10, mc22, mmxext)
|
||||
LUMA_MC_ALL(10, mc32, mmxext)
|
||||
LUMA_MC_ALL(10, mc03, mmxext)
|
||||
LUMA_MC_ALL(10, mc13, mmxext)
|
||||
LUMA_MC_ALL(10, mc23, mmxext)
|
||||
LUMA_MC_ALL(10, mc33, mmxext)
|
||||
|
||||
LUMA_MC_816(10, mc00, sse2)
|
||||
LUMA_MC_816(10, mc10, sse2)
|
||||
LUMA_MC_816(10, mc10, sse2_cache64)
|
||||
LUMA_MC_816(10, mc10, ssse3_cache64)
|
||||
LUMA_MC_816(10, mc20, sse2)
|
||||
LUMA_MC_816(10, mc20, sse2_cache64)
|
||||
LUMA_MC_816(10, mc20, ssse3_cache64)
|
||||
LUMA_MC_816(10, mc30, sse2)
|
||||
LUMA_MC_816(10, mc30, sse2_cache64)
|
||||
LUMA_MC_816(10, mc30, ssse3_cache64)
|
||||
LUMA_MC_816(10, mc01, sse2)
|
||||
LUMA_MC_816(10, mc11, sse2)
|
||||
LUMA_MC_816(10, mc21, sse2)
|
||||
LUMA_MC_816(10, mc31, sse2)
|
||||
LUMA_MC_816(10, mc02, sse2)
|
||||
LUMA_MC_816(10, mc12, sse2)
|
||||
LUMA_MC_816(10, mc22, sse2)
|
||||
LUMA_MC_816(10, mc32, sse2)
|
||||
LUMA_MC_816(10, mc03, sse2)
|
||||
LUMA_MC_816(10, mc13, sse2)
|
||||
LUMA_MC_816(10, mc23, sse2)
|
||||
LUMA_MC_816(10, mc33, sse2)
|
||||
|
||||
#define QPEL16_OPMC(OP, MC, MMX)\
|
||||
void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride){\
|
||||
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
|
||||
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
|
||||
src += 8*stride;\
|
||||
dst += 8*stride;\
|
||||
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
|
||||
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
|
||||
}
|
||||
|
||||
#define QPEL16_OP(MC, MMX)\
|
||||
QPEL16_OPMC(put, MC, MMX)\
|
||||
QPEL16_OPMC(avg, MC, MMX)
|
||||
|
||||
#define QPEL16(MMX)\
|
||||
QPEL16_OP(mc00, MMX)\
|
||||
QPEL16_OP(mc01, MMX)\
|
||||
QPEL16_OP(mc02, MMX)\
|
||||
QPEL16_OP(mc03, MMX)\
|
||||
QPEL16_OP(mc10, MMX)\
|
||||
QPEL16_OP(mc11, MMX)\
|
||||
QPEL16_OP(mc12, MMX)\
|
||||
QPEL16_OP(mc13, MMX)\
|
||||
QPEL16_OP(mc20, MMX)\
|
||||
QPEL16_OP(mc21, MMX)\
|
||||
QPEL16_OP(mc22, MMX)\
|
||||
QPEL16_OP(mc23, MMX)\
|
||||
QPEL16_OP(mc30, MMX)\
|
||||
QPEL16_OP(mc31, MMX)\
|
||||
QPEL16_OP(mc32, MMX)\
|
||||
QPEL16_OP(mc33, MMX)
|
||||
|
||||
#if ARCH_X86_32 // ARCH_X86_64 implies SSE2+
|
||||
QPEL16(mmxext)
|
||||
#endif
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
|
||||
do { \
|
||||
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
|
||||
} while (0)
|
||||
|
||||
#define H264_QPEL_FUNCS(x, y, CPU) \
|
||||
do { \
|
||||
c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
|
||||
c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
|
||||
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
|
||||
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
|
||||
} while (0)
|
||||
|
||||
#define H264_QPEL_FUNCS_10(x, y, CPU) \
|
||||
do { \
|
||||
c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
|
||||
c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
|
||||
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
|
||||
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
|
||||
} while (0)
|
||||
|
||||
av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int high_bit_depth = bit_depth > 8;
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
if (!high_bit_depth) {
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
|
||||
} else if (bit_depth == 10) {
|
||||
#if ARCH_X86_32
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
|
||||
#endif
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
if (!high_bit_depth) {
|
||||
H264_QPEL_FUNCS(0, 1, sse2);
|
||||
H264_QPEL_FUNCS(0, 2, sse2);
|
||||
H264_QPEL_FUNCS(0, 3, sse2);
|
||||
H264_QPEL_FUNCS(1, 1, sse2);
|
||||
H264_QPEL_FUNCS(1, 2, sse2);
|
||||
H264_QPEL_FUNCS(1, 3, sse2);
|
||||
H264_QPEL_FUNCS(2, 1, sse2);
|
||||
H264_QPEL_FUNCS(2, 2, sse2);
|
||||
H264_QPEL_FUNCS(2, 3, sse2);
|
||||
H264_QPEL_FUNCS(3, 1, sse2);
|
||||
H264_QPEL_FUNCS(3, 2, sse2);
|
||||
H264_QPEL_FUNCS(3, 3, sse2);
|
||||
}
|
||||
|
||||
if (bit_depth == 10) {
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
|
||||
H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
|
||||
H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
|
||||
H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2_FAST(cpu_flags)) {
|
||||
if (!high_bit_depth) {
|
||||
H264_QPEL_FUNCS(0, 0, sse2);
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
if (!high_bit_depth) {
|
||||
H264_QPEL_FUNCS(1, 0, ssse3);
|
||||
H264_QPEL_FUNCS(1, 1, ssse3);
|
||||
H264_QPEL_FUNCS(1, 2, ssse3);
|
||||
H264_QPEL_FUNCS(1, 3, ssse3);
|
||||
H264_QPEL_FUNCS(2, 0, ssse3);
|
||||
H264_QPEL_FUNCS(2, 1, ssse3);
|
||||
H264_QPEL_FUNCS(2, 2, ssse3);
|
||||
H264_QPEL_FUNCS(2, 3, ssse3);
|
||||
H264_QPEL_FUNCS(3, 0, ssse3);
|
||||
H264_QPEL_FUNCS(3, 1, ssse3);
|
||||
H264_QPEL_FUNCS(3, 2, ssse3);
|
||||
H264_QPEL_FUNCS(3, 3, ssse3);
|
||||
}
|
||||
|
||||
if (bit_depth == 10) {
|
||||
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
|
||||
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
|
||||
H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
/* AVX implies 64 byte cache lines without the need to avoid unaligned
|
||||
* memory accesses that cross the boundary between two cache lines.
|
||||
* TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid
|
||||
* having to treat SSE2 functions with such properties as AVX. */
|
||||
if (bit_depth == 10) {
|
||||
H264_QPEL_FUNCS_10(1, 0, sse2);
|
||||
H264_QPEL_FUNCS_10(2, 0, sse2);
|
||||
H264_QPEL_FUNCS_10(3, 0, sse2);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
884
externals/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm
vendored
Executable file
884
externals/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm
vendored
Executable file
@@ -0,0 +1,884 @@
|
||||
;*****************************************************************************
|
||||
;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2011 x264 project
|
||||
;*
|
||||
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
cextern pd_65535
|
||||
cextern pw_1023
|
||||
%define pw_pixel_max pw_1023
|
||||
cextern pw_16
|
||||
cextern pw_1
|
||||
cextern pb_0
|
||||
|
||||
pad10: times 8 dw 10*1023
|
||||
pad20: times 8 dw 20*1023
|
||||
pad30: times 8 dw 30*1023
|
||||
depad: times 4 dd 32*20*1023 + 512
|
||||
depad2: times 8 dw 20*1023 + 16*1022 + 16
|
||||
unpad: times 8 dw 16*1022/32 ; needs to be mod 16
|
||||
|
||||
tap1: times 4 dw 1, -5
|
||||
tap2: times 4 dw 20, 20
|
||||
tap3: times 4 dw -5, 1
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
||||
%macro AVG_MOV 2
|
||||
pavgw %2, %1
|
||||
mova %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro ADDW 3
|
||||
%if mmsize == 8
|
||||
paddw %1, %2
|
||||
%else
|
||||
movu %3, %2
|
||||
paddw %1, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro FILT_H 4
|
||||
paddw %1, %4
|
||||
psubw %1, %2 ; a-b
|
||||
psraw %1, 2 ; (a-b)/4
|
||||
psubw %1, %2 ; (a-b)/4-b
|
||||
paddw %1, %3 ; (a-b)/4-b+c
|
||||
psraw %1, 2 ; ((a-b)/4-b+c)/4
|
||||
paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
|
||||
%endmacro
|
||||
|
||||
%macro PRELOAD_V 0
|
||||
lea r3, [r2*3]
|
||||
sub r1, r3
|
||||
movu m0, [r1+r2]
|
||||
movu m1, [r1+r2*2]
|
||||
add r1, r3
|
||||
movu m2, [r1]
|
||||
movu m3, [r1+r2]
|
||||
movu m4, [r1+r2*2]
|
||||
add r1, r3
|
||||
%endmacro
|
||||
|
||||
%macro FILT_V 8
|
||||
movu %6, [r1]
|
||||
paddw %1, %6
|
||||
mova %7, %2
|
||||
paddw %7, %5
|
||||
mova %8, %3
|
||||
paddw %8, %4
|
||||
FILT_H %1, %7, %8, [pw_16]
|
||||
psraw %1, 1
|
||||
CLIPW %1, [pb_0], [pw_pixel_max]
|
||||
%endmacro
|
||||
|
||||
%macro MC 1
|
||||
%define OP_MOV mova
|
||||
INIT_MMX mmxext
|
||||
%1 put, 4
|
||||
INIT_XMM sse2
|
||||
%1 put, 8
|
||||
|
||||
%define OP_MOV AVG_MOV
|
||||
INIT_MMX mmxext
|
||||
%1 avg, 4
|
||||
INIT_XMM sse2
|
||||
%1 avg, 8
|
||||
%endmacro
|
||||
|
||||
%macro MCAxA_OP 7
|
||||
%if ARCH_X86_32
|
||||
cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
mov r0, r0m
|
||||
mov r1, r1m
|
||||
add r0, %3*2
|
||||
add r1, %3*2
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
mov r0, r0m
|
||||
mov r1, r1m
|
||||
lea r0, [r0+r2*%3]
|
||||
lea r1, [r1+r2*%3]
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
mov r0, r0m
|
||||
mov r1, r1m
|
||||
lea r0, [r0+r2*%3+%3*2]
|
||||
lea r1, [r1+r2*%3+%3*2]
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
RET
|
||||
%else ; ARCH_X86_64
|
||||
cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
|
||||
mov r%6, r0
|
||||
%assign p1 %6+1
|
||||
mov r %+ p1, r1
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
lea r0, [r%6+%3*2]
|
||||
lea r1, [r %+ p1+%3*2]
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
lea r0, [r%6+r2*%3]
|
||||
lea r1, [r %+ p1+r2*%3]
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
lea r0, [r%6+r2*%3+%3*2]
|
||||
lea r1, [r %+ p1+r2*%3+%3*2]
|
||||
%if UNIX64 == 0 ; fall through to function
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
RET
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;cpu, put/avg, mc, 4/8, ...
|
||||
%macro cglobal_mc 6
|
||||
%assign i %3*2
|
||||
%if ARCH_X86_32 || cpuflag(sse2)
|
||||
MCAxA_OP %1, %2, %3, i, %4,%5,%6
|
||||
%endif
|
||||
|
||||
cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
|
||||
%if UNIX64 == 0 ; no prologue or epilogue for UNIX64
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
RET
|
||||
%endif
|
||||
|
||||
stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro COPY4 0
|
||||
movu m0, [r1 ]
|
||||
OP_MOV [r0 ], m0
|
||||
movu m0, [r1+r2 ]
|
||||
OP_MOV [r0+r2 ], m0
|
||||
movu m0, [r1+r2*2]
|
||||
OP_MOV [r0+r2*2], m0
|
||||
movu m0, [r1+r3 ]
|
||||
OP_MOV [r0+r3 ], m0
|
||||
%endmacro
|
||||
|
||||
%macro MC00 1
|
||||
INIT_MMX mmxext
|
||||
cglobal_mc %1, mc00, 4, 3,4,0
|
||||
lea r3, [r2*3]
|
||||
COPY4
|
||||
ret
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal %1_h264_qpel8_mc00_10, 3,4
|
||||
lea r3, [r2*3]
|
||||
COPY4
|
||||
lea r0, [r0+r2*4]
|
||||
lea r1, [r1+r2*4]
|
||||
COPY4
|
||||
RET
|
||||
|
||||
cglobal %1_h264_qpel16_mc00_10, 3,4
|
||||
mov r3d, 8
|
||||
.loop:
|
||||
movu m0, [r1 ]
|
||||
movu m1, [r1 +16]
|
||||
OP_MOV [r0 ], m0
|
||||
OP_MOV [r0 +16], m1
|
||||
movu m0, [r1+r2 ]
|
||||
movu m1, [r1+r2+16]
|
||||
OP_MOV [r0+r2 ], m0
|
||||
OP_MOV [r0+r2+16], m1
|
||||
lea r0, [r0+r2*2]
|
||||
lea r1, [r1+r2*2]
|
||||
dec r3d
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%define OP_MOV mova
|
||||
MC00 put
|
||||
|
||||
%define OP_MOV AVG_MOV
|
||||
MC00 avg
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC_CACHE 1
|
||||
%define OP_MOV mova
|
||||
INIT_MMX mmxext
|
||||
%1 put, 4
|
||||
INIT_XMM sse2, cache64
|
||||
%1 put, 8
|
||||
INIT_XMM ssse3, cache64
|
||||
%1 put, 8
|
||||
INIT_XMM sse2
|
||||
%1 put, 8
|
||||
|
||||
%define OP_MOV AVG_MOV
|
||||
INIT_MMX mmxext
|
||||
%1 avg, 4
|
||||
INIT_XMM sse2, cache64
|
||||
%1 avg, 8
|
||||
INIT_XMM ssse3, cache64
|
||||
%1 avg, 8
|
||||
INIT_XMM sse2
|
||||
%1 avg, 8
|
||||
%endmacro
|
||||
|
||||
%macro MC20 2
|
||||
cglobal_mc %1, mc20, %2, 3,4,9
|
||||
mov r3d, %2
|
||||
mova m1, [pw_pixel_max]
|
||||
%if num_mmregs > 8
|
||||
mova m8, [pw_16]
|
||||
%define p16 m8
|
||||
%else
|
||||
%define p16 [pw_16]
|
||||
%endif
|
||||
.nextrow:
|
||||
%if %0 == 4
|
||||
movu m2, [r1-4]
|
||||
movu m3, [r1-2]
|
||||
movu m4, [r1+0]
|
||||
ADDW m2, [r1+6], m5
|
||||
ADDW m3, [r1+4], m5
|
||||
ADDW m4, [r1+2], m5
|
||||
%else ; movu is slow on these processors
|
||||
%if mmsize==16
|
||||
movu m2, [r1-4]
|
||||
movu m0, [r1+6]
|
||||
mova m6, m0
|
||||
psrldq m0, 6
|
||||
|
||||
paddw m6, m2
|
||||
PALIGNR m3, m0, m2, 2, m5
|
||||
PALIGNR m7, m0, m2, 8, m5
|
||||
paddw m3, m7
|
||||
PALIGNR m4, m0, m2, 4, m5
|
||||
PALIGNR m7, m0, m2, 6, m5
|
||||
paddw m4, m7
|
||||
SWAP 2, 6
|
||||
%else
|
||||
movu m2, [r1-4]
|
||||
movu m6, [r1+4]
|
||||
PALIGNR m3, m6, m2, 2, m5
|
||||
paddw m3, m6
|
||||
PALIGNR m4, m6, m2, 4, m5
|
||||
PALIGNR m7, m6, m2, 6, m5
|
||||
paddw m4, m7
|
||||
paddw m2, [r1+6]
|
||||
%endif
|
||||
%endif
|
||||
|
||||
FILT_H m2, m3, m4, p16
|
||||
psraw m2, 1
|
||||
pxor m0, m0
|
||||
CLIPW m2, m0, m1
|
||||
OP_MOV [r0], m2
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jg .nextrow
|
||||
rep ret
|
||||
%endmacro
|
||||
|
||||
MC_CACHE MC20
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC30 2
|
||||
cglobal_mc %1, mc30, %2, 3,5,9
|
||||
lea r4, [r1+2]
|
||||
jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC_CACHE MC30
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC10 2
|
||||
cglobal_mc %1, mc10, %2, 3,5,9
|
||||
mov r4, r1
|
||||
.body:
|
||||
mov r3d, %2
|
||||
mova m1, [pw_pixel_max]
|
||||
%if num_mmregs > 8
|
||||
mova m8, [pw_16]
|
||||
%define p16 m8
|
||||
%else
|
||||
%define p16 [pw_16]
|
||||
%endif
|
||||
.nextrow:
|
||||
%if %0 == 4
|
||||
movu m2, [r1-4]
|
||||
movu m3, [r1-2]
|
||||
movu m4, [r1+0]
|
||||
ADDW m2, [r1+6], m5
|
||||
ADDW m3, [r1+4], m5
|
||||
ADDW m4, [r1+2], m5
|
||||
%else ; movu is slow on these processors
|
||||
%if mmsize==16
|
||||
movu m2, [r1-4]
|
||||
movu m0, [r1+6]
|
||||
mova m6, m0
|
||||
psrldq m0, 6
|
||||
|
||||
paddw m6, m2
|
||||
PALIGNR m3, m0, m2, 2, m5
|
||||
PALIGNR m7, m0, m2, 8, m5
|
||||
paddw m3, m7
|
||||
PALIGNR m4, m0, m2, 4, m5
|
||||
PALIGNR m7, m0, m2, 6, m5
|
||||
paddw m4, m7
|
||||
SWAP 2, 6
|
||||
%else
|
||||
movu m2, [r1-4]
|
||||
movu m6, [r1+4]
|
||||
PALIGNR m3, m6, m2, 2, m5
|
||||
paddw m3, m6
|
||||
PALIGNR m4, m6, m2, 4, m5
|
||||
PALIGNR m7, m6, m2, 6, m5
|
||||
paddw m4, m7
|
||||
paddw m2, [r1+6]
|
||||
%endif
|
||||
%endif
|
||||
|
||||
FILT_H m2, m3, m4, p16
|
||||
psraw m2, 1
|
||||
pxor m0, m0
|
||||
CLIPW m2, m0, m1
|
||||
movu m3, [r4]
|
||||
pavgw m2, m3
|
||||
OP_MOV [r0], m2
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
add r4, r2
|
||||
dec r3d
|
||||
jg .nextrow
|
||||
rep ret
|
||||
%endmacro
|
||||
|
||||
MC_CACHE MC10
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro V_FILT 10
|
||||
v_filt%9_%10_10:
|
||||
add r4, r2
|
||||
.no_addr4:
|
||||
FILT_V m0, m1, m2, m3, m4, m5, m6, m7
|
||||
add r1, r2
|
||||
add r0, r2
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
RESET_MM_PERMUTATION
|
||||
%assign i 0
|
||||
%rep 4
|
||||
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign i i+1
|
||||
%endrep
|
||||
|
||||
INIT_XMM sse2
|
||||
RESET_MM_PERMUTATION
|
||||
%assign i 0
|
||||
%rep 6
|
||||
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign i i+1
|
||||
%endrep
|
||||
|
||||
%macro MC02 2
|
||||
cglobal_mc %1, mc02, %2, 3,4,8
|
||||
PRELOAD_V
|
||||
|
||||
sub r0, r2
|
||||
%assign j 0
|
||||
%rep %2
|
||||
%assign i (j % 6)
|
||||
call v_filt%2_ %+ i %+ _10.no_addr4
|
||||
OP_MOV [r0], m0
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign j j+1
|
||||
%endrep
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
MC MC02
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC01 2
|
||||
cglobal_mc %1, mc01, %2, 3,5,8
|
||||
mov r4, r1
|
||||
.body:
|
||||
PRELOAD_V
|
||||
|
||||
sub r4, r2
|
||||
sub r0, r2
|
||||
%assign j 0
|
||||
%rep %2
|
||||
%assign i (j % 6)
|
||||
call v_filt%2_ %+ i %+ _10
|
||||
movu m7, [r4]
|
||||
pavgw m0, m7
|
||||
OP_MOV [r0], m0
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign j j+1
|
||||
%endrep
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
MC MC01
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC03 2
|
||||
cglobal_mc %1, mc03, %2, 3,5,8
|
||||
lea r4, [r1+r2]
|
||||
jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC MC03
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro H_FILT_AVG 2-3
|
||||
h_filt%1_%2_10:
|
||||
;FILT_H with fewer registers and averaged with the FILT_V result
|
||||
;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
|
||||
;unfortunately I need three registers, so m5 will have to be re-read from memory
|
||||
movu m5, [r4-4]
|
||||
ADDW m5, [r4+6], m7
|
||||
movu m6, [r4-2]
|
||||
ADDW m6, [r4+4], m7
|
||||
paddw m5, [pw_16]
|
||||
psubw m5, m6 ; a-b
|
||||
psraw m5, 2 ; (a-b)/4
|
||||
psubw m5, m6 ; (a-b)/4-b
|
||||
movu m6, [r4+0]
|
||||
ADDW m6, [r4+2], m7
|
||||
paddw m5, m6 ; (a-b)/4-b+c
|
||||
psraw m5, 2 ; ((a-b)/4-b+c)/4
|
||||
paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
|
||||
psraw m5, 1
|
||||
CLIPW m5, [pb_0], [pw_pixel_max]
|
||||
;avg FILT_V, FILT_H
|
||||
pavgw m0, m5
|
||||
%if %0!=4
|
||||
movu m5, [r1+r5]
|
||||
%endif
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
RESET_MM_PERMUTATION
|
||||
%assign i 0
|
||||
%rep 3
|
||||
H_FILT_AVG 4, i
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign i i+1
|
||||
%endrep
|
||||
H_FILT_AVG 4, i, 0
|
||||
|
||||
INIT_XMM sse2
|
||||
RESET_MM_PERMUTATION
|
||||
%assign i 0
|
||||
%rep 6
|
||||
%if i==1
|
||||
H_FILT_AVG 8, i, 0
|
||||
%else
|
||||
H_FILT_AVG 8, i
|
||||
%endif
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign i i+1
|
||||
%endrep
|
||||
|
||||
%macro MC11 2
|
||||
; this REALLY needs x86_64
|
||||
cglobal_mc %1, mc11, %2, 3,6,8
|
||||
mov r4, r1
|
||||
.body:
|
||||
PRELOAD_V
|
||||
|
||||
sub r0, r2
|
||||
sub r4, r2
|
||||
mov r5, r2
|
||||
neg r5
|
||||
%assign j 0
|
||||
%rep %2
|
||||
%assign i (j % 6)
|
||||
call v_filt%2_ %+ i %+ _10
|
||||
call h_filt%2_ %+ i %+ _10
|
||||
%if %2==8 && i==1
|
||||
movu m5, [r1+r5]
|
||||
%endif
|
||||
OP_MOV [r0], m0
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign j j+1
|
||||
%endrep
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
MC MC11
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC31 2
|
||||
cglobal_mc %1, mc31, %2, 3,6,8
|
||||
mov r4, r1
|
||||
add r1, 2
|
||||
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC MC31
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC13 2
|
||||
cglobal_mc %1, mc13, %2, 3,7,12
|
||||
lea r4, [r1+r2]
|
||||
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC MC13
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC33 2
|
||||
cglobal_mc %1, mc33, %2, 3,6,8
|
||||
lea r4, [r1+r2]
|
||||
add r1, 2
|
||||
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC MC33
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro FILT_H2 3
|
||||
psubw %1, %2 ; a-b
|
||||
psubw %2, %3 ; b-c
|
||||
psllw %2, 2
|
||||
psubw %1, %2 ; a-5*b+4*c
|
||||
psllw %3, 4
|
||||
paddw %1, %3 ; a-5*b+20*c
|
||||
%endmacro
|
||||
|
||||
%macro FILT_VNRD 8
|
||||
movu %6, [r1]
|
||||
paddw %1, %6
|
||||
mova %7, %2
|
||||
paddw %7, %5
|
||||
mova %8, %3
|
||||
paddw %8, %4
|
||||
FILT_H2 %1, %7, %8
|
||||
%endmacro
|
||||
|
||||
%macro HV 1
|
||||
%if mmsize==16
|
||||
%define PAD 12
|
||||
%define COUNT 2
|
||||
%else
|
||||
%define PAD 4
|
||||
%define COUNT 3
|
||||
%endif
|
||||
put_hv%1_10:
|
||||
neg r2 ; This actually saves instructions
|
||||
lea r1, [r1+r2*2-mmsize+PAD]
|
||||
lea r4, [rsp+PAD+gprsize]
|
||||
mov r3d, COUNT
|
||||
.v_loop:
|
||||
movu m0, [r1]
|
||||
sub r1, r2
|
||||
movu m1, [r1]
|
||||
sub r1, r2
|
||||
movu m2, [r1]
|
||||
sub r1, r2
|
||||
movu m3, [r1]
|
||||
sub r1, r2
|
||||
movu m4, [r1]
|
||||
sub r1, r2
|
||||
%assign i 0
|
||||
%rep %1-1
|
||||
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
|
||||
psubw m0, [pad20]
|
||||
movu [r4+i*mmsize*3], m0
|
||||
sub r1, r2
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign i i+1
|
||||
%endrep
|
||||
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
|
||||
psubw m0, [pad20]
|
||||
movu [r4+i*mmsize*3], m0
|
||||
add r4, mmsize
|
||||
lea r1, [r1+r2*8+mmsize]
|
||||
%if %1==8
|
||||
lea r1, [r1+r2*4]
|
||||
%endif
|
||||
dec r3d
|
||||
jg .v_loop
|
||||
neg r2
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
HV 4
|
||||
INIT_XMM sse2
|
||||
HV 8
|
||||
|
||||
%macro H_LOOP 1
|
||||
%if num_mmregs > 8
|
||||
%define s1 m8
|
||||
%define s2 m9
|
||||
%define s3 m10
|
||||
%define d1 m11
|
||||
%else
|
||||
%define s1 [tap1]
|
||||
%define s2 [tap2]
|
||||
%define s3 [tap3]
|
||||
%define d1 [depad]
|
||||
%endif
|
||||
h%1_loop_op:
|
||||
movu m1, [r1+mmsize-4]
|
||||
movu m2, [r1+mmsize-2]
|
||||
mova m3, [r1+mmsize+0]
|
||||
movu m4, [r1+mmsize+2]
|
||||
movu m5, [r1+mmsize+4]
|
||||
movu m6, [r1+mmsize+6]
|
||||
%if num_mmregs > 8
|
||||
pmaddwd m1, s1
|
||||
pmaddwd m2, s1
|
||||
pmaddwd m3, s2
|
||||
pmaddwd m4, s2
|
||||
pmaddwd m5, s3
|
||||
pmaddwd m6, s3
|
||||
paddd m1, d1
|
||||
paddd m2, d1
|
||||
%else
|
||||
mova m0, s1
|
||||
pmaddwd m1, m0
|
||||
pmaddwd m2, m0
|
||||
mova m0, s2
|
||||
pmaddwd m3, m0
|
||||
pmaddwd m4, m0
|
||||
mova m0, s3
|
||||
pmaddwd m5, m0
|
||||
pmaddwd m6, m0
|
||||
mova m0, d1
|
||||
paddd m1, m0
|
||||
paddd m2, m0
|
||||
%endif
|
||||
paddd m3, m5
|
||||
paddd m4, m6
|
||||
paddd m1, m3
|
||||
paddd m2, m4
|
||||
psrad m1, 10
|
||||
psrad m2, 10
|
||||
pslld m2, 16
|
||||
pand m1, [pd_65535]
|
||||
por m1, m2
|
||||
%if num_mmregs <= 8
|
||||
pxor m0, m0
|
||||
%endif
|
||||
CLIPW m1, m0, m7
|
||||
add r1, mmsize*3
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
H_LOOP 4
|
||||
INIT_XMM sse2
|
||||
H_LOOP 8
|
||||
|
||||
%macro MC22 2
|
||||
cglobal_mc %1, mc22, %2, 3,7,12
|
||||
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
|
||||
mov r6, rsp ; backup stack pointer
|
||||
and rsp, ~(mmsize-1) ; align stack
|
||||
sub rsp, PAD
|
||||
|
||||
call put_hv%2_10
|
||||
|
||||
mov r3d, %2
|
||||
mova m7, [pw_pixel_max]
|
||||
%if num_mmregs > 8
|
||||
pxor m0, m0
|
||||
mova m8, [tap1]
|
||||
mova m9, [tap2]
|
||||
mova m10, [tap3]
|
||||
mova m11, [depad]
|
||||
%endif
|
||||
mov r1, rsp
|
||||
.h_loop:
|
||||
call h%2_loop_op
|
||||
|
||||
OP_MOV [r0], m1
|
||||
add r0, r2
|
||||
dec r3d
|
||||
jg .h_loop
|
||||
|
||||
mov rsp, r6 ; restore stack pointer
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
MC MC22
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC12 2
|
||||
cglobal_mc %1, mc12, %2, 3,7,12
|
||||
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
|
||||
mov r6, rsp ; backup stack pointer
|
||||
and rsp, ~(mmsize-1) ; align stack
|
||||
sub rsp, PAD
|
||||
|
||||
call put_hv%2_10
|
||||
|
||||
xor r4d, r4d
|
||||
.body:
|
||||
mov r3d, %2
|
||||
pxor m0, m0
|
||||
mova m7, [pw_pixel_max]
|
||||
%if num_mmregs > 8
|
||||
mova m8, [tap1]
|
||||
mova m9, [tap2]
|
||||
mova m10, [tap3]
|
||||
mova m11, [depad]
|
||||
%endif
|
||||
mov r1, rsp
|
||||
.h_loop:
|
||||
call h%2_loop_op
|
||||
|
||||
movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
|
||||
paddw m3, [depad2]
|
||||
psrlw m3, 5
|
||||
psubw m3, [unpad]
|
||||
CLIPW m3, m0, m7
|
||||
pavgw m1, m3
|
||||
|
||||
OP_MOV [r0], m1
|
||||
add r0, r2
|
||||
dec r3d
|
||||
jg .h_loop
|
||||
|
||||
mov rsp, r6 ; restore stack pointer
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
MC MC12
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC32 2
|
||||
cglobal_mc %1, mc32, %2, 3,7,12
|
||||
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
|
||||
mov r6, rsp ; backup stack pointer
|
||||
and rsp, ~(mmsize-1) ; align stack
|
||||
sub rsp, PAD
|
||||
|
||||
call put_hv%2_10
|
||||
|
||||
mov r4d, 2 ; sizeof(pixel)
|
||||
jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC MC32
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro H_NRD 1
|
||||
put_h%1_10:
|
||||
add rsp, gprsize
|
||||
mov r3d, %1
|
||||
xor r4d, r4d
|
||||
mova m6, [pad20]
|
||||
.nextrow:
|
||||
movu m2, [r5-4]
|
||||
movu m3, [r5-2]
|
||||
movu m4, [r5+0]
|
||||
ADDW m2, [r5+6], m5
|
||||
ADDW m3, [r5+4], m5
|
||||
ADDW m4, [r5+2], m5
|
||||
|
||||
FILT_H2 m2, m3, m4
|
||||
psubw m2, m6
|
||||
mova [rsp+r4], m2
|
||||
add r4d, mmsize*3
|
||||
add r5, r2
|
||||
dec r3d
|
||||
jg .nextrow
|
||||
sub rsp, gprsize
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
H_NRD 4
|
||||
INIT_XMM sse2
|
||||
H_NRD 8
|
||||
|
||||
%macro MC21 2
|
||||
cglobal_mc %1, mc21, %2, 3,7,12
|
||||
mov r5, r1
|
||||
.body:
|
||||
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
|
||||
mov r6, rsp ; backup stack pointer
|
||||
and rsp, ~(mmsize-1) ; align stack
|
||||
|
||||
sub rsp, PAD
|
||||
call put_h%2_10
|
||||
|
||||
sub rsp, PAD
|
||||
call put_hv%2_10
|
||||
|
||||
mov r4d, PAD-mmsize ; H buffer
|
||||
jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC MC21
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC23 2
|
||||
cglobal_mc %1, mc23, %2, 3,7,12
|
||||
lea r5, [r1+r2]
|
||||
jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC MC23
|
||||
862
externals/ffmpeg/libavcodec/x86/h264_qpel_8bit.asm
vendored
Executable file
862
externals/ffmpeg/libavcodec/x86/h264_qpel_8bit.asm
vendored
Executable file
@@ -0,0 +1,862 @@
|
||||
;*****************************************************************************
|
||||
;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
|
||||
;*****************************************************************************
|
||||
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
|
||||
;* Copyright (C) 2012 Daniel Kang
|
||||
;*
|
||||
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
cextern pw_16
|
||||
cextern pw_5
|
||||
cextern pb_0
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
||||
%macro op_avgh 3
|
||||
movh %3, %2
|
||||
pavgb %1, %3
|
||||
movh %2, %1
|
||||
%endmacro
|
||||
|
||||
%macro op_avg 2-3
|
||||
pavgb %1, %2
|
||||
mova %2, %1
|
||||
%endmacro
|
||||
|
||||
%macro op_puth 2-3
|
||||
movh %2, %1
|
||||
%endmacro
|
||||
|
||||
%macro op_put 2-3
|
||||
mova %2, %1
|
||||
%endmacro
|
||||
|
||||
%macro QPEL4_H_LOWPASS_OP 1
|
||||
cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
|
||||
movsxdifnidn r2, r2d
|
||||
movsxdifnidn r3, r3d
|
||||
pxor m7, m7
|
||||
mova m4, [pw_5]
|
||||
mova m5, [pw_16]
|
||||
mov r4d, 4
|
||||
.loop:
|
||||
movh m1, [r1-1]
|
||||
movh m2, [r1+0]
|
||||
movh m3, [r1+1]
|
||||
movh m0, [r1+2]
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m0, m7
|
||||
paddw m1, m0
|
||||
paddw m2, m3
|
||||
movh m0, [r1-2]
|
||||
movh m3, [r1+3]
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m3, m7
|
||||
paddw m0, m3
|
||||
psllw m2, 2
|
||||
psubw m2, m1
|
||||
pmullw m2, m4
|
||||
paddw m0, m5
|
||||
paddw m0, m2
|
||||
psraw m0, 5
|
||||
packuswb m0, m0
|
||||
op_%1h m0, [r0], m6
|
||||
add r0, r2
|
||||
add r1, r3
|
||||
dec r4d
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL4_H_LOWPASS_OP put
|
||||
QPEL4_H_LOWPASS_OP avg
|
||||
|
||||
%macro QPEL8_H_LOWPASS_OP 1
|
||||
cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
|
||||
movsxdifnidn r2, r2d
|
||||
movsxdifnidn r3, r3d
|
||||
mov r4d, 8
|
||||
pxor m7, m7
|
||||
mova m6, [pw_5]
|
||||
.loop:
|
||||
mova m0, [r1]
|
||||
mova m2, [r1+1]
|
||||
mova m1, m0
|
||||
mova m3, m2
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
psllw m0, 2
|
||||
psllw m1, 2
|
||||
mova m2, [r1-1]
|
||||
mova m4, [r1+2]
|
||||
mova m3, m2
|
||||
mova m5, m4
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
punpckhbw m5, m7
|
||||
paddw m2, m4
|
||||
paddw m5, m3
|
||||
psubw m0, m2
|
||||
psubw m1, m5
|
||||
pmullw m0, m6
|
||||
pmullw m1, m6
|
||||
movd m2, [r1-2]
|
||||
movd m5, [r1+7]
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m5, m7
|
||||
paddw m2, m3
|
||||
paddw m4, m5
|
||||
mova m5, [pw_16]
|
||||
paddw m2, m5
|
||||
paddw m4, m5
|
||||
paddw m0, m2
|
||||
paddw m1, m4
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
packuswb m0, m1
|
||||
op_%1 m0, [r0], m4
|
||||
add r0, r2
|
||||
add r1, r3
|
||||
dec r4d
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL8_H_LOWPASS_OP put
|
||||
QPEL8_H_LOWPASS_OP avg
|
||||
|
||||
%macro QPEL8_H_LOWPASS_OP_XMM 1
|
||||
cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
|
||||
movsxdifnidn r2, r2d
|
||||
movsxdifnidn r3, r3d
|
||||
mov r4d, 8
|
||||
pxor m7, m7
|
||||
mova m6, [pw_5]
|
||||
.loop:
|
||||
movu m1, [r1-2]
|
||||
mova m0, m1
|
||||
punpckhbw m1, m7
|
||||
punpcklbw m0, m7
|
||||
mova m2, m1
|
||||
mova m3, m1
|
||||
mova m4, m1
|
||||
mova m5, m1
|
||||
palignr m4, m0, 2
|
||||
palignr m3, m0, 4
|
||||
palignr m2, m0, 6
|
||||
palignr m1, m0, 8
|
||||
palignr m5, m0, 10
|
||||
paddw m0, m5
|
||||
paddw m2, m3
|
||||
paddw m1, m4
|
||||
psllw m2, 2
|
||||
psubw m2, m1
|
||||
paddw m0, [pw_16]
|
||||
pmullw m2, m6
|
||||
paddw m2, m0
|
||||
psraw m2, 5
|
||||
packuswb m2, m2
|
||||
op_%1h m2, [r0], m4
|
||||
add r1, r3
|
||||
add r0, r2
|
||||
dec r4d
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
QPEL8_H_LOWPASS_OP_XMM put
|
||||
QPEL8_H_LOWPASS_OP_XMM avg
|
||||
|
||||
|
||||
%macro QPEL4_H_LOWPASS_L2_OP 1
|
||||
cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
|
||||
movsxdifnidn r3, r3d
|
||||
movsxdifnidn r4, r4d
|
||||
pxor m7, m7
|
||||
mova m4, [pw_5]
|
||||
mova m5, [pw_16]
|
||||
mov r5d, 4
|
||||
.loop:
|
||||
movh m1, [r1-1]
|
||||
movh m2, [r1+0]
|
||||
movh m3, [r1+1]
|
||||
movh m0, [r1+2]
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m0, m7
|
||||
paddw m1, m0
|
||||
paddw m2, m3
|
||||
movh m0, [r1-2]
|
||||
movh m3, [r1+3]
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m3, m7
|
||||
paddw m0, m3
|
||||
psllw m2, 2
|
||||
psubw m2, m1
|
||||
pmullw m2, m4
|
||||
paddw m0, m5
|
||||
paddw m0, m2
|
||||
movh m3, [r2]
|
||||
psraw m0, 5
|
||||
packuswb m0, m0
|
||||
pavgb m0, m3
|
||||
op_%1h m0, [r0], m6
|
||||
add r0, r3
|
||||
add r1, r3
|
||||
add r2, r4
|
||||
dec r5d
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL4_H_LOWPASS_L2_OP put
|
||||
QPEL4_H_LOWPASS_L2_OP avg
|
||||
|
||||
|
||||
%macro QPEL8_H_LOWPASS_L2_OP 1
|
||||
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
|
||||
movsxdifnidn r3, r3d
|
||||
movsxdifnidn r4, r4d
|
||||
mov r5d, 8
|
||||
pxor m7, m7
|
||||
mova m6, [pw_5]
|
||||
.loop:
|
||||
mova m0, [r1]
|
||||
mova m2, [r1+1]
|
||||
mova m1, m0
|
||||
mova m3, m2
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
psllw m0, 2
|
||||
psllw m1, 2
|
||||
mova m2, [r1-1]
|
||||
mova m4, [r1+2]
|
||||
mova m3, m2
|
||||
mova m5, m4
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
punpckhbw m5, m7
|
||||
paddw m2, m4
|
||||
paddw m5, m3
|
||||
psubw m0, m2
|
||||
psubw m1, m5
|
||||
pmullw m0, m6
|
||||
pmullw m1, m6
|
||||
movd m2, [r1-2]
|
||||
movd m5, [r1+7]
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m5, m7
|
||||
paddw m2, m3
|
||||
paddw m4, m5
|
||||
mova m5, [pw_16]
|
||||
paddw m2, m5
|
||||
paddw m4, m5
|
||||
paddw m0, m2
|
||||
paddw m1, m4
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
mova m4, [r2]
|
||||
packuswb m0, m1
|
||||
pavgb m0, m4
|
||||
op_%1 m0, [r0], m4
|
||||
add r0, r3
|
||||
add r1, r3
|
||||
add r2, r4
|
||||
dec r5d
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL8_H_LOWPASS_L2_OP put
|
||||
QPEL8_H_LOWPASS_L2_OP avg
|
||||
|
||||
|
||||
%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
|
||||
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
|
||||
movsxdifnidn r3, r3d
|
||||
movsxdifnidn r4, r4d
|
||||
mov r5d, 8
|
||||
pxor m7, m7
|
||||
mova m6, [pw_5]
|
||||
.loop:
|
||||
lddqu m1, [r1-2]
|
||||
mova m0, m1
|
||||
punpckhbw m1, m7
|
||||
punpcklbw m0, m7
|
||||
mova m2, m1
|
||||
mova m3, m1
|
||||
mova m4, m1
|
||||
mova m5, m1
|
||||
palignr m4, m0, 2
|
||||
palignr m3, m0, 4
|
||||
palignr m2, m0, 6
|
||||
palignr m1, m0, 8
|
||||
palignr m5, m0, 10
|
||||
paddw m0, m5
|
||||
paddw m2, m3
|
||||
paddw m1, m4
|
||||
psllw m2, 2
|
||||
movh m3, [r2]
|
||||
psubw m2, m1
|
||||
paddw m0, [pw_16]
|
||||
pmullw m2, m6
|
||||
paddw m2, m0
|
||||
psraw m2, 5
|
||||
packuswb m2, m2
|
||||
pavgb m2, m3
|
||||
op_%1h m2, [r0], m4
|
||||
add r1, r3
|
||||
add r0, r3
|
||||
add r2, r4
|
||||
dec r5d
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
QPEL8_H_LOWPASS_L2_OP_XMM put
|
||||
QPEL8_H_LOWPASS_L2_OP_XMM avg
|
||||
|
||||
|
||||
; All functions that call this are required to have function arguments of
|
||||
; dst, src, dstStride, srcStride
|
||||
%macro FILT_V 1
|
||||
mova m6, m2
|
||||
movh m5, [r1]
|
||||
paddw m6, m3
|
||||
psllw m6, 2
|
||||
psubw m6, m1
|
||||
psubw m6, m4
|
||||
punpcklbw m5, m7
|
||||
pmullw m6, [pw_5]
|
||||
paddw m0, [pw_16]
|
||||
add r1, r3
|
||||
paddw m0, m5
|
||||
paddw m6, m0
|
||||
psraw m6, 5
|
||||
packuswb m6, m6
|
||||
op_%1h m6, [r0], m0 ; 1
|
||||
add r0, r2
|
||||
SWAP 0, 1, 2, 3, 4, 5
|
||||
%endmacro
|
||||
|
||||
%macro QPEL4_V_LOWPASS_OP 1
|
||||
cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
|
||||
movsxdifnidn r2, r2d
|
||||
movsxdifnidn r3, r3d
|
||||
sub r1, r3
|
||||
sub r1, r3
|
||||
pxor m7, m7
|
||||
movh m0, [r1]
|
||||
movh m1, [r1+r3]
|
||||
lea r1, [r1+2*r3]
|
||||
movh m2, [r1]
|
||||
movh m3, [r1+r3]
|
||||
lea r1, [r1+2*r3]
|
||||
movh m4, [r1]
|
||||
add r1, r3
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL4_V_LOWPASS_OP put
|
||||
QPEL4_V_LOWPASS_OP avg
|
||||
|
||||
|
||||
|
||||
%macro QPEL8OR16_V_LOWPASS_OP 1
|
||||
%if cpuflag(sse2)
|
||||
cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
|
||||
movsxdifnidn r2, r2d
|
||||
movsxdifnidn r3, r3d
|
||||
sub r1, r3
|
||||
sub r1, r3
|
||||
%else
|
||||
cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
|
||||
movsxdifnidn r2, r2d
|
||||
movsxdifnidn r3, r3d
|
||||
%endif
|
||||
pxor m7, m7
|
||||
movh m0, [r1]
|
||||
movh m1, [r1+r3]
|
||||
lea r1, [r1+2*r3]
|
||||
movh m2, [r1]
|
||||
movh m3, [r1+r3]
|
||||
lea r1, [r1+2*r3]
|
||||
movh m4, [r1]
|
||||
add r1, r3
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
cmp r4d, 16
|
||||
jne .end
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
.end:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL8OR16_V_LOWPASS_OP put
|
||||
QPEL8OR16_V_LOWPASS_OP avg
|
||||
|
||||
INIT_XMM sse2
|
||||
QPEL8OR16_V_LOWPASS_OP put
|
||||
QPEL8OR16_V_LOWPASS_OP avg
|
||||
|
||||
|
||||
; All functions that use this are required to have args:
|
||||
; src, tmp, srcSize
|
||||
%macro FILT_HV 1 ; offset
|
||||
mova m6, m2
|
||||
movh m5, [r0]
|
||||
paddw m6, m3
|
||||
psllw m6, 2
|
||||
paddw m0, [pw_16]
|
||||
psubw m6, m1
|
||||
psubw m6, m4
|
||||
punpcklbw m5, m7
|
||||
pmullw m6, [pw_5]
|
||||
paddw m0, m5
|
||||
add r0, r2
|
||||
paddw m6, m0
|
||||
mova [r1+%1], m6
|
||||
SWAP 0, 1, 2, 3, 4, 5
|
||||
%endmacro
|
||||
|
||||
%macro QPEL4_HV1_LOWPASS_OP 1
|
||||
cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
|
||||
movsxdifnidn r2, r2d
|
||||
pxor m7, m7
|
||||
movh m0, [r0]
|
||||
movh m1, [r0+r2]
|
||||
lea r0, [r0+2*r2]
|
||||
movh m2, [r0]
|
||||
movh m3, [r0+r2]
|
||||
lea r0, [r0+2*r2]
|
||||
movh m4, [r0]
|
||||
add r0, r2
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
FILT_HV 0*24
|
||||
FILT_HV 1*24
|
||||
FILT_HV 2*24
|
||||
FILT_HV 3*24
|
||||
RET
|
||||
|
||||
cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
|
||||
movsxdifnidn r2, r2d
|
||||
mov r3d, 4
|
||||
.loop:
|
||||
mova m0, [r0]
|
||||
paddw m0, [r0+10]
|
||||
mova m1, [r0+2]
|
||||
paddw m1, [r0+8]
|
||||
mova m2, [r0+4]
|
||||
paddw m2, [r0+6]
|
||||
psubw m0, m1
|
||||
psraw m0, 2
|
||||
psubw m0, m1
|
||||
paddsw m0, m2
|
||||
psraw m0, 2
|
||||
paddw m0, m2
|
||||
psraw m0, 6
|
||||
packuswb m0, m0
|
||||
op_%1h m0, [r1], m7
|
||||
add r0, 24
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jnz .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL4_HV1_LOWPASS_OP put
|
||||
QPEL4_HV1_LOWPASS_OP avg
|
||||
|
||||
%macro QPEL8OR16_HV1_LOWPASS_OP 1
|
||||
cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
|
||||
movsxdifnidn r2, r2d
|
||||
pxor m7, m7
|
||||
movh m0, [r0]
|
||||
movh m1, [r0+r2]
|
||||
lea r0, [r0+2*r2]
|
||||
movh m2, [r0]
|
||||
movh m3, [r0+r2]
|
||||
lea r0, [r0+2*r2]
|
||||
movh m4, [r0]
|
||||
add r0, r2
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
FILT_HV 0*48
|
||||
FILT_HV 1*48
|
||||
FILT_HV 2*48
|
||||
FILT_HV 3*48
|
||||
FILT_HV 4*48
|
||||
FILT_HV 5*48
|
||||
FILT_HV 6*48
|
||||
FILT_HV 7*48
|
||||
cmp r3d, 16
|
||||
jne .end
|
||||
FILT_HV 8*48
|
||||
FILT_HV 9*48
|
||||
FILT_HV 10*48
|
||||
FILT_HV 11*48
|
||||
FILT_HV 12*48
|
||||
FILT_HV 13*48
|
||||
FILT_HV 14*48
|
||||
FILT_HV 15*48
|
||||
.end:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL8OR16_HV1_LOWPASS_OP put
|
||||
QPEL8OR16_HV1_LOWPASS_OP avg
|
||||
|
||||
INIT_XMM sse2
|
||||
QPEL8OR16_HV1_LOWPASS_OP put
|
||||
|
||||
|
||||
|
||||
%macro QPEL8OR16_HV2_LOWPASS_OP 1
|
||||
; unused is to match ssse3 and mmxext args
|
||||
cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
|
||||
movsxdifnidn r2, r2d
|
||||
.loop:
|
||||
mova m0, [r1]
|
||||
mova m3, [r1+8]
|
||||
mova m1, [r1+2]
|
||||
mova m4, [r1+10]
|
||||
paddw m0, m4
|
||||
paddw m1, m3
|
||||
paddw m3, [r1+18]
|
||||
paddw m4, [r1+16]
|
||||
mova m2, [r1+4]
|
||||
mova m5, [r1+12]
|
||||
paddw m2, [r1+6]
|
||||
paddw m5, [r1+14]
|
||||
psubw m0, m1
|
||||
psubw m3, m4
|
||||
psraw m0, 2
|
||||
psraw m3, 2
|
||||
psubw m0, m1
|
||||
psubw m3, m4
|
||||
paddsw m0, m2
|
||||
paddsw m3, m5
|
||||
psraw m0, 2
|
||||
psraw m3, 2
|
||||
paddw m0, m2
|
||||
paddw m3, m5
|
||||
psraw m0, 6
|
||||
psraw m3, 6
|
||||
packuswb m0, m3
|
||||
op_%1 m0, [r0], m7
|
||||
add r1, 48
|
||||
add r0, r2
|
||||
dec r4d
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL8OR16_HV2_LOWPASS_OP put
|
||||
QPEL8OR16_HV2_LOWPASS_OP avg
|
||||
|
||||
%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
|
||||
cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
|
||||
movsxdifnidn r2, r2d
|
||||
movsxdifnidn r3, r3d
|
||||
cmp r4d, 16
|
||||
je .op16
|
||||
.loop8:
|
||||
mova m1, [r1+16]
|
||||
mova m0, [r1]
|
||||
mova m2, m1
|
||||
mova m3, m1
|
||||
mova m4, m1
|
||||
mova m5, m1
|
||||
palignr m5, m0, 10
|
||||
palignr m4, m0, 8
|
||||
palignr m3, m0, 6
|
||||
palignr m2, m0, 4
|
||||
palignr m1, m0, 2
|
||||
paddw m0, m5
|
||||
paddw m1, m4
|
||||
paddw m2, m3
|
||||
psubw m0, m1
|
||||
psraw m0, 2
|
||||
psubw m0, m1
|
||||
paddw m0, m2
|
||||
psraw m0, 2
|
||||
paddw m0, m2
|
||||
psraw m0, 6
|
||||
packuswb m0, m0
|
||||
op_%1h m0, [r0], m7
|
||||
add r1, 48
|
||||
add r0, r2
|
||||
dec r4d
|
||||
jne .loop8
|
||||
jmp .done
|
||||
.op16:
|
||||
mova m4, [r1+32]
|
||||
mova m5, [r1+16]
|
||||
mova m7, [r1]
|
||||
mova m3, m4
|
||||
mova m2, m4
|
||||
mova m1, m4
|
||||
mova m0, m4
|
||||
palignr m0, m5, 10
|
||||
palignr m1, m5, 8
|
||||
palignr m2, m5, 6
|
||||
palignr m3, m5, 4
|
||||
palignr m4, m5, 2
|
||||
paddw m0, m5
|
||||
paddw m1, m4
|
||||
paddw m2, m3
|
||||
mova m6, m5
|
||||
mova m4, m5
|
||||
mova m3, m5
|
||||
palignr m4, m7, 8
|
||||
palignr m6, m7, 2
|
||||
palignr m3, m7, 10
|
||||
paddw m4, m6
|
||||
mova m6, m5
|
||||
palignr m5, m7, 6
|
||||
palignr m6, m7, 4
|
||||
paddw m3, m7
|
||||
paddw m5, m6
|
||||
psubw m0, m1
|
||||
psubw m3, m4
|
||||
psraw m0, 2
|
||||
psraw m3, 2
|
||||
psubw m0, m1
|
||||
psubw m3, m4
|
||||
paddw m0, m2
|
||||
paddw m3, m5
|
||||
psraw m0, 2
|
||||
psraw m3, 2
|
||||
paddw m0, m2
|
||||
paddw m3, m5
|
||||
psraw m0, 6
|
||||
psraw m3, 6
|
||||
packuswb m3, m0
|
||||
op_%1 m3, [r0], m7
|
||||
add r1, 48
|
||||
add r0, r2
|
||||
dec r4d
|
||||
jne .op16
|
||||
.done:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
QPEL8OR16_HV2_LOWPASS_OP_XMM put
|
||||
QPEL8OR16_HV2_LOWPASS_OP_XMM avg
|
||||
|
||||
|
||||
%macro PIXELS4_L2_SHIFT5 1
|
||||
cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
|
||||
movsxdifnidn r3, r3d
|
||||
movsxdifnidn r4, r4d
|
||||
mova m0, [r1]
|
||||
mova m1, [r1+24]
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
packuswb m0, m0
|
||||
packuswb m1, m1
|
||||
pavgb m0, [r2]
|
||||
pavgb m1, [r2+r4]
|
||||
op_%1h m0, [r0], m4
|
||||
op_%1h m1, [r0+r3], m5
|
||||
lea r2, [r2+r4*2]
|
||||
lea r0, [r0+r3*2]
|
||||
mova m0, [r1+48]
|
||||
mova m1, [r1+72]
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
packuswb m0, m0
|
||||
packuswb m1, m1
|
||||
pavgb m0, [r2]
|
||||
pavgb m1, [r2+r4]
|
||||
op_%1h m0, [r0], m4
|
||||
op_%1h m1, [r0+r3], m5
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
PIXELS4_L2_SHIFT5 put
|
||||
PIXELS4_L2_SHIFT5 avg
|
||||
|
||||
|
||||
%macro PIXELS8_L2_SHIFT5 1
|
||||
cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
|
||||
movsxdifnidn r3, r3d
|
||||
movsxdifnidn r4, r4d
|
||||
.loop:
|
||||
mova m0, [r1]
|
||||
mova m1, [r1+8]
|
||||
mova m2, [r1+48]
|
||||
mova m3, [r1+48+8]
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
psraw m2, 5
|
||||
psraw m3, 5
|
||||
packuswb m0, m1
|
||||
packuswb m2, m3
|
||||
pavgb m0, [r2]
|
||||
pavgb m2, [r2+r4]
|
||||
op_%1 m0, [r0], m4
|
||||
op_%1 m2, [r0+r3], m5
|
||||
lea r2, [r2+2*r4]
|
||||
add r1, 48*2
|
||||
lea r0, [r0+2*r3]
|
||||
sub r5d, 2
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
PIXELS8_L2_SHIFT5 put
|
||||
PIXELS8_L2_SHIFT5 avg
|
||||
|
||||
|
||||
%if ARCH_X86_64
|
||||
%macro QPEL16_H_LOWPASS_L2_OP 1
|
||||
cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
|
||||
movsxdifnidn r3, r3d
|
||||
movsxdifnidn r4, r4d
|
||||
mov r5d, 16
|
||||
pxor m15, m15
|
||||
mova m14, [pw_5]
|
||||
mova m13, [pw_16]
|
||||
.loop:
|
||||
lddqu m1, [r1+6]
|
||||
lddqu m7, [r1-2]
|
||||
mova m0, m1
|
||||
punpckhbw m1, m15
|
||||
punpcklbw m0, m15
|
||||
punpcklbw m7, m15
|
||||
mova m2, m1
|
||||
mova m6, m0
|
||||
mova m3, m1
|
||||
mova m8, m0
|
||||
mova m4, m1
|
||||
mova m9, m0
|
||||
mova m12, m0
|
||||
mova m11, m1
|
||||
palignr m11, m0, 10
|
||||
palignr m12, m7, 10
|
||||
palignr m4, m0, 2
|
||||
palignr m9, m7, 2
|
||||
palignr m3, m0, 4
|
||||
palignr m8, m7, 4
|
||||
palignr m2, m0, 6
|
||||
palignr m6, m7, 6
|
||||
paddw m11, m0
|
||||
palignr m1, m0, 8
|
||||
palignr m0, m7, 8
|
||||
paddw m7, m12
|
||||
paddw m2, m3
|
||||
paddw m6, m8
|
||||
paddw m1, m4
|
||||
paddw m0, m9
|
||||
psllw m2, 2
|
||||
psllw m6, 2
|
||||
psubw m2, m1
|
||||
psubw m6, m0
|
||||
paddw m11, m13
|
||||
paddw m7, m13
|
||||
pmullw m2, m14
|
||||
pmullw m6, m14
|
||||
lddqu m3, [r2]
|
||||
paddw m2, m11
|
||||
paddw m6, m7
|
||||
psraw m2, 5
|
||||
psraw m6, 5
|
||||
packuswb m6, m2
|
||||
pavgb m6, m3
|
||||
op_%1 m6, [r0], m11
|
||||
add r1, r3
|
||||
add r0, r3
|
||||
add r2, r4
|
||||
dec r5d
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
QPEL16_H_LOWPASS_L2_OP put
|
||||
QPEL16_H_LOWPASS_L2_OP avg
|
||||
%endif
|
||||
320
externals/ffmpeg/libavcodec/x86/h264_weight.asm
vendored
Executable file
320
externals/ffmpeg/libavcodec/x86/h264_weight.asm
vendored
Executable file
@@ -0,0 +1,320 @@
|
||||
;*****************************************************************************
|
||||
;* SSE2-optimized weighted prediction code
|
||||
;*****************************************************************************
|
||||
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
|
||||
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; biweight pred:
|
||||
;
|
||||
; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
|
||||
; int height, int log2_denom, int weightd,
|
||||
; int weights, int offset);
|
||||
; and
|
||||
; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height,
|
||||
; int log2_denom, int weight, int offset);
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro WEIGHT_SETUP 0
|
||||
add r5, r5
|
||||
inc r5
|
||||
movd m3, r4d
|
||||
movd m5, r5d
|
||||
movd m6, r3d
|
||||
pslld m5, m6
|
||||
psrld m5, 1
|
||||
%if mmsize == 16
|
||||
pshuflw m3, m3, 0
|
||||
pshuflw m5, m5, 0
|
||||
punpcklqdq m3, m3
|
||||
punpcklqdq m5, m5
|
||||
%else
|
||||
pshufw m3, m3, 0
|
||||
pshufw m5, m5, 0
|
||||
%endif
|
||||
pxor m7, m7
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_OP 2
|
||||
movh m0, [r0+%1]
|
||||
movh m1, [r0+%2]
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
pmullw m0, m3
|
||||
pmullw m1, m3
|
||||
paddsw m0, m5
|
||||
paddsw m1, m5
|
||||
psraw m0, m6
|
||||
psraw m1, m6
|
||||
packuswb m0, m1
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
cglobal h264_weight_16, 6, 6, 0
|
||||
WEIGHT_SETUP
|
||||
.nextrow:
|
||||
WEIGHT_OP 0, 4
|
||||
mova [r0 ], m0
|
||||
WEIGHT_OP 8, 12
|
||||
mova [r0+8], m0
|
||||
add r0, r1
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
|
||||
%macro WEIGHT_FUNC_MM 2
|
||||
cglobal h264_weight_%1, 6, 6, %2
|
||||
WEIGHT_SETUP
|
||||
.nextrow:
|
||||
WEIGHT_OP 0, mmsize/2
|
||||
mova [r0], m0
|
||||
add r0, r1
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
WEIGHT_FUNC_MM 8, 0
|
||||
INIT_XMM sse2
|
||||
WEIGHT_FUNC_MM 16, 8
|
||||
|
||||
%macro WEIGHT_FUNC_HALF_MM 2
|
||||
cglobal h264_weight_%1, 6, 6, %2
|
||||
WEIGHT_SETUP
|
||||
sar r2d, 1
|
||||
lea r3, [r1*2]
|
||||
.nextrow:
|
||||
WEIGHT_OP 0, r1
|
||||
movh [r0], m0
|
||||
%if mmsize == 16
|
||||
movhps [r0+r1], m0
|
||||
%else
|
||||
psrlq m0, 32
|
||||
movh [r0+r1], m0
|
||||
%endif
|
||||
add r0, r3
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
WEIGHT_FUNC_HALF_MM 4, 0
|
||||
INIT_XMM sse2
|
||||
WEIGHT_FUNC_HALF_MM 8, 8
|
||||
|
||||
%macro BIWEIGHT_SETUP 0
|
||||
%if ARCH_X86_64
|
||||
%define off_regd r7d
|
||||
%else
|
||||
%define off_regd r3d
|
||||
%endif
|
||||
mov off_regd, r7m
|
||||
add off_regd, 1
|
||||
or off_regd, 1
|
||||
add r4d, 1
|
||||
cmp r6d, 128
|
||||
je .nonnormal
|
||||
cmp r5d, 128
|
||||
jne .normal
|
||||
.nonnormal:
|
||||
sar r5d, 1
|
||||
sar r6d, 1
|
||||
sar off_regd, 1
|
||||
sub r4d, 1
|
||||
.normal:
|
||||
%if cpuflag(ssse3)
|
||||
movd m4, r5d
|
||||
movd m0, r6d
|
||||
%else
|
||||
movd m3, r5d
|
||||
movd m4, r6d
|
||||
%endif
|
||||
movd m5, off_regd
|
||||
movd m6, r4d
|
||||
pslld m5, m6
|
||||
psrld m5, 1
|
||||
%if cpuflag(ssse3)
|
||||
punpcklbw m4, m0
|
||||
pshuflw m4, m4, 0
|
||||
pshuflw m5, m5, 0
|
||||
punpcklqdq m4, m4
|
||||
punpcklqdq m5, m5
|
||||
|
||||
%else
|
||||
%if mmsize == 16
|
||||
pshuflw m3, m3, 0
|
||||
pshuflw m4, m4, 0
|
||||
pshuflw m5, m5, 0
|
||||
punpcklqdq m3, m3
|
||||
punpcklqdq m4, m4
|
||||
punpcklqdq m5, m5
|
||||
%else
|
||||
pshufw m3, m3, 0
|
||||
pshufw m4, m4, 0
|
||||
pshufw m5, m5, 0
|
||||
%endif
|
||||
pxor m7, m7
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro BIWEIGHT_STEPA 3
|
||||
movh m%1, [r0+%3]
|
||||
movh m%2, [r1+%3]
|
||||
punpcklbw m%1, m7
|
||||
punpcklbw m%2, m7
|
||||
pmullw m%1, m3
|
||||
pmullw m%2, m4
|
||||
paddsw m%1, m%2
|
||||
%endmacro
|
||||
|
||||
%macro BIWEIGHT_STEPB 0
|
||||
paddsw m0, m5
|
||||
paddsw m1, m5
|
||||
psraw m0, m6
|
||||
psraw m1, m6
|
||||
packuswb m0, m1
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
cglobal h264_biweight_16, 7, 8, 0
|
||||
BIWEIGHT_SETUP
|
||||
movifnidn r3d, r3m
|
||||
.nextrow:
|
||||
BIWEIGHT_STEPA 0, 1, 0
|
||||
BIWEIGHT_STEPA 1, 2, 4
|
||||
BIWEIGHT_STEPB
|
||||
mova [r0], m0
|
||||
BIWEIGHT_STEPA 0, 1, 8
|
||||
BIWEIGHT_STEPA 1, 2, 12
|
||||
BIWEIGHT_STEPB
|
||||
mova [r0+8], m0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
|
||||
%macro BIWEIGHT_FUNC_MM 2
|
||||
cglobal h264_biweight_%1, 7, 8, %2
|
||||
BIWEIGHT_SETUP
|
||||
movifnidn r3d, r3m
|
||||
.nextrow:
|
||||
BIWEIGHT_STEPA 0, 1, 0
|
||||
BIWEIGHT_STEPA 1, 2, mmsize/2
|
||||
BIWEIGHT_STEPB
|
||||
mova [r0], m0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
BIWEIGHT_FUNC_MM 8, 0
|
||||
INIT_XMM sse2
|
||||
BIWEIGHT_FUNC_MM 16, 8
|
||||
|
||||
%macro BIWEIGHT_FUNC_HALF_MM 2
|
||||
cglobal h264_biweight_%1, 7, 8, %2
|
||||
BIWEIGHT_SETUP
|
||||
movifnidn r3d, r3m
|
||||
sar r3, 1
|
||||
lea r4, [r2*2]
|
||||
.nextrow:
|
||||
BIWEIGHT_STEPA 0, 1, 0
|
||||
BIWEIGHT_STEPA 1, 2, r2
|
||||
BIWEIGHT_STEPB
|
||||
movh [r0], m0
|
||||
%if mmsize == 16
|
||||
movhps [r0+r2], m0
|
||||
%else
|
||||
psrlq m0, 32
|
||||
movh [r0+r2], m0
|
||||
%endif
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
BIWEIGHT_FUNC_HALF_MM 4, 0
|
||||
INIT_XMM sse2
|
||||
BIWEIGHT_FUNC_HALF_MM 8, 8
|
||||
|
||||
%macro BIWEIGHT_SSSE3_OP 0
|
||||
pmaddubsw m0, m4
|
||||
pmaddubsw m2, m4
|
||||
paddsw m0, m5
|
||||
paddsw m2, m5
|
||||
psraw m0, m6
|
||||
psraw m2, m6
|
||||
packuswb m0, m2
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
cglobal h264_biweight_16, 7, 8, 8
|
||||
BIWEIGHT_SETUP
|
||||
movifnidn r3d, r3m
|
||||
|
||||
.nextrow:
|
||||
movh m0, [r0]
|
||||
movh m2, [r0+8]
|
||||
movh m3, [r1+8]
|
||||
punpcklbw m0, [r1]
|
||||
punpcklbw m2, m3
|
||||
BIWEIGHT_SSSE3_OP
|
||||
mova [r0], m0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
|
||||
INIT_XMM ssse3
|
||||
cglobal h264_biweight_8, 7, 8, 8
|
||||
BIWEIGHT_SETUP
|
||||
movifnidn r3d, r3m
|
||||
sar r3, 1
|
||||
lea r4, [r2*2]
|
||||
|
||||
.nextrow:
|
||||
movh m0, [r0]
|
||||
movh m1, [r1]
|
||||
movh m2, [r0+r2]
|
||||
movh m3, [r1+r2]
|
||||
punpcklbw m0, m1
|
||||
punpcklbw m2, m3
|
||||
BIWEIGHT_SSSE3_OP
|
||||
movh [r0], m0
|
||||
movhps [r0+r2], m0
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
284
externals/ffmpeg/libavcodec/x86/h264_weight_10bit.asm
vendored
Executable file
284
externals/ffmpeg/libavcodec/x86/h264_weight_10bit.asm
vendored
Executable file
@@ -0,0 +1,284 @@
|
||||
;*****************************************************************************
|
||||
;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2005-2011 x264 project
|
||||
;*
|
||||
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
sq_1: dq 1
|
||||
dq 0
|
||||
|
||||
cextern pw_1
|
||||
cextern pw_1023
|
||||
%define pw_pixel_max pw_1023
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_weight_16_10(uint8_t *dst, int stride, int height,
|
||||
; int log2_denom, int weight, int offset);
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro WEIGHT_PROLOGUE 0
|
||||
.prologue:
|
||||
PROLOGUE 0,6,8
|
||||
movifnidn r0, r0mp
|
||||
movifnidn r1d, r1m
|
||||
movifnidn r2d, r2m
|
||||
movifnidn r4d, r4m
|
||||
movifnidn r5d, r5m
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_SETUP 0
|
||||
mova m0, [pw_1]
|
||||
movd m2, r3m
|
||||
pslld m0, m2 ; 1<<log2_denom
|
||||
SPLATW m0, m0
|
||||
shl r5, 19 ; *8, move to upper half of dword
|
||||
lea r5, [r5+r4*2+0x10000]
|
||||
movd m3, r5d ; weight<<1 | 1+(offset<<(3))
|
||||
pshufd m3, m3, 0
|
||||
mova m4, [pw_pixel_max]
|
||||
paddw m2, [sq_1] ; log2_denom+1
|
||||
%if notcpuflag(sse4)
|
||||
pxor m7, m7
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_OP 1-2
|
||||
%if %0==1
|
||||
mova m5, [r0+%1]
|
||||
punpckhwd m6, m5, m0
|
||||
punpcklwd m5, m0
|
||||
%else
|
||||
movq m5, [r0+%1]
|
||||
movq m6, [r0+%2]
|
||||
punpcklwd m5, m0
|
||||
punpcklwd m6, m0
|
||||
%endif
|
||||
pmaddwd m5, m3
|
||||
pmaddwd m6, m3
|
||||
psrad m5, m2
|
||||
psrad m6, m2
|
||||
%if cpuflag(sse4)
|
||||
packusdw m5, m6
|
||||
pminsw m5, m4
|
||||
%else
|
||||
packssdw m5, m6
|
||||
CLIPW m5, m7, m4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_FUNC_DBL 0
|
||||
cglobal h264_weight_16_10
|
||||
WEIGHT_PROLOGUE
|
||||
WEIGHT_SETUP
|
||||
.nextrow:
|
||||
WEIGHT_OP 0
|
||||
mova [r0 ], m5
|
||||
WEIGHT_OP 16
|
||||
mova [r0+16], m5
|
||||
add r0, r1
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
WEIGHT_FUNC_DBL
|
||||
INIT_XMM sse4
|
||||
WEIGHT_FUNC_DBL
|
||||
|
||||
|
||||
%macro WEIGHT_FUNC_MM 0
|
||||
cglobal h264_weight_8_10
|
||||
WEIGHT_PROLOGUE
|
||||
WEIGHT_SETUP
|
||||
.nextrow:
|
||||
WEIGHT_OP 0
|
||||
mova [r0], m5
|
||||
add r0, r1
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
WEIGHT_FUNC_MM
|
||||
INIT_XMM sse4
|
||||
WEIGHT_FUNC_MM
|
||||
|
||||
|
||||
%macro WEIGHT_FUNC_HALF_MM 0
|
||||
cglobal h264_weight_4_10
|
||||
WEIGHT_PROLOGUE
|
||||
sar r2d, 1
|
||||
WEIGHT_SETUP
|
||||
lea r3, [r1*2]
|
||||
.nextrow:
|
||||
WEIGHT_OP 0, r1
|
||||
movh [r0], m5
|
||||
movhps [r0+r1], m5
|
||||
add r0, r3
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
WEIGHT_FUNC_HALF_MM
|
||||
INIT_XMM sse4
|
||||
WEIGHT_FUNC_HALF_MM
|
||||
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_h264_biweight_16_10(uint8_t *dst, uint8_t *src, int stride,
|
||||
; int height, int log2_denom, int weightd,
|
||||
; int weights, int offset);
|
||||
;-----------------------------------------------------------------------------
|
||||
%if ARCH_X86_32
|
||||
DECLARE_REG_TMP 3
|
||||
%else
|
||||
DECLARE_REG_TMP 7
|
||||
%endif
|
||||
|
||||
%macro BIWEIGHT_PROLOGUE 0
|
||||
.prologue:
|
||||
PROLOGUE 0,8,8
|
||||
movifnidn r0, r0mp
|
||||
movifnidn r1, r1mp
|
||||
movifnidn r2d, r2m
|
||||
movifnidn r5d, r5m
|
||||
movifnidn r6d, r6m
|
||||
movifnidn t0d, r7m
|
||||
%endmacro
|
||||
|
||||
%macro BIWEIGHT_SETUP 0
|
||||
lea t0, [t0*4+1] ; (offset<<2)+1
|
||||
or t0, 1
|
||||
shl r6, 16
|
||||
or r5, r6
|
||||
movd m4, r5d ; weightd | weights
|
||||
movd m5, t0d ; (offset+1)|1
|
||||
movd m6, r4m ; log2_denom
|
||||
pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
|
||||
paddd m6, [sq_1]
|
||||
pshufd m4, m4, 0
|
||||
pshufd m5, m5, 0
|
||||
mova m3, [pw_pixel_max]
|
||||
movifnidn r3d, r3m
|
||||
%if notcpuflag(sse4)
|
||||
pxor m7, m7
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro BIWEIGHT 1-2
|
||||
%if %0==1
|
||||
mova m0, [r0+%1]
|
||||
mova m1, [r1+%1]
|
||||
punpckhwd m2, m0, m1
|
||||
punpcklwd m0, m1
|
||||
%else
|
||||
movq m0, [r0+%1]
|
||||
movq m1, [r1+%1]
|
||||
punpcklwd m0, m1
|
||||
movq m2, [r0+%2]
|
||||
movq m1, [r1+%2]
|
||||
punpcklwd m2, m1
|
||||
%endif
|
||||
pmaddwd m0, m4
|
||||
pmaddwd m2, m4
|
||||
paddd m0, m5
|
||||
paddd m2, m5
|
||||
psrad m0, m6
|
||||
psrad m2, m6
|
||||
%if cpuflag(sse4)
|
||||
packusdw m0, m2
|
||||
pminsw m0, m3
|
||||
%else
|
||||
packssdw m0, m2
|
||||
CLIPW m0, m7, m3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro BIWEIGHT_FUNC_DBL 0
|
||||
cglobal h264_biweight_16_10
|
||||
BIWEIGHT_PROLOGUE
|
||||
BIWEIGHT_SETUP
|
||||
.nextrow:
|
||||
BIWEIGHT 0
|
||||
mova [r0 ], m0
|
||||
BIWEIGHT 16
|
||||
mova [r0+16], m0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
BIWEIGHT_FUNC_DBL
|
||||
INIT_XMM sse4
|
||||
BIWEIGHT_FUNC_DBL
|
||||
|
||||
%macro BIWEIGHT_FUNC 0
|
||||
cglobal h264_biweight_8_10
|
||||
BIWEIGHT_PROLOGUE
|
||||
BIWEIGHT_SETUP
|
||||
.nextrow:
|
||||
BIWEIGHT 0
|
||||
mova [r0], m0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
BIWEIGHT_FUNC
|
||||
INIT_XMM sse4
|
||||
BIWEIGHT_FUNC
|
||||
|
||||
%macro BIWEIGHT_FUNC_HALF 0
|
||||
cglobal h264_biweight_4_10
|
||||
BIWEIGHT_PROLOGUE
|
||||
BIWEIGHT_SETUP
|
||||
sar r3d, 1
|
||||
lea r4, [r2*2]
|
||||
.nextrow:
|
||||
BIWEIGHT 0, r2
|
||||
movh [r0 ], m0
|
||||
movhps [r0+r2], m0
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
BIWEIGHT_FUNC_HALF
|
||||
INIT_XMM sse4
|
||||
BIWEIGHT_FUNC_HALF
|
||||
117
externals/ffmpeg/libavcodec/x86/h264chroma_init.c
vendored
Executable file
117
externals/ffmpeg/libavcodec/x86/h264chroma_init.c
vendored
Executable file
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/h264chroma.h"
|
||||
|
||||
void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
|
||||
void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
|
||||
void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
|
||||
void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
|
||||
void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
|
||||
#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
|
||||
void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
|
||||
(uint8_t *dst, uint8_t *src, \
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
|
||||
CHROMA_MC(put, 2, 10, mmxext)
|
||||
CHROMA_MC(avg, 2, 10, mmxext)
|
||||
CHROMA_MC(put, 4, 10, mmxext)
|
||||
CHROMA_MC(avg, 4, 10, mmxext)
|
||||
CHROMA_MC(put, 8, 10, sse2)
|
||||
CHROMA_MC(avg, 8, 10, sse2)
|
||||
CHROMA_MC(put, 8, 10, avx)
|
||||
CHROMA_MC(avg, 8, 10, avx)
|
||||
|
||||
av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth)
|
||||
{
|
||||
int high_bit_depth = bit_depth > 8;
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags) && !high_bit_depth) {
|
||||
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
|
||||
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AMD3DNOW(cpu_flags) && !high_bit_depth) {
|
||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
|
||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags) && !high_bit_depth) {
|
||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
|
||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
|
||||
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
|
||||
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
|
||||
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
|
||||
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
|
||||
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
|
||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
|
||||
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
|
||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags) && !high_bit_depth) {
|
||||
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
|
||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
|
||||
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
|
||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
|
||||
// AVX implies !cache64.
|
||||
// TODO: Port cache(32|64) detection from x264.
|
||||
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
|
||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
|
||||
}
|
||||
}
|
||||
448
externals/ffmpeg/libavcodec/x86/h264dsp_init.c
vendored
Executable file
448
externals/ffmpeg/libavcodec/x86/h264dsp_init.c
vendored
Executable file
@@ -0,0 +1,448 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/h264dsp.h"
|
||||
|
||||
/***********************************/
|
||||
/* IDCT */
|
||||
#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \
|
||||
void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
|
||||
int16_t *block, \
|
||||
int stride);
|
||||
|
||||
IDCT_ADD_FUNC(, 8, mmx)
|
||||
IDCT_ADD_FUNC(, 8, sse2)
|
||||
IDCT_ADD_FUNC(, 8, avx)
|
||||
IDCT_ADD_FUNC(, 10, sse2)
|
||||
IDCT_ADD_FUNC(_dc, 8, mmxext)
|
||||
IDCT_ADD_FUNC(_dc, 8, sse2)
|
||||
IDCT_ADD_FUNC(_dc, 8, avx)
|
||||
IDCT_ADD_FUNC(_dc, 10, mmxext)
|
||||
IDCT_ADD_FUNC(8_dc, 8, mmxext)
|
||||
IDCT_ADD_FUNC(8_dc, 10, sse2)
|
||||
IDCT_ADD_FUNC(8, 8, mmx)
|
||||
IDCT_ADD_FUNC(8, 8, sse2)
|
||||
IDCT_ADD_FUNC(8, 10, sse2)
|
||||
IDCT_ADD_FUNC(, 10, avx)
|
||||
IDCT_ADD_FUNC(8_dc, 10, avx)
|
||||
IDCT_ADD_FUNC(8, 10, avx)
|
||||
|
||||
|
||||
#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \
|
||||
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
|
||||
(uint8_t *dst, const int *block_offset, \
|
||||
int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
|
||||
|
||||
IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
|
||||
IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
|
||||
IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
|
||||
IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
|
||||
IDCT_ADD_REP_FUNC(8, 4, 10, avx)
|
||||
IDCT_ADD_REP_FUNC(, 16, 8, mmx)
|
||||
IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
|
||||
IDCT_ADD_REP_FUNC(, 16, 8, sse2)
|
||||
IDCT_ADD_REP_FUNC(, 16, 10, sse2)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
|
||||
IDCT_ADD_REP_FUNC(, 16, 10, avx)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
|
||||
|
||||
|
||||
#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \
|
||||
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
|
||||
(uint8_t **dst, const int *block_offset, \
|
||||
int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
|
||||
|
||||
IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
|
||||
IDCT_ADD_REP_FUNC2(, 8, 8, mmxext)
|
||||
IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
|
||||
IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
|
||||
IDCT_ADD_REP_FUNC2(, 8, 10, avx)
|
||||
|
||||
IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx)
|
||||
|
||||
IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2)
|
||||
IDCT_ADD_REP_FUNC2(, 8_422, 10, avx)
|
||||
|
||||
void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
|
||||
void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
|
||||
|
||||
/***********************************/
|
||||
/* deblocking */
|
||||
|
||||
void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40],
|
||||
int8_t ref[2][40],
|
||||
int16_t mv[2][40][2],
|
||||
int bidir, int edges, int step,
|
||||
int mask_mv0, int mask_mv1, int field);
|
||||
|
||||
#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
|
||||
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
|
||||
ptrdiff_t stride, \
|
||||
int alpha, \
|
||||
int beta, \
|
||||
int8_t *tc0);
|
||||
#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
|
||||
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
|
||||
ptrdiff_t stride, \
|
||||
int alpha, \
|
||||
int beta);
|
||||
|
||||
#define LF_FUNCS(type, depth) \
|
||||
LF_FUNC(h, chroma, depth, mmxext) \
|
||||
LF_IFUNC(h, chroma_intra, depth, mmxext) \
|
||||
LF_FUNC(h, chroma422, depth, mmxext) \
|
||||
LF_IFUNC(h, chroma422_intra, depth, mmxext) \
|
||||
LF_FUNC(v, chroma, depth, mmxext) \
|
||||
LF_IFUNC(v, chroma_intra, depth, mmxext) \
|
||||
LF_FUNC(h, luma, depth, mmxext) \
|
||||
LF_IFUNC(h, luma_intra, depth, mmxext) \
|
||||
LF_FUNC(h, luma, depth, sse2) \
|
||||
LF_IFUNC(h, luma_intra, depth, sse2) \
|
||||
LF_FUNC(v, luma, depth, sse2) \
|
||||
LF_IFUNC(v, luma_intra, depth, sse2) \
|
||||
LF_FUNC(h, chroma, depth, sse2) \
|
||||
LF_IFUNC(h, chroma_intra, depth, sse2) \
|
||||
LF_FUNC(h, chroma422, depth, sse2) \
|
||||
LF_IFUNC(h, chroma422_intra, depth, sse2) \
|
||||
LF_FUNC(v, chroma, depth, sse2) \
|
||||
LF_IFUNC(v, chroma_intra, depth, sse2) \
|
||||
LF_FUNC(h, luma, depth, avx) \
|
||||
LF_IFUNC(h, luma_intra, depth, avx) \
|
||||
LF_FUNC(v, luma, depth, avx) \
|
||||
LF_IFUNC(v, luma_intra, depth, avx) \
|
||||
LF_FUNC(h, chroma, depth, avx) \
|
||||
LF_IFUNC(h, chroma_intra, depth, avx) \
|
||||
LF_FUNC(h, chroma422, depth, avx) \
|
||||
LF_IFUNC(h, chroma422_intra, depth, avx) \
|
||||
LF_FUNC(v, chroma, depth, avx) \
|
||||
LF_IFUNC(v, chroma_intra, depth, avx)
|
||||
|
||||
LF_FUNC(h, luma_mbaff, 8, sse2)
|
||||
LF_FUNC(h, luma_mbaff, 8, avx)
|
||||
|
||||
LF_FUNCS(uint8_t, 8)
|
||||
LF_FUNCS(uint16_t, 10)
|
||||
|
||||
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
|
||||
LF_FUNC(v8, luma, 8, mmxext)
|
||||
static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
|
||||
int beta, int8_t *tc0)
|
||||
{
|
||||
if ((tc0[0] & tc0[1]) >= 0)
|
||||
ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0);
|
||||
if ((tc0[2] & tc0[3]) >= 0)
|
||||
ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2);
|
||||
}
|
||||
LF_IFUNC(v8, luma_intra, 8, mmxext)
|
||||
static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride,
|
||||
int alpha, int beta)
|
||||
{
|
||||
ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta);
|
||||
ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta);
|
||||
}
|
||||
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
|
||||
|
||||
LF_FUNC(v, luma, 10, mmxext)
|
||||
LF_IFUNC(v, luma_intra, 10, mmxext)
|
||||
|
||||
/***********************************/
|
||||
/* weighted prediction */
|
||||
|
||||
#define H264_WEIGHT(W, OPT) \
|
||||
void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, ptrdiff_t stride, \
|
||||
int height, int log2_denom, \
|
||||
int weight, int offset);
|
||||
|
||||
#define H264_BIWEIGHT(W, OPT) \
|
||||
void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \
|
||||
ptrdiff_t stride, int height, \
|
||||
int log2_denom, int weightd, \
|
||||
int weights, int offset);
|
||||
|
||||
#define H264_BIWEIGHT_MMX(W) \
|
||||
H264_WEIGHT(W, mmxext) \
|
||||
H264_BIWEIGHT(W, mmxext)
|
||||
|
||||
#define H264_BIWEIGHT_MMX_SSE(W) \
|
||||
H264_BIWEIGHT_MMX(W) \
|
||||
H264_WEIGHT(W, sse2) \
|
||||
H264_BIWEIGHT(W, sse2) \
|
||||
H264_BIWEIGHT(W, ssse3)
|
||||
|
||||
H264_BIWEIGHT_MMX_SSE(16)
|
||||
H264_BIWEIGHT_MMX_SSE(8)
|
||||
H264_BIWEIGHT_MMX(4)
|
||||
|
||||
#define H264_WEIGHT_10(W, DEPTH, OPT) \
|
||||
void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
|
||||
ptrdiff_t stride, \
|
||||
int height, \
|
||||
int log2_denom, \
|
||||
int weight, \
|
||||
int offset);
|
||||
|
||||
#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
|
||||
void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
|
||||
uint8_t *src, \
|
||||
ptrdiff_t stride, \
|
||||
int height, \
|
||||
int log2_denom, \
|
||||
int weightd, \
|
||||
int weights, \
|
||||
int offset);
|
||||
|
||||
#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
|
||||
H264_WEIGHT_10(W, DEPTH, sse2) \
|
||||
H264_WEIGHT_10(W, DEPTH, sse4) \
|
||||
H264_BIWEIGHT_10(W, DEPTH, sse2) \
|
||||
H264_BIWEIGHT_10(W, DEPTH, sse4)
|
||||
|
||||
H264_BIWEIGHT_10_SSE(16, 10)
|
||||
H264_BIWEIGHT_10_SSE(8, 10)
|
||||
H264_BIWEIGHT_10_SSE(4, 10)
|
||||
|
||||
av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1)
|
||||
c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext;
|
||||
|
||||
if (bit_depth == 8) {
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->h264_idct_dc_add =
|
||||
c->h264_idct_add = ff_h264_idct_add_8_mmx;
|
||||
c->h264_idct8_dc_add =
|
||||
c->h264_idct8_add = ff_h264_idct8_add_8_mmx;
|
||||
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
|
||||
} else {
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx;
|
||||
}
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
|
||||
if (cpu_flags & AV_CPU_FLAG_CMOV)
|
||||
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
|
||||
}
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext;
|
||||
if (chroma_format_idc <= 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext;
|
||||
}
|
||||
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
|
||||
c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext;
|
||||
c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
|
||||
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
|
||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
|
||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->h264_idct8_add = ff_h264_idct8_add_8_sse2;
|
||||
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_8_sse2;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2;
|
||||
if (chroma_format_idc <= 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_8_sse2;
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2;
|
||||
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
|
||||
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
|
||||
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
|
||||
|
||||
#if ARCH_X86_64
|
||||
c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_sse2;
|
||||
#endif
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_sse2;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_sse2;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_sse2;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_sse2;
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_sse2;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2;
|
||||
}
|
||||
|
||||
c->h264_idct_add = ff_h264_idct_add_8_sse2;
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_sse2;
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
|
||||
#if ARCH_X86_64
|
||||
c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx;
|
||||
#endif
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_avx;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_avx;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_avx;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_avx;
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
|
||||
}
|
||||
|
||||
c->h264_idct_add = ff_h264_idct_add_8_avx;
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx;
|
||||
}
|
||||
} else if (bit_depth == 10) {
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
#if ARCH_X86_32
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext;
|
||||
}
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
|
||||
#endif /* ARCH_X86_32 */
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->h264_idct_add = ff_h264_idct_add_10_sse2;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
|
||||
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
|
||||
} else {
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_422_10_sse2;
|
||||
}
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_idct8_add = ff_h264_idct8_add_10_sse2;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
|
||||
#endif /* HAVE_ALIGNED_STACK */
|
||||
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
|
||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
|
||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
|
||||
}
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
|
||||
#endif /* HAVE_ALIGNED_STACK */
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
|
||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
|
||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
c->h264_idct_dc_add =
|
||||
c->h264_idct_add = ff_h264_idct_add_10_avx;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
|
||||
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
|
||||
} else {
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_422_10_avx;
|
||||
}
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_idct8_add = ff_h264_idct8_add_10_avx;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
|
||||
#endif /* HAVE_ALIGNED_STACK */
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
|
||||
}
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
|
||||
#endif /* HAVE_ALIGNED_STACK */
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
369
externals/ffmpeg/libavcodec/x86/hevc_add_res.asm
vendored
Executable file
369
externals/ffmpeg/libavcodec/x86/hevc_add_res.asm
vendored
Executable file
@@ -0,0 +1,369 @@
|
||||
; *****************************************************************************
|
||||
; * Provide SIMD optimizations for add_residual functions for HEVC decoding
|
||||
; * Copyright (c) 2014 Pierre-Edouard LEPERE
|
||||
; *
|
||||
; * This file is part of FFmpeg.
|
||||
; *
|
||||
; * FFmpeg is free software; you can redistribute it and/or
|
||||
; * modify it under the terms of the GNU Lesser General Public
|
||||
; * License as published by the Free Software Foundation; either
|
||||
; * version 2.1 of the License, or (at your option) any later version.
|
||||
; *
|
||||
; * FFmpeg is distributed in the hope that it will be useful,
|
||||
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
; * Lesser General Public License for more details.
|
||||
; *
|
||||
; * You should have received a copy of the GNU Lesser General Public
|
||||
; * License along with FFmpeg; if not, write to the Free Software
|
||||
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
; ******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
cextern pw_1023
|
||||
%define max_pixels_10 pw_1023
|
||||
|
||||
; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
|
||||
%macro ADD_RES_MMX_4_8 0
|
||||
mova m0, [r1]
|
||||
mova m2, [r1+8]
|
||||
|
||||
movd m1, [r0]
|
||||
movd m3, [r0+r2]
|
||||
punpcklbw m1, m4
|
||||
punpcklbw m3, m4
|
||||
|
||||
paddsw m0, m1
|
||||
paddsw m2, m3
|
||||
packuswb m0, m4
|
||||
packuswb m2, m4
|
||||
|
||||
movd [r0], m0
|
||||
movd [r0+r2], m2
|
||||
%endmacro
|
||||
|
||||
|
||||
INIT_MMX mmxext
|
||||
; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
|
||||
cglobal hevc_add_residual_4_8, 3, 3, 6
|
||||
pxor m4, m4
|
||||
ADD_RES_MMX_4_8
|
||||
add r1, 16
|
||||
lea r0, [r0+r2*2]
|
||||
ADD_RES_MMX_4_8
|
||||
RET
|
||||
|
||||
%macro ADD_RES_SSE_8_8 0
|
||||
movq m0, [r0]
|
||||
movq m1, [r0+r2]
|
||||
punpcklbw m0, m4
|
||||
punpcklbw m1, m4
|
||||
mova m2, [r1]
|
||||
mova m3, [r1+16]
|
||||
paddsw m0, m2
|
||||
paddsw m1, m3
|
||||
packuswb m0, m1
|
||||
|
||||
movq m2, [r0+r2*2]
|
||||
movq m3, [r0+r3]
|
||||
punpcklbw m2, m4
|
||||
punpcklbw m3, m4
|
||||
mova m6, [r1+32]
|
||||
mova m7, [r1+48]
|
||||
paddsw m2, m6
|
||||
paddsw m3, m7
|
||||
packuswb m2, m3
|
||||
|
||||
movq [r0], m0
|
||||
movhps [r0+r2], m0
|
||||
movq [r0+r2*2], m2
|
||||
movhps [r0+r3], m2
|
||||
%endmacro
|
||||
|
||||
%macro ADD_RES_SSE_16_32_8 3
|
||||
mova m1, [%2]
|
||||
mova m2, m1
|
||||
punpcklbw m1, m0
|
||||
punpckhbw m2, m0
|
||||
mova xm5, [r1+%1]
|
||||
mova xm6, [r1+%1+16]
|
||||
%if cpuflag(avx2)
|
||||
vinserti128 m5, m5, [r1+%1+32], 1
|
||||
vinserti128 m6, m6, [r1+%1+48], 1
|
||||
%endif
|
||||
paddsw m1, m5
|
||||
paddsw m2, m6
|
||||
|
||||
mova m3, [%3]
|
||||
mova m4, m3
|
||||
punpcklbw m3, m0
|
||||
punpckhbw m4, m0
|
||||
mova xm5, [r1+%1+mmsize*2]
|
||||
mova xm6, [r1+%1+mmsize*2+16]
|
||||
%if cpuflag(avx2)
|
||||
vinserti128 m5, m5, [r1+%1+96], 1
|
||||
vinserti128 m6, m6, [r1+%1+112], 1
|
||||
%endif
|
||||
paddsw m3, m5
|
||||
paddsw m4, m6
|
||||
|
||||
packuswb m1, m2
|
||||
packuswb m3, m4
|
||||
mova [%2], m1
|
||||
mova [%3], m3
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro TRANSFORM_ADD_8 0
|
||||
; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
|
||||
cglobal hevc_add_residual_8_8, 3, 4, 8
|
||||
pxor m4, m4
|
||||
lea r3, [r2*3]
|
||||
ADD_RES_SSE_8_8
|
||||
add r1, 64
|
||||
lea r0, [r0+r2*4]
|
||||
ADD_RES_SSE_8_8
|
||||
RET
|
||||
|
||||
; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
|
||||
cglobal hevc_add_residual_16_8, 3, 5, 7
|
||||
pxor m0, m0
|
||||
lea r3, [r2*3]
|
||||
mov r4d, 4
|
||||
.loop:
|
||||
ADD_RES_SSE_16_32_8 0, r0, r0+r2
|
||||
ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
|
||||
add r1, 128
|
||||
lea r0, [r0+r2*4]
|
||||
dec r4d
|
||||
jg .loop
|
||||
RET
|
||||
|
||||
; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
|
||||
cglobal hevc_add_residual_32_8, 3, 5, 7
|
||||
pxor m0, m0
|
||||
mov r4d, 16
|
||||
.loop:
|
||||
ADD_RES_SSE_16_32_8 0, r0, r0+16
|
||||
ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
|
||||
add r1, 128
|
||||
lea r0, [r0+r2*2]
|
||||
dec r4d
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
TRANSFORM_ADD_8
|
||||
INIT_XMM avx
|
||||
TRANSFORM_ADD_8
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
|
||||
cglobal hevc_add_residual_32_8, 3, 5, 7
|
||||
pxor m0, m0
|
||||
lea r3, [r2*3]
|
||||
mov r4d, 8
|
||||
.loop:
|
||||
ADD_RES_SSE_16_32_8 0, r0, r0+r2
|
||||
ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
|
||||
add r1, 256
|
||||
lea r0, [r0+r2*4]
|
||||
dec r4d
|
||||
jg .loop
|
||||
RET
|
||||
%endif ;HAVE_AVX2_EXTERNAL
|
||||
|
||||
%macro ADD_RES_SSE_8_10 4
|
||||
mova m0, [%4]
|
||||
mova m1, [%4+16]
|
||||
mova m2, [%4+32]
|
||||
mova m3, [%4+48]
|
||||
paddw m0, [%1+0]
|
||||
paddw m1, [%1+%2]
|
||||
paddw m2, [%1+%2*2]
|
||||
paddw m3, [%1+%3]
|
||||
CLIPW m0, m4, m5
|
||||
CLIPW m1, m4, m5
|
||||
CLIPW m2, m4, m5
|
||||
CLIPW m3, m4, m5
|
||||
mova [%1+0], m0
|
||||
mova [%1+%2], m1
|
||||
mova [%1+%2*2], m2
|
||||
mova [%1+%3], m3
|
||||
%endmacro
|
||||
|
||||
%macro ADD_RES_MMX_4_10 3
|
||||
mova m0, [%1+0]
|
||||
mova m1, [%1+%2]
|
||||
paddw m0, [%3]
|
||||
paddw m1, [%3+8]
|
||||
CLIPW m0, m2, m3
|
||||
CLIPW m1, m2, m3
|
||||
mova [%1+0], m0
|
||||
mova [%1+%2], m1
|
||||
%endmacro
|
||||
|
||||
%macro ADD_RES_SSE_16_10 3
|
||||
mova m0, [%3]
|
||||
mova m1, [%3+16]
|
||||
mova m2, [%3+32]
|
||||
mova m3, [%3+48]
|
||||
paddw m0, [%1]
|
||||
paddw m1, [%1+16]
|
||||
paddw m2, [%1+%2]
|
||||
paddw m3, [%1+%2+16]
|
||||
CLIPW m0, m4, m5
|
||||
CLIPW m1, m4, m5
|
||||
CLIPW m2, m4, m5
|
||||
CLIPW m3, m4, m5
|
||||
mova [%1], m0
|
||||
mova [%1+16], m1
|
||||
mova [%1+%2], m2
|
||||
mova [%1+%2+16], m3
|
||||
%endmacro
|
||||
|
||||
%macro ADD_RES_SSE_32_10 2
|
||||
mova m0, [%2]
|
||||
mova m1, [%2+16]
|
||||
mova m2, [%2+32]
|
||||
mova m3, [%2+48]
|
||||
|
||||
paddw m0, [%1]
|
||||
paddw m1, [%1+16]
|
||||
paddw m2, [%1+32]
|
||||
paddw m3, [%1+48]
|
||||
CLIPW m0, m4, m5
|
||||
CLIPW m1, m4, m5
|
||||
CLIPW m2, m4, m5
|
||||
CLIPW m3, m4, m5
|
||||
mova [%1], m0
|
||||
mova [%1+16], m1
|
||||
mova [%1+32], m2
|
||||
mova [%1+48], m3
|
||||
%endmacro
|
||||
|
||||
%macro ADD_RES_AVX2_16_10 4
|
||||
mova m0, [%4]
|
||||
mova m1, [%4+32]
|
||||
mova m2, [%4+64]
|
||||
mova m3, [%4+96]
|
||||
|
||||
paddw m0, [%1+0]
|
||||
paddw m1, [%1+%2]
|
||||
paddw m2, [%1+%2*2]
|
||||
paddw m3, [%1+%3]
|
||||
|
||||
CLIPW m0, m4, m5
|
||||
CLIPW m1, m4, m5
|
||||
CLIPW m2, m4, m5
|
||||
CLIPW m3, m4, m5
|
||||
mova [%1+0], m0
|
||||
mova [%1+%2], m1
|
||||
mova [%1+%2*2], m2
|
||||
mova [%1+%3], m3
|
||||
%endmacro
|
||||
|
||||
%macro ADD_RES_AVX2_32_10 3
|
||||
mova m0, [%3]
|
||||
mova m1, [%3+32]
|
||||
mova m2, [%3+64]
|
||||
mova m3, [%3+96]
|
||||
|
||||
paddw m0, [%1]
|
||||
paddw m1, [%1+32]
|
||||
paddw m2, [%1+%2]
|
||||
paddw m3, [%1+%2+32]
|
||||
|
||||
CLIPW m0, m4, m5
|
||||
CLIPW m1, m4, m5
|
||||
CLIPW m2, m4, m5
|
||||
CLIPW m3, m4, m5
|
||||
mova [%1], m0
|
||||
mova [%1+32], m1
|
||||
mova [%1+%2], m2
|
||||
mova [%1+%2+32], m3
|
||||
%endmacro
|
||||
|
||||
; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
|
||||
INIT_MMX mmxext
|
||||
cglobal hevc_add_residual_4_10, 3, 3, 6
|
||||
pxor m2, m2
|
||||
mova m3, [max_pixels_10]
|
||||
ADD_RES_MMX_4_10 r0, r2, r1
|
||||
add r1, 16
|
||||
lea r0, [r0+2*r2]
|
||||
ADD_RES_MMX_4_10 r0, r2, r1
|
||||
RET
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal hevc_add_residual_8_10, 3, 4, 6
|
||||
pxor m4, m4
|
||||
mova m5, [max_pixels_10]
|
||||
lea r3, [r2*3]
|
||||
|
||||
ADD_RES_SSE_8_10 r0, r2, r3, r1
|
||||
lea r0, [r0+r2*4]
|
||||
add r1, 64
|
||||
ADD_RES_SSE_8_10 r0, r2, r3, r1
|
||||
RET
|
||||
|
||||
cglobal hevc_add_residual_16_10, 3, 5, 6
|
||||
pxor m4, m4
|
||||
mova m5, [max_pixels_10]
|
||||
|
||||
mov r4d, 8
|
||||
.loop:
|
||||
ADD_RES_SSE_16_10 r0, r2, r1
|
||||
lea r0, [r0+r2*2]
|
||||
add r1, 64
|
||||
dec r4d
|
||||
jg .loop
|
||||
RET
|
||||
|
||||
cglobal hevc_add_residual_32_10, 3, 5, 6
|
||||
pxor m4, m4
|
||||
mova m5, [max_pixels_10]
|
||||
|
||||
mov r4d, 32
|
||||
.loop:
|
||||
ADD_RES_SSE_32_10 r0, r1
|
||||
lea r0, [r0+r2]
|
||||
add r1, 64
|
||||
dec r4d
|
||||
jg .loop
|
||||
RET
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
cglobal hevc_add_residual_16_10, 3, 5, 6
|
||||
pxor m4, m4
|
||||
mova m5, [max_pixels_10]
|
||||
lea r3, [r2*3]
|
||||
|
||||
mov r4d, 4
|
||||
.loop:
|
||||
ADD_RES_AVX2_16_10 r0, r2, r3, r1
|
||||
lea r0, [r0+r2*4]
|
||||
add r1, 128
|
||||
dec r4d
|
||||
jg .loop
|
||||
RET
|
||||
|
||||
cglobal hevc_add_residual_32_10, 3, 5, 6
|
||||
pxor m4, m4
|
||||
mova m5, [max_pixels_10]
|
||||
|
||||
mov r4d, 16
|
||||
.loop:
|
||||
ADD_RES_AVX2_32_10 r0, r2, r1
|
||||
lea r0, [r0+r2*2]
|
||||
add r1, 128
|
||||
dec r4d
|
||||
jg .loop
|
||||
RET
|
||||
%endif ;HAVE_AVX2_EXTERNAL
|
||||
871
externals/ffmpeg/libavcodec/x86/hevc_deblock.asm
vendored
Executable file
871
externals/ffmpeg/libavcodec/x86/hevc_deblock.asm
vendored
Executable file
@@ -0,0 +1,871 @@
|
||||
;*****************************************************************************
|
||||
;* SSE2-optimized HEVC deblocking code
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2013 VTT
|
||||
;*
|
||||
;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
cextern pw_1023
|
||||
%define pw_pixel_max_10 pw_1023
|
||||
pw_pixel_max_12: times 8 dw ((1 << 12)-1)
|
||||
pw_m2: times 8 dw -2
|
||||
pd_1 : times 4 dd 1
|
||||
|
||||
cextern pw_4
|
||||
cextern pw_8
|
||||
cextern pw_m1
|
||||
|
||||
SECTION .text
|
||||
INIT_XMM sse2
|
||||
|
||||
; in: 8 rows of 4 bytes in %4..%11
|
||||
; out: 4 rows of 8 words in m0..m3
|
||||
%macro TRANSPOSE4x8B_LOAD 8
|
||||
movd m0, %1
|
||||
movd m2, %2
|
||||
movd m1, %3
|
||||
movd m3, %4
|
||||
|
||||
punpcklbw m0, m2
|
||||
punpcklbw m1, m3
|
||||
punpcklwd m0, m1
|
||||
|
||||
movd m4, %5
|
||||
movd m6, %6
|
||||
movd m5, %7
|
||||
movd m3, %8
|
||||
|
||||
punpcklbw m4, m6
|
||||
punpcklbw m5, m3
|
||||
punpcklwd m4, m5
|
||||
|
||||
punpckhdq m2, m0, m4
|
||||
punpckldq m0, m4
|
||||
|
||||
pxor m5, m5
|
||||
punpckhbw m1, m0, m5
|
||||
punpcklbw m0, m5
|
||||
punpckhbw m3, m2, m5
|
||||
punpcklbw m2, m5
|
||||
%endmacro
|
||||
|
||||
; in: 4 rows of 8 words in m0..m3
|
||||
; out: 8 rows of 4 bytes in %1..%8
|
||||
%macro TRANSPOSE8x4B_STORE 8
|
||||
packuswb m0, m2
|
||||
packuswb m1, m3
|
||||
SBUTTERFLY bw, 0, 1, 2
|
||||
SBUTTERFLY wd, 0, 1, 2
|
||||
|
||||
movd %1, m0
|
||||
pshufd m0, m0, 0x39
|
||||
movd %2, m0
|
||||
pshufd m0, m0, 0x39
|
||||
movd %3, m0
|
||||
pshufd m0, m0, 0x39
|
||||
movd %4, m0
|
||||
|
||||
movd %5, m1
|
||||
pshufd m1, m1, 0x39
|
||||
movd %6, m1
|
||||
pshufd m1, m1, 0x39
|
||||
movd %7, m1
|
||||
pshufd m1, m1, 0x39
|
||||
movd %8, m1
|
||||
%endmacro
|
||||
|
||||
; in: 8 rows of 4 words in %4..%11
|
||||
; out: 4 rows of 8 words in m0..m3
|
||||
%macro TRANSPOSE4x8W_LOAD 8
|
||||
movq m0, %1
|
||||
movq m2, %2
|
||||
movq m1, %3
|
||||
movq m3, %4
|
||||
|
||||
punpcklwd m0, m2
|
||||
punpcklwd m1, m3
|
||||
punpckhdq m2, m0, m1
|
||||
punpckldq m0, m1
|
||||
|
||||
movq m4, %5
|
||||
movq m6, %6
|
||||
movq m5, %7
|
||||
movq m3, %8
|
||||
|
||||
punpcklwd m4, m6
|
||||
punpcklwd m5, m3
|
||||
punpckhdq m6, m4, m5
|
||||
punpckldq m4, m5
|
||||
|
||||
punpckhqdq m1, m0, m4
|
||||
punpcklqdq m0, m4
|
||||
punpckhqdq m3, m2, m6
|
||||
punpcklqdq m2, m6
|
||||
|
||||
%endmacro
|
||||
|
||||
; in: 4 rows of 8 words in m0..m3
|
||||
; out: 8 rows of 4 words in %1..%8
|
||||
%macro TRANSPOSE8x4W_STORE 9
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
|
||||
pxor m5, m5; zeros reg
|
||||
CLIPW m0, m5, %9
|
||||
CLIPW m1, m5, %9
|
||||
CLIPW m2, m5, %9
|
||||
CLIPW m3, m5, %9
|
||||
|
||||
movq %1, m0
|
||||
movhps %2, m0
|
||||
movq %3, m1
|
||||
movhps %4, m1
|
||||
movq %5, m2
|
||||
movhps %6, m2
|
||||
movq %7, m3
|
||||
movhps %8, m3
|
||||
%endmacro
|
||||
|
||||
; in: 8 rows of 8 bytes in %1..%8
|
||||
; out: 8 rows of 8 words in m0..m7
|
||||
%macro TRANSPOSE8x8B_LOAD 8
|
||||
movq m7, %1
|
||||
movq m2, %2
|
||||
movq m1, %3
|
||||
movq m3, %4
|
||||
|
||||
punpcklbw m7, m2
|
||||
punpcklbw m1, m3
|
||||
punpcklwd m3, m7, m1
|
||||
punpckhwd m7, m1
|
||||
|
||||
movq m4, %5
|
||||
movq m6, %6
|
||||
movq m5, %7
|
||||
movq m15, %8
|
||||
|
||||
punpcklbw m4, m6
|
||||
punpcklbw m5, m15
|
||||
punpcklwd m9, m4, m5
|
||||
punpckhwd m4, m5
|
||||
|
||||
punpckldq m1, m3, m9; 0, 1
|
||||
punpckhdq m3, m9; 2, 3
|
||||
|
||||
punpckldq m5, m7, m4; 4, 5
|
||||
punpckhdq m7, m4; 6, 7
|
||||
|
||||
pxor m13, m13
|
||||
|
||||
punpcklbw m0, m1, m13; 0 in 16 bit
|
||||
punpckhbw m1, m13; 1 in 16 bit
|
||||
|
||||
punpcklbw m2, m3, m13; 2
|
||||
punpckhbw m3, m13; 3
|
||||
|
||||
punpcklbw m4, m5, m13; 4
|
||||
punpckhbw m5, m13; 5
|
||||
|
||||
punpcklbw m6, m7, m13; 6
|
||||
punpckhbw m7, m13; 7
|
||||
%endmacro
|
||||
|
||||
|
||||
; in: 8 rows of 8 words in m0..m8
|
||||
; out: 8 rows of 8 bytes in %1..%8
|
||||
%macro TRANSPOSE8x8B_STORE 8
|
||||
packuswb m0, m4
|
||||
packuswb m1, m5
|
||||
packuswb m2, m6
|
||||
packuswb m3, m7
|
||||
TRANSPOSE2x4x4B 0, 1, 2, 3, 4
|
||||
|
||||
movq %1, m0
|
||||
movhps %2, m0
|
||||
movq %3, m1
|
||||
movhps %4, m1
|
||||
movq %5, m2
|
||||
movhps %6, m2
|
||||
movq %7, m3
|
||||
movhps %8, m3
|
||||
%endmacro
|
||||
|
||||
; in: 8 rows of 8 words in %1..%8
|
||||
; out: 8 rows of 8 words in m0..m7
|
||||
%macro TRANSPOSE8x8W_LOAD 8
|
||||
movdqu m0, %1
|
||||
movdqu m1, %2
|
||||
movdqu m2, %3
|
||||
movdqu m3, %4
|
||||
movdqu m4, %5
|
||||
movdqu m5, %6
|
||||
movdqu m6, %7
|
||||
movdqu m7, %8
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
%endmacro
|
||||
|
||||
; in: 8 rows of 8 words in m0..m8
|
||||
; out: 8 rows of 8 words in %1..%8
|
||||
%macro TRANSPOSE8x8W_STORE 9
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
|
||||
pxor m8, m8
|
||||
CLIPW m0, m8, %9
|
||||
CLIPW m1, m8, %9
|
||||
CLIPW m2, m8, %9
|
||||
CLIPW m3, m8, %9
|
||||
CLIPW m4, m8, %9
|
||||
CLIPW m5, m8, %9
|
||||
CLIPW m6, m8, %9
|
||||
CLIPW m7, m8, %9
|
||||
|
||||
movdqu %1, m0
|
||||
movdqu %2, m1
|
||||
movdqu %3, m2
|
||||
movdqu %4, m3
|
||||
movdqu %5, m4
|
||||
movdqu %6, m5
|
||||
movdqu %7, m6
|
||||
movdqu %8, m7
|
||||
%endmacro
|
||||
|
||||
|
||||
; in: %2 clobbered
|
||||
; out: %1
|
||||
; mask in m11
|
||||
; clobbers m10
|
||||
%macro MASKED_COPY 2
|
||||
pand %2, m11 ; and mask
|
||||
pandn m10, m11, %1; and -mask
|
||||
por %2, m10
|
||||
mova %1, %2
|
||||
%endmacro
|
||||
|
||||
; in: %2 clobbered
|
||||
; out: %1
|
||||
; mask in %3, will be clobbered
|
||||
%macro MASKED_COPY2 3
|
||||
pand %2, %3 ; and mask
|
||||
pandn %3, %1; and -mask
|
||||
por %2, %3
|
||||
mova %1, %2
|
||||
%endmacro
|
||||
|
||||
ALIGN 16
|
||||
; input in m0 ... m3 and tcs in r2. Output in m1 and m2
|
||||
%macro CHROMA_DEBLOCK_BODY 1
|
||||
psubw m4, m2, m1; q0 - p0
|
||||
psubw m5, m0, m3; p1 - q1
|
||||
psllw m4, 2; << 2
|
||||
paddw m5, m4;
|
||||
|
||||
;tc calculations
|
||||
movq m6, [tcq]; tc0
|
||||
punpcklwd m6, m6
|
||||
pshufd m6, m6, 0xA0; tc0, tc1
|
||||
%if cpuflag(ssse3)
|
||||
psignw m4, m6, [pw_m1]; -tc0, -tc1
|
||||
%else
|
||||
pmullw m4, m6, [pw_m1]; -tc0, -tc1
|
||||
%endif
|
||||
;end tc calculations
|
||||
|
||||
paddw m5, [pw_4]; +4
|
||||
psraw m5, 3; >> 3
|
||||
|
||||
%if %1 > 8
|
||||
psllw m4, %1-8; << (BIT_DEPTH - 8)
|
||||
psllw m6, %1-8; << (BIT_DEPTH - 8)
|
||||
%endif
|
||||
pmaxsw m5, m4
|
||||
pminsw m5, m6
|
||||
paddw m1, m5; p0 + delta0
|
||||
psubw m2, m5; q0 - delta0
|
||||
%endmacro
|
||||
|
||||
; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6
|
||||
%macro LUMA_DEBLOCK_BODY 2
|
||||
psllw m9, m2, 1; *2
|
||||
psubw m10, m1, m9
|
||||
paddw m10, m3
|
||||
ABS1 m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3
|
||||
|
||||
psllw m9, m5, 1; *2
|
||||
psubw m11, m6, m9
|
||||
paddw m11, m4
|
||||
ABS1 m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3
|
||||
|
||||
;beta calculations
|
||||
%if %1 > 8
|
||||
shl betaq, %1 - 8
|
||||
%endif
|
||||
movd m13, betad
|
||||
SPLATW m13, m13, 0
|
||||
;end beta calculations
|
||||
|
||||
paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3
|
||||
|
||||
pshufhw m14, m9, 0x0f ;0b00001111; 0d3 0d3 0d0 0d0 in high
|
||||
pshuflw m14, m14, 0x0f ;0b00001111; 1d3 1d3 1d0 1d0 in low
|
||||
|
||||
pshufhw m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
|
||||
pshuflw m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
|
||||
|
||||
paddw m14, m9; 0d0+0d3, 1d0+1d3
|
||||
|
||||
;compare
|
||||
pcmpgtw m15, m13, m14
|
||||
movmskps r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
|
||||
test r13, r13
|
||||
je .bypassluma
|
||||
|
||||
;weak / strong decision compare to beta_2
|
||||
psraw m15, m13, 2; beta >> 2
|
||||
psllw m8, m9, 1;
|
||||
pcmpgtw m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
|
||||
movmskps r6, m15;
|
||||
;end weak / strong decision
|
||||
|
||||
; weak filter nd_p/q calculation
|
||||
pshufd m8, m10, 0x31
|
||||
psrld m8, 16
|
||||
paddw m8, m10
|
||||
movd r7d, m8
|
||||
pshufd m8, m8, 0x4E
|
||||
movd r8d, m8
|
||||
|
||||
pshufd m8, m11, 0x31
|
||||
psrld m8, 16
|
||||
paddw m8, m11
|
||||
movd r9d, m8
|
||||
pshufd m8, m8, 0x4E
|
||||
movd r10d, m8
|
||||
; end calc for weak filter
|
||||
|
||||
; filtering mask
|
||||
mov r11, r13
|
||||
shr r11, 3
|
||||
movd m15, r11d
|
||||
and r13, 1
|
||||
movd m11, r13d
|
||||
shufps m11, m15, 0
|
||||
shl r11, 1
|
||||
or r13, r11
|
||||
|
||||
pcmpeqd m11, [pd_1]; filtering mask
|
||||
|
||||
;decide between strong and weak filtering
|
||||
;tc25 calculations
|
||||
mov r11d, [tcq];
|
||||
%if %1 > 8
|
||||
shl r11, %1 - 8
|
||||
%endif
|
||||
movd m8, r11d; tc0
|
||||
mov r3d, [tcq+4];
|
||||
%if %1 > 8
|
||||
shl r3, %1 - 8
|
||||
%endif
|
||||
add r11d, r3d; tc0 + tc1
|
||||
jz .bypassluma
|
||||
movd m9, r3d; tc1
|
||||
punpcklwd m8, m8
|
||||
punpcklwd m9, m9
|
||||
shufps m8, m9, 0; tc0, tc1
|
||||
mova m9, m8
|
||||
psllw m8, 2; tc << 2
|
||||
pavgw m8, m9; tc25 = ((tc * 5 + 1) >> 1)
|
||||
;end tc25 calculations
|
||||
|
||||
;----beta_3 comparison-----
|
||||
psubw m12, m0, m3; p3 - p0
|
||||
ABS1 m12, m14; abs(p3 - p0)
|
||||
|
||||
psubw m15, m7, m4; q3 - q0
|
||||
ABS1 m15, m14; abs(q3 - q0)
|
||||
|
||||
paddw m12, m15; abs(p3 - p0) + abs(q3 - q0)
|
||||
|
||||
pshufhw m12, m12, 0xf0 ;0b11110000;
|
||||
pshuflw m12, m12, 0xf0 ;0b11110000;
|
||||
|
||||
psraw m13, 3; beta >> 3
|
||||
pcmpgtw m13, m12;
|
||||
movmskps r11, m13;
|
||||
and r6, r11; strong mask , beta_2 and beta_3 comparisons
|
||||
;----beta_3 comparison end-----
|
||||
;----tc25 comparison---
|
||||
psubw m12, m3, m4; p0 - q0
|
||||
ABS1 m12, m14; abs(p0 - q0)
|
||||
|
||||
pshufhw m12, m12, 0xf0 ;0b11110000;
|
||||
pshuflw m12, m12, 0xf0 ;0b11110000;
|
||||
|
||||
pcmpgtw m8, m12; tc25 comparisons
|
||||
movmskps r11, m8;
|
||||
and r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
|
||||
;----tc25 comparison end---
|
||||
mov r11, r6;
|
||||
shr r11, 1;
|
||||
and r6, r11; strong mask, bits 2 and 0
|
||||
|
||||
pmullw m14, m9, [pw_m2]; -tc * 2
|
||||
paddw m9, m9
|
||||
|
||||
and r6, 5; 0b101
|
||||
mov r11, r6; strong mask
|
||||
shr r6, 2;
|
||||
movd m12, r6d; store to xmm for mask generation
|
||||
shl r6, 1
|
||||
and r11, 1
|
||||
movd m10, r11d; store to xmm for mask generation
|
||||
or r6, r11; final strong mask, bits 1 and 0
|
||||
jz .weakfilter
|
||||
|
||||
shufps m10, m12, 0
|
||||
pcmpeqd m10, [pd_1]; strong mask
|
||||
|
||||
mova m13, [pw_4]; 4 in every cell
|
||||
pand m11, m10; combine filtering mask and strong mask
|
||||
paddw m12, m2, m3; p1 + p0
|
||||
paddw m12, m4; p1 + p0 + q0
|
||||
mova m10, m12; copy
|
||||
paddw m12, m12; 2*p1 + 2*p0 + 2*q0
|
||||
paddw m12, m1; p2 + 2*p1 + 2*p0 + 2*q0
|
||||
paddw m12, m5; p2 + 2*p1 + 2*p0 + 2*q0 + q1
|
||||
paddw m12, m13; p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4
|
||||
psraw m12, 3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3)
|
||||
psubw m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0
|
||||
pmaxsw m12, m14
|
||||
pminsw m12, m9; av_clip( , -2 * tc, 2 * tc)
|
||||
paddw m12, m3; p0'
|
||||
|
||||
paddw m15, m1, m10; p2 + p1 + p0 + q0
|
||||
psrlw m13, 1; 2 in every cell
|
||||
paddw m15, m13; p2 + p1 + p0 + q0 + 2
|
||||
psraw m15, 2; (p2 + p1 + p0 + q0 + 2) >> 2
|
||||
psubw m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1
|
||||
pmaxsw m15, m14
|
||||
pminsw m15, m9; av_clip( , -2 * tc, 2 * tc)
|
||||
paddw m15, m2; p1'
|
||||
|
||||
paddw m8, m1, m0; p3 + p2
|
||||
paddw m8, m8; 2*p3 + 2*p2
|
||||
paddw m8, m1; 2*p3 + 3*p2
|
||||
paddw m8, m10; 2*p3 + 3*p2 + p1 + p0 + q0
|
||||
paddw m13, m13
|
||||
paddw m8, m13; 2*p3 + 3*p2 + p1 + p0 + q0 + 4
|
||||
psraw m8, 3; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
|
||||
psubw m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2
|
||||
pmaxsw m8, m14
|
||||
pminsw m8, m9; av_clip( , -2 * tc, 2 * tc)
|
||||
paddw m8, m1; p2'
|
||||
MASKED_COPY m1, m8
|
||||
|
||||
paddw m8, m3, m4; p0 + q0
|
||||
paddw m8, m5; p0 + q0 + q1
|
||||
paddw m8, m8; 2*p0 + 2*q0 + 2*q1
|
||||
paddw m8, m2; p1 + 2*p0 + 2*q0 + 2*q1
|
||||
paddw m8, m6; p1 + 2*p0 + 2*q0 + 2*q1 + q2
|
||||
paddw m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4
|
||||
psraw m8, 3; (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3
|
||||
psubw m8, m4;
|
||||
pmaxsw m8, m14
|
||||
pminsw m8, m9; av_clip( , -2 * tc, 2 * tc)
|
||||
paddw m8, m4; q0'
|
||||
MASKED_COPY m2, m15
|
||||
|
||||
paddw m15, m3, m4; p0 + q0
|
||||
paddw m15, m5; p0 + q0 + q1
|
||||
mova m10, m15;
|
||||
paddw m15, m6; p0 + q0 + q1 + q2
|
||||
psrlw m13, 1; 2 in every cell
|
||||
paddw m15, m13; p0 + q0 + q1 + q2 + 2
|
||||
psraw m15, 2; (p0 + q0 + q1 + q2 + 2) >> 2
|
||||
psubw m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1
|
||||
pmaxsw m15, m14
|
||||
pminsw m15, m9; av_clip( , -2 * tc, 2 * tc)
|
||||
paddw m15, m5; q1'
|
||||
|
||||
paddw m13, m7; q3 + 2
|
||||
paddw m13, m6; q3 + q2 + 2
|
||||
paddw m13, m13; 2*q3 + 2*q2 + 4
|
||||
paddw m13, m6; 2*q3 + 3*q2 + 4
|
||||
paddw m13, m10; 2*q3 + 3*q2 + q1 + q0 + p0 + 4
|
||||
psraw m13, 3; (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3
|
||||
psubw m13, m6; ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2
|
||||
pmaxsw m13, m14
|
||||
pminsw m13, m9; av_clip( , -2 * tc, 2 * tc)
|
||||
paddw m13, m6; q2'
|
||||
|
||||
MASKED_COPY m6, m13
|
||||
MASKED_COPY m5, m15
|
||||
MASKED_COPY m4, m8
|
||||
MASKED_COPY m3, m12
|
||||
|
||||
.weakfilter:
|
||||
not r6; strong mask -> weak mask
|
||||
and r6, r13; final weak filtering mask, bits 0 and 1
|
||||
jz .store
|
||||
|
||||
; weak filtering mask
|
||||
mov r11, r6
|
||||
shr r11, 1
|
||||
movd m12, r11d
|
||||
and r6, 1
|
||||
movd m11, r6d
|
||||
shufps m11, m12, 0
|
||||
pcmpeqd m11, [pd_1]; filtering mask
|
||||
|
||||
mov r13, betaq
|
||||
shr r13, 1;
|
||||
add betaq, r13
|
||||
shr betaq, 3; ((beta + (beta >> 1)) >> 3))
|
||||
|
||||
mova m13, [pw_8]
|
||||
psubw m12, m4, m3 ; q0 - p0
|
||||
psllw m10, m12, 3; 8 * (q0 - p0)
|
||||
paddw m12, m10 ; 9 * (q0 - p0)
|
||||
|
||||
psubw m10, m5, m2 ; q1 - p1
|
||||
psllw m8, m10, 1; 2 * ( q1 - p1 )
|
||||
paddw m10, m8; 3 * ( q1 - p1 )
|
||||
psubw m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
|
||||
paddw m12, m13; + 8
|
||||
psraw m12, 4; >> 4 , delta0
|
||||
PABSW m13, m12; abs(delta0)
|
||||
|
||||
|
||||
psllw m10, m9, 2; 8 * tc
|
||||
paddw m10, m9; 10 * tc
|
||||
pcmpgtw m10, m13
|
||||
pand m11, m10
|
||||
|
||||
psraw m9, 1; tc * 2 -> tc
|
||||
psraw m14, 1; -tc * 2 -> -tc
|
||||
|
||||
pmaxsw m12, m14
|
||||
pminsw m12, m9; av_clip(delta0, -tc, tc)
|
||||
|
||||
psraw m9, 1; tc -> tc / 2
|
||||
%if cpuflag(ssse3)
|
||||
psignw m14, m9, [pw_m1]; -tc / 2
|
||||
%else
|
||||
pmullw m14, m9, [pw_m1]; -tc / 2
|
||||
%endif
|
||||
|
||||
pavgw m15, m1, m3; (p2 + p0 + 1) >> 1
|
||||
psubw m15, m2; ((p2 + p0 + 1) >> 1) - p1
|
||||
paddw m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0
|
||||
psraw m15, 1; (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
|
||||
pmaxsw m15, m14
|
||||
pminsw m15, m9; av_clip(deltap1, -tc/2, tc/2)
|
||||
paddw m15, m2; p1'
|
||||
|
||||
;beta calculations
|
||||
movd m10, betad
|
||||
SPLATW m10, m10, 0
|
||||
|
||||
movd m13, r7d; 1dp0 + 1dp3
|
||||
movd m8, r8d; 0dp0 + 0dp3
|
||||
punpcklwd m8, m8
|
||||
punpcklwd m13, m13
|
||||
shufps m13, m8, 0;
|
||||
pcmpgtw m8, m10, m13
|
||||
pand m8, m11
|
||||
;end beta calculations
|
||||
MASKED_COPY2 m2, m15, m8; write p1'
|
||||
|
||||
pavgw m8, m6, m4; (q2 + q0 + 1) >> 1
|
||||
psubw m8, m5; ((q2 + q0 + 1) >> 1) - q1
|
||||
psubw m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0)
|
||||
psraw m8, 1; ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
|
||||
pmaxsw m8, m14
|
||||
pminsw m8, m9; av_clip(deltaq1, -tc/2, tc/2)
|
||||
paddw m8, m5; q1'
|
||||
|
||||
movd m13, r9d;
|
||||
movd m15, r10d;
|
||||
punpcklwd m15, m15
|
||||
punpcklwd m13, m13
|
||||
shufps m13, m15, 0; dq0 + dq3
|
||||
|
||||
pcmpgtw m10, m13; compare to ((beta+(beta>>1))>>3)
|
||||
pand m10, m11
|
||||
MASKED_COPY2 m5, m8, m10; write q1'
|
||||
|
||||
paddw m15, m3, m12 ; p0 + delta0
|
||||
MASKED_COPY m3, m15
|
||||
|
||||
psubw m8, m4, m12 ; q0 - delta0
|
||||
MASKED_COPY m4, m8
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
|
||||
; uint8_t *_no_p, uint8_t *_no_q);
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro LOOP_FILTER_CHROMA 0
|
||||
cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
|
||||
sub pixq, 2
|
||||
lea r3strideq, [3*strideq]
|
||||
mov pix0q, pixq
|
||||
add pixq, r3strideq
|
||||
TRANSPOSE4x8B_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
|
||||
CHROMA_DEBLOCK_BODY 8
|
||||
TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
|
||||
RET
|
||||
|
||||
cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
|
||||
sub pixq, 4
|
||||
lea r3strideq, [3*strideq]
|
||||
mov pix0q, pixq
|
||||
add pixq, r3strideq
|
||||
TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
|
||||
CHROMA_DEBLOCK_BODY 10
|
||||
TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
|
||||
RET
|
||||
|
||||
cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
|
||||
sub pixq, 4
|
||||
lea r3strideq, [3*strideq]
|
||||
mov pix0q, pixq
|
||||
add pixq, r3strideq
|
||||
TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
|
||||
CHROMA_DEBLOCK_BODY 12
|
||||
TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
|
||||
; uint8_t *_no_p, uint8_t *_no_q);
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
|
||||
mov pix0q, pixq
|
||||
sub pix0q, strideq
|
||||
sub pix0q, strideq
|
||||
movq m0, [pix0q]; p1
|
||||
movq m1, [pix0q+strideq]; p0
|
||||
movq m2, [pixq]; q0
|
||||
movq m3, [pixq+strideq]; q1
|
||||
pxor m5, m5; zeros reg
|
||||
punpcklbw m0, m5
|
||||
punpcklbw m1, m5
|
||||
punpcklbw m2, m5
|
||||
punpcklbw m3, m5
|
||||
CHROMA_DEBLOCK_BODY 8
|
||||
packuswb m1, m2
|
||||
movh[pix0q+strideq], m1
|
||||
movhps [pixq], m1
|
||||
RET
|
||||
|
||||
cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
|
||||
mov pix0q, pixq
|
||||
sub pix0q, strideq
|
||||
sub pix0q, strideq
|
||||
movu m0, [pix0q]; p1
|
||||
movu m1, [pix0q+strideq]; p0
|
||||
movu m2, [pixq]; q0
|
||||
movu m3, [pixq+strideq]; q1
|
||||
CHROMA_DEBLOCK_BODY 10
|
||||
pxor m5, m5; zeros reg
|
||||
CLIPW m1, m5, [pw_pixel_max_10]
|
||||
CLIPW m2, m5, [pw_pixel_max_10]
|
||||
movu [pix0q+strideq], m1
|
||||
movu [pixq], m2
|
||||
RET
|
||||
|
||||
cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
|
||||
mov pix0q, pixq
|
||||
sub pix0q, strideq
|
||||
sub pix0q, strideq
|
||||
movu m0, [pix0q]; p1
|
||||
movu m1, [pix0q+strideq]; p0
|
||||
movu m2, [pixq]; q0
|
||||
movu m3, [pixq+strideq]; q1
|
||||
CHROMA_DEBLOCK_BODY 12
|
||||
pxor m5, m5; zeros reg
|
||||
CLIPW m1, m5, [pw_pixel_max_12]
|
||||
CLIPW m2, m5, [pw_pixel_max_12]
|
||||
movu [pix0q+strideq], m1
|
||||
movu [pixq], m2
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
LOOP_FILTER_CHROMA
|
||||
INIT_XMM avx
|
||||
LOOP_FILTER_CHROMA
|
||||
|
||||
%if ARCH_X86_64
|
||||
%macro LOOP_FILTER_LUMA 0
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
|
||||
; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
|
||||
sub pixq, 4
|
||||
lea pix0q, [3 * r1]
|
||||
mov src3strideq, pixq
|
||||
add pixq, pix0q
|
||||
TRANSPOSE8x8B_LOAD PASS8ROWS(src3strideq, pixq, r1, pix0q)
|
||||
LUMA_DEBLOCK_BODY 8, v
|
||||
.store:
|
||||
TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
|
||||
.bypassluma:
|
||||
RET
|
||||
|
||||
cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
|
||||
sub pixq, 8
|
||||
lea pix0q, [3 * strideq]
|
||||
mov src3strideq, pixq
|
||||
add pixq, pix0q
|
||||
TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q)
|
||||
LUMA_DEBLOCK_BODY 10, v
|
||||
.store:
|
||||
TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
|
||||
.bypassluma:
|
||||
RET
|
||||
|
||||
cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
|
||||
sub pixq, 8
|
||||
lea pix0q, [3 * strideq]
|
||||
mov src3strideq, pixq
|
||||
add pixq, pix0q
|
||||
TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q)
|
||||
LUMA_DEBLOCK_BODY 12, v
|
||||
.store:
|
||||
TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
|
||||
.bypassluma:
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
|
||||
; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
|
||||
lea src3strideq, [3 * strideq]
|
||||
mov pix0q, pixq
|
||||
sub pix0q, src3strideq
|
||||
sub pix0q, strideq
|
||||
movq m0, [pix0q]; p3
|
||||
movq m1, [pix0q + strideq]; p2
|
||||
movq m2, [pix0q + 2 * strideq]; p1
|
||||
movq m3, [pix0q + src3strideq]; p0
|
||||
movq m4, [pixq]; q0
|
||||
movq m5, [pixq + strideq]; q1
|
||||
movq m6, [pixq + 2 * strideq]; q2
|
||||
movq m7, [pixq + src3strideq]; q3
|
||||
pxor m8, m8
|
||||
punpcklbw m0, m8
|
||||
punpcklbw m1, m8
|
||||
punpcklbw m2, m8
|
||||
punpcklbw m3, m8
|
||||
punpcklbw m4, m8
|
||||
punpcklbw m5, m8
|
||||
punpcklbw m6, m8
|
||||
punpcklbw m7, m8
|
||||
LUMA_DEBLOCK_BODY 8, h
|
||||
.store:
|
||||
packuswb m1, m2
|
||||
packuswb m3, m4
|
||||
packuswb m5, m6
|
||||
movh [pix0q + strideq], m1
|
||||
movhps [pix0q + 2 * strideq], m1
|
||||
movh [pix0q + src3strideq], m3
|
||||
movhps [pixq ], m3
|
||||
movh [pixq + strideq], m5
|
||||
movhps [pixq + 2 * strideq], m5
|
||||
.bypassluma:
|
||||
RET
|
||||
|
||||
cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
|
||||
lea src3strideq, [3 * strideq]
|
||||
mov pix0q, pixq
|
||||
sub pix0q, src3strideq
|
||||
sub pix0q, strideq
|
||||
movdqu m0, [pix0q]; p3
|
||||
movdqu m1, [pix0q + strideq]; p2
|
||||
movdqu m2, [pix0q + 2 * strideq]; p1
|
||||
movdqu m3, [pix0q + src3strideq]; p0
|
||||
movdqu m4, [pixq]; q0
|
||||
movdqu m5, [pixq + strideq]; q1
|
||||
movdqu m6, [pixq + 2 * strideq]; q2
|
||||
movdqu m7, [pixq + src3strideq]; q3
|
||||
LUMA_DEBLOCK_BODY 10, h
|
||||
.store:
|
||||
pxor m8, m8; zeros reg
|
||||
CLIPW m1, m8, [pw_pixel_max_10]
|
||||
CLIPW m2, m8, [pw_pixel_max_10]
|
||||
CLIPW m3, m8, [pw_pixel_max_10]
|
||||
CLIPW m4, m8, [pw_pixel_max_10]
|
||||
CLIPW m5, m8, [pw_pixel_max_10]
|
||||
CLIPW m6, m8, [pw_pixel_max_10]
|
||||
movdqu [pix0q + strideq], m1; p2
|
||||
movdqu [pix0q + 2 * strideq], m2; p1
|
||||
movdqu [pix0q + src3strideq], m3; p0
|
||||
movdqu [pixq ], m4; q0
|
||||
movdqu [pixq + strideq], m5; q1
|
||||
movdqu [pixq + 2 * strideq], m6; q2
|
||||
.bypassluma:
|
||||
RET
|
||||
|
||||
cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
|
||||
lea src3strideq, [3 * strideq]
|
||||
mov pix0q, pixq
|
||||
sub pix0q, src3strideq
|
||||
sub pix0q, strideq
|
||||
movdqu m0, [pix0q]; p3
|
||||
movdqu m1, [pix0q + strideq]; p2
|
||||
movdqu m2, [pix0q + 2 * strideq]; p1
|
||||
movdqu m3, [pix0q + src3strideq]; p0
|
||||
movdqu m4, [pixq]; q0
|
||||
movdqu m5, [pixq + strideq]; q1
|
||||
movdqu m6, [pixq + 2 * strideq]; q2
|
||||
movdqu m7, [pixq + src3strideq]; q3
|
||||
LUMA_DEBLOCK_BODY 12, h
|
||||
.store:
|
||||
pxor m8, m8; zeros reg
|
||||
CLIPW m1, m8, [pw_pixel_max_12]
|
||||
CLIPW m2, m8, [pw_pixel_max_12]
|
||||
CLIPW m3, m8, [pw_pixel_max_12]
|
||||
CLIPW m4, m8, [pw_pixel_max_12]
|
||||
CLIPW m5, m8, [pw_pixel_max_12]
|
||||
CLIPW m6, m8, [pw_pixel_max_12]
|
||||
movdqu [pix0q + strideq], m1; p2
|
||||
movdqu [pix0q + 2 * strideq], m2; p1
|
||||
movdqu [pix0q + src3strideq], m3; p0
|
||||
movdqu [pixq ], m4; q0
|
||||
movdqu [pixq + strideq], m5; q1
|
||||
movdqu [pixq + 2 * strideq], m6; q2
|
||||
.bypassluma:
|
||||
RET
|
||||
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
LOOP_FILTER_LUMA
|
||||
INIT_XMM ssse3
|
||||
LOOP_FILTER_LUMA
|
||||
INIT_XMM avx
|
||||
LOOP_FILTER_LUMA
|
||||
%endif
|
||||
853
externals/ffmpeg/libavcodec/x86/hevc_idct.asm
vendored
Executable file
853
externals/ffmpeg/libavcodec/x86/hevc_idct.asm
vendored
Executable file
@@ -0,0 +1,853 @@
|
||||
;*******************************************************************************
|
||||
;* SIMD-optimized IDCT functions for HEVC decoding
|
||||
;* Copyright (c) 2014 Pierre-Edouard LEPERE
|
||||
;* Copyright (c) 2014 James Almer
|
||||
;* Copyright (c) 2016 Alexandra Hájková
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
pd_64: times 4 dd 64
|
||||
pd_2048: times 4 dd 2048
|
||||
pd_512: times 4 dd 512
|
||||
|
||||
; 4x4 transform coeffs
|
||||
cextern pw_64
|
||||
pw_64_m64: times 4 dw 64, -64
|
||||
pw_83_36: times 4 dw 83, 36
|
||||
pw_36_m83: times 4 dw 36, -83
|
||||
|
||||
; 8x8 transform coeffs
|
||||
pw_89_75: times 4 dw 89, 75
|
||||
pw_50_18: times 4 dw 50, 18
|
||||
|
||||
pw_75_m18: times 4 dw 75, -18
|
||||
pw_m89_m50: times 4 dw -89, -50
|
||||
|
||||
pw_50_m89: times 4 dw 50, -89
|
||||
pw_18_75: times 4 dw 18, 75
|
||||
|
||||
pw_18_m50: times 4 dw 18, -50
|
||||
pw_75_m89: times 4 dw 75, -89
|
||||
|
||||
; 16x16 transformation coeffs
|
||||
trans_coeffs16: times 4 dw 90, 87
|
||||
times 4 dw 80, 70
|
||||
times 4 dw 57, 43
|
||||
times 4 dw 25, 9
|
||||
|
||||
times 4 dw 87, 57
|
||||
times 4 dw 9, -43
|
||||
times 4 dw -80, -90
|
||||
times 4 dw -70, -25
|
||||
|
||||
times 4 dw 80, 9
|
||||
times 4 dw -70, -87
|
||||
times 4 dw -25, 57
|
||||
times 4 dw 90, 43
|
||||
|
||||
times 4 dw 70, -43
|
||||
times 4 dw -87, 9
|
||||
times 4 dw 90, 25
|
||||
times 4 dw -80, -57
|
||||
|
||||
times 4 dw 57, -80
|
||||
times 4 dw -25, 90
|
||||
times 4 dw -9, -87
|
||||
times 4 dw 43, 70
|
||||
|
||||
times 4 dw 43, -90
|
||||
times 4 dw 57, 25
|
||||
times 4 dw -87, 70
|
||||
times 4 dw 9, -80
|
||||
|
||||
times 4 dw 25, -70
|
||||
times 4 dw 90, -80
|
||||
times 4 dw 43, 9
|
||||
times 4 dw -57, 87
|
||||
|
||||
times 4 dw 9, -25
|
||||
times 4 dw 43, -57
|
||||
times 4 dw 70, -80
|
||||
times 4 dw 87, -90
|
||||
|
||||
; 32x32 transform coeffs
|
||||
trans_coeff32: times 8 dw 90
|
||||
times 4 dw 88, 85
|
||||
times 4 dw 82, 78
|
||||
times 4 dw 73, 67
|
||||
times 4 dw 61, 54
|
||||
times 4 dw 46, 38
|
||||
times 4 dw 31, 22
|
||||
times 4 dw 13, 4
|
||||
|
||||
times 4 dw 90, 82
|
||||
times 4 dw 67, 46
|
||||
times 4 dw 22, -4
|
||||
times 4 dw -31, -54
|
||||
times 4 dw -73, -85
|
||||
times 4 dw -90, -88
|
||||
times 4 dw -78, -61
|
||||
times 4 dw -38, -13
|
||||
|
||||
times 4 dw 88, 67
|
||||
times 4 dw 31, -13
|
||||
times 4 dw -54, -82
|
||||
times 4 dw -90, -78
|
||||
times 4 dw -46, -4
|
||||
times 4 dw 38, 73
|
||||
times 4 dw 90, 85
|
||||
times 4 dw 61, 22
|
||||
|
||||
times 4 dw 85, 46
|
||||
times 4 dw -13, -67
|
||||
times 4 dw -90, -73
|
||||
times 4 dw -22, 38
|
||||
times 4 dw 82, 88
|
||||
times 4 dw 54, -4
|
||||
times 4 dw -61, -90
|
||||
times 4 dw -78, -31
|
||||
|
||||
times 4 dw 82, 22
|
||||
times 4 dw -54, -90
|
||||
times 4 dw -61, 13
|
||||
times 4 dw 78, 85
|
||||
times 4 dw 31, -46
|
||||
times 4 dw -90, -67
|
||||
times 4 dw 4, 73
|
||||
times 4 dw 88, 38
|
||||
|
||||
times 4 dw 78, -4
|
||||
times 4 dw -82, -73
|
||||
times 4 dw 13, 85
|
||||
times 4 dw 67, -22
|
||||
times 4 dw -88, -61
|
||||
times 4 dw 31, 90
|
||||
times 4 dw 54, -38
|
||||
times 4 dw -90, -46
|
||||
|
||||
times 4 dw 73, -31
|
||||
times 4 dw -90, -22
|
||||
times 4 dw 78, 67
|
||||
times 4 dw -38, -90
|
||||
times 4 dw -13, 82
|
||||
times 4 dw 61, -46
|
||||
times 4 dw -88, -4
|
||||
times 4 dw 85, 54
|
||||
|
||||
times 4 dw 67, -54
|
||||
times 4 dw -78, 38
|
||||
times 4 dw 85, -22
|
||||
times 4 dw -90, 4
|
||||
times 4 dw 90, 13
|
||||
times 4 dw -88, -31
|
||||
times 4 dw 82, 46
|
||||
times 4 dw -73, -61
|
||||
|
||||
times 4 dw 61, -73
|
||||
times 4 dw -46, 82
|
||||
times 4 dw 31, -88
|
||||
times 4 dw -13, 90
|
||||
times 4 dw -4, -90
|
||||
times 4 dw 22, 85
|
||||
times 4 dw -38, -78
|
||||
times 4 dw 54, 67
|
||||
|
||||
times 4 dw 54, -85
|
||||
times 4 dw -4, 88
|
||||
times 4 dw -46, -61
|
||||
times 4 dw 82, 13
|
||||
times 4 dw -90, 38
|
||||
times 4 dw 67, -78
|
||||
times 4 dw -22, 90
|
||||
times 4 dw -31, -73
|
||||
|
||||
times 4 dw 46, -90
|
||||
times 4 dw 38, 54
|
||||
times 4 dw -90, 31
|
||||
times 4 dw 61, -88
|
||||
times 4 dw 22, 67
|
||||
times 4 dw -85, 13
|
||||
times 4 dw 73, -82
|
||||
times 4 dw 4, 78
|
||||
|
||||
times 4 dw 38, -88
|
||||
times 4 dw 73, -4
|
||||
times 4 dw -67, 90
|
||||
times 4 dw -46, -31
|
||||
times 4 dw 85, -78
|
||||
times 4 dw 13, 61
|
||||
times 4 dw -90, 54
|
||||
times 4 dw 22, -82
|
||||
|
||||
times 4 dw 31, -78
|
||||
times 4 dw 90, -61
|
||||
times 4 dw 4, 54
|
||||
times 4 dw -88, 82
|
||||
times 4 dw -38, -22
|
||||
times 4 dw 73, -90
|
||||
times 4 dw 67, -13
|
||||
times 4 dw -46, 85
|
||||
|
||||
times 4 dw 22, -61
|
||||
times 4 dw 85, -90
|
||||
times 4 dw 73, -38
|
||||
times 4 dw -4, 46
|
||||
times 4 dw -78, 90
|
||||
times 4 dw -82, 54
|
||||
times 4 dw -13, -31
|
||||
times 4 dw 67, -88
|
||||
|
||||
times 4 dw 13, -38
|
||||
times 4 dw 61, -78
|
||||
times 4 dw 88, -90
|
||||
times 4 dw 85, -73
|
||||
times 4 dw 54, -31
|
||||
times 4 dw 4, 22
|
||||
times 4 dw -46, 67
|
||||
times 4 dw -82, 90
|
||||
|
||||
times 4 dw 4, -13
|
||||
times 4 dw 22, -31
|
||||
times 4 dw 38, -46
|
||||
times 4 dw 54, -61
|
||||
times 4 dw 67, -73
|
||||
times 4 dw 78, -82
|
||||
times 4 dw 85, -88
|
||||
times 4 dw 90, -90
|
||||
|
||||
SECTION .text
|
||||
|
||||
; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
|
||||
; %1 = HxW
|
||||
; %2 = number of loops
|
||||
; %3 = bitdepth
|
||||
%macro IDCT_DC 3
|
||||
cglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp
|
||||
movsx tmpd, word [coeffq]
|
||||
add tmpd, (1 << (14 - %3)) + 1
|
||||
sar tmpd, (15 - %3)
|
||||
movd xm0, tmpd
|
||||
SPLATW m0, xm0
|
||||
DEFINE_ARGS coeff, cnt
|
||||
mov cntd, %2
|
||||
.loop:
|
||||
mova [coeffq+mmsize*0], m0
|
||||
mova [coeffq+mmsize*1], m0
|
||||
mova [coeffq+mmsize*2], m0
|
||||
mova [coeffq+mmsize*3], m0
|
||||
add coeffq, mmsize*8
|
||||
mova [coeffq+mmsize*-4], m0
|
||||
mova [coeffq+mmsize*-3], m0
|
||||
mova [coeffq+mmsize*-2], m0
|
||||
mova [coeffq+mmsize*-1], m0
|
||||
dec cntd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
; %1 = HxW
|
||||
; %2 = bitdepth
|
||||
%macro IDCT_DC_NL 2 ; No loop
|
||||
cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp
|
||||
movsx tmpd, word [coeffq]
|
||||
add tmpd, (1 << (14 - %2)) + 1
|
||||
sar tmpd, (15 - %2)
|
||||
movd m0, tmpd
|
||||
SPLATW m0, xm0
|
||||
mova [coeffq+mmsize*0], m0
|
||||
mova [coeffq+mmsize*1], m0
|
||||
mova [coeffq+mmsize*2], m0
|
||||
mova [coeffq+mmsize*3], m0
|
||||
%if mmsize == 16
|
||||
mova [coeffq+mmsize*4], m0
|
||||
mova [coeffq+mmsize*5], m0
|
||||
mova [coeffq+mmsize*6], m0
|
||||
mova [coeffq+mmsize*7], m0
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
; IDCT 4x4, expects input in m0, m1
|
||||
; %1 - shift
|
||||
; %2 - 1/0 - SCALE and Transpose or not
|
||||
; %3 - 1/0 add constant or not
|
||||
%macro TR_4x4 3
|
||||
; interleaves src0 with src2 to m0
|
||||
; and src1 with scr3 to m2
|
||||
; src0: 00 01 02 03 m0: 00 20 01 21 02 22 03 23
|
||||
; src1: 10 11 12 13 -->
|
||||
; src2: 20 21 22 23 m1: 10 30 11 31 12 32 13 33
|
||||
; src3: 30 31 32 33
|
||||
|
||||
SBUTTERFLY wd, 0, 1, 2
|
||||
|
||||
pmaddwd m2, m0, [pw_64] ; e0
|
||||
pmaddwd m3, m1, [pw_83_36] ; o0
|
||||
pmaddwd m0, [pw_64_m64] ; e1
|
||||
pmaddwd m1, [pw_36_m83] ; o1
|
||||
|
||||
%if %3 == 1
|
||||
%assign %%add 1 << (%1 - 1)
|
||||
mova m4, [pd_ %+ %%add]
|
||||
paddd m2, m4
|
||||
paddd m0, m4
|
||||
%endif
|
||||
|
||||
SUMSUB_BADC d, 3, 2, 1, 0, 4
|
||||
|
||||
%if %2 == 1
|
||||
psrad m3, %1 ; e0 + o0
|
||||
psrad m1, %1 ; e1 + o1
|
||||
psrad m2, %1 ; e0 - o0
|
||||
psrad m0, %1 ; e1 - o1
|
||||
;clip16
|
||||
packssdw m3, m1
|
||||
packssdw m0, m2
|
||||
; Transpose
|
||||
SBUTTERFLY wd, 3, 0, 1
|
||||
SBUTTERFLY wd, 3, 0, 1
|
||||
SWAP 3, 1, 0
|
||||
%else
|
||||
SWAP 3, 2, 0
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro DEFINE_BIAS 1
|
||||
%assign shift (20 - %1)
|
||||
%assign c_add (1 << (shift - 1))
|
||||
%define arr_add pd_ %+ c_add
|
||||
%endmacro
|
||||
|
||||
; %1 - bit_depth
|
||||
; %2 - register add constant
|
||||
; is loaded to
|
||||
; shift = 20 - bit_depth
|
||||
%macro LOAD_BIAS 2
|
||||
DEFINE_BIAS %1
|
||||
mova %2, [arr_add]
|
||||
%endmacro
|
||||
|
||||
; %1, %2 - registers to load packed 16 bit values to
|
||||
; %3, %4, %5, %6 - vertical offsets
|
||||
; %7 - horizontal offset
|
||||
%macro LOAD_BLOCK 7
|
||||
movq %1, [r0 + %3 + %7]
|
||||
movhps %1, [r0 + %5 + %7]
|
||||
movq %2, [r0 + %4 + %7]
|
||||
movhps %2, [r0 + %6 + %7]
|
||||
%endmacro
|
||||
|
||||
; void ff_hevc_idct_4x4__{8,10}_<opt>(int16_t *coeffs, int col_limit)
|
||||
; %1 = bitdepth
|
||||
%macro IDCT_4x4 1
|
||||
cglobal hevc_idct_4x4_%1, 1, 1, 5, coeffs
|
||||
mova m0, [coeffsq]
|
||||
mova m1, [coeffsq + 16]
|
||||
|
||||
TR_4x4 7, 1, 1
|
||||
TR_4x4 20 - %1, 1, 1
|
||||
|
||||
mova [coeffsq], m0
|
||||
mova [coeffsq + 16], m1
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
; scale, pack (clip16) and store the residuals 0 e8[0] + o8[0] --> + %1
|
||||
; 4 at one time (4 columns) 1 e8[1] + o8[1]
|
||||
; from %5: e8/16 + o8/16, with %1 offset ...
|
||||
; and %3: e8/16 - o8/16, with %2 offset 6 e8[1] - o8[1]
|
||||
; %4 - shift 7 e8[0] - o8[0] --> + %2
|
||||
%macro STORE_8 7
|
||||
psrad %5, %4
|
||||
psrad %3, %4
|
||||
packssdw %5, %3
|
||||
movq [coeffsq + %1], %5
|
||||
movhps [coeffsq + %2], %5
|
||||
%endmacro
|
||||
|
||||
; %1 - horizontal offset
|
||||
; %2 - shift
|
||||
; %3, %4 - transform coeffs
|
||||
; %5 - vertical offset for e8 + o8
|
||||
; %6 - vertical offset for e8 - o8
|
||||
; %7 - register with e8 inside
|
||||
; %8 - block_size
|
||||
; %9 - register to store e8 +o8
|
||||
; %10 - register to store e8 - o8
|
||||
%macro E8_O8 10
|
||||
pmaddwd m6, m4, %3
|
||||
pmaddwd m7, m5, %4
|
||||
|
||||
paddd m6, m7
|
||||
paddd m7, m6, %7 ; o8 + e8
|
||||
psubd %7, m6 ; e8 - o8
|
||||
%if %8 == 8
|
||||
STORE_8 %5 + %1, %6 + %1, %7, %2, m7, 0, 0
|
||||
%else
|
||||
SWAP m7, %9
|
||||
SWAP %7, %10
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; 8x4 residuals are processed and stored
|
||||
; %1 - horizontal offset
|
||||
; %2 - shift
|
||||
; %3 - offset of the even row
|
||||
; %4 - step: 1 for 8x8, 2 for 16x16, 4 for 32x32
|
||||
; %5 - offset of the odd row
|
||||
; %6 - block size
|
||||
; %7 - 1/0 add a constant in TR_4x4 or not
|
||||
; I want to add a constant for 8x8 transform but not for 16x16 and 32x32
|
||||
%macro TR_8x4 7
|
||||
; load 4 columns of even rows
|
||||
LOAD_BLOCK m0, m1, 0, 2 * %4 * %3, %4 * %3, 3 * %4 * %3, %1
|
||||
|
||||
TR_4x4 %2, 0, %7 ; e8: m0, m1, m2, m3, for 4 columns only
|
||||
|
||||
; load 4 columns of odd rows
|
||||
LOAD_BLOCK m4, m5, %4 * %5, 3 * %4 * %5, 5 * %4 * %5, 7 * %4 * %5, %1
|
||||
|
||||
; 00 01 02 03
|
||||
; 10 11 12 13 m4: 10 30 11 31 12 32 13 33
|
||||
|
||||
; ... -- >
|
||||
; m5: 50 70 51 71 52 72 53 73
|
||||
; 70 71 72 73
|
||||
SBUTTERFLY wd, 4, 5, 6
|
||||
|
||||
E8_O8 %1, %2, [pw_89_75], [pw_50_18], 0, %5 * 7, m0, %6, m8, m15
|
||||
E8_O8 %1, %2, [pw_75_m18], [pw_m89_m50], %5, %5 * 6, m1, %6, m9, m14
|
||||
E8_O8 %1, %2, [pw_50_m89], [pw_18_75], %5 * 2, %5 * 5, m2, %6, m10, m13
|
||||
E8_O8 %1, %2, [pw_18_m50], [pw_75_m89], %5 * 3, %5 * 4, m3, %6, m11, m12
|
||||
%endmacro
|
||||
|
||||
%macro STORE_PACKED 7
|
||||
movq [r0 + %3 + %7], %1
|
||||
movhps [r0 + %4 + %7], %1
|
||||
movq [r0 + %5 + %7], %2
|
||||
movhps [r0 + %6 + %7], %2
|
||||
%endmacro
|
||||
|
||||
; transpose 4x4 block packed
|
||||
; in %1 and %2 registers
|
||||
; %3 - temporary register
|
||||
%macro TRANSPOSE_4x4 3
|
||||
SBUTTERFLY wd, %1, %2, %3
|
||||
SBUTTERFLY dq, %1, %2, %3
|
||||
%endmacro
|
||||
|
||||
; %1 - horizontal offset of the block i
|
||||
; %2 - vertical offset of the block i
|
||||
; %3 - width in bytes
|
||||
; %4 - vertical offset for the block j
|
||||
; %5 - horizontal offset for the block j
|
||||
%macro SWAP_BLOCKS 5
|
||||
; M_j
|
||||
LOAD_BLOCK m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
|
||||
TRANSPOSE_4x4 4, 5, 6
|
||||
|
||||
; M_i
|
||||
LOAD_BLOCK m6, m7, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
|
||||
|
||||
STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
|
||||
|
||||
; transpose and store M_i
|
||||
SWAP m6, m4
|
||||
SWAP m7, m5
|
||||
TRANSPOSE_4x4 4, 5, 6
|
||||
STORE_PACKED m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
|
||||
%endmacro
|
||||
|
||||
; %1 - horizontal offset
|
||||
; %2 - vertical offset of the block
|
||||
; %3 - width in bytes
|
||||
%macro TRANSPOSE_BLOCK 3
|
||||
LOAD_BLOCK m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
|
||||
TRANSPOSE_4x4 4, 5, 6
|
||||
STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE_8x8 0
|
||||
cglobal hevc_idct_transpose_8x8, 0, 0, 0
|
||||
; M1 M2 ^T = M1^t M3^t
|
||||
; M3 M4 M2^t M4^t
|
||||
|
||||
; M1 4x4 block
|
||||
TRANSPOSE_BLOCK 0, 0, 16
|
||||
|
||||
; M2 and M3
|
||||
SWAP_BLOCKS 0, 64, 16, 0, 8
|
||||
|
||||
; M4
|
||||
TRANSPOSE_BLOCK 8, 64, 16
|
||||
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
; void ff_hevc_idct_8x8_{8,10}_<opt>(int16_t *coeffs, int col_limit)
|
||||
; %1 = bitdepth
|
||||
%macro IDCT_8x8 1
|
||||
cglobal hevc_idct_8x8_%1, 1, 1, 8, coeffs
|
||||
TR_8x4 0, 7, 32, 1, 16, 8, 1
|
||||
TR_8x4 8, 7, 32, 1, 16, 8, 1
|
||||
|
||||
call hevc_idct_transpose_8x8_ %+ cpuname
|
||||
|
||||
DEFINE_BIAS %1
|
||||
TR_8x4 0, shift, 32, 1, 16, 8, 1
|
||||
TR_8x4 8, shift, 32, 1, 16, 8, 1
|
||||
|
||||
TAIL_CALL hevc_idct_transpose_8x8_ %+ cpuname, 1
|
||||
%endmacro
|
||||
|
||||
; store intermedite e32 coeffs on stack
|
||||
; as 16x4 matrix
|
||||
; from m10: e8 + o8, with %6 offset
|
||||
; and %3: e8 - o8, with %7 offset
|
||||
; %4 - shift, unused here
|
||||
%macro STORE_16 7
|
||||
mova [rsp + %6], %5
|
||||
mova [rsp + %7], %3
|
||||
%endmacro
|
||||
|
||||
; %1, %2 - transform constants
|
||||
; %3, %4 - regs with interleaved coeffs
|
||||
; %5 - 1/0 SWAP or add
|
||||
; %6, %7 - registers for intermidiate sums
|
||||
; %8 - accumulator register
|
||||
%macro ADD_ROWS 8
|
||||
pmaddwd %6, %3, %1
|
||||
pmaddwd %7, %4, %2
|
||||
paddd %6, %7
|
||||
%if %5 == 1
|
||||
SWAP %6, %8
|
||||
%else
|
||||
paddd %8, %6
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; %1 - transform coeffs
|
||||
; %2, %3 offsets for storing e+o/e-o back to coeffsq
|
||||
; %4 - shift
|
||||
; %5 - add
|
||||
; %6 - block_size
|
||||
; %7 - register with e16
|
||||
; %8, %9 - stack offsets for storing e+o/e-o
|
||||
%macro E16_O16 9
|
||||
ADD_ROWS [%1], [%1 + 16], m0, m1, 1, m5, m6, m7
|
||||
ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m5, m6, m7
|
||||
|
||||
%if %6 == 8
|
||||
paddd %7, %5
|
||||
%endif
|
||||
|
||||
paddd m4, m7, %7 ; o16 + e16
|
||||
psubd %7, m7 ; e16 - o16
|
||||
STORE_%6 %2, %3, %7, %4, m4, %8, %9
|
||||
%endmacro
|
||||
|
||||
%macro TR_16x4 10
|
||||
; produce 8x4 matrix of e16 coeffs
|
||||
; for 4 first rows and store it on stack (128 bytes)
|
||||
TR_8x4 %1, 7, %4, %5, %6, %8, 0
|
||||
|
||||
; load 8 even rows
|
||||
LOAD_BLOCK m0, m1, %9 * %6, %9 * 3 * %6, %9 * 5 * %6, %9 * 7 * %6, %1
|
||||
LOAD_BLOCK m2, m3, %9 * 9 * %6, %9 * 11 * %6, %9 * 13 * %6, %9 * 15 * %6, %1
|
||||
|
||||
SBUTTERFLY wd, 0, 1, 4
|
||||
SBUTTERFLY wd, 2, 3, 4
|
||||
|
||||
E16_O16 trans_coeffs16, 0 + %1, 15 * %6 + %1, %2, %3, %7, m8, 0, 15 * 16
|
||||
mova m8, %3
|
||||
E16_O16 trans_coeffs16 + 64, %6 + %1, 14 * %6 + %1, %2, m8, %7, m9, 16, 14 * 16
|
||||
E16_O16 trans_coeffs16 + 2 * 64, 2 * %6 + %1, 13 * %6 + %1, %2, m8, %7, m10, 2 * 16, 13 * 16
|
||||
E16_O16 trans_coeffs16 + 3 * 64, 3 * %6 + %1, 12 * %6 + %1, %2, m8, %7, m11, 3 * 16, 12 * 16
|
||||
E16_O16 trans_coeffs16 + 4 * 64, 4 * %6 + %1, 11 * %6 + %1, %2, m8, %7, m12, 4 * 16, 11 * 16
|
||||
E16_O16 trans_coeffs16 + 5 * 64, 5 * %6 + %1, 10 * %6 + %1, %2, m8, %7, m13, 5 * 16, 10 * 16
|
||||
E16_O16 trans_coeffs16 + 6 * 64, 6 * %6 + %1, 9 * %6 + %1, %2, m8, %7, m14, 6 * 16, 9 * 16
|
||||
E16_O16 trans_coeffs16 + 7 * 64, 7 * %6 + %1, 8 * %6 + %1, %2, m8, %7, m15, 7 * 16, 8 * 16
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE_16x16 0
|
||||
cglobal hevc_idct_transpose_16x16, 0, 0, 0
|
||||
; M1 M2 M3 M4 ^T m1 m5 m9 m13 M_i^T = m_i
|
||||
; M5 M6 M7 M8 --> m2 m6 m10 m14
|
||||
; M9 M10 M11 M12 m3 m7 m11 m15
|
||||
; M13 M14 M15 M16 m4 m8 m12 m16
|
||||
|
||||
; M1 4x4 block
|
||||
TRANSPOSE_BLOCK 0, 0, 32
|
||||
|
||||
; M5, M2
|
||||
SWAP_BLOCKS 0, 128, 32, 0, 8
|
||||
; M9, M3
|
||||
SWAP_BLOCKS 0, 256, 32, 0, 16
|
||||
; M13, M4
|
||||
SWAP_BLOCKS 0, 384, 32, 0, 24
|
||||
|
||||
;M6
|
||||
TRANSPOSE_BLOCK 8, 128, 32
|
||||
|
||||
; M10, M7
|
||||
SWAP_BLOCKS 8, 256, 32, 128, 16
|
||||
; M14, M8
|
||||
SWAP_BLOCKS 8, 384, 32, 128, 24
|
||||
|
||||
;M11
|
||||
TRANSPOSE_BLOCK 16, 256, 32
|
||||
|
||||
; M15, M12
|
||||
SWAP_BLOCKS 16, 384, 32, 256, 24
|
||||
|
||||
;M16
|
||||
TRANSPOSE_BLOCK 24, 384, 32
|
||||
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
; void ff_hevc_idct_16x16_{8,10}_<opt>(int16_t *coeffs, int col_limit)
|
||||
; %1 = bitdepth
|
||||
%macro IDCT_16x16 1
|
||||
cglobal hevc_idct_16x16_%1, 1, 2, 16, coeffs
|
||||
mov r1d, 3
|
||||
.loop16:
|
||||
TR_16x4 8 * r1, 7, [pd_64], 64, 2, 32, 8, 16, 1, 0
|
||||
dec r1d
|
||||
jge .loop16
|
||||
|
||||
call hevc_idct_transpose_16x16_ %+ cpuname
|
||||
|
||||
DEFINE_BIAS %1
|
||||
mov r1d, 3
|
||||
.loop16_2:
|
||||
TR_16x4 8 * r1, shift, [arr_add], 64, 2, 32, 8, 16, 1, 1
|
||||
dec r1d
|
||||
jge .loop16_2
|
||||
|
||||
TAIL_CALL hevc_idct_transpose_16x16_ %+ cpuname, 1
|
||||
%endmacro
|
||||
|
||||
; scale, pack (clip16) and store the residuals 0 e32[0] + o32[0] --> %1
|
||||
; 4 at one time (4 columns) 1 e32[1] + o32[1]
|
||||
; %1 - address to store e32 + o32
|
||||
; %2 - address to store e32 - e32
|
||||
; %5 - reg with e32 + o32 ...
|
||||
; %3 - reg with e32 - o32 30 e32[1] - o32[1]
|
||||
; %4 - shift 31 e32[0] - o32[0] --> %2
|
||||
%macro STORE_32 5
|
||||
psrad %5, %4
|
||||
psrad %3, %4
|
||||
packssdw %5, %3
|
||||
movq [%1], %5
|
||||
movhps [%2], %5
|
||||
%endmacro
|
||||
|
||||
; %1 - transform coeffs
|
||||
; %2 - stack offset for e32
|
||||
; %2, %3 offsets for storing e+o/e-o back to coeffsq
|
||||
; %4 - shift
|
||||
; %5 - stack offset of e32
|
||||
%macro E32_O32 5
|
||||
ADD_ROWS [%1], [%1 + 16], m0, m1, 1, m8, m9, m10
|
||||
ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m8, m9, m10
|
||||
ADD_ROWS [%1 + 4 * 16], [%1 + 5 * 16], m4, m5, 0, m8, m9, m10
|
||||
ADD_ROWS [%1 + 6 * 16], [%1 + 7 * 16], m6, m7, 0, m8, m9, m10
|
||||
|
||||
paddd m11, m14, [rsp + %5]
|
||||
paddd m12, m10, m11 ; o32 + e32
|
||||
psubd m11, m10 ; e32 - o32
|
||||
STORE_32 %2, %3, m11, %4, m12
|
||||
%endmacro
|
||||
|
||||
; %1 - horizontal offset
|
||||
; %2 - bitdepth
|
||||
%macro TR_32x4 3
|
||||
TR_16x4 %1, 7, [pd_64], 128, 4, 64, 16, 16, 2, 0
|
||||
|
||||
LOAD_BLOCK m0, m1, 64, 3 * 64, 5 * 64, 7 * 64, %1
|
||||
LOAD_BLOCK m2, m3, 9 * 64, 11 * 64, 13 * 64, 15 * 64, %1
|
||||
LOAD_BLOCK m4, m5, 17 * 64, 19 * 64, 21 * 64, 23 * 64, %1
|
||||
LOAD_BLOCK m6, m7, 25 * 64, 27 * 64, 29 * 64, 31 * 64, %1
|
||||
|
||||
SBUTTERFLY wd, 0, 1, 8
|
||||
SBUTTERFLY wd, 2, 3, 8
|
||||
SBUTTERFLY wd, 4, 5, 8
|
||||
SBUTTERFLY wd, 6, 7, 8
|
||||
|
||||
%if %3 == 1
|
||||
%assign shift 7
|
||||
mova m14, [pd_64]
|
||||
%else
|
||||
LOAD_BIAS %2, m14
|
||||
%endif
|
||||
|
||||
lea r2, [trans_coeff32 + 15 * 128]
|
||||
lea r3, [coeffsq + %1]
|
||||
lea r4, [r3 + 16 * 64]
|
||||
mov r5d, 15 * 16
|
||||
%%loop:
|
||||
E32_O32 r2, r3 + r5 * 4, r4, shift, r5
|
||||
sub r2, 128
|
||||
add r4, 64
|
||||
sub r5d, 16
|
||||
jge %%loop
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE_32x32 0
|
||||
cglobal hevc_idct_transpose_32x32, 0, 0, 0
|
||||
; M0 M1 ... M7
|
||||
; M8 M15
|
||||
;
|
||||
; ...
|
||||
;
|
||||
; M56 M63
|
||||
|
||||
TRANSPOSE_BLOCK 0, 0, 64 ; M1
|
||||
mov r1d, 7
|
||||
mov r2d, 7 * 256
|
||||
.loop_transpose:
|
||||
SWAP_BLOCKS 0, r2, 64, 0, r1 * 8
|
||||
sub r2d, 256
|
||||
dec r1d
|
||||
jg .loop_transpose
|
||||
|
||||
TRANSPOSE_BLOCK 8, 256, 64 ; M9
|
||||
mov r1d, 6
|
||||
mov r2d, 512
|
||||
mov r3d, 16
|
||||
.loop_transpose2:
|
||||
SWAP_BLOCKS 8, r2, 64, 256, r3
|
||||
add r3d, 8
|
||||
add r2d, 256
|
||||
dec r1d
|
||||
jg .loop_transpose2
|
||||
|
||||
TRANSPOSE_BLOCK 2 * 8, 2 * 256, 64 ; M9
|
||||
mov r1d, 5
|
||||
mov r2d, 768
|
||||
mov r3d, 24
|
||||
.loop_transpose3:
|
||||
SWAP_BLOCKS 2 * 8, r2, 64, 2 * 256, r3
|
||||
add r3d, 8
|
||||
add r2d, 256
|
||||
dec r1d
|
||||
jg .loop_transpose3
|
||||
|
||||
TRANSPOSE_BLOCK 3 * 8, 3 * 256, 64 ; M27
|
||||
mov r1d, 4
|
||||
mov r2d, 1024
|
||||
mov r3d, 32
|
||||
.loop_transpose4:
|
||||
SWAP_BLOCKS 3 * 8, r2, 64, 3 * 256, r3
|
||||
add r3d, 8
|
||||
add r2d, 256
|
||||
dec r1d
|
||||
jg .loop_transpose4
|
||||
|
||||
TRANSPOSE_BLOCK 4 * 8, 4 * 256, 64 ; M36
|
||||
mov r1d, 3
|
||||
mov r2d, 1280
|
||||
mov r3d, 40
|
||||
.loop_transpose5:
|
||||
SWAP_BLOCKS 4 * 8, r2, 64, 4 * 256, r3
|
||||
add r3d, 8
|
||||
add r2d, 256
|
||||
dec r1d
|
||||
jg .loop_transpose5
|
||||
|
||||
TRANSPOSE_BLOCK 5 * 8, 5 * 256, 64 ; M45
|
||||
SWAP_BLOCKS 5 * 8, 6 * 256, 64, 5 * 256, 6 * 8
|
||||
SWAP_BLOCKS 5 * 8, 7 * 256, 64, 5 * 256, 7 * 8
|
||||
|
||||
TRANSPOSE_BLOCK 6 * 8, 6 * 256, 64 ; M54
|
||||
SWAP_BLOCKS 6 * 8, 7 * 256, 64, 6 * 256, 7 * 8
|
||||
|
||||
TRANSPOSE_BLOCK 7 * 8, 7 * 256, 64 ; M63
|
||||
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
; void ff_hevc_idct_32x32_{8,10}_<opt>(int16_t *coeffs, int col_limit)
|
||||
; %1 = bitdepth
|
||||
%macro IDCT_32x32 1
|
||||
cglobal hevc_idct_32x32_%1, 1, 6, 16, 256, coeffs
|
||||
mov r1d, 7
|
||||
.loop32:
|
||||
TR_32x4 8 * r1, %1, 1
|
||||
dec r1d
|
||||
jge .loop32
|
||||
|
||||
call hevc_idct_transpose_32x32_ %+ cpuname
|
||||
|
||||
mov r1d, 7
|
||||
.loop32_2:
|
||||
TR_32x4 8 * r1, %1, 0
|
||||
dec r1d
|
||||
jge .loop32_2
|
||||
|
||||
TAIL_CALL hevc_idct_transpose_32x32_ %+ cpuname, 1
|
||||
%endmacro
|
||||
|
||||
%macro INIT_IDCT_DC 1
|
||||
INIT_MMX mmxext
|
||||
IDCT_DC_NL 4, %1
|
||||
IDCT_DC 8, 2, %1
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT_DC_NL 8, %1
|
||||
IDCT_DC 16, 4, %1
|
||||
IDCT_DC 32, 16, %1
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
IDCT_DC 16, 2, %1
|
||||
IDCT_DC 32, 8, %1
|
||||
%endif ;HAVE_AVX2_EXTERNAL
|
||||
%endmacro
|
||||
|
||||
%macro INIT_IDCT 2
|
||||
INIT_XMM %2
|
||||
%if %1 == 8
|
||||
TRANSPOSE_8x8
|
||||
%if ARCH_X86_64
|
||||
TRANSPOSE_16x16
|
||||
TRANSPOSE_32x32
|
||||
%endif
|
||||
%endif
|
||||
%if ARCH_X86_64
|
||||
IDCT_32x32 %1
|
||||
IDCT_16x16 %1
|
||||
%endif
|
||||
IDCT_8x8 %1
|
||||
IDCT_4x4 %1
|
||||
%endmacro
|
||||
|
||||
INIT_IDCT_DC 8
|
||||
INIT_IDCT_DC 10
|
||||
INIT_IDCT_DC 12
|
||||
INIT_IDCT 8, sse2
|
||||
INIT_IDCT 8, avx
|
||||
INIT_IDCT 10, sse2
|
||||
INIT_IDCT 10, avx
|
||||
;INIT_IDCT 12, sse2
|
||||
;INIT_IDCT 12, avx
|
||||
1672
externals/ffmpeg/libavcodec/x86/hevc_mc.asm
vendored
Executable file
1672
externals/ffmpeg/libavcodec/x86/hevc_mc.asm
vendored
Executable file
File diff suppressed because it is too large
Load Diff
340
externals/ffmpeg/libavcodec/x86/hevc_sao.asm
vendored
Executable file
340
externals/ffmpeg/libavcodec/x86/hevc_sao.asm
vendored
Executable file
@@ -0,0 +1,340 @@
|
||||
;******************************************************************************
|
||||
;* SIMD optimized SAO functions for HEVC 8bit decoding
|
||||
;*
|
||||
;* Copyright (c) 2013 Pierre-Edouard LEPERE
|
||||
;* Copyright (c) 2014 James Almer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
|
||||
pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
|
||||
cextern pb_1
|
||||
cextern pb_2
|
||||
|
||||
SECTION .text
|
||||
|
||||
;******************************************************************************
|
||||
;SAO Band Filter
|
||||
;******************************************************************************
|
||||
|
||||
%macro HEVC_SAO_BAND_FILTER_INIT 0
|
||||
and leftq, 31
|
||||
movd xm0, leftd
|
||||
add leftq, 1
|
||||
and leftq, 31
|
||||
movd xm1, leftd
|
||||
add leftq, 1
|
||||
and leftq, 31
|
||||
movd xm2, leftd
|
||||
add leftq, 1
|
||||
and leftq, 31
|
||||
movd xm3, leftd
|
||||
|
||||
SPLATW m0, xm0
|
||||
SPLATW m1, xm1
|
||||
SPLATW m2, xm2
|
||||
SPLATW m3, xm3
|
||||
%if mmsize > 16
|
||||
SPLATW m4, [offsetq + 2]
|
||||
SPLATW m5, [offsetq + 4]
|
||||
SPLATW m6, [offsetq + 6]
|
||||
SPLATW m7, [offsetq + 8]
|
||||
%else
|
||||
movq m7, [offsetq + 2]
|
||||
SPLATW m4, m7, 0
|
||||
SPLATW m5, m7, 1
|
||||
SPLATW m6, m7, 2
|
||||
SPLATW m7, m7, 3
|
||||
%endif
|
||||
|
||||
%if ARCH_X86_64
|
||||
pxor m14, m14
|
||||
|
||||
%else ; ARCH_X86_32
|
||||
mova [rsp+mmsize*0], m0
|
||||
mova [rsp+mmsize*1], m1
|
||||
mova [rsp+mmsize*2], m2
|
||||
mova [rsp+mmsize*3], m3
|
||||
mova [rsp+mmsize*4], m4
|
||||
mova [rsp+mmsize*5], m5
|
||||
mova [rsp+mmsize*6], m6
|
||||
pxor m0, m0
|
||||
%assign MMSIZE mmsize
|
||||
%define m14 m0
|
||||
%define m13 m1
|
||||
%define m9 m2
|
||||
%define m8 m3
|
||||
%endif ; ARCH
|
||||
DEFINE_ARGS dst, src, dststride, srcstride, offset, height
|
||||
mov heightd, r7m
|
||||
%endmacro
|
||||
|
||||
%macro HEVC_SAO_BAND_FILTER_COMPUTE 2
|
||||
psraw %1, %2, 3
|
||||
%if ARCH_X86_64
|
||||
pcmpeqw m10, %1, m0
|
||||
pcmpeqw m11, %1, m1
|
||||
pcmpeqw m12, %1, m2
|
||||
pcmpeqw %1, m3
|
||||
pand m10, m4
|
||||
pand m11, m5
|
||||
pand m12, m6
|
||||
pand %1, m7
|
||||
por m10, m11
|
||||
por m12, %1
|
||||
por m10, m12
|
||||
paddw %2, m10
|
||||
%else ; ARCH_X86_32
|
||||
pcmpeqw m4, %1, [rsp+MMSIZE*0]
|
||||
pcmpeqw m5, %1, [rsp+MMSIZE*1]
|
||||
pcmpeqw m6, %1, [rsp+MMSIZE*2]
|
||||
pcmpeqw %1, [rsp+MMSIZE*3]
|
||||
pand m4, [rsp+MMSIZE*4]
|
||||
pand m5, [rsp+MMSIZE*5]
|
||||
pand m6, [rsp+MMSIZE*6]
|
||||
pand %1, m7
|
||||
por m4, m5
|
||||
por m6, %1
|
||||
por m4, m6
|
||||
paddw %2, m4
|
||||
%endif ; ARCH
|
||||
%endmacro
|
||||
|
||||
;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
|
||||
; int16_t *sao_offset_val, int sao_left_class, int width, int height);
|
||||
%macro HEVC_SAO_BAND_FILTER 2
|
||||
cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
|
||||
HEVC_SAO_BAND_FILTER_INIT
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
%if %1 == 8
|
||||
movq m8, [srcq]
|
||||
punpcklbw m8, m14
|
||||
HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
|
||||
packuswb m8, m14
|
||||
movq [dstq], m8
|
||||
%endif ; %1 == 8
|
||||
|
||||
%assign i 0
|
||||
%rep %2
|
||||
mova m13, [srcq + i]
|
||||
punpcklbw m8, m13, m14
|
||||
HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
|
||||
punpckhbw m13, m14
|
||||
HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
|
||||
packuswb m8, m13
|
||||
mova [dstq + i], m8
|
||||
%assign i i+mmsize
|
||||
%endrep
|
||||
|
||||
%if %1 == 48
|
||||
INIT_XMM cpuname
|
||||
|
||||
mova m13, [srcq + i]
|
||||
punpcklbw m8, m13, m14
|
||||
HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
|
||||
punpckhbw m13, m14
|
||||
HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
|
||||
packuswb m8, m13
|
||||
mova [dstq + i], m8
|
||||
%if cpuflag(avx2)
|
||||
INIT_YMM cpuname
|
||||
%endif
|
||||
%endif ; %1 == 48
|
||||
|
||||
add dstq, dststrideq ; dst += dststride
|
||||
add srcq, srcstrideq ; src += srcstride
|
||||
dec heightd ; cmp height
|
||||
jnz .loop ; height loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro HEVC_SAO_BAND_FILTER_FUNCS 0
|
||||
HEVC_SAO_BAND_FILTER 8, 0
|
||||
HEVC_SAO_BAND_FILTER 16, 1
|
||||
HEVC_SAO_BAND_FILTER 32, 2
|
||||
HEVC_SAO_BAND_FILTER 48, 2
|
||||
HEVC_SAO_BAND_FILTER 64, 4
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
HEVC_SAO_BAND_FILTER_FUNCS
|
||||
INIT_XMM avx
|
||||
HEVC_SAO_BAND_FILTER_FUNCS
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_XMM avx2
|
||||
HEVC_SAO_BAND_FILTER 8, 0
|
||||
HEVC_SAO_BAND_FILTER 16, 1
|
||||
INIT_YMM avx2
|
||||
HEVC_SAO_BAND_FILTER 32, 1
|
||||
HEVC_SAO_BAND_FILTER 48, 1
|
||||
HEVC_SAO_BAND_FILTER 64, 2
|
||||
%endif
|
||||
|
||||
;******************************************************************************
|
||||
;SAO Edge Filter
|
||||
;******************************************************************************
|
||||
|
||||
%define MAX_PB_SIZE 64
|
||||
%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
|
||||
%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
|
||||
|
||||
%macro HEVC_SAO_EDGE_FILTER_INIT 0
|
||||
%if WIN64
|
||||
movsxd eoq, dword eom
|
||||
%elif ARCH_X86_64
|
||||
movsxd eoq, eod
|
||||
%else
|
||||
mov eoq, r4m
|
||||
%endif
|
||||
lea tmp2q, [pb_eo]
|
||||
movsx a_strideq, byte [tmp2q+eoq*4+1]
|
||||
movsx b_strideq, byte [tmp2q+eoq*4+3]
|
||||
imul a_strideq, EDGE_SRCSTRIDE
|
||||
imul b_strideq, EDGE_SRCSTRIDE
|
||||
movsx tmpq, byte [tmp2q+eoq*4]
|
||||
add a_strideq, tmpq
|
||||
movsx tmpq, byte [tmp2q+eoq*4+2]
|
||||
add b_strideq, tmpq
|
||||
%endmacro
|
||||
|
||||
%macro HEVC_SAO_EDGE_FILTER_COMPUTE 1
|
||||
pminub m4, m1, m2
|
||||
pminub m5, m1, m3
|
||||
pcmpeqb m2, m4
|
||||
pcmpeqb m3, m5
|
||||
pcmpeqb m4, m1
|
||||
pcmpeqb m5, m1
|
||||
psubb m4, m2
|
||||
psubb m5, m3
|
||||
paddb m4, m6
|
||||
paddb m4, m5
|
||||
|
||||
pshufb m2, m0, m4
|
||||
%if %1 > 8
|
||||
punpckhbw m5, m7, m1
|
||||
punpckhbw m4, m2, m7
|
||||
punpcklbw m3, m7, m1
|
||||
punpcklbw m2, m7
|
||||
pmaddubsw m5, m4
|
||||
pmaddubsw m3, m2
|
||||
packuswb m3, m5
|
||||
%else
|
||||
punpcklbw m3, m7, m1
|
||||
punpcklbw m2, m7
|
||||
pmaddubsw m3, m2
|
||||
packuswb m3, m3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
|
||||
; int eo, int width, int height);
|
||||
%macro HEVC_SAO_EDGE_FILTER 2-3
|
||||
%if ARCH_X86_64
|
||||
cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
|
||||
%define tmp2q heightq
|
||||
HEVC_SAO_EDGE_FILTER_INIT
|
||||
mov heightd, r6m
|
||||
|
||||
%else ; ARCH_X86_32
|
||||
cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height
|
||||
%define eoq srcq
|
||||
%define tmpq heightq
|
||||
%define tmp2q dststrideq
|
||||
%define offsetq heightq
|
||||
HEVC_SAO_EDGE_FILTER_INIT
|
||||
mov srcq, srcm
|
||||
mov offsetq, r3m
|
||||
mov dststrideq, dststridem
|
||||
%endif ; ARCH
|
||||
|
||||
%if mmsize > 16
|
||||
vbroadcasti128 m0, [offsetq]
|
||||
%else
|
||||
movu m0, [offsetq]
|
||||
%endif
|
||||
mova m1, [pb_edge_shuffle]
|
||||
packsswb m0, m0
|
||||
mova m7, [pb_1]
|
||||
pshufb m0, m1
|
||||
mova m6, [pb_2]
|
||||
%if ARCH_X86_32
|
||||
mov heightd, r6m
|
||||
%endif
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
|
||||
%if %1 == 8
|
||||
movq m1, [srcq]
|
||||
movq m2, [srcq + a_strideq]
|
||||
movq m3, [srcq + b_strideq]
|
||||
HEVC_SAO_EDGE_FILTER_COMPUTE %1
|
||||
movq [dstq], m3
|
||||
%endif
|
||||
|
||||
%assign i 0
|
||||
%rep %2
|
||||
mova m1, [srcq + i]
|
||||
movu m2, [srcq + a_strideq + i]
|
||||
movu m3, [srcq + b_strideq + i]
|
||||
HEVC_SAO_EDGE_FILTER_COMPUTE %1
|
||||
mov%3 [dstq + i], m3
|
||||
%assign i i+mmsize
|
||||
%endrep
|
||||
|
||||
%if %1 == 48
|
||||
INIT_XMM cpuname
|
||||
|
||||
mova m1, [srcq + i]
|
||||
movu m2, [srcq + a_strideq + i]
|
||||
movu m3, [srcq + b_strideq + i]
|
||||
HEVC_SAO_EDGE_FILTER_COMPUTE %1
|
||||
mova [dstq + i], m3
|
||||
%if cpuflag(avx2)
|
||||
INIT_YMM cpuname
|
||||
%endif
|
||||
%endif
|
||||
|
||||
add dstq, dststrideq
|
||||
add srcq, EDGE_SRCSTRIDE
|
||||
dec heightd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
HEVC_SAO_EDGE_FILTER 8, 0
|
||||
HEVC_SAO_EDGE_FILTER 16, 1, a
|
||||
HEVC_SAO_EDGE_FILTER 32, 2, a
|
||||
HEVC_SAO_EDGE_FILTER 48, 2, a
|
||||
HEVC_SAO_EDGE_FILTER 64, 4, a
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
HEVC_SAO_EDGE_FILTER 32, 1, a
|
||||
HEVC_SAO_EDGE_FILTER 48, 1, u
|
||||
HEVC_SAO_EDGE_FILTER 64, 2, a
|
||||
%endif
|
||||
370
externals/ffmpeg/libavcodec/x86/hevc_sao_10bit.asm
vendored
Executable file
370
externals/ffmpeg/libavcodec/x86/hevc_sao_10bit.asm
vendored
Executable file
@@ -0,0 +1,370 @@
|
||||
;******************************************************************************
|
||||
;* SIMD optimized SAO functions for HEVC 10/12bit decoding
|
||||
;*
|
||||
;* Copyright (c) 2013 Pierre-Edouard LEPERE
|
||||
;* Copyright (c) 2014 James Almer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
pw_m2: times 16 dw -2
|
||||
pw_mask10: times 16 dw 0x03FF
|
||||
pw_mask12: times 16 dw 0x0FFF
|
||||
pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
|
||||
cextern pw_m1
|
||||
cextern pw_1
|
||||
cextern pw_2
|
||||
|
||||
SECTION .text
|
||||
|
||||
;******************************************************************************
|
||||
;SAO Band Filter
|
||||
;******************************************************************************
|
||||
|
||||
%macro HEVC_SAO_BAND_FILTER_INIT 1
|
||||
and leftq, 31
|
||||
movd xm0, leftd
|
||||
add leftq, 1
|
||||
and leftq, 31
|
||||
movd xm1, leftd
|
||||
add leftq, 1
|
||||
and leftq, 31
|
||||
movd xm2, leftd
|
||||
add leftq, 1
|
||||
and leftq, 31
|
||||
movd xm3, leftd
|
||||
|
||||
SPLATW m0, xm0
|
||||
SPLATW m1, xm1
|
||||
SPLATW m2, xm2
|
||||
SPLATW m3, xm3
|
||||
%if mmsize > 16
|
||||
SPLATW m4, [offsetq + 2]
|
||||
SPLATW m5, [offsetq + 4]
|
||||
SPLATW m6, [offsetq + 6]
|
||||
SPLATW m7, [offsetq + 8]
|
||||
%else
|
||||
movq m7, [offsetq + 2]
|
||||
SPLATW m4, m7, 0
|
||||
SPLATW m5, m7, 1
|
||||
SPLATW m6, m7, 2
|
||||
SPLATW m7, m7, 3
|
||||
%endif
|
||||
|
||||
%if ARCH_X86_64
|
||||
mova m13, [pw_mask %+ %1]
|
||||
pxor m14, m14
|
||||
|
||||
%else ; ARCH_X86_32
|
||||
mova [rsp+mmsize*0], m0
|
||||
mova [rsp+mmsize*1], m1
|
||||
mova [rsp+mmsize*2], m2
|
||||
mova [rsp+mmsize*3], m3
|
||||
mova [rsp+mmsize*4], m4
|
||||
mova [rsp+mmsize*5], m5
|
||||
mova [rsp+mmsize*6], m6
|
||||
mova m1, [pw_mask %+ %1]
|
||||
pxor m0, m0
|
||||
%define m14 m0
|
||||
%define m13 m1
|
||||
%define m9 m2
|
||||
%define m8 m3
|
||||
%endif ; ARCH
|
||||
DEFINE_ARGS dst, src, dststride, srcstride, offset, height
|
||||
mov heightd, r7m
|
||||
%endmacro
|
||||
|
||||
;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
|
||||
; int16_t *sao_offset_val, int sao_left_class, int width, int height);
|
||||
%macro HEVC_SAO_BAND_FILTER 3
|
||||
cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
|
||||
HEVC_SAO_BAND_FILTER_INIT %1
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
|
||||
%assign i 0
|
||||
%assign j 0
|
||||
%rep %3
|
||||
%assign k 8+(j&1)
|
||||
%assign l 9-(j&1)
|
||||
mova m %+ k, [srcq + i]
|
||||
psraw m %+ l, m %+ k, %1-5
|
||||
%if ARCH_X86_64
|
||||
pcmpeqw m10, m %+ l, m0
|
||||
pcmpeqw m11, m %+ l, m1
|
||||
pcmpeqw m12, m %+ l, m2
|
||||
pcmpeqw m %+ l, m3
|
||||
pand m10, m4
|
||||
pand m11, m5
|
||||
pand m12, m6
|
||||
pand m %+ l, m7
|
||||
por m10, m11
|
||||
por m12, m %+ l
|
||||
por m10, m12
|
||||
paddw m %+ k, m10
|
||||
%else ; ARCH_X86_32
|
||||
pcmpeqw m4, m %+ l, [rsp+mmsize*0]
|
||||
pcmpeqw m5, m %+ l, [rsp+mmsize*1]
|
||||
pcmpeqw m6, m %+ l, [rsp+mmsize*2]
|
||||
pcmpeqw m %+ l, [rsp+mmsize*3]
|
||||
pand m4, [rsp+mmsize*4]
|
||||
pand m5, [rsp+mmsize*5]
|
||||
pand m6, [rsp+mmsize*6]
|
||||
pand m %+ l, m7
|
||||
por m4, m5
|
||||
por m6, m %+ l
|
||||
por m4, m6
|
||||
paddw m %+ k, m4
|
||||
%endif ; ARCH
|
||||
CLIPW m %+ k, m14, m13
|
||||
mova [dstq + i], m %+ k
|
||||
%assign i i+mmsize
|
||||
%assign j j+1
|
||||
%endrep
|
||||
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%macro HEVC_SAO_BAND_FILTER_FUNCS 0
|
||||
HEVC_SAO_BAND_FILTER 10, 8, 1
|
||||
HEVC_SAO_BAND_FILTER 10, 16, 2
|
||||
HEVC_SAO_BAND_FILTER 10, 32, 4
|
||||
HEVC_SAO_BAND_FILTER 10, 48, 6
|
||||
HEVC_SAO_BAND_FILTER 10, 64, 8
|
||||
|
||||
HEVC_SAO_BAND_FILTER 12, 8, 1
|
||||
HEVC_SAO_BAND_FILTER 12, 16, 2
|
||||
HEVC_SAO_BAND_FILTER 12, 32, 4
|
||||
HEVC_SAO_BAND_FILTER 12, 48, 6
|
||||
HEVC_SAO_BAND_FILTER 12, 64, 8
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
HEVC_SAO_BAND_FILTER_FUNCS
|
||||
INIT_XMM avx
|
||||
HEVC_SAO_BAND_FILTER_FUNCS
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_XMM avx2
|
||||
HEVC_SAO_BAND_FILTER 10, 8, 1
|
||||
INIT_YMM avx2
|
||||
HEVC_SAO_BAND_FILTER 10, 16, 1
|
||||
HEVC_SAO_BAND_FILTER 10, 32, 2
|
||||
HEVC_SAO_BAND_FILTER 10, 48, 3
|
||||
HEVC_SAO_BAND_FILTER 10, 64, 4
|
||||
|
||||
INIT_XMM avx2
|
||||
HEVC_SAO_BAND_FILTER 12, 8, 1
|
||||
INIT_YMM avx2
|
||||
HEVC_SAO_BAND_FILTER 12, 16, 1
|
||||
HEVC_SAO_BAND_FILTER 12, 32, 2
|
||||
HEVC_SAO_BAND_FILTER 12, 48, 3
|
||||
HEVC_SAO_BAND_FILTER 12, 64, 4
|
||||
%endif
|
||||
|
||||
;******************************************************************************
|
||||
;SAO Edge Filter
|
||||
;******************************************************************************
|
||||
|
||||
%define MAX_PB_SIZE 64
|
||||
%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
|
||||
%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
|
||||
|
||||
%macro PMINUW 4
|
||||
%if cpuflag(sse4)
|
||||
pminuw %1, %2, %3
|
||||
%else
|
||||
psubusw %4, %2, %3
|
||||
psubw %1, %2, %4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro HEVC_SAO_EDGE_FILTER_INIT 0
|
||||
%if WIN64
|
||||
movsxd eoq, dword eom
|
||||
%elif ARCH_X86_64
|
||||
movsxd eoq, eod
|
||||
%else
|
||||
mov eoq, r4m
|
||||
%endif
|
||||
lea tmp2q, [pb_eo]
|
||||
movsx a_strideq, byte [tmp2q+eoq*4+1]
|
||||
movsx b_strideq, byte [tmp2q+eoq*4+3]
|
||||
imul a_strideq, EDGE_SRCSTRIDE >> 1
|
||||
imul b_strideq, EDGE_SRCSTRIDE >> 1
|
||||
movsx tmpq, byte [tmp2q+eoq*4]
|
||||
add a_strideq, tmpq
|
||||
movsx tmpq, byte [tmp2q+eoq*4+2]
|
||||
add b_strideq, tmpq
|
||||
%endmacro
|
||||
|
||||
;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
|
||||
; int eo, int width, int height);
|
||||
%macro HEVC_SAO_EDGE_FILTER 3
|
||||
%if ARCH_X86_64
|
||||
cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
|
||||
%define tmp2q heightq
|
||||
HEVC_SAO_EDGE_FILTER_INIT
|
||||
mov heightd, r6m
|
||||
add a_strideq, a_strideq
|
||||
add b_strideq, b_strideq
|
||||
|
||||
%else ; ARCH_X86_32
|
||||
cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
|
||||
%define eoq srcq
|
||||
%define tmpq heightq
|
||||
%define tmp2q dststrideq
|
||||
%define offsetq heightq
|
||||
%define m8 m1
|
||||
%define m9 m2
|
||||
%define m10 m3
|
||||
%define m11 m4
|
||||
%define m12 m5
|
||||
HEVC_SAO_EDGE_FILTER_INIT
|
||||
mov srcq, srcm
|
||||
mov offsetq, r3m
|
||||
mov dststrideq, dststridem
|
||||
add a_strideq, a_strideq
|
||||
add b_strideq, b_strideq
|
||||
|
||||
%endif ; ARCH
|
||||
|
||||
%if mmsize > 16
|
||||
SPLATW m8, [offsetq+2]
|
||||
SPLATW m9, [offsetq+4]
|
||||
SPLATW m10, [offsetq+0]
|
||||
SPLATW m11, [offsetq+6]
|
||||
SPLATW m12, [offsetq+8]
|
||||
%else
|
||||
movq m10, [offsetq+0]
|
||||
movd m12, [offsetq+6]
|
||||
SPLATW m8, xm10, 1
|
||||
SPLATW m9, xm10, 2
|
||||
SPLATW m10, xm10, 0
|
||||
SPLATW m11, xm12, 0
|
||||
SPLATW m12, xm12, 1
|
||||
%endif
|
||||
pxor m0, m0
|
||||
%if ARCH_X86_64
|
||||
mova m13, [pw_m1]
|
||||
mova m14, [pw_1]
|
||||
mova m15, [pw_2]
|
||||
%else
|
||||
mov heightd, r6m
|
||||
mova [rsp+mmsize*0], m8
|
||||
mova [rsp+mmsize*1], m9
|
||||
mova [rsp+mmsize*2], m10
|
||||
mova [rsp+mmsize*3], m11
|
||||
mova [rsp+mmsize*4], m12
|
||||
%endif
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
|
||||
%assign i 0
|
||||
%rep %3
|
||||
mova m1, [srcq + i]
|
||||
movu m2, [srcq+a_strideq + i]
|
||||
movu m3, [srcq+b_strideq + i]
|
||||
PMINUW m4, m1, m2, m6
|
||||
PMINUW m5, m1, m3, m7
|
||||
pcmpeqw m2, m4
|
||||
pcmpeqw m3, m5
|
||||
pcmpeqw m4, m1
|
||||
pcmpeqw m5, m1
|
||||
psubw m4, m2
|
||||
psubw m5, m3
|
||||
|
||||
paddw m4, m5
|
||||
pcmpeqw m2, m4, [pw_m2]
|
||||
%if ARCH_X86_64
|
||||
pcmpeqw m3, m4, m13
|
||||
pcmpeqw m5, m4, m0
|
||||
pcmpeqw m6, m4, m14
|
||||
pcmpeqw m7, m4, m15
|
||||
pand m2, m8
|
||||
pand m3, m9
|
||||
pand m5, m10
|
||||
pand m6, m11
|
||||
pand m7, m12
|
||||
%else
|
||||
pcmpeqw m3, m4, [pw_m1]
|
||||
pcmpeqw m5, m4, m0
|
||||
pcmpeqw m6, m4, [pw_1]
|
||||
pcmpeqw m7, m4, [pw_2]
|
||||
pand m2, [rsp+mmsize*0]
|
||||
pand m3, [rsp+mmsize*1]
|
||||
pand m5, [rsp+mmsize*2]
|
||||
pand m6, [rsp+mmsize*3]
|
||||
pand m7, [rsp+mmsize*4]
|
||||
%endif
|
||||
paddw m2, m3
|
||||
paddw m5, m6
|
||||
paddw m2, m7
|
||||
paddw m2, m1
|
||||
paddw m2, m5
|
||||
CLIPW m2, m0, [pw_mask %+ %1]
|
||||
mova [dstq + i], m2
|
||||
%assign i i+mmsize
|
||||
%endrep
|
||||
|
||||
add dstq, dststrideq
|
||||
add srcq, EDGE_SRCSTRIDE
|
||||
dec heightd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
HEVC_SAO_EDGE_FILTER 10, 8, 1
|
||||
HEVC_SAO_EDGE_FILTER 10, 16, 2
|
||||
HEVC_SAO_EDGE_FILTER 10, 32, 4
|
||||
HEVC_SAO_EDGE_FILTER 10, 48, 6
|
||||
HEVC_SAO_EDGE_FILTER 10, 64, 8
|
||||
|
||||
HEVC_SAO_EDGE_FILTER 12, 8, 1
|
||||
HEVC_SAO_EDGE_FILTER 12, 16, 2
|
||||
HEVC_SAO_EDGE_FILTER 12, 32, 4
|
||||
HEVC_SAO_EDGE_FILTER 12, 48, 6
|
||||
HEVC_SAO_EDGE_FILTER 12, 64, 8
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_XMM avx2
|
||||
HEVC_SAO_EDGE_FILTER 10, 8, 1
|
||||
INIT_YMM avx2
|
||||
HEVC_SAO_EDGE_FILTER 10, 16, 1
|
||||
HEVC_SAO_EDGE_FILTER 10, 32, 2
|
||||
HEVC_SAO_EDGE_FILTER 10, 48, 3
|
||||
HEVC_SAO_EDGE_FILTER 10, 64, 4
|
||||
|
||||
INIT_XMM avx2
|
||||
HEVC_SAO_EDGE_FILTER 12, 8, 1
|
||||
INIT_YMM avx2
|
||||
HEVC_SAO_EDGE_FILTER 12, 16, 1
|
||||
HEVC_SAO_EDGE_FILTER 12, 32, 2
|
||||
HEVC_SAO_EDGE_FILTER 12, 48, 3
|
||||
HEVC_SAO_EDGE_FILTER 12, 64, 4
|
||||
%endif
|
||||
259
externals/ffmpeg/libavcodec/x86/hevcdsp.h
vendored
Executable file
259
externals/ffmpeg/libavcodec/x86/hevcdsp.h
vendored
Executable file
@@ -0,0 +1,259 @@
|
||||
/*
|
||||
* HEVC video decoder
|
||||
*
|
||||
* Copyright (C) 2012 - 2013 Guillaume Martres
|
||||
* Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
|
||||
*
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_HEVCDSP_H
|
||||
#define AVCODEC_X86_HEVCDSP_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
|
||||
dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
|
||||
dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
|
||||
dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
|
||||
dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
|
||||
dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
|
||||
|
||||
|
||||
#define PEL_PROTOTYPE(name, D, opt) \
|
||||
void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
|
||||
void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
|
||||
void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
|
||||
void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
|
||||
void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// MC functions
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define EPEL_PROTOTYPES(fname, bitd, opt) \
|
||||
PEL_PROTOTYPE(fname##4, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##6, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##8, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##12, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##16, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##24, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##32, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##48, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##64, bitd, opt)
|
||||
|
||||
#define QPEL_PROTOTYPES(fname, bitd, opt) \
|
||||
PEL_PROTOTYPE(fname##4, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##8, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##12, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##16, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##24, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##32, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##48, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##64, bitd, opt)
|
||||
|
||||
#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
|
||||
void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom, int _wx, int _ox); \
|
||||
void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom, int _wx0, int _wx1, int _ox0, int _ox1)
|
||||
|
||||
#define WEIGHTING_PROTOTYPES(bitd, opt) \
|
||||
WEIGHTING_PROTOTYPE(2, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(4, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(6, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(8, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(12, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(16, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(24, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(32, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(48, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(64, bitd, opt)
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// QPEL_PIXELS EPEL_PIXELS
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
EPEL_PROTOTYPES(pel_pixels , 8, sse4);
|
||||
EPEL_PROTOTYPES(pel_pixels , 10, sse4);
|
||||
EPEL_PROTOTYPES(pel_pixels , 12, sse4);
|
||||
|
||||
void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
|
||||
void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
|
||||
|
||||
|
||||
void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
|
||||
void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
|
||||
|
||||
|
||||
void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
|
||||
void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// EPEL
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
EPEL_PROTOTYPES(epel_h , 8, sse4);
|
||||
EPEL_PROTOTYPES(epel_h , 10, sse4);
|
||||
EPEL_PROTOTYPES(epel_h , 12, sse4);
|
||||
|
||||
EPEL_PROTOTYPES(epel_v , 8, sse4);
|
||||
EPEL_PROTOTYPES(epel_v , 10, sse4);
|
||||
EPEL_PROTOTYPES(epel_v , 12, sse4);
|
||||
|
||||
EPEL_PROTOTYPES(epel_hv , 8, sse4);
|
||||
EPEL_PROTOTYPES(epel_hv , 10, sse4);
|
||||
EPEL_PROTOTYPES(epel_hv , 12, sse4);
|
||||
|
||||
PEL_PROTOTYPE(epel_h16, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_h24, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_h32, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_h48, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_h64, 8, avx2);
|
||||
|
||||
PEL_PROTOTYPE(epel_h16,10, avx2);
|
||||
PEL_PROTOTYPE(epel_h24,10, avx2);
|
||||
PEL_PROTOTYPE(epel_h32,10, avx2);
|
||||
PEL_PROTOTYPE(epel_h48,10, avx2);
|
||||
PEL_PROTOTYPE(epel_h64,10, avx2);
|
||||
|
||||
PEL_PROTOTYPE(epel_v16, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_v24, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_v32, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_v48, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_v64, 8, avx2);
|
||||
|
||||
PEL_PROTOTYPE(epel_v16,10, avx2);
|
||||
PEL_PROTOTYPE(epel_v24,10, avx2);
|
||||
PEL_PROTOTYPE(epel_v32,10, avx2);
|
||||
PEL_PROTOTYPE(epel_v48,10, avx2);
|
||||
PEL_PROTOTYPE(epel_v64,10, avx2);
|
||||
|
||||
PEL_PROTOTYPE(epel_hv16, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_hv24, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_hv32, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_hv48, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_hv64, 8, avx2);
|
||||
|
||||
PEL_PROTOTYPE(epel_hv16,10, avx2);
|
||||
PEL_PROTOTYPE(epel_hv24,10, avx2);
|
||||
PEL_PROTOTYPE(epel_hv32,10, avx2);
|
||||
PEL_PROTOTYPE(epel_hv48,10, avx2);
|
||||
PEL_PROTOTYPE(epel_hv64,10, avx2);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// QPEL
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
QPEL_PROTOTYPES(qpel_h , 8, sse4);
|
||||
QPEL_PROTOTYPES(qpel_h , 10, sse4);
|
||||
QPEL_PROTOTYPES(qpel_h , 12, sse4);
|
||||
|
||||
QPEL_PROTOTYPES(qpel_v, 8, sse4);
|
||||
QPEL_PROTOTYPES(qpel_v, 10, sse4);
|
||||
QPEL_PROTOTYPES(qpel_v, 12, sse4);
|
||||
|
||||
QPEL_PROTOTYPES(qpel_hv, 8, sse4);
|
||||
QPEL_PROTOTYPES(qpel_hv, 10, sse4);
|
||||
QPEL_PROTOTYPES(qpel_hv, 12, sse4);
|
||||
|
||||
PEL_PROTOTYPE(qpel_h16, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_h24, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_h32, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_h48, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_h64, 8, avx2);
|
||||
|
||||
PEL_PROTOTYPE(qpel_h16,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_h24,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_h32,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_h48,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_h64,10, avx2);
|
||||
|
||||
PEL_PROTOTYPE(qpel_v16, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_v24, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_v32, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_v48, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_v64, 8, avx2);
|
||||
|
||||
PEL_PROTOTYPE(qpel_v16,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_v24,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_v32,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_v48,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_v64,10, avx2);
|
||||
|
||||
PEL_PROTOTYPE(qpel_hv16, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv24, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv32, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv48, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv64, 8, avx2);
|
||||
|
||||
PEL_PROTOTYPE(qpel_hv16,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv24,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv32,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv48,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv64,10, avx2);
|
||||
|
||||
WEIGHTING_PROTOTYPES(8, sse4);
|
||||
WEIGHTING_PROTOTYPES(10, sse4);
|
||||
WEIGHTING_PROTOTYPES(12, sse4);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// TRANSFORM_ADD
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
|
||||
void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
|
||||
void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
|
||||
void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
|
||||
void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
|
||||
#endif // AVCODEC_X86_HEVCDSP_H
|
||||
1151
externals/ffmpeg/libavcodec/x86/hevcdsp_init.c
vendored
Executable file
1151
externals/ffmpeg/libavcodec/x86/hevcdsp_init.c
vendored
Executable file
File diff suppressed because it is too large
Load Diff
591
externals/ffmpeg/libavcodec/x86/hpeldsp.asm
vendored
Executable file
591
externals/ffmpeg/libavcodec/x86/hpeldsp.asm
vendored
Executable file
@@ -0,0 +1,591 @@
|
||||
;******************************************************************************
|
||||
;*
|
||||
;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
|
||||
;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
|
||||
;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
|
||||
;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
|
||||
;* Copyright (c) 2013 Daniel Kang
|
||||
;*
|
||||
;* SIMD-optimized halfpel functions
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
cextern pb_1
|
||||
cextern pw_2
|
||||
pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
|
||||
pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
|
||||
|
||||
cextern pw_8192
|
||||
|
||||
SECTION .text
|
||||
|
||||
; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro PUT_PIXELS8_X2 0
|
||||
%if cpuflag(sse2)
|
||||
cglobal put_pixels16_x2, 4,5,4
|
||||
%else
|
||||
cglobal put_pixels8_x2, 4,5
|
||||
%endif
|
||||
lea r4, [r2*2]
|
||||
.loop:
|
||||
movu m0, [r1+1]
|
||||
movu m1, [r1+r2+1]
|
||||
%if cpuflag(sse2)
|
||||
movu m2, [r1]
|
||||
movu m3, [r1+r2]
|
||||
pavgb m0, m2
|
||||
pavgb m1, m3
|
||||
%else
|
||||
PAVGB m0, [r1]
|
||||
PAVGB m1, [r1+r2]
|
||||
%endif
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m1
|
||||
add r1, r4
|
||||
add r0, r4
|
||||
movu m0, [r1+1]
|
||||
movu m1, [r1+r2+1]
|
||||
%if cpuflag(sse2)
|
||||
movu m2, [r1]
|
||||
movu m3, [r1+r2]
|
||||
pavgb m0, m2
|
||||
pavgb m1, m3
|
||||
%else
|
||||
PAVGB m0, [r1]
|
||||
PAVGB m1, [r1+r2]
|
||||
%endif
|
||||
add r1, r4
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m1
|
||||
add r0, r4
|
||||
sub r3d, 4
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
PUT_PIXELS8_X2
|
||||
INIT_MMX 3dnow
|
||||
PUT_PIXELS8_X2
|
||||
|
||||
|
||||
; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro PUT_PIXELS_16 0
|
||||
cglobal put_pixels16_x2, 4,5
|
||||
lea r4, [r2*2]
|
||||
.loop:
|
||||
mova m0, [r1]
|
||||
mova m1, [r1+r2]
|
||||
mova m2, [r1+8]
|
||||
mova m3, [r1+r2+8]
|
||||
PAVGB m0, [r1+1]
|
||||
PAVGB m1, [r1+r2+1]
|
||||
PAVGB m2, [r1+9]
|
||||
PAVGB m3, [r1+r2+9]
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m1
|
||||
mova [r0+8], m2
|
||||
mova [r0+r2+8], m3
|
||||
add r1, r4
|
||||
add r0, r4
|
||||
mova m0, [r1]
|
||||
mova m1, [r1+r2]
|
||||
mova m2, [r1+8]
|
||||
mova m3, [r1+r2+8]
|
||||
PAVGB m0, [r1+1]
|
||||
PAVGB m1, [r1+r2+1]
|
||||
PAVGB m2, [r1+9]
|
||||
PAVGB m3, [r1+r2+9]
|
||||
add r1, r4
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m1
|
||||
mova [r0+8], m2
|
||||
mova [r0+r2+8], m3
|
||||
add r0, r4
|
||||
sub r3d, 4
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
PUT_PIXELS_16
|
||||
INIT_MMX 3dnow
|
||||
PUT_PIXELS_16
|
||||
; The 8_X2 macro can easily be used here
|
||||
INIT_XMM sse2
|
||||
PUT_PIXELS8_X2
|
||||
|
||||
|
||||
; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro PUT_NO_RND_PIXELS8_X2 0
|
||||
cglobal put_no_rnd_pixels8_x2, 4,5
|
||||
mova m6, [pb_1]
|
||||
lea r4, [r2*2]
|
||||
.loop:
|
||||
mova m0, [r1]
|
||||
mova m2, [r1+r2]
|
||||
mova m1, [r1+1]
|
||||
mova m3, [r1+r2+1]
|
||||
add r1, r4
|
||||
psubusb m0, m6
|
||||
psubusb m2, m6
|
||||
PAVGB m0, m1
|
||||
PAVGB m2, m3
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m2
|
||||
mova m0, [r1]
|
||||
mova m1, [r1+1]
|
||||
mova m2, [r1+r2]
|
||||
mova m3, [r1+r2+1]
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
psubusb m0, m6
|
||||
psubusb m2, m6
|
||||
PAVGB m0, m1
|
||||
PAVGB m2, m3
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m2
|
||||
add r0, r4
|
||||
sub r3d, 4
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
PUT_NO_RND_PIXELS8_X2
|
||||
INIT_MMX 3dnow
|
||||
PUT_NO_RND_PIXELS8_X2
|
||||
|
||||
|
||||
; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro PUT_PIXELS8_Y2 0
|
||||
%if cpuflag(sse2)
|
||||
cglobal put_pixels16_y2, 4,5,3
|
||||
%else
|
||||
cglobal put_pixels8_y2, 4,5
|
||||
%endif
|
||||
lea r4, [r2*2]
|
||||
movu m0, [r1]
|
||||
sub r0, r2
|
||||
.loop:
|
||||
movu m1, [r1+r2]
|
||||
movu m2, [r1+r4]
|
||||
add r1, r4
|
||||
PAVGB m0, m1
|
||||
PAVGB m1, m2
|
||||
mova [r0+r2], m0
|
||||
mova [r0+r4], m1
|
||||
movu m1, [r1+r2]
|
||||
movu m0, [r1+r4]
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
PAVGB m2, m1
|
||||
PAVGB m1, m0
|
||||
mova [r0+r2], m2
|
||||
mova [r0+r4], m1
|
||||
add r0, r4
|
||||
sub r3d, 4
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
PUT_PIXELS8_Y2
|
||||
INIT_MMX 3dnow
|
||||
PUT_PIXELS8_Y2
|
||||
; actually, put_pixels16_y2_sse2
|
||||
INIT_XMM sse2
|
||||
PUT_PIXELS8_Y2
|
||||
|
||||
|
||||
; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro PUT_NO_RND_PIXELS8_Y2 0
|
||||
cglobal put_no_rnd_pixels8_y2, 4,5
|
||||
mova m6, [pb_1]
|
||||
lea r4, [r2+r2]
|
||||
mova m0, [r1]
|
||||
sub r0, r2
|
||||
.loop:
|
||||
mova m1, [r1+r2]
|
||||
mova m2, [r1+r4]
|
||||
add r1, r4
|
||||
psubusb m1, m6
|
||||
PAVGB m0, m1
|
||||
PAVGB m1, m2
|
||||
mova [r0+r2], m0
|
||||
mova [r0+r4], m1
|
||||
mova m1, [r1+r2]
|
||||
mova m0, [r1+r4]
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
psubusb m1, m6
|
||||
PAVGB m2, m1
|
||||
PAVGB m1, m0
|
||||
mova [r0+r2], m2
|
||||
mova [r0+r4], m1
|
||||
add r0, r4
|
||||
sub r3d, 4
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
PUT_NO_RND_PIXELS8_Y2
|
||||
INIT_MMX 3dnow
|
||||
PUT_NO_RND_PIXELS8_Y2
|
||||
|
||||
|
||||
; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro AVG_PIXELS8 0
|
||||
cglobal avg_pixels8, 4,5
|
||||
lea r4, [r2*2]
|
||||
.loop:
|
||||
mova m0, [r0]
|
||||
mova m1, [r0+r2]
|
||||
PAVGB m0, [r1]
|
||||
PAVGB m1, [r1+r2]
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m1
|
||||
add r1, r4
|
||||
add r0, r4
|
||||
mova m0, [r0]
|
||||
mova m1, [r0+r2]
|
||||
PAVGB m0, [r1]
|
||||
PAVGB m1, [r1+r2]
|
||||
add r1, r4
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m1
|
||||
add r0, r4
|
||||
sub r3d, 4
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX 3dnow
|
||||
AVG_PIXELS8
|
||||
|
||||
|
||||
; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro AVG_PIXELS8_X2 0
|
||||
%if cpuflag(sse2)
|
||||
cglobal avg_pixels16_x2, 4,5,4
|
||||
%else
|
||||
cglobal avg_pixels8_x2, 4,5
|
||||
%endif
|
||||
lea r4, [r2*2]
|
||||
%if notcpuflag(mmxext)
|
||||
pcmpeqd m5, m5
|
||||
paddb m5, m5
|
||||
%endif
|
||||
.loop:
|
||||
movu m0, [r1]
|
||||
movu m2, [r1+r2]
|
||||
%if cpuflag(sse2)
|
||||
movu m1, [r1+1]
|
||||
movu m3, [r1+r2+1]
|
||||
pavgb m0, m1
|
||||
pavgb m2, m3
|
||||
%else
|
||||
PAVGB m0, [r1+1], m3, m5
|
||||
PAVGB m2, [r1+r2+1], m4, m5
|
||||
%endif
|
||||
PAVGB m0, [r0], m3, m5
|
||||
PAVGB m2, [r0+r2], m4, m5
|
||||
add r1, r4
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m2
|
||||
movu m0, [r1]
|
||||
movu m2, [r1+r2]
|
||||
%if cpuflag(sse2)
|
||||
movu m1, [r1+1]
|
||||
movu m3, [r1+r2+1]
|
||||
pavgb m0, m1
|
||||
pavgb m2, m3
|
||||
%else
|
||||
PAVGB m0, [r1+1], m3, m5
|
||||
PAVGB m2, [r1+r2+1], m4, m5
|
||||
%endif
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
PAVGB m0, [r0], m3, m5
|
||||
PAVGB m2, [r0+r2], m4, m5
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m2
|
||||
add r0, r4
|
||||
sub r3d, 4
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
AVG_PIXELS8_X2
|
||||
INIT_MMX mmxext
|
||||
AVG_PIXELS8_X2
|
||||
INIT_MMX 3dnow
|
||||
AVG_PIXELS8_X2
|
||||
; actually avg_pixels16_x2
|
||||
INIT_XMM sse2
|
||||
AVG_PIXELS8_X2
|
||||
|
||||
|
||||
; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro AVG_PIXELS8_Y2 0
|
||||
%if cpuflag(sse2)
|
||||
cglobal avg_pixels16_y2, 4,5,3
|
||||
%else
|
||||
cglobal avg_pixels8_y2, 4,5
|
||||
%endif
|
||||
lea r4, [r2*2]
|
||||
movu m0, [r1]
|
||||
sub r0, r2
|
||||
.loop:
|
||||
movu m1, [r1+r2]
|
||||
movu m2, [r1+r4]
|
||||
add r1, r4
|
||||
PAVGB m0, m1
|
||||
PAVGB m1, m2
|
||||
PAVGB m0, [r0+r2]
|
||||
PAVGB m1, [r0+r4]
|
||||
mova [r0+r2], m0
|
||||
mova [r0+r4], m1
|
||||
movu m1, [r1+r2]
|
||||
movu m0, [r1+r4]
|
||||
PAVGB m2, m1
|
||||
PAVGB m1, m0
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
PAVGB m2, [r0+r2]
|
||||
PAVGB m1, [r0+r4]
|
||||
mova [r0+r2], m2
|
||||
mova [r0+r4], m1
|
||||
add r0, r4
|
||||
sub r3d, 4
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
AVG_PIXELS8_Y2
|
||||
INIT_MMX 3dnow
|
||||
AVG_PIXELS8_Y2
|
||||
; actually avg_pixels16_y2
|
||||
INIT_XMM sse2
|
||||
AVG_PIXELS8_Y2
|
||||
|
||||
|
||||
; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
; Note this is not correctly rounded, and is therefore used for
|
||||
; not-bitexact output
|
||||
%macro AVG_APPROX_PIXELS8_XY2 0
|
||||
cglobal avg_approx_pixels8_xy2, 4,5
|
||||
mova m6, [pb_1]
|
||||
lea r4, [r2*2]
|
||||
mova m0, [r1]
|
||||
PAVGB m0, [r1+1]
|
||||
.loop:
|
||||
mova m2, [r1+r4]
|
||||
mova m1, [r1+r2]
|
||||
psubusb m2, m6
|
||||
PAVGB m1, [r1+r2+1]
|
||||
PAVGB m2, [r1+r4+1]
|
||||
add r1, r4
|
||||
PAVGB m0, m1
|
||||
PAVGB m1, m2
|
||||
PAVGB m0, [r0]
|
||||
PAVGB m1, [r0+r2]
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m1
|
||||
mova m1, [r1+r2]
|
||||
mova m0, [r1+r4]
|
||||
PAVGB m1, [r1+r2+1]
|
||||
PAVGB m0, [r1+r4+1]
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
PAVGB m2, m1
|
||||
PAVGB m1, m0
|
||||
PAVGB m2, [r0]
|
||||
PAVGB m1, [r0+r2]
|
||||
mova [r0], m2
|
||||
mova [r0+r2], m1
|
||||
add r0, r4
|
||||
sub r3d, 4
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
AVG_APPROX_PIXELS8_XY2
|
||||
INIT_MMX 3dnow
|
||||
AVG_APPROX_PIXELS8_XY2
|
||||
|
||||
|
||||
; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro SET_PIXELS_XY2 1
|
||||
%if cpuflag(sse2)
|
||||
cglobal %1_pixels16_xy2, 4,5,8
|
||||
%else
|
||||
cglobal %1_pixels8_xy2, 4,5
|
||||
%endif
|
||||
pxor m7, m7
|
||||
mova m6, [pw_2]
|
||||
movu m0, [r1]
|
||||
movu m4, [r1+1]
|
||||
mova m1, m0
|
||||
mova m5, m4
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m4, m7
|
||||
punpckhbw m1, m7
|
||||
punpckhbw m5, m7
|
||||
paddusw m4, m0
|
||||
paddusw m5, m1
|
||||
xor r4, r4
|
||||
add r1, r2
|
||||
.loop:
|
||||
movu m0, [r1+r4]
|
||||
movu m2, [r1+r4+1]
|
||||
mova m1, m0
|
||||
mova m3, m2
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m1, m7
|
||||
punpckhbw m3, m7
|
||||
paddusw m0, m2
|
||||
paddusw m1, m3
|
||||
paddusw m4, m6
|
||||
paddusw m5, m6
|
||||
paddusw m4, m0
|
||||
paddusw m5, m1
|
||||
psrlw m4, 2
|
||||
psrlw m5, 2
|
||||
%ifidn %1, avg
|
||||
mova m3, [r0+r4]
|
||||
packuswb m4, m5
|
||||
PAVGB m4, m3
|
||||
%else
|
||||
packuswb m4, m5
|
||||
%endif
|
||||
mova [r0+r4], m4
|
||||
add r4, r2
|
||||
|
||||
movu m2, [r1+r4]
|
||||
movu m4, [r1+r4+1]
|
||||
mova m3, m2
|
||||
mova m5, m4
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m4, m7
|
||||
punpckhbw m3, m7
|
||||
punpckhbw m5, m7
|
||||
paddusw m4, m2
|
||||
paddusw m5, m3
|
||||
paddusw m0, m6
|
||||
paddusw m1, m6
|
||||
paddusw m0, m4
|
||||
paddusw m1, m5
|
||||
psrlw m0, 2
|
||||
psrlw m1, 2
|
||||
%ifidn %1, avg
|
||||
mova m3, [r0+r4]
|
||||
packuswb m0, m1
|
||||
PAVGB m0, m3
|
||||
%else
|
||||
packuswb m0, m1
|
||||
%endif
|
||||
mova [r0+r4], m0
|
||||
add r4, r2
|
||||
sub r3d, 2
|
||||
jnz .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
SET_PIXELS_XY2 avg
|
||||
INIT_MMX 3dnow
|
||||
SET_PIXELS_XY2 avg
|
||||
INIT_XMM sse2
|
||||
SET_PIXELS_XY2 put
|
||||
SET_PIXELS_XY2 avg
|
||||
|
||||
%macro SSSE3_PIXELS_XY2 1-2
|
||||
%if %0 == 2 ; sse2
|
||||
cglobal %1_pixels16_xy2, 4,5,%2
|
||||
mova m4, [pb_interleave16]
|
||||
%else
|
||||
cglobal %1_pixels8_xy2, 4,5
|
||||
mova m4, [pb_interleave8]
|
||||
%endif
|
||||
mova m5, [pb_1]
|
||||
movu m0, [r1]
|
||||
movu m1, [r1+1]
|
||||
pmaddubsw m0, m5
|
||||
pmaddubsw m1, m5
|
||||
xor r4, r4
|
||||
add r1, r2
|
||||
.loop:
|
||||
movu m2, [r1+r4]
|
||||
movu m3, [r1+r4+1]
|
||||
pmaddubsw m2, m5
|
||||
pmaddubsw m3, m5
|
||||
paddusw m0, m2
|
||||
paddusw m1, m3
|
||||
pmulhrsw m0, [pw_8192]
|
||||
pmulhrsw m1, [pw_8192]
|
||||
%ifidn %1, avg
|
||||
mova m6, [r0+r4]
|
||||
packuswb m0, m1
|
||||
pshufb m0, m4
|
||||
pavgb m0, m6
|
||||
%else
|
||||
packuswb m0, m1
|
||||
pshufb m0, m4
|
||||
%endif
|
||||
mova [r0+r4], m0
|
||||
add r4, r2
|
||||
|
||||
movu m0, [r1+r4]
|
||||
movu m1, [r1+r4+1]
|
||||
pmaddubsw m0, m5
|
||||
pmaddubsw m1, m5
|
||||
paddusw m2, m0
|
||||
paddusw m3, m1
|
||||
pmulhrsw m2, [pw_8192]
|
||||
pmulhrsw m3, [pw_8192]
|
||||
%ifidn %1, avg
|
||||
mova m6, [r0+r4]
|
||||
packuswb m2, m3
|
||||
pshufb m2, m4
|
||||
pavgb m2, m6
|
||||
%else
|
||||
packuswb m2, m3
|
||||
pshufb m2, m4
|
||||
%endif
|
||||
mova [r0+r4], m2
|
||||
add r4, r2
|
||||
sub r3d, 2
|
||||
jnz .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX ssse3
|
||||
SSSE3_PIXELS_XY2 put
|
||||
SSSE3_PIXELS_XY2 avg
|
||||
INIT_XMM ssse3
|
||||
SSSE3_PIXELS_XY2 put, 6
|
||||
SSSE3_PIXELS_XY2 avg, 7
|
||||
57
externals/ffmpeg/libavcodec/x86/hpeldsp.h
vendored
Executable file
57
externals/ffmpeg/libavcodec/x86/hpeldsp.h
vendored
Executable file
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_HPELDSP_H
|
||||
#define AVCODEC_X86_HPELDSP_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavcodec/hpeldsp.h"
|
||||
|
||||
void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags);
|
||||
|
||||
#endif /* AVCODEC_X86_HPELDSP_H */
|
||||
313
externals/ffmpeg/libavcodec/x86/hpeldsp_init.c
vendored
Executable file
313
externals/ffmpeg/libavcodec/x86/hpeldsp_init.c
vendored
Executable file
@@ -0,0 +1,313 @@
|
||||
/*
|
||||
* SIMD-optimized halfpel functions
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/hpeldsp.h"
|
||||
#include "libavcodec/pixels.h"
|
||||
#include "fpel.h"
|
||||
#include "hpeldsp.h"
|
||||
|
||||
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
#define avg_pixels8_mmx ff_avg_pixels8_mmx
|
||||
#define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx
|
||||
#define avg_pixels16_mmx ff_avg_pixels16_mmx
|
||||
#define avg_pixels8_xy2_mmx ff_avg_pixels8_xy2_mmx
|
||||
#define avg_pixels16_xy2_mmx ff_avg_pixels16_xy2_mmx
|
||||
#define put_pixels8_mmx ff_put_pixels8_mmx
|
||||
#define put_pixels16_mmx ff_put_pixels16_mmx
|
||||
#define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx
|
||||
#define put_pixels16_xy2_mmx ff_put_pixels16_xy2_mmx
|
||||
#define avg_no_rnd_pixels16_mmx ff_avg_pixels16_mmx
|
||||
#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx
|
||||
#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
/***********************************/
|
||||
/* MMX no rounding */
|
||||
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
|
||||
#define SET_RND MOVQ_WONE
|
||||
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
|
||||
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
|
||||
#define STATIC static
|
||||
|
||||
#include "rnd_template.c"
|
||||
#include "hpeldsp_rnd_template.c"
|
||||
|
||||
#undef DEF
|
||||
#undef SET_RND
|
||||
#undef PAVGBP
|
||||
#undef PAVGB
|
||||
#undef STATIC
|
||||
|
||||
#if HAVE_MMX
|
||||
CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
|
||||
CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
|
||||
|
||||
CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
|
||||
CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
|
||||
#endif
|
||||
|
||||
/***********************************/
|
||||
/* MMX rounding */
|
||||
|
||||
#define DEF(x, y) x ## _ ## y ## _mmx
|
||||
#define SET_RND MOVQ_WTWO
|
||||
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
|
||||
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
|
||||
|
||||
#include "hpeldsp_rnd_template.c"
|
||||
|
||||
#undef DEF
|
||||
#define DEF(x, y) ff_ ## x ## _ ## y ## _mmx
|
||||
#define STATIC
|
||||
|
||||
#include "rnd_template.c"
|
||||
|
||||
#undef DEF
|
||||
#undef SET_RND
|
||||
#undef PAVGBP
|
||||
#undef PAVGB
|
||||
|
||||
#if HAVE_MMX
|
||||
CALL_2X_PIXELS(avg_pixels16_y2_mmx, avg_pixels8_y2_mmx, 8)
|
||||
CALL_2X_PIXELS(put_pixels16_y2_mmx, put_pixels8_y2_mmx, 8)
|
||||
|
||||
CALL_2X_PIXELS_EXPORT(ff_avg_pixels16_xy2_mmx, ff_avg_pixels8_xy2_mmx, 8)
|
||||
CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
|
||||
#endif
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
|
||||
#if HAVE_X86ASM
|
||||
|
||||
#define HPELDSP_AVG_PIXELS16(CPUEXT) \
|
||||
CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(put_pixels16_y2 ## CPUEXT, ff_put_pixels8_y2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(avg_pixels16 ## CPUEXT, ff_avg_pixels8 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
|
||||
|
||||
HPELDSP_AVG_PIXELS16(_3dnow)
|
||||
HPELDSP_AVG_PIXELS16(_mmxext)
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
|
||||
if (HAVE_MMX_EXTERNAL) \
|
||||
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU;
|
||||
|
||||
#if HAVE_MMX_INLINE
|
||||
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
|
||||
do { \
|
||||
SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
|
||||
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
|
||||
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
|
||||
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
|
||||
} while (0)
|
||||
#else
|
||||
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
|
||||
do { \
|
||||
SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
|
||||
{
|
||||
SET_HPEL_FUNCS(put, [0], 16, mmx);
|
||||
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
|
||||
SET_HPEL_FUNCS(avg, [0], 16, mmx);
|
||||
SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
|
||||
SET_HPEL_FUNCS(put, [1], 8, mmx);
|
||||
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
|
||||
if (HAVE_MMX_EXTERNAL) {
|
||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx;
|
||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx;
|
||||
}
|
||||
#if HAVE_MMX_INLINE
|
||||
c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
|
||||
{
|
||||
#if HAVE_MMXEXT_EXTERNAL
|
||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
|
||||
c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
|
||||
|
||||
c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
|
||||
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
|
||||
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
|
||||
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
|
||||
|
||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
|
||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
|
||||
|
||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
|
||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
|
||||
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
|
||||
|
||||
if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
|
||||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
|
||||
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
|
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
|
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
|
||||
|
||||
c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
|
||||
}
|
||||
#endif /* HAVE_MMXEXT_EXTERNAL */
|
||||
}
|
||||
|
||||
static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
|
||||
{
|
||||
#if HAVE_AMD3DNOW_EXTERNAL
|
||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
|
||||
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
|
||||
|
||||
c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
|
||||
c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
|
||||
c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
|
||||
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
|
||||
|
||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
|
||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
|
||||
|
||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
|
||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
|
||||
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
|
||||
|
||||
if (!(flags & AV_CODEC_FLAG_BITEXACT)){
|
||||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
|
||||
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
|
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
|
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
|
||||
|
||||
c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow;
|
||||
}
|
||||
#endif /* HAVE_AMD3DNOW_EXTERNAL */
|
||||
}
|
||||
|
||||
static void hpeldsp_init_sse2_fast(HpelDSPContext *c, int flags)
|
||||
{
|
||||
#if HAVE_SSE2_EXTERNAL
|
||||
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
|
||||
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
|
||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2;
|
||||
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2;
|
||||
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_sse2;
|
||||
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
|
||||
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
|
||||
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
|
||||
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2;
|
||||
#endif /* HAVE_SSE2_EXTERNAL */
|
||||
}
|
||||
|
||||
static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags)
|
||||
{
|
||||
#if HAVE_SSSE3_EXTERNAL
|
||||
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3;
|
||||
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3;
|
||||
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3;
|
||||
#endif
|
||||
}
|
||||
|
||||
av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (INLINE_MMX(cpu_flags))
|
||||
hpeldsp_init_mmx(c, flags);
|
||||
|
||||
if (EXTERNAL_AMD3DNOW(cpu_flags))
|
||||
hpeldsp_init_3dnow(c, flags);
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||
hpeldsp_init_mmxext(c, flags);
|
||||
|
||||
if (EXTERNAL_SSE2_FAST(cpu_flags))
|
||||
hpeldsp_init_sse2_fast(c, flags);
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags))
|
||||
hpeldsp_init_ssse3(c, flags);
|
||||
|
||||
if (CONFIG_VP3_DECODER)
|
||||
ff_hpeldsp_vp3_init_x86(c, cpu_flags, flags);
|
||||
}
|
||||
202
externals/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c
vendored
Executable file
202
externals/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c
vendored
Executable file
@@ -0,0 +1,202 @@
|
||||
/*
|
||||
* SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
|
||||
* and improved by Zdenek Kabelac <kabi@users.sf.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// put_pixels
|
||||
av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"FF_REG_a" \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((x86_reg)line_size)
|
||||
:FF_REG_a, "memory");
|
||||
}
|
||||
|
||||
av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"FF_REG_a" \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"movq 8(%1), %%mm0 \n\t"
|
||||
"movq 9(%1), %%mm1 \n\t"
|
||||
"movq 8(%1, %3), %%mm2 \n\t"
|
||||
"movq 9(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, 8(%2) \n\t"
|
||||
"movq %%mm5, 8(%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"movq 8(%1), %%mm0 \n\t"
|
||||
"movq 9(%1), %%mm1 \n\t"
|
||||
"movq 8(%1, %3), %%mm2 \n\t"
|
||||
"movq 9(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, 8(%2) \n\t"
|
||||
"movq %%mm5, 8(%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((x86_reg)line_size)
|
||||
:FF_REG_a, "memory");
|
||||
}
|
||||
|
||||
av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"FF_REG_a" \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"FF_REG_a"),%%mm2\n\t"
|
||||
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"FF_REG_a"),%%mm0\n\t"
|
||||
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((x86_reg)line_size)
|
||||
:FF_REG_a, "memory");
|
||||
}
|
||||
|
||||
av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%2), %%mm3 \n\t"
|
||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq 8(%1), %%mm0 \n\t"
|
||||
"movq 9(%1), %%mm1 \n\t"
|
||||
"movq 8(%2), %%mm3 \n\t"
|
||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
|
||||
"movq %%mm0, 8(%2) \n\t"
|
||||
"add %3, %1 \n\t"
|
||||
"add %3, %2 \n\t"
|
||||
"subl $1, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((x86_reg)line_size)
|
||||
:"memory");
|
||||
}
|
||||
|
||||
av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"FF_REG_a" \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
|
||||
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
||||
"movq (%2), %%mm3 \n\t"
|
||||
PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
|
||||
"movq (%2, %3), %%mm3 \n\t"
|
||||
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
|
||||
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
||||
"movq (%2), %%mm3 \n\t"
|
||||
PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
|
||||
"movq (%2, %3), %%mm3 \n\t"
|
||||
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
|
||||
"movq %%mm2, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((x86_reg)line_size)
|
||||
:FF_REG_a, "memory");
|
||||
}
|
||||
111
externals/ffmpeg/libavcodec/x86/hpeldsp_vp3.asm
vendored
Executable file
111
externals/ffmpeg/libavcodec/x86/hpeldsp_vp3.asm
vendored
Executable file
@@ -0,0 +1,111 @@
|
||||
;******************************************************************************
|
||||
;* SIMD-optimized halfpel functions for VP3
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro PUT_NO_RND_PIXELS8_X2_EXACT 0
|
||||
cglobal put_no_rnd_pixels8_x2_exact, 4,5
|
||||
lea r4, [r2*3]
|
||||
pcmpeqb m6, m6
|
||||
.loop:
|
||||
mova m0, [r1]
|
||||
mova m2, [r1+r2]
|
||||
mova m1, [r1+1]
|
||||
mova m3, [r1+r2+1]
|
||||
pxor m0, m6
|
||||
pxor m2, m6
|
||||
pxor m1, m6
|
||||
pxor m3, m6
|
||||
PAVGB m0, m1
|
||||
PAVGB m2, m3
|
||||
pxor m0, m6
|
||||
pxor m2, m6
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m2
|
||||
mova m0, [r1+r2*2]
|
||||
mova m1, [r1+r2*2+1]
|
||||
mova m2, [r1+r4]
|
||||
mova m3, [r1+r4+1]
|
||||
pxor m0, m6
|
||||
pxor m1, m6
|
||||
pxor m2, m6
|
||||
pxor m3, m6
|
||||
PAVGB m0, m1
|
||||
PAVGB m2, m3
|
||||
pxor m0, m6
|
||||
pxor m2, m6
|
||||
mova [r0+r2*2], m0
|
||||
mova [r0+r4], m2
|
||||
lea r1, [r1+r2*4]
|
||||
lea r0, [r0+r2*4]
|
||||
sub r3d, 4
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
PUT_NO_RND_PIXELS8_X2_EXACT
|
||||
INIT_MMX 3dnow
|
||||
PUT_NO_RND_PIXELS8_X2_EXACT
|
||||
|
||||
|
||||
; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
|
||||
cglobal put_no_rnd_pixels8_y2_exact, 4,5
|
||||
lea r4, [r2*3]
|
||||
mova m0, [r1]
|
||||
pcmpeqb m6, m6
|
||||
add r1, r2
|
||||
pxor m0, m6
|
||||
.loop:
|
||||
mova m1, [r1]
|
||||
mova m2, [r1+r2]
|
||||
pxor m1, m6
|
||||
pxor m2, m6
|
||||
PAVGB m0, m1
|
||||
PAVGB m1, m2
|
||||
pxor m0, m6
|
||||
pxor m1, m6
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m1
|
||||
mova m1, [r1+r2*2]
|
||||
mova m0, [r1+r4]
|
||||
pxor m1, m6
|
||||
pxor m0, m6
|
||||
PAVGB m2, m1
|
||||
PAVGB m1, m0
|
||||
pxor m2, m6
|
||||
pxor m1, m6
|
||||
mova [r0+r2*2], m2
|
||||
mova [r0+r4], m1
|
||||
lea r1, [r1+r2*4]
|
||||
lea r0, [r0+r2*4]
|
||||
sub r3d, 4
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
PUT_NO_RND_PIXELS8_Y2_EXACT
|
||||
INIT_MMX 3dnow
|
||||
PUT_NO_RND_PIXELS8_Y2_EXACT
|
||||
56
externals/ffmpeg/libavcodec/x86/hpeldsp_vp3_init.c
vendored
Executable file
56
externals/ffmpeg/libavcodec/x86/hpeldsp_vp3_init.c
vendored
Executable file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/hpeldsp.h"
|
||||
|
||||
#include "hpeldsp.h"
|
||||
|
||||
void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
|
||||
const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
|
||||
const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
|
||||
const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
|
||||
const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
av_cold void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags)
|
||||
{
|
||||
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
|
||||
if (flags & AV_CODEC_FLAG_BITEXACT) {
|
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
|
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
if (flags & AV_CODEC_FLAG_BITEXACT) {
|
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
|
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
|
||||
}
|
||||
}
|
||||
}
|
||||
164
externals/ffmpeg/libavcodec/x86/huffyuvdsp.asm
vendored
Executable file
164
externals/ffmpeg/libavcodec/x86/huffyuvdsp.asm
vendored
Executable file
@@ -0,0 +1,164 @@
|
||||
;******************************************************************************
|
||||
;* SIMD-optimized HuffYUV functions
|
||||
;* Copyright (c) 2008 Loren Merritt
|
||||
;* Copyright (c) 2014 Christophe Gisquet
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
%include "libavcodec/x86/huffyuvdsp_template.asm"
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void (*add_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro ADD_INT16 0
|
||||
cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
|
||||
%if mmsize > 8
|
||||
test srcq, mmsize-1
|
||||
jnz .unaligned
|
||||
test dstq, mmsize-1
|
||||
jnz .unaligned
|
||||
%endif
|
||||
INT16_LOOP a, add
|
||||
%if mmsize > 8
|
||||
.unaligned:
|
||||
INT16_LOOP u, add
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
ADD_INT16
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
ADD_INT16
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
ADD_INT16
|
||||
%endif
|
||||
|
||||
; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
|
||||
; intptr_t w, uint8_t *left)
|
||||
%macro LEFT_BGR32 0
|
||||
cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
|
||||
shl wq, 2
|
||||
movd m0, [leftq]
|
||||
lea dstq, [dstq + wq]
|
||||
lea srcq, [srcq + wq]
|
||||
LSHIFT m0, mmsize-4
|
||||
neg wq
|
||||
.loop:
|
||||
movu m1, [srcq+wq]
|
||||
mova m2, m1
|
||||
%if mmsize == 8
|
||||
punpckhdq m0, m0
|
||||
%endif
|
||||
LSHIFT m1, 4
|
||||
paddb m1, m2
|
||||
%if mmsize == 16
|
||||
pshufd m0, m0, q3333
|
||||
mova m2, m1
|
||||
LSHIFT m1, 8
|
||||
paddb m1, m2
|
||||
%endif
|
||||
paddb m0, m1
|
||||
movu [dstq+wq], m0
|
||||
add wq, mmsize
|
||||
jl .loop
|
||||
movd m0, [dstq-4]
|
||||
movd [leftq], m0
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
LEFT_BGR32
|
||||
%endif
|
||||
INIT_XMM sse2
|
||||
LEFT_BGR32
|
||||
|
||||
; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
|
||||
INIT_MMX mmxext
|
||||
cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
|
||||
add wd, wd
|
||||
movd mm6, maskd
|
||||
SPLATW mm6, mm6
|
||||
movq mm0, [topq]
|
||||
movq mm2, mm0
|
||||
movd mm4, [left_topq]
|
||||
psllq mm2, 16
|
||||
movq mm1, mm0
|
||||
por mm4, mm2
|
||||
movd mm3, [leftq]
|
||||
psubw mm0, mm4 ; t-tl
|
||||
add dstq, wq
|
||||
add topq, wq
|
||||
add diffq, wq
|
||||
neg wq
|
||||
jmp .skip
|
||||
.loop:
|
||||
movq mm4, [topq+wq]
|
||||
movq mm0, mm4
|
||||
psllq mm4, 16
|
||||
por mm4, mm1
|
||||
movq mm1, mm0 ; t
|
||||
psubw mm0, mm4 ; t-tl
|
||||
.skip:
|
||||
movq mm2, [diffq+wq]
|
||||
%assign i 0
|
||||
%rep 4
|
||||
movq mm4, mm0
|
||||
paddw mm4, mm3 ; t-tl+l
|
||||
pand mm4, mm6
|
||||
movq mm5, mm3
|
||||
pmaxsw mm3, mm1
|
||||
pminsw mm5, mm1
|
||||
pminsw mm3, mm4
|
||||
pmaxsw mm3, mm5 ; median
|
||||
paddw mm3, mm2 ; +residual
|
||||
pand mm3, mm6
|
||||
%if i==0
|
||||
movq mm7, mm3
|
||||
psllq mm7, 48
|
||||
%else
|
||||
movq mm4, mm3
|
||||
psrlq mm7, 16
|
||||
psllq mm4, 48
|
||||
por mm7, mm4
|
||||
%endif
|
||||
%if i<3
|
||||
psrlq mm0, 16
|
||||
psrlq mm1, 16
|
||||
psrlq mm2, 16
|
||||
%endif
|
||||
%assign i i+1
|
||||
%endrep
|
||||
movq [dstq+wq], mm7
|
||||
add wq, 8
|
||||
jl .loop
|
||||
movzx r2d, word [dstq-2]
|
||||
mov [leftq], r2d
|
||||
movzx r2d, word [topq-2]
|
||||
mov [left_topq], r2d
|
||||
RET
|
||||
61
externals/ffmpeg/libavcodec/x86/huffyuvdsp_init.c
vendored
Executable file
61
externals/ffmpeg/libavcodec/x86/huffyuvdsp_init.c
vendored
Executable file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/pixdesc.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/huffyuvdsp.h"
|
||||
|
||||
void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
|
||||
void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
|
||||
void ff_add_int16_avx2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
|
||||
|
||||
void ff_add_hfyu_left_pred_bgr32_mmx(uint8_t *dst, const uint8_t *src,
|
||||
intptr_t w, uint8_t *left);
|
||||
void ff_add_hfyu_left_pred_bgr32_sse2(uint8_t *dst, const uint8_t *src,
|
||||
intptr_t w, uint8_t *left);
|
||||
void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
|
||||
|
||||
av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(pix_fmt);
|
||||
|
||||
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
|
||||
c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmx;
|
||||
c->add_int16 = ff_add_int16_mmx;
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
|
||||
c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->add_int16 = ff_add_int16_sse2;
|
||||
c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
c->add_int16 = ff_add_int16_avx2;
|
||||
}
|
||||
}
|
||||
76
externals/ffmpeg/libavcodec/x86/huffyuvdsp_template.asm
vendored
Executable file
76
externals/ffmpeg/libavcodec/x86/huffyuvdsp_template.asm
vendored
Executable file
@@ -0,0 +1,76 @@
|
||||
;******************************************************************************
|
||||
;* SIMD-optimized HuffYUV functions
|
||||
;* Copyright (c) 2008 Loren Merritt
|
||||
;* Copyright (c) 2014 Christophe Gisquet
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
|
||||
movd xm4, maskd
|
||||
SPLATW m4, xm4
|
||||
add wd, wd
|
||||
test wq, 2*mmsize - 1
|
||||
jz %%.tomainloop
|
||||
push tmpq
|
||||
%%.wordloop:
|
||||
sub wq, 2
|
||||
%ifidn %2, add
|
||||
mov tmpw, [srcq+wq]
|
||||
add tmpw, [dstq+wq]
|
||||
%else
|
||||
mov tmpw, [src1q+wq]
|
||||
sub tmpw, [src2q+wq]
|
||||
%endif
|
||||
and tmpw, maskw
|
||||
mov [dstq+wq], tmpw
|
||||
test wq, 2*mmsize - 1
|
||||
jnz %%.wordloop
|
||||
pop tmpq
|
||||
%%.tomainloop:
|
||||
%ifidn %2, add
|
||||
add srcq, wq
|
||||
%else
|
||||
add src1q, wq
|
||||
add src2q, wq
|
||||
%endif
|
||||
add dstq, wq
|
||||
neg wq
|
||||
jz %%.end
|
||||
%%.loop:
|
||||
%ifidn %2, add
|
||||
mov%1 m0, [srcq+wq]
|
||||
mov%1 m1, [dstq+wq]
|
||||
mov%1 m2, [srcq+wq+mmsize]
|
||||
mov%1 m3, [dstq+wq+mmsize]
|
||||
%else
|
||||
mov%1 m0, [src1q+wq]
|
||||
mov%1 m1, [src2q+wq]
|
||||
mov%1 m2, [src1q+wq+mmsize]
|
||||
mov%1 m3, [src2q+wq+mmsize]
|
||||
%endif
|
||||
p%2w m0, m1
|
||||
p%2w m2, m3
|
||||
pand m0, m4
|
||||
pand m2, m4
|
||||
mov%1 [dstq+wq] , m0
|
||||
mov%1 [dstq+wq+mmsize], m2
|
||||
add wq, 2*mmsize
|
||||
jl %%.loop
|
||||
%%.end:
|
||||
RET
|
||||
%endmacro
|
||||
105
externals/ffmpeg/libavcodec/x86/huffyuvencdsp.asm
vendored
Executable file
105
externals/ffmpeg/libavcodec/x86/huffyuvencdsp.asm
vendored
Executable file
@@ -0,0 +1,105 @@
|
||||
;************************************************************************
|
||||
;* SIMD-optimized HuffYUV encoding functions
|
||||
;* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
;*
|
||||
;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
%include "libavcodec/x86/huffyuvdsp_template.asm"
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_diff_int16(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
; unsigned mask, int w);
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro DIFF_INT16 0
|
||||
cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
|
||||
%if mmsize > 8
|
||||
test src1q, mmsize-1
|
||||
jnz .unaligned
|
||||
test src2q, mmsize-1
|
||||
jnz .unaligned
|
||||
test dstq, mmsize-1
|
||||
jnz .unaligned
|
||||
%endif
|
||||
INT16_LOOP a, sub
|
||||
%if mmsize > 8
|
||||
.unaligned:
|
||||
INT16_LOOP u, sub
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
DIFF_INT16
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
DIFF_INT16
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
DIFF_INT16
|
||||
%endif
|
||||
|
||||
INIT_MMX mmxext
|
||||
cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
|
||||
add wd, wd
|
||||
movd mm7, maskd
|
||||
SPLATW mm7, mm7
|
||||
movq mm0, [src1q]
|
||||
movq mm2, [src2q]
|
||||
psllq mm0, 16
|
||||
psllq mm2, 16
|
||||
movd mm6, [left_topq]
|
||||
por mm0, mm6
|
||||
movd mm6, [leftq]
|
||||
por mm2, mm6
|
||||
xor maskq, maskq
|
||||
.loop:
|
||||
movq mm1, [src1q + maskq]
|
||||
movq mm3, [src2q + maskq]
|
||||
movq mm4, mm2
|
||||
psubw mm2, mm0
|
||||
paddw mm2, mm1
|
||||
pand mm2, mm7
|
||||
movq mm5, mm4
|
||||
pmaxsw mm4, mm1
|
||||
pminsw mm1, mm5
|
||||
pminsw mm4, mm2
|
||||
pmaxsw mm4, mm1
|
||||
psubw mm3, mm4
|
||||
pand mm3, mm7
|
||||
movq [dstq + maskq], mm3
|
||||
add maskq, 8
|
||||
movq mm0, [src1q + maskq - 2]
|
||||
movq mm2, [src2q + maskq - 2]
|
||||
cmp maskq, wq
|
||||
jb .loop
|
||||
movzx maskd, word [src1q + wq - 2]
|
||||
mov [left_topq], maskd
|
||||
movzx maskd, word [src2q + wq - 2]
|
||||
mov [leftq], maskd
|
||||
RET
|
||||
60
externals/ffmpeg/libavcodec/x86/huffyuvencdsp_init.c
vendored
Executable file
60
externals/ffmpeg/libavcodec/x86/huffyuvencdsp_init.c
vendored
Executable file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* SIMD-optimized HuffYUV encoding functions
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/pixdesc.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/huffyuvencdsp.h"
|
||||
|
||||
void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
|
||||
unsigned mask, int w);
|
||||
void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
|
||||
unsigned mask, int w);
|
||||
void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
|
||||
unsigned mask, int w);
|
||||
void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
|
||||
unsigned mask, int w, int *left, int *left_top);
|
||||
|
||||
av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
av_unused int cpu_flags = av_get_cpu_flags();
|
||||
const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
|
||||
|
||||
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
|
||||
c->diff_int16 = ff_diff_int16_mmx;
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
|
||||
c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->diff_int16 = ff_diff_int16_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
c->diff_int16 = ff_diff_int16_avx2;
|
||||
}
|
||||
}
|
||||
183
externals/ffmpeg/libavcodec/x86/idctdsp.asm
vendored
Executable file
183
externals/ffmpeg/libavcodec/x86/idctdsp.asm
vendored
Executable file
@@ -0,0 +1,183 @@
|
||||
;******************************************************************************
|
||||
;* SIMD-optimized IDCT-related routines
|
||||
;* Copyright (c) 2008 Loren Merritt
|
||||
;* Copyright (c) 2003-2013 Michael Niedermayer
|
||||
;* Copyright (c) 2013 Daniel Kang
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
cextern pb_80
|
||||
|
||||
SECTION .text
|
||||
|
||||
;--------------------------------------------------------------------------
|
||||
;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
|
||||
; ptrdiff_t line_size)
|
||||
;--------------------------------------------------------------------------
|
||||
|
||||
%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
|
||||
mova m1, [blockq+mmsize*0+%1]
|
||||
mova m2, [blockq+mmsize*2+%1]
|
||||
%if mmsize == 8
|
||||
mova m3, [blockq+mmsize*4+%1]
|
||||
mova m4, [blockq+mmsize*6+%1]
|
||||
%endif
|
||||
packsswb m1, [blockq+mmsize*1+%1]
|
||||
packsswb m2, [blockq+mmsize*3+%1]
|
||||
%if mmsize == 8
|
||||
packsswb m3, [blockq+mmsize*5+%1]
|
||||
packsswb m4, [blockq+mmsize*7+%1]
|
||||
%endif
|
||||
paddb m1, m0
|
||||
paddb m2, m0
|
||||
%if mmsize == 8
|
||||
paddb m3, m0
|
||||
paddb m4, m0
|
||||
movq [pixelsq+lsizeq*0], m1
|
||||
movq [pixelsq+lsizeq*1], m2
|
||||
movq [pixelsq+lsizeq*2], m3
|
||||
movq [pixelsq+lsize3q ], m4
|
||||
%else
|
||||
movq [pixelsq+lsizeq*0], m1
|
||||
movhps [pixelsq+lsizeq*1], m1
|
||||
movq [pixelsq+lsizeq*2], m2
|
||||
movhps [pixelsq+lsize3q ], m2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PUT_SIGNED_PIXELS_CLAMPED 1
|
||||
cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3
|
||||
mova m0, [pb_80]
|
||||
lea lsize3q, [lsizeq*3]
|
||||
PUT_SIGNED_PIXELS_CLAMPED_HALF 0
|
||||
lea pixelsq, [pixelsq+lsizeq*4]
|
||||
PUT_SIGNED_PIXELS_CLAMPED_HALF 64
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
PUT_SIGNED_PIXELS_CLAMPED 0
|
||||
INIT_XMM sse2
|
||||
PUT_SIGNED_PIXELS_CLAMPED 3
|
||||
|
||||
;--------------------------------------------------------------------------
|
||||
; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels,
|
||||
; ptrdiff_t line_size);
|
||||
;--------------------------------------------------------------------------
|
||||
; %1 = block offset
|
||||
%macro PUT_PIXELS_CLAMPED_HALF 1
|
||||
mova m0, [blockq+mmsize*0+%1]
|
||||
mova m1, [blockq+mmsize*2+%1]
|
||||
%if mmsize == 8
|
||||
mova m2, [blockq+mmsize*4+%1]
|
||||
mova m3, [blockq+mmsize*6+%1]
|
||||
%endif
|
||||
packuswb m0, [blockq+mmsize*1+%1]
|
||||
packuswb m1, [blockq+mmsize*3+%1]
|
||||
%if mmsize == 8
|
||||
packuswb m2, [blockq+mmsize*5+%1]
|
||||
packuswb m3, [blockq+mmsize*7+%1]
|
||||
movq [pixelsq], m0
|
||||
movq [lsizeq+pixelsq], m1
|
||||
movq [2*lsizeq+pixelsq], m2
|
||||
movq [lsize3q+pixelsq], m3
|
||||
%else
|
||||
movq [pixelsq], m0
|
||||
movhps [lsizeq+pixelsq], m0
|
||||
movq [2*lsizeq+pixelsq], m1
|
||||
movhps [lsize3q+pixelsq], m1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PUT_PIXELS_CLAMPED 0
|
||||
cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3
|
||||
lea lsize3q, [lsizeq*3]
|
||||
PUT_PIXELS_CLAMPED_HALF 0
|
||||
lea pixelsq, [pixelsq+lsizeq*4]
|
||||
PUT_PIXELS_CLAMPED_HALF 64
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
PUT_PIXELS_CLAMPED
|
||||
INIT_XMM sse2
|
||||
PUT_PIXELS_CLAMPED
|
||||
|
||||
;--------------------------------------------------------------------------
|
||||
; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels,
|
||||
; ptrdiff_t line_size);
|
||||
;--------------------------------------------------------------------------
|
||||
; %1 = block offset
|
||||
%macro ADD_PIXELS_CLAMPED 1
|
||||
mova m0, [blockq+mmsize*0+%1]
|
||||
mova m1, [blockq+mmsize*1+%1]
|
||||
%if mmsize == 8
|
||||
mova m5, [blockq+mmsize*2+%1]
|
||||
mova m6, [blockq+mmsize*3+%1]
|
||||
%endif
|
||||
movq m2, [pixelsq]
|
||||
movq m3, [pixelsq+lsizeq]
|
||||
%if mmsize == 8
|
||||
mova m7, m2
|
||||
punpcklbw m2, m4
|
||||
punpckhbw m7, m4
|
||||
paddsw m0, m2
|
||||
paddsw m1, m7
|
||||
mova m7, m3
|
||||
punpcklbw m3, m4
|
||||
punpckhbw m7, m4
|
||||
paddsw m5, m3
|
||||
paddsw m6, m7
|
||||
%else
|
||||
punpcklbw m2, m4
|
||||
punpcklbw m3, m4
|
||||
paddsw m0, m2
|
||||
paddsw m1, m3
|
||||
%endif
|
||||
packuswb m0, m1
|
||||
%if mmsize == 8
|
||||
packuswb m5, m6
|
||||
movq [pixelsq], m0
|
||||
movq [pixelsq+lsizeq], m5
|
||||
%else
|
||||
movq [pixelsq], m0
|
||||
movhps [pixelsq+lsizeq], m0
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ADD_PIXELS_CLAMPED 0
|
||||
cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize
|
||||
pxor m4, m4
|
||||
ADD_PIXELS_CLAMPED 0
|
||||
lea pixelsq, [pixelsq+lsizeq*2]
|
||||
ADD_PIXELS_CLAMPED 32
|
||||
lea pixelsq, [pixelsq+lsizeq*2]
|
||||
ADD_PIXELS_CLAMPED 64
|
||||
lea pixelsq, [pixelsq+lsizeq*2]
|
||||
ADD_PIXELS_CLAMPED 96
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
ADD_PIXELS_CLAMPED
|
||||
INIT_XMM sse2
|
||||
ADD_PIXELS_CLAMPED
|
||||
39
externals/ffmpeg/libavcodec/x86/idctdsp.h
vendored
Executable file
39
externals/ffmpeg/libavcodec/x86/idctdsp.h
vendored
Executable file
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_IDCTDSP_H
|
||||
#define AVCODEC_X86_IDCTDSP_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
|
||||
|
||||
#endif /* AVCODEC_X86_IDCTDSP_H */
|
||||
162
externals/ffmpeg/libavcodec/x86/idctdsp_init.c
vendored
Executable file
162
externals/ffmpeg/libavcodec/x86/idctdsp_init.c
vendored
Executable file
@@ -0,0 +1,162 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/idctdsp.h"
|
||||
#include "idctdsp.h"
|
||||
#include "simple_idct.h"
|
||||
|
||||
/* Input permutation for the simple_idct_mmx */
|
||||
static const uint8_t simple_mmx_permutation[64] = {
|
||||
0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
|
||||
0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
|
||||
0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
|
||||
0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
|
||||
0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
|
||||
0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
|
||||
0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
|
||||
0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
|
||||
};
|
||||
|
||||
static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
|
||||
|
||||
av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
|
||||
enum idct_permutation_type perm_type)
|
||||
{
|
||||
int i;
|
||||
|
||||
switch (perm_type) {
|
||||
case FF_IDCT_PERM_SIMPLE:
|
||||
for (i = 0; i < 64; i++)
|
||||
idct_permutation[i] = simple_mmx_permutation[i];
|
||||
return 1;
|
||||
case FF_IDCT_PERM_SSE2:
|
||||
for (i = 0; i < 64; i++)
|
||||
idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
|
||||
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
|
||||
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
|
||||
|
||||
if (!high_bit_depth &&
|
||||
avctx->lowres == 0 &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
|
||||
c->idct_put = ff_simple_idct_put_mmx;
|
||||
c->idct_add = ff_simple_idct_add_mmx;
|
||||
c->idct = ff_simple_idct_mmx;
|
||||
c->perm_type = FF_IDCT_PERM_SIMPLE;
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
|
||||
c->put_pixels_clamped = ff_put_pixels_clamped_sse2;
|
||||
c->add_pixels_clamped = ff_add_pixels_clamped_sse2;
|
||||
|
||||
if (!high_bit_depth &&
|
||||
avctx->lowres == 0 &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
|
||||
c->idct_put = ff_simple_idct_put_sse2;
|
||||
c->idct_add = ff_simple_idct_add_sse2;
|
||||
c->perm_type = FF_IDCT_PERM_SIMPLE;
|
||||
}
|
||||
|
||||
if (ARCH_X86_64 &&
|
||||
!high_bit_depth &&
|
||||
avctx->lowres == 0 &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEMMX ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLE)) {
|
||||
c->idct = ff_simple_idct8_sse2;
|
||||
c->idct_put = ff_simple_idct8_put_sse2;
|
||||
c->idct_add = ff_simple_idct8_add_sse2;
|
||||
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
}
|
||||
}
|
||||
|
||||
if (ARCH_X86_64 && avctx->lowres == 0) {
|
||||
if (EXTERNAL_AVX(cpu_flags) &&
|
||||
!high_bit_depth &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEMMX ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLE)) {
|
||||
c->idct = ff_simple_idct8_avx;
|
||||
c->idct_put = ff_simple_idct8_put_avx;
|
||||
c->idct_add = ff_simple_idct8_add_avx;
|
||||
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
}
|
||||
|
||||
if (avctx->bits_per_raw_sample == 10 &&
|
||||
avctx->codec_id != AV_CODEC_ID_MPEG4 &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLE)) {
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->idct_put = ff_simple_idct10_put_sse2;
|
||||
c->idct_add = NULL;
|
||||
c->idct = ff_simple_idct10_sse2;
|
||||
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
c->idct_put = ff_simple_idct10_put_avx;
|
||||
c->idct_add = NULL;
|
||||
c->idct = ff_simple_idct10_avx;
|
||||
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
}
|
||||
}
|
||||
|
||||
if (avctx->bits_per_raw_sample == 12 &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->idct_put = ff_simple_idct12_put_sse2;
|
||||
c->idct_add = NULL;
|
||||
c->idct = ff_simple_idct12_sse2;
|
||||
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
c->idct_put = ff_simple_idct12_put_avx;
|
||||
c->idct_add = NULL;
|
||||
c->idct = ff_simple_idct12_avx;
|
||||
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
741
externals/ffmpeg/libavcodec/x86/imdct36.asm
vendored
Executable file
741
externals/ffmpeg/libavcodec/x86/imdct36.asm
vendored
Executable file
@@ -0,0 +1,741 @@
|
||||
;******************************************************************************
|
||||
;* 36 point SSE-optimized IMDCT transform
|
||||
;* Copyright (c) 2011 Vitor Sessak
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
ps_mask: dd 0, ~0, ~0, ~0
|
||||
ps_mask2: dd 0, ~0, 0, ~0
|
||||
ps_mask3: dd 0, 0, 0, ~0
|
||||
ps_mask4: dd 0, ~0, 0, 0
|
||||
|
||||
ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
|
||||
ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
|
||||
ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
|
||||
ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
|
||||
ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
|
||||
ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
|
||||
ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
|
||||
|
||||
ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
|
||||
ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
|
||||
|
||||
ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
|
||||
dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
|
||||
dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
|
||||
dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
|
||||
dd 1.0, 0.70710678118654752439, 0.0, 0.0
|
||||
|
||||
ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
|
||||
dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
|
||||
dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
|
||||
dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
|
||||
dd 1.0, -0.70710678118654752439, 0.0, 0.0
|
||||
|
||||
costabs: times 4 dd 0.98480773
|
||||
times 4 dd 0.93969262
|
||||
times 4 dd 0.86602539
|
||||
times 4 dd -0.76604444
|
||||
times 4 dd -0.64278764
|
||||
times 4 dd 0.50000000
|
||||
times 4 dd -0.50000000
|
||||
times 4 dd -0.34202015
|
||||
times 4 dd -0.17364818
|
||||
times 4 dd 0.50190992
|
||||
times 4 dd 0.51763808
|
||||
times 4 dd 0.55168896
|
||||
times 4 dd 0.61038726
|
||||
times 4 dd 0.70710677
|
||||
times 4 dd 0.87172341
|
||||
times 4 dd 1.18310082
|
||||
times 4 dd 1.93185163
|
||||
times 4 dd 5.73685646
|
||||
|
||||
%define SBLIMIT 32
|
||||
SECTION .text
|
||||
|
||||
%macro PSHUFD 3
|
||||
%if cpuflag(sse2) && notcpuflag(avx)
|
||||
pshufd %1, %2, %3
|
||||
%else
|
||||
shufps %1, %2, %2, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
|
||||
; output %1={x3,x4,y1,y2}
|
||||
%macro BUILDINVHIGHLOW 3
|
||||
%if cpuflag(avx)
|
||||
shufps %1, %2, %3, 0x4e
|
||||
%else
|
||||
movlhps %1, %3
|
||||
movhlps %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
|
||||
; output %1={x4,y1,y2,y3}
|
||||
%macro ROTLEFT 3
|
||||
%if cpuflag(ssse3)
|
||||
palignr %1, %3, %2, 12
|
||||
%else
|
||||
BUILDINVHIGHLOW %1, %2, %3
|
||||
shufps %1, %1, %3, 0x99
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro INVERTHL 2
|
||||
%if cpuflag(sse2)
|
||||
PSHUFD %1, %2, 0x4e
|
||||
%else
|
||||
movhlps %1, %2
|
||||
movlhps %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERF 3
|
||||
INVERTHL %2, %1
|
||||
xorps %1, [ps_p1p1m1m1]
|
||||
addps %1, %2
|
||||
%if cpuflag(sse3)
|
||||
mulps %1, %1, [ps_cosh_sse3 + %3]
|
||||
PSHUFD %2, %1, 0xb1
|
||||
addsubps %1, %1, %2
|
||||
%else
|
||||
mulps %1, [ps_cosh + %3]
|
||||
PSHUFD %2, %1, 0xb1
|
||||
xorps %1, [ps_p1m1p1m1]
|
||||
addps %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERF2 3
|
||||
%if cpuflag(sse3)
|
||||
mulps %1, %1, [ps_cosh_sse3 + %3]
|
||||
PSHUFD %2, %1, 0xe1
|
||||
addsubps %1, %1, %2
|
||||
%else
|
||||
mulps %1, [ps_cosh + %3]
|
||||
PSHUFD %2, %1, 0xe1
|
||||
xorps %1, [ps_p1m1p1m1]
|
||||
addps %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro STORE 4
|
||||
%if cpuflag(sse4)
|
||||
movss [%3 ], %1
|
||||
extractps dword [%3 + %4], %1, 1
|
||||
extractps dword [%3 + 2*%4], %1, 2
|
||||
extractps dword [%3 + 3*%4], %1, 3
|
||||
%else
|
||||
movhlps %2, %1
|
||||
movss [%3 ], %1
|
||||
movss [%3 + 2*%4], %2
|
||||
shufps %1, %1, 0xb1
|
||||
movss [%3 + %4], %1
|
||||
movhlps %2, %1
|
||||
movss [%3 + 3*%4], %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro LOAD 4
|
||||
movlps %1, [%3 ]
|
||||
movhps %1, [%3 + %4]
|
||||
movlps %2, [%3 + 2*%4]
|
||||
movhps %2, [%3 + 3*%4]
|
||||
shufps %1, %2, 0x88
|
||||
%endmacro
|
||||
|
||||
%macro LOADA64 2
|
||||
%if cpuflag(avx)
|
||||
movu %1, [%2]
|
||||
%else
|
||||
movlps %1, [%2]
|
||||
movhps %1, [%2 + 8]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro DEFINE_IMDCT 0
|
||||
cglobal imdct36_float, 4,4,9, out, buf, in, win
|
||||
|
||||
; for(i=17;i>=1;i--) in[i] += in[i-1];
|
||||
LOADA64 m0, inq
|
||||
LOADA64 m1, inq + 16
|
||||
|
||||
ROTLEFT m5, m0, m1
|
||||
|
||||
PSHUFD m6, m0, 0x93
|
||||
andps m6, m6, [ps_mask]
|
||||
addps m0, m0, m6
|
||||
|
||||
LOADA64 m2, inq + 32
|
||||
|
||||
ROTLEFT m7, m1, m2
|
||||
|
||||
addps m1, m1, m5
|
||||
LOADA64 m3, inq + 48
|
||||
|
||||
ROTLEFT m5, m2, m3
|
||||
|
||||
xorps m4, m4, m4
|
||||
movlps m4, [inq+64]
|
||||
BUILDINVHIGHLOW m6, m3, m4
|
||||
shufps m6, m6, m4, 0xa9
|
||||
|
||||
addps m4, m4, m6
|
||||
addps m2, m2, m7
|
||||
addps m3, m3, m5
|
||||
|
||||
; for(i=17;i>=3;i-=2) in[i] += in[i-2];
|
||||
movlhps m5, m5, m0
|
||||
andps m5, m5, [ps_mask3]
|
||||
|
||||
BUILDINVHIGHLOW m7, m0, m1
|
||||
andps m7, m7, [ps_mask2]
|
||||
|
||||
addps m0, m0, m5
|
||||
|
||||
BUILDINVHIGHLOW m6, m1, m2
|
||||
andps m6, m6, [ps_mask2]
|
||||
|
||||
addps m1, m1, m7
|
||||
|
||||
BUILDINVHIGHLOW m7, m2, m3
|
||||
andps m7, m7, [ps_mask2]
|
||||
|
||||
addps m2, m2, m6
|
||||
|
||||
movhlps m6, m6, m3
|
||||
andps m6, m6, [ps_mask4]
|
||||
|
||||
addps m3, m3, m7
|
||||
addps m4, m4, m6
|
||||
|
||||
; Populate tmp[]
|
||||
movlhps m6, m1, m5 ; zero out high values
|
||||
subps m6, m6, m4
|
||||
|
||||
subps m5, m0, m3
|
||||
|
||||
%if ARCH_X86_64
|
||||
SWAP m5, m8
|
||||
%endif
|
||||
|
||||
mulps m7, m2, [ps_val1]
|
||||
|
||||
%if ARCH_X86_64
|
||||
mulps m5, m8, [ps_val2]
|
||||
%else
|
||||
mulps m5, m5, [ps_val2]
|
||||
%endif
|
||||
addps m7, m7, m5
|
||||
|
||||
mulps m5, m6, [ps_val1]
|
||||
subps m7, m7, m5
|
||||
|
||||
%if ARCH_X86_64
|
||||
SWAP m5, m8
|
||||
%else
|
||||
subps m5, m0, m3
|
||||
%endif
|
||||
|
||||
subps m5, m5, m6
|
||||
addps m5, m5, m2
|
||||
|
||||
shufps m6, m4, m3, 0xe4
|
||||
subps m6, m6, m2
|
||||
mulps m6, m6, [ps_val3]
|
||||
|
||||
addps m4, m4, m1
|
||||
mulps m4, m4, [ps_val4]
|
||||
|
||||
shufps m1, m1, m0, 0xe4
|
||||
addps m1, m1, m2
|
||||
mulps m1, m1, [ps_val5]
|
||||
|
||||
mulps m3, m3, [ps_val6]
|
||||
mulps m0, m0, [ps_val7]
|
||||
addps m0, m0, m3
|
||||
|
||||
xorps m2, m1, [ps_p1p1m1m1]
|
||||
subps m2, m2, m4
|
||||
addps m2, m2, m0
|
||||
|
||||
addps m3, m4, m0
|
||||
subps m3, m3, m6
|
||||
xorps m3, m3, [ps_p1p1m1m1]
|
||||
|
||||
shufps m0, m0, m4, 0xe4
|
||||
subps m0, m0, m1
|
||||
addps m0, m0, m6
|
||||
|
||||
BUILDINVHIGHLOW m4, m2, m3
|
||||
shufps m3, m3, m2, 0x4e
|
||||
|
||||
; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
|
||||
|
||||
BUTTERF m0, m1, 0
|
||||
BUTTERF m7, m2, 16
|
||||
BUTTERF m3, m6, 32
|
||||
BUTTERF m4, m1, 48
|
||||
BUTTERF2 m5, m1, 64
|
||||
|
||||
; permutates:
|
||||
; m0 0 1 2 3 => 2 6 10 14 m1
|
||||
; m7 4 5 6 7 => 3 7 11 15 m2
|
||||
; m3 8 9 10 11 => 17 13 9 5 m3
|
||||
; m4 12 13 14 15 => 16 12 8 4 m5
|
||||
; m5 16 17 xx xx => 0 1 xx xx m0
|
||||
|
||||
unpckhps m1, m0, m7
|
||||
unpckhps m6, m3, m4
|
||||
movhlps m2, m6, m1
|
||||
movlhps m1, m1, m6
|
||||
|
||||
unpcklps m5, m5, m4
|
||||
unpcklps m3, m3, m7
|
||||
movhlps m4, m3, m5
|
||||
movlhps m5, m5, m3
|
||||
SWAP m4, m3
|
||||
; permutation done
|
||||
|
||||
PSHUFD m6, m2, 0xb1
|
||||
movss m4, [bufq + 4*68]
|
||||
movss m7, [bufq + 4*64]
|
||||
unpcklps m7, m7, m4
|
||||
mulps m6, m6, [winq + 16*4]
|
||||
addps m6, m6, m7
|
||||
movss [outq + 64*SBLIMIT], m6
|
||||
shufps m6, m6, m6, 0xb1
|
||||
movss [outq + 68*SBLIMIT], m6
|
||||
|
||||
mulps m6, m3, [winq + 4*4]
|
||||
LOAD m4, m7, bufq + 4*16, 16
|
||||
addps m6, m6, m4
|
||||
STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
|
||||
|
||||
shufps m4, m0, m3, 0xb5
|
||||
mulps m4, m4, [winq + 8*4]
|
||||
LOAD m7, m6, bufq + 4*32, 16
|
||||
addps m4, m4, m7
|
||||
STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
|
||||
|
||||
shufps m3, m3, m2, 0xb1
|
||||
mulps m3, m3, [winq + 12*4]
|
||||
LOAD m7, m6, bufq + 4*48, 16
|
||||
addps m3, m3, m7
|
||||
STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
|
||||
|
||||
mulps m2, m2, [winq]
|
||||
LOAD m6, m7, bufq, 16
|
||||
addps m2, m2, m6
|
||||
STORE m2, m7, outq, 4*SBLIMIT
|
||||
|
||||
mulps m4, m1, [winq + 20*4]
|
||||
STORE m4, m7, bufq, 16
|
||||
|
||||
mulps m3, m5, [winq + 24*4]
|
||||
STORE m3, m7, bufq + 4*16, 16
|
||||
|
||||
shufps m0, m0, m5, 0xb0
|
||||
mulps m0, m0, [winq + 28*4]
|
||||
STORE m0, m7, bufq + 4*32, 16
|
||||
|
||||
shufps m5, m5, m1, 0xb1
|
||||
mulps m5, m5, [winq + 32*4]
|
||||
STORE m5, m7, bufq + 4*48, 16
|
||||
|
||||
shufps m1, m1, m1, 0xb1
|
||||
mulps m1, m1, [winq + 36*4]
|
||||
movss [bufq + 4*64], m1
|
||||
shufps m1, m1, 0xb1
|
||||
movss [bufq + 4*68], m1
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_XMM sse
|
||||
DEFINE_IMDCT
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
DEFINE_IMDCT
|
||||
|
||||
INIT_XMM sse3
|
||||
DEFINE_IMDCT
|
||||
|
||||
INIT_XMM ssse3
|
||||
DEFINE_IMDCT
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
DEFINE_IMDCT
|
||||
%endif
|
||||
|
||||
INIT_XMM sse
|
||||
|
||||
%if ARCH_X86_64
|
||||
%define SPILL SWAP
|
||||
%define UNSPILL SWAP
|
||||
%define SPILLED(x) m %+ x
|
||||
%else
|
||||
%define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
|
||||
%macro SPILL 2 ; xmm#, mempos
|
||||
movaps SPILLED(%2), m%1
|
||||
%endmacro
|
||||
%macro UNSPILL 2
|
||||
movaps m%1, SPILLED(%2)
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%macro DEFINE_FOUR_IMDCT 0
|
||||
cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
|
||||
movlps m0, [inq+64]
|
||||
movhps m0, [inq+64 + 72]
|
||||
movlps m3, [inq+64 + 2*72]
|
||||
movhps m3, [inq+64 + 3*72]
|
||||
|
||||
shufps m5, m0, m3, 0xdd
|
||||
shufps m0, m0, m3, 0x88
|
||||
|
||||
mova m1, [inq+48]
|
||||
movu m6, [inq+48 + 72]
|
||||
mova m7, [inq+48 + 2*72]
|
||||
movu m3, [inq+48 + 3*72]
|
||||
|
||||
TRANSPOSE4x4PS 1, 6, 7, 3, 4
|
||||
|
||||
addps m4, m6, m7
|
||||
mova [tmpq+4*28], m4
|
||||
|
||||
addps m7, m3
|
||||
addps m6, m1
|
||||
addps m3, m0
|
||||
addps m0, m5
|
||||
addps m0, m7
|
||||
addps m7, m6
|
||||
mova [tmpq+4*12], m7
|
||||
SPILL 3, 12
|
||||
|
||||
mova m4, [inq+32]
|
||||
movu m5, [inq+32 + 72]
|
||||
mova m2, [inq+32 + 2*72]
|
||||
movu m7, [inq+32 + 3*72]
|
||||
|
||||
TRANSPOSE4x4PS 4, 5, 2, 7, 3
|
||||
|
||||
addps m1, m7
|
||||
SPILL 1, 11
|
||||
|
||||
addps m3, m5, m2
|
||||
SPILL 3, 13
|
||||
|
||||
addps m7, m2
|
||||
addps m5, m4
|
||||
addps m6, m7
|
||||
mova [tmpq], m6
|
||||
addps m7, m5
|
||||
mova [tmpq+4*16], m7
|
||||
|
||||
mova m2, [inq+16]
|
||||
movu m7, [inq+16 + 72]
|
||||
mova m1, [inq+16 + 2*72]
|
||||
movu m6, [inq+16 + 3*72]
|
||||
|
||||
TRANSPOSE4x4PS 2, 7, 1, 6, 3
|
||||
|
||||
addps m4, m6
|
||||
addps m6, m1
|
||||
addps m1, m7
|
||||
addps m7, m2
|
||||
addps m5, m6
|
||||
SPILL 5, 15
|
||||
addps m6, m7
|
||||
mulps m6, [costabs + 16*2]
|
||||
mova [tmpq+4*8], m6
|
||||
SPILL 1, 10
|
||||
SPILL 0, 14
|
||||
|
||||
mova m1, [inq]
|
||||
movu m6, [inq + 72]
|
||||
mova m3, [inq + 2*72]
|
||||
movu m5, [inq + 3*72]
|
||||
|
||||
TRANSPOSE4x4PS 1, 6, 3, 5, 0
|
||||
|
||||
addps m2, m5
|
||||
addps m5, m3
|
||||
addps m7, m5
|
||||
addps m3, m6
|
||||
addps m6, m1
|
||||
SPILL 7, 8
|
||||
addps m5, m6
|
||||
SPILL 6, 9
|
||||
addps m6, m4, SPILLED(12)
|
||||
subps m6, m2
|
||||
UNSPILL 7, 11
|
||||
SPILL 5, 11
|
||||
subps m5, m1, m7
|
||||
mulps m7, [costabs + 16*5]
|
||||
addps m7, m1
|
||||
mulps m0, m6, [costabs + 16*6]
|
||||
addps m0, m5
|
||||
mova [tmpq+4*24], m0
|
||||
addps m6, m5
|
||||
mova [tmpq+4*4], m6
|
||||
addps m6, m4, m2
|
||||
mulps m6, [costabs + 16*1]
|
||||
subps m4, SPILLED(12)
|
||||
mulps m4, [costabs + 16*8]
|
||||
addps m2, SPILLED(12)
|
||||
mulps m2, [costabs + 16*3]
|
||||
subps m5, m7, m6
|
||||
subps m5, m2
|
||||
addps m6, m7
|
||||
addps m6, m4
|
||||
addps m7, m2
|
||||
subps m7, m4
|
||||
mova [tmpq+4*20], m7
|
||||
mova m2, [tmpq+4*28]
|
||||
mova [tmpq+4*28], m5
|
||||
UNSPILL 7, 13
|
||||
subps m5, m7, m2
|
||||
mulps m5, [costabs + 16*7]
|
||||
UNSPILL 1, 10
|
||||
mulps m1, [costabs + 16*2]
|
||||
addps m4, m3, m2
|
||||
mulps m4, [costabs + 16*4]
|
||||
addps m2, m7
|
||||
addps m7, m3
|
||||
mulps m7, [costabs]
|
||||
subps m3, m2
|
||||
mulps m3, [costabs + 16*2]
|
||||
addps m2, m7, m5
|
||||
addps m2, m1
|
||||
SPILL 2, 10
|
||||
addps m7, m4
|
||||
subps m7, m1
|
||||
SPILL 7, 12
|
||||
subps m5, m4
|
||||
subps m5, m1
|
||||
UNSPILL 0, 14
|
||||
SPILL 5, 13
|
||||
addps m1, m0, SPILLED(15)
|
||||
subps m1, SPILLED(8)
|
||||
mova m4, [costabs + 16*5]
|
||||
mulps m4, [tmpq]
|
||||
UNSPILL 2, 9
|
||||
addps m4, m2
|
||||
subps m2, [tmpq]
|
||||
mulps m5, m1, [costabs + 16*6]
|
||||
addps m5, m2
|
||||
SPILL 5, 9
|
||||
addps m2, m1
|
||||
SPILL 2, 14
|
||||
UNSPILL 5, 15
|
||||
subps m7, m5, m0
|
||||
addps m5, SPILLED(8)
|
||||
mulps m5, [costabs + 16*1]
|
||||
mulps m7, [costabs + 16*8]
|
||||
addps m0, SPILLED(8)
|
||||
mulps m0, [costabs + 16*3]
|
||||
subps m2, m4, m5
|
||||
subps m2, m0
|
||||
SPILL 2, 15
|
||||
addps m5, m4
|
||||
addps m5, m7
|
||||
addps m4, m0
|
||||
subps m4, m7
|
||||
SPILL 4, 8
|
||||
mova m7, [tmpq+4*16]
|
||||
mova m2, [tmpq+4*12]
|
||||
addps m0, m7, m2
|
||||
subps m0, SPILLED(11)
|
||||
mulps m0, [costabs + 16*2]
|
||||
addps m4, m7, SPILLED(11)
|
||||
mulps m4, [costabs]
|
||||
subps m7, m2
|
||||
mulps m7, [costabs + 16*7]
|
||||
addps m2, SPILLED(11)
|
||||
mulps m2, [costabs + 16*4]
|
||||
addps m1, m7, [tmpq+4*8]
|
||||
addps m1, m4
|
||||
addps m4, m2
|
||||
subps m4, [tmpq+4*8]
|
||||
SPILL 4, 11
|
||||
subps m7, m2
|
||||
subps m7, [tmpq+4*8]
|
||||
addps m4, m6, SPILLED(10)
|
||||
subps m6, SPILLED(10)
|
||||
addps m2, m5, m1
|
||||
mulps m2, [costabs + 16*9]
|
||||
subps m5, m1
|
||||
mulps m5, [costabs + 16*17]
|
||||
subps m1, m4, m2
|
||||
addps m4, m2
|
||||
mulps m2, m1, [winq+4*36]
|
||||
addps m2, [bufq+4*36]
|
||||
mova [outq+1152], m2
|
||||
mulps m1, [winq+4*32]
|
||||
addps m1, [bufq+4*32]
|
||||
mova [outq+1024], m1
|
||||
mulps m1, m4, [winq+4*116]
|
||||
mova [bufq+4*36], m1
|
||||
mulps m4, [winq+4*112]
|
||||
mova [bufq+4*32], m4
|
||||
addps m2, m6, m5
|
||||
subps m6, m5
|
||||
mulps m1, m6, [winq+4*68]
|
||||
addps m1, [bufq+4*68]
|
||||
mova [outq+2176], m1
|
||||
mulps m6, [winq]
|
||||
addps m6, [bufq]
|
||||
mova [outq], m6
|
||||
mulps m1, m2, [winq+4*148]
|
||||
mova [bufq+4*68], m1
|
||||
mulps m2, [winq+4*80]
|
||||
mova [bufq], m2
|
||||
addps m5, m3, [tmpq+4*24]
|
||||
mova m2, [tmpq+4*24]
|
||||
subps m2, m3
|
||||
mova m1, SPILLED(9)
|
||||
subps m1, m0
|
||||
mulps m1, [costabs + 16*10]
|
||||
addps m0, SPILLED(9)
|
||||
mulps m0, [costabs + 16*16]
|
||||
addps m6, m5, m1
|
||||
subps m5, m1
|
||||
mulps m3, m5, [winq+4*40]
|
||||
addps m3, [bufq+4*40]
|
||||
mova [outq+1280], m3
|
||||
mulps m5, [winq+4*28]
|
||||
addps m5, [bufq+4*28]
|
||||
mova [outq+896], m5
|
||||
mulps m1, m6, [winq+4*120]
|
||||
mova [bufq+4*40], m1
|
||||
mulps m6, [winq+4*108]
|
||||
mova [bufq+4*28], m6
|
||||
addps m1, m2, m0
|
||||
subps m2, m0
|
||||
mulps m5, m2, [winq+4*64]
|
||||
addps m5, [bufq+4*64]
|
||||
mova [outq+2048], m5
|
||||
mulps m2, [winq+4*4]
|
||||
addps m2, [bufq+4*4]
|
||||
mova [outq+128], m2
|
||||
mulps m0, m1, [winq+4*144]
|
||||
mova [bufq+4*64], m0
|
||||
mulps m1, [winq+4*84]
|
||||
mova [bufq+4*4], m1
|
||||
mova m1, [tmpq+4*28]
|
||||
mova m5, m1
|
||||
addps m1, SPILLED(13)
|
||||
subps m5, SPILLED(13)
|
||||
UNSPILL 3, 15
|
||||
addps m2, m7, m3
|
||||
mulps m2, [costabs + 16*11]
|
||||
subps m3, m7
|
||||
mulps m3, [costabs + 16*15]
|
||||
addps m0, m2, m1
|
||||
subps m1, m2
|
||||
SWAP m0, m2
|
||||
mulps m6, m1, [winq+4*44]
|
||||
addps m6, [bufq+4*44]
|
||||
mova [outq+1408], m6
|
||||
mulps m1, [winq+4*24]
|
||||
addps m1, [bufq+4*24]
|
||||
mova [outq+768], m1
|
||||
mulps m0, m2, [winq+4*124]
|
||||
mova [bufq+4*44], m0
|
||||
mulps m2, [winq+4*104]
|
||||
mova [bufq+4*24], m2
|
||||
addps m0, m5, m3
|
||||
subps m5, m3
|
||||
mulps m1, m5, [winq+4*60]
|
||||
addps m1, [bufq+4*60]
|
||||
mova [outq+1920], m1
|
||||
mulps m5, [winq+4*8]
|
||||
addps m5, [bufq+4*8]
|
||||
mova [outq+256], m5
|
||||
mulps m1, m0, [winq+4*140]
|
||||
mova [bufq+4*60], m1
|
||||
mulps m0, [winq+4*88]
|
||||
mova [bufq+4*8], m0
|
||||
mova m1, [tmpq+4*20]
|
||||
addps m1, SPILLED(12)
|
||||
mova m2, [tmpq+4*20]
|
||||
subps m2, SPILLED(12)
|
||||
UNSPILL 7, 8
|
||||
subps m0, m7, SPILLED(11)
|
||||
addps m7, SPILLED(11)
|
||||
mulps m4, m7, [costabs + 16*12]
|
||||
mulps m0, [costabs + 16*14]
|
||||
addps m5, m1, m4
|
||||
subps m1, m4
|
||||
mulps m7, m1, [winq+4*48]
|
||||
addps m7, [bufq+4*48]
|
||||
mova [outq+1536], m7
|
||||
mulps m1, [winq+4*20]
|
||||
addps m1, [bufq+4*20]
|
||||
mova [outq+640], m1
|
||||
mulps m1, m5, [winq+4*128]
|
||||
mova [bufq+4*48], m1
|
||||
mulps m5, [winq+4*100]
|
||||
mova [bufq+4*20], m5
|
||||
addps m6, m2, m0
|
||||
subps m2, m0
|
||||
mulps m1, m2, [winq+4*56]
|
||||
addps m1, [bufq+4*56]
|
||||
mova [outq+1792], m1
|
||||
mulps m2, [winq+4*12]
|
||||
addps m2, [bufq+4*12]
|
||||
mova [outq+384], m2
|
||||
mulps m0, m6, [winq+4*136]
|
||||
mova [bufq+4*56], m0
|
||||
mulps m6, [winq+4*92]
|
||||
mova [bufq+4*12], m6
|
||||
UNSPILL 0, 14
|
||||
mulps m0, [costabs + 16*13]
|
||||
mova m3, [tmpq+4*4]
|
||||
addps m2, m0, m3
|
||||
subps m3, m0
|
||||
mulps m0, m3, [winq+4*52]
|
||||
addps m0, [bufq+4*52]
|
||||
mova [outq+1664], m0
|
||||
mulps m3, [winq+4*16]
|
||||
addps m3, [bufq+4*16]
|
||||
mova [outq+512], m3
|
||||
mulps m0, m2, [winq+4*132]
|
||||
mova [bufq+4*52], m0
|
||||
mulps m2, [winq+4*96]
|
||||
mova [bufq+4*16], m2
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
DEFINE_FOUR_IMDCT
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
DEFINE_FOUR_IMDCT
|
||||
%endif
|
||||
100
externals/ffmpeg/libavcodec/x86/inline_asm.h
vendored
Executable file
100
externals/ffmpeg/libavcodec/x86/inline_asm.h
vendored
Executable file
@@ -0,0 +1,100 @@
|
||||
/*
|
||||
* inline assembly helper macros
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_INLINE_ASM_H
|
||||
#define AVCODEC_X86_INLINE_ASM_H
|
||||
|
||||
#include "constants.h"
|
||||
|
||||
#define MOVQ_WONE(regd) \
|
||||
__asm__ volatile ( \
|
||||
"pcmpeqd %%" #regd ", %%" #regd " \n\t" \
|
||||
"psrlw $15, %%" #regd ::)
|
||||
|
||||
#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
|
||||
#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
|
||||
|
||||
#define MOVQ_BFE(regd) \
|
||||
__asm__ volatile ( \
|
||||
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
|
||||
"paddb %%"#regd", %%"#regd" \n\t" ::)
|
||||
|
||||
#ifndef PIC
|
||||
#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2))
|
||||
#else
|
||||
// for shared library it's better to use this way for accessing constants
|
||||
// pcmpeqd -> -1
|
||||
#define MOVQ_WTWO(regd) \
|
||||
__asm__ volatile ( \
|
||||
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
|
||||
"psrlw $15, %%"#regd" \n\t" \
|
||||
"psllw $1, %%"#regd" \n\t"::)
|
||||
|
||||
#endif
|
||||
|
||||
// using regr as temporary and for the output result
|
||||
// first argument is unmodified and second is trashed
|
||||
// regfe is supposed to contain 0xfefefefefefefefe
|
||||
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
|
||||
"movq "#rega", "#regr" \n\t" \
|
||||
"pand "#regb", "#regr" \n\t" \
|
||||
"pxor "#rega", "#regb" \n\t" \
|
||||
"pand "#regfe", "#regb" \n\t" \
|
||||
"psrlq $1, "#regb" \n\t" \
|
||||
"paddb "#regb", "#regr" \n\t"
|
||||
|
||||
#define PAVGB_MMX(rega, regb, regr, regfe) \
|
||||
"movq "#rega", "#regr" \n\t" \
|
||||
"por "#regb", "#regr" \n\t" \
|
||||
"pxor "#rega", "#regb" \n\t" \
|
||||
"pand "#regfe", "#regb" \n\t" \
|
||||
"psrlq $1, "#regb" \n\t" \
|
||||
"psubb "#regb", "#regr" \n\t"
|
||||
|
||||
// mm6 is supposed to contain 0xfefefefefefefefe
|
||||
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
|
||||
"movq "#rega", "#regr" \n\t" \
|
||||
"movq "#regc", "#regp" \n\t" \
|
||||
"pand "#regb", "#regr" \n\t" \
|
||||
"pand "#regd", "#regp" \n\t" \
|
||||
"pxor "#rega", "#regb" \n\t" \
|
||||
"pxor "#regc", "#regd" \n\t" \
|
||||
"pand %%mm6, "#regb" \n\t" \
|
||||
"pand %%mm6, "#regd" \n\t" \
|
||||
"psrlq $1, "#regb" \n\t" \
|
||||
"psrlq $1, "#regd" \n\t" \
|
||||
"paddb "#regb", "#regr" \n\t" \
|
||||
"paddb "#regd", "#regp" \n\t"
|
||||
|
||||
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
|
||||
"movq "#rega", "#regr" \n\t" \
|
||||
"movq "#regc", "#regp" \n\t" \
|
||||
"por "#regb", "#regr" \n\t" \
|
||||
"por "#regd", "#regp" \n\t" \
|
||||
"pxor "#rega", "#regb" \n\t" \
|
||||
"pxor "#regc", "#regd" \n\t" \
|
||||
"pand %%mm6, "#regb" \n\t" \
|
||||
"pand %%mm6, "#regd" \n\t" \
|
||||
"psrlq $1, "#regd" \n\t" \
|
||||
"psrlq $1, "#regb" \n\t" \
|
||||
"psubb "#regb", "#regr" \n\t" \
|
||||
"psubb "#regd", "#regp" \n\t"
|
||||
|
||||
#endif /* AVCODEC_X86_INLINE_ASM_H */
|
||||
164
externals/ffmpeg/libavcodec/x86/jpeg2000dsp.asm
vendored
Executable file
164
externals/ffmpeg/libavcodec/x86/jpeg2000dsp.asm
vendored
Executable file
@@ -0,0 +1,164 @@
|
||||
;******************************************************************************
|
||||
;* SIMD-optimized JPEG2000 DSP functions
|
||||
;* Copyright (c) 2014 Nicolas Bertrand
|
||||
;* Copyright (c) 2015 James Almer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
pf_ict0: times 8 dd 1.402
|
||||
pf_ict1: times 8 dd 0.34413
|
||||
pf_ict2: times 8 dd 0.71414
|
||||
pf_ict3: times 8 dd 1.772
|
||||
|
||||
SECTION .text
|
||||
|
||||
;***********************************************************************
|
||||
; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize)
|
||||
;***********************************************************************
|
||||
%macro ICT_FLOAT 1
|
||||
cglobal ict_float, 4, 4, %1, src0, src1, src2, csize
|
||||
shl csized, 2
|
||||
add src0q, csizeq
|
||||
add src1q, csizeq
|
||||
add src2q, csizeq
|
||||
neg csizeq
|
||||
movaps m6, [pf_ict0]
|
||||
movaps m7, [pf_ict1]
|
||||
%define ICT0 m6
|
||||
%define ICT1 m7
|
||||
|
||||
%if ARCH_X86_64
|
||||
movaps m8, [pf_ict2]
|
||||
%define ICT2 m8
|
||||
%if cpuflag(avx)
|
||||
movaps m3, [pf_ict3]
|
||||
%define ICT3 m3
|
||||
%else
|
||||
movaps m9, [pf_ict3]
|
||||
%define ICT3 m9
|
||||
%endif
|
||||
|
||||
%else ; ARCH_X86_32
|
||||
%define ICT2 [pf_ict2]
|
||||
%if cpuflag(avx)
|
||||
movaps m3, [pf_ict3]
|
||||
%define ICT3 m3
|
||||
%else
|
||||
%define ICT3 [pf_ict3]
|
||||
%endif
|
||||
|
||||
%endif ; ARCH
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
movaps m0, [src0q+csizeq]
|
||||
movaps m1, [src1q+csizeq]
|
||||
movaps m2, [src2q+csizeq]
|
||||
|
||||
%if cpuflag(fma4) || cpuflag(fma3)
|
||||
%if cpuflag(fma4)
|
||||
fnmaddps m5, m1, ICT1, m0
|
||||
fmaddps m4, m2, ICT0, m0
|
||||
%else ; fma3
|
||||
movaps m5, m1
|
||||
movaps m4, m2
|
||||
fnmaddps m5, m5, ICT1, m0
|
||||
fmaddps m4, m4, ICT0, m0
|
||||
%endif
|
||||
fmaddps m0, m1, ICT3, m0
|
||||
fnmaddps m5, m2, ICT2, m5
|
||||
%else ; non FMA
|
||||
%if cpuflag(avx)
|
||||
mulps m5, m1, ICT1
|
||||
mulps m4, m2, ICT0
|
||||
mulps m1, m1, ICT3
|
||||
mulps m2, m2, ICT2
|
||||
subps m5, m0, m5
|
||||
%else ; sse
|
||||
movaps m3, m1
|
||||
movaps m4, m2
|
||||
movaps m5, m0
|
||||
mulps m3, ICT1
|
||||
mulps m4, ICT0
|
||||
mulps m1, ICT3
|
||||
mulps m2, ICT2
|
||||
subps m5, m3
|
||||
%endif
|
||||
addps m4, m4, m0
|
||||
addps m0, m0, m1
|
||||
subps m5, m5, m2
|
||||
%endif
|
||||
|
||||
movaps [src0q+csizeq], m4
|
||||
movaps [src2q+csizeq], m0
|
||||
movaps [src1q+csizeq], m5
|
||||
add csizeq, mmsize
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
ICT_FLOAT 10
|
||||
INIT_YMM avx
|
||||
ICT_FLOAT 9
|
||||
%if HAVE_FMA4_EXTERNAL
|
||||
INIT_XMM fma4
|
||||
ICT_FLOAT 9
|
||||
%endif
|
||||
INIT_YMM fma3
|
||||
ICT_FLOAT 9
|
||||
|
||||
;***************************************************************************
|
||||
; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize)
|
||||
;***************************************************************************
|
||||
%macro RCT_INT 0
|
||||
cglobal rct_int, 4, 4, 4, src0, src1, src2, csize
|
||||
shl csized, 2
|
||||
add src0q, csizeq
|
||||
add src1q, csizeq
|
||||
add src2q, csizeq
|
||||
neg csizeq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
mova m1, [src1q+csizeq]
|
||||
mova m2, [src2q+csizeq]
|
||||
mova m0, [src0q+csizeq]
|
||||
paddd m3, m1, m2
|
||||
psrad m3, 2
|
||||
psubd m0, m3
|
||||
paddd m1, m0
|
||||
paddd m2, m0
|
||||
mova [src1q+csizeq], m0
|
||||
mova [src2q+csizeq], m1
|
||||
mova [src0q+csizeq], m2
|
||||
add csizeq, mmsize
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
RCT_INT
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
RCT_INT
|
||||
%endif
|
||||
60
externals/ffmpeg/libavcodec/x86/jpeg2000dsp_init.c
vendored
Executable file
60
externals/ffmpeg/libavcodec/x86/jpeg2000dsp_init.c
vendored
Executable file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* SIMD optimized JPEG 2000 DSP functions
|
||||
* Copyright (c) 2015 James Almer
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/jpeg2000dsp.h"
|
||||
|
||||
void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
|
||||
void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
|
||||
void ff_ict_float_fma3(void *src0, void *src1, void *src2, int csize);
|
||||
void ff_ict_float_fma4(void *src0, void *src1, void *src2, int csize);
|
||||
void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
|
||||
void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
|
||||
|
||||
av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
c->mct_decode[FF_DWT97] = ff_ict_float_sse;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->mct_decode[FF_DWT53] = ff_rct_int_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags)) {
|
||||
c->mct_decode[FF_DWT97] = ff_ict_float_avx;
|
||||
}
|
||||
|
||||
if (EXTERNAL_FMA4(cpu_flags)) {
|
||||
c->mct_decode[FF_DWT97] = ff_ict_float_fma4;
|
||||
}
|
||||
|
||||
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
|
||||
c->mct_decode[FF_DWT97] = ff_ict_float_fma3;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
|
||||
}
|
||||
}
|
||||
190
externals/ffmpeg/libavcodec/x86/lossless_audiodsp.asm
vendored
Executable file
190
externals/ffmpeg/libavcodec/x86/lossless_audiodsp.asm
vendored
Executable file
@@ -0,0 +1,190 @@
|
||||
;******************************************************************************
|
||||
;* Copyright (c) 2008 Loren Merritt
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro SCALARPRODUCT 0
|
||||
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
|
||||
; int order, int mul)
|
||||
cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
|
||||
shl orderq, 1
|
||||
movd m7, mulm
|
||||
%if mmsize == 16
|
||||
pshuflw m7, m7, 0
|
||||
punpcklqdq m7, m7
|
||||
%else
|
||||
pshufw m7, m7, 0
|
||||
%endif
|
||||
pxor m6, m6
|
||||
add v1q, orderq
|
||||
add v2q, orderq
|
||||
add v3q, orderq
|
||||
neg orderq
|
||||
.loop:
|
||||
movu m0, [v2q + orderq]
|
||||
movu m1, [v2q + orderq + mmsize]
|
||||
mova m4, [v1q + orderq]
|
||||
mova m5, [v1q + orderq + mmsize]
|
||||
movu m2, [v3q + orderq]
|
||||
movu m3, [v3q + orderq + mmsize]
|
||||
pmaddwd m0, m4
|
||||
pmaddwd m1, m5
|
||||
pmullw m2, m7
|
||||
pmullw m3, m7
|
||||
paddd m6, m0
|
||||
paddd m6, m1
|
||||
paddw m2, m4
|
||||
paddw m3, m5
|
||||
mova [v1q + orderq], m2
|
||||
mova [v1q + orderq + mmsize], m3
|
||||
add orderq, mmsize*2
|
||||
jl .loop
|
||||
HADDD m6, m0
|
||||
movd eax, m6
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
SCALARPRODUCT
|
||||
INIT_XMM sse2
|
||||
SCALARPRODUCT
|
||||
|
||||
INIT_XMM sse4
|
||||
; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3,
|
||||
; int order, int mul)
|
||||
cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul
|
||||
shl orderq, 1
|
||||
movd m7, mulm
|
||||
SPLATW m7, m7
|
||||
pxor m6, m6
|
||||
add v1q, orderq
|
||||
lea v2q, [v2q + 2*orderq]
|
||||
add v3q, orderq
|
||||
neg orderq
|
||||
.loop:
|
||||
mova m3, [v1q + orderq]
|
||||
movu m0, [v2q + 2*orderq]
|
||||
pmovsxwd m4, m3
|
||||
movu m1, [v2q + 2*orderq + mmsize]
|
||||
movhlps m5, m3
|
||||
movu m2, [v3q + orderq]
|
||||
pmovsxwd m5, m5
|
||||
pmullw m2, m7
|
||||
pmulld m0, m4
|
||||
pmulld m1, m5
|
||||
paddw m2, m3
|
||||
paddd m6, m0
|
||||
paddd m6, m1
|
||||
mova [v1q + orderq], m2
|
||||
add orderq, 16
|
||||
jl .loop
|
||||
HADDD m6, m0
|
||||
movd eax, m6
|
||||
RET
|
||||
|
||||
%macro SCALARPRODUCT_LOOP 1
|
||||
align 16
|
||||
.loop%1:
|
||||
sub orderq, mmsize*2
|
||||
%if %1
|
||||
mova m1, m4
|
||||
mova m4, [v2q + orderq]
|
||||
mova m0, [v2q + orderq + mmsize]
|
||||
palignr m1, m0, %1
|
||||
palignr m0, m4, %1
|
||||
mova m3, m5
|
||||
mova m5, [v3q + orderq]
|
||||
mova m2, [v3q + orderq + mmsize]
|
||||
palignr m3, m2, %1
|
||||
palignr m2, m5, %1
|
||||
%else
|
||||
mova m0, [v2q + orderq]
|
||||
mova m1, [v2q + orderq + mmsize]
|
||||
mova m2, [v3q + orderq]
|
||||
mova m3, [v3q + orderq + mmsize]
|
||||
%endif
|
||||
%define t0 [v1q + orderq]
|
||||
%define t1 [v1q + orderq + mmsize]
|
||||
%if ARCH_X86_64
|
||||
mova m8, t0
|
||||
mova m9, t1
|
||||
%define t0 m8
|
||||
%define t1 m9
|
||||
%endif
|
||||
pmaddwd m0, t0
|
||||
pmaddwd m1, t1
|
||||
pmullw m2, m7
|
||||
pmullw m3, m7
|
||||
paddw m2, t0
|
||||
paddw m3, t1
|
||||
paddd m6, m0
|
||||
paddd m6, m1
|
||||
mova [v1q + orderq], m2
|
||||
mova [v1q + orderq + mmsize], m3
|
||||
jg .loop%1
|
||||
%if %1
|
||||
jmp .end
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
|
||||
; int order, int mul)
|
||||
INIT_XMM ssse3
|
||||
cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
|
||||
shl orderq, 1
|
||||
movd m7, mulm
|
||||
pshuflw m7, m7, 0
|
||||
punpcklqdq m7, m7
|
||||
pxor m6, m6
|
||||
mov r4d, v2d
|
||||
and r4d, 15
|
||||
and v2q, ~15
|
||||
and v3q, ~15
|
||||
mova m4, [v2q + orderq]
|
||||
mova m5, [v3q + orderq]
|
||||
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
|
||||
cmp r4d, 0
|
||||
je .loop0
|
||||
cmp r4d, 2
|
||||
je .loop2
|
||||
cmp r4d, 4
|
||||
je .loop4
|
||||
cmp r4d, 6
|
||||
je .loop6
|
||||
cmp r4d, 8
|
||||
je .loop8
|
||||
cmp r4d, 10
|
||||
je .loop10
|
||||
cmp r4d, 12
|
||||
je .loop12
|
||||
SCALARPRODUCT_LOOP 14
|
||||
SCALARPRODUCT_LOOP 12
|
||||
SCALARPRODUCT_LOOP 10
|
||||
SCALARPRODUCT_LOOP 8
|
||||
SCALARPRODUCT_LOOP 6
|
||||
SCALARPRODUCT_LOOP 4
|
||||
SCALARPRODUCT_LOOP 2
|
||||
SCALARPRODUCT_LOOP 0
|
||||
.end:
|
||||
HADDD m6, m0
|
||||
movd eax, m6
|
||||
RET
|
||||
56
externals/ffmpeg/libavcodec/x86/lossless_audiodsp_init.c
vendored
Executable file
56
externals/ffmpeg/libavcodec/x86/lossless_audiodsp_init.c
vendored
Executable file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/lossless_audiodsp.h"
|
||||
|
||||
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul);
|
||||
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul);
|
||||
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul);
|
||||
|
||||
int32_t ff_scalarproduct_and_madd_int32_sse4(int16_t *v1, const int32_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul);
|
||||
|
||||
av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags) &&
|
||||
!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags))
|
||||
c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4;
|
||||
#endif
|
||||
}
|
||||
406
externals/ffmpeg/libavcodec/x86/lossless_videodsp.asm
vendored
Executable file
406
externals/ffmpeg/libavcodec/x86/lossless_videodsp.asm
vendored
Executable file
@@ -0,0 +1,406 @@
|
||||
;******************************************************************************
|
||||
;* SIMD lossless video DSP utils
|
||||
;* Copyright (c) 2008 Loren Merritt
|
||||
;* Copyright (c) 2014 Michael Niedermayer
|
||||
;* Copyright (c) 2017 Jokyo Images
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
cextern pb_15
|
||||
pb_zzzzzzzz77777777: times 8 db -1
|
||||
pb_7: times 8 db 7
|
||||
pb_ef: times 8 db 14,15
|
||||
pb_67: times 8 db 6, 7
|
||||
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
|
||||
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
|
||||
pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
|
||||
pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
|
||||
|
||||
SECTION .text
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
|
||||
; const uint8_t *diff, int w,
|
||||
; int *left, int *left_top)
|
||||
;------------------------------------------------------------------------------
|
||||
%macro MEDIAN_PRED 0
|
||||
cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
|
||||
movu m0, [topq]
|
||||
mova m2, m0
|
||||
movd m4, [left_topq]
|
||||
LSHIFT m2, 1
|
||||
mova m1, m0
|
||||
por m4, m2
|
||||
movd m3, [leftq]
|
||||
psubb m0, m4 ; t-tl
|
||||
add dstq, wq
|
||||
add topq, wq
|
||||
add diffq, wq
|
||||
neg wq
|
||||
jmp .skip
|
||||
.loop:
|
||||
movu m4, [topq+wq]
|
||||
mova m0, m4
|
||||
LSHIFT m4, 1
|
||||
por m4, m1
|
||||
mova m1, m0 ; t
|
||||
psubb m0, m4 ; t-tl
|
||||
.skip:
|
||||
movu m2, [diffq+wq]
|
||||
%assign i 0
|
||||
%rep mmsize
|
||||
mova m4, m0
|
||||
paddb m4, m3 ; t-tl+l
|
||||
mova m5, m3
|
||||
pmaxub m3, m1
|
||||
pminub m5, m1
|
||||
pminub m3, m4
|
||||
pmaxub m3, m5 ; median
|
||||
paddb m3, m2 ; +residual
|
||||
%if i==0
|
||||
mova m7, m3
|
||||
LSHIFT m7, mmsize-1
|
||||
%else
|
||||
mova m6, m3
|
||||
RSHIFT m7, 1
|
||||
LSHIFT m6, mmsize-1
|
||||
por m7, m6
|
||||
%endif
|
||||
%if i<mmsize-1
|
||||
RSHIFT m0, 1
|
||||
RSHIFT m1, 1
|
||||
RSHIFT m2, 1
|
||||
%endif
|
||||
%assign i i+1
|
||||
%endrep
|
||||
movu [dstq+wq], m7
|
||||
add wq, mmsize
|
||||
jl .loop
|
||||
movzx r2d, byte [dstq-1]
|
||||
mov [leftq], r2d
|
||||
movzx r2d, byte [topq-1]
|
||||
mov [left_topq], r2d
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmxext
|
||||
MEDIAN_PRED
|
||||
%endif
|
||||
INIT_XMM sse2
|
||||
MEDIAN_PRED
|
||||
|
||||
|
||||
%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
|
||||
add srcq, wq
|
||||
add dstq, wq
|
||||
neg wq
|
||||
%%.loop:
|
||||
pshufb xm0, xm5
|
||||
%if %2
|
||||
mova m1, [srcq+wq]
|
||||
%else
|
||||
movu m1, [srcq+wq]
|
||||
%endif
|
||||
psllw m2, m1, 8
|
||||
paddb m1, m2
|
||||
pshufb m2, m1, m3
|
||||
paddb m1, m2
|
||||
pshufb m2, m1, m4
|
||||
paddb m1, m2
|
||||
%if mmsize >= 16
|
||||
pshufb m2, m1, m6
|
||||
paddb m1, m2
|
||||
%endif
|
||||
paddb xm0, xm1
|
||||
%if %1
|
||||
mova [dstq+wq], xm0
|
||||
%else
|
||||
movq [dstq+wq], xm0
|
||||
movhps [dstq+wq+8], xm0
|
||||
%endif
|
||||
|
||||
%if mmsize == 32
|
||||
vextracti128 xm2, m1, 1 ; get second lane of the ymm
|
||||
pshufb xm0, xm5 ; set alls val to last val of the first lane
|
||||
paddb xm0, xm2
|
||||
;store val
|
||||
%if %1
|
||||
mova [dstq+wq+16], xm0
|
||||
%else;
|
||||
movq [dstq+wq+16], xm0
|
||||
movhps [dstq+wq+16+8], xm0
|
||||
%endif
|
||||
%endif
|
||||
add wq, mmsize
|
||||
jl %%.loop
|
||||
%if mmsize == 32
|
||||
movzx eax, byte [dstq - 1]
|
||||
%else;
|
||||
mov eax, mmsize-1
|
||||
sub eax, wd
|
||||
movd m1, eax
|
||||
pshufb m0, m1
|
||||
movd eax, m0
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
|
||||
;------------------------------------------------------------------------------
|
||||
INIT_MMX ssse3
|
||||
cglobal add_left_pred, 3,3,7, dst, src, w, left
|
||||
.skip_prologue:
|
||||
mova m5, [pb_7]
|
||||
mova m4, [pb_zzzz3333zzzzbbbb]
|
||||
mova m3, [pb_zz11zz55zz99zzdd]
|
||||
movd m0, leftm
|
||||
psllq m0, 56
|
||||
ADD_LEFT_LOOP 1, 1
|
||||
|
||||
%macro ADD_LEFT_PRED_UNALIGNED 0
|
||||
cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
|
||||
mova xm5, [pb_15]
|
||||
VBROADCASTI128 m6, [pb_zzzzzzzz77777777]
|
||||
VBROADCASTI128 m4, [pb_zzzz3333zzzzbbbb]
|
||||
VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd]
|
||||
movd xm0, leftm
|
||||
pslldq xm0, 15
|
||||
test srcq, mmsize - 1
|
||||
jnz .src_unaligned
|
||||
test dstq, mmsize - 1
|
||||
jnz .dst_unaligned
|
||||
ADD_LEFT_LOOP 1, 1
|
||||
.dst_unaligned:
|
||||
ADD_LEFT_LOOP 0, 1
|
||||
.src_unaligned:
|
||||
ADD_LEFT_LOOP 0, 0
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
ADD_LEFT_PRED_UNALIGNED
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
ADD_LEFT_PRED_UNALIGNED
|
||||
%endif
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w);
|
||||
;------------------------------------------------------------------------------
|
||||
%macro ADD_BYTES 0
|
||||
cglobal add_bytes, 3,4,2, dst, src, w, size
|
||||
mov sizeq, wq
|
||||
and sizeq, -2*mmsize
|
||||
jz .2
|
||||
add dstq, sizeq
|
||||
add srcq, sizeq
|
||||
neg sizeq
|
||||
.1:
|
||||
mova m0, [srcq + sizeq]
|
||||
mova m1, [srcq + sizeq + mmsize]
|
||||
paddb m0, [dstq + sizeq]
|
||||
paddb m1, [dstq + sizeq + mmsize]
|
||||
mova [dstq + sizeq], m0
|
||||
mova [dstq + sizeq + mmsize], m1
|
||||
add sizeq, 2*mmsize
|
||||
jl .1
|
||||
.2:
|
||||
and wq, 2*mmsize-1
|
||||
jz .end
|
||||
add dstq, wq
|
||||
add srcq, wq
|
||||
neg wq
|
||||
.3:
|
||||
mov sizeb, [srcq + wq]
|
||||
add [dstq + wq], sizeb
|
||||
inc wq
|
||||
jl .3
|
||||
.end:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
ADD_BYTES
|
||||
%endif
|
||||
INIT_XMM sse2
|
||||
ADD_BYTES
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
ADD_BYTES
|
||||
%endif
|
||||
|
||||
%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
|
||||
add wd, wd
|
||||
add srcq, wq
|
||||
add dstq, wq
|
||||
neg wq
|
||||
%%.loop:
|
||||
mov%2 m1, [srcq+wq]
|
||||
mova m2, m1
|
||||
pslld m1, 16
|
||||
paddw m1, m2
|
||||
mova m2, m1
|
||||
|
||||
pshufb m1, m3
|
||||
paddw m1, m2
|
||||
pshufb m0, m5
|
||||
%if mmsize == 16
|
||||
mova m2, m1
|
||||
pshufb m1, m4
|
||||
paddw m1, m2
|
||||
%endif
|
||||
paddw m0, m1
|
||||
pand m0, m7
|
||||
%ifidn %1, a
|
||||
mova [dstq+wq], m0
|
||||
%else
|
||||
movq [dstq+wq], m0
|
||||
movhps [dstq+wq+8], m0
|
||||
%endif
|
||||
add wq, mmsize
|
||||
jl %%.loop
|
||||
mov eax, mmsize-1
|
||||
sub eax, wd
|
||||
mov wd, eax
|
||||
shl wd, 8
|
||||
lea eax, [wd+eax-1]
|
||||
movd m1, eax
|
||||
pshufb m0, m1
|
||||
movd eax, m0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
;---------------------------------------------------------------------------------------------
|
||||
; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
|
||||
;---------------------------------------------------------------------------------------------
|
||||
INIT_MMX ssse3
|
||||
cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
|
||||
.skip_prologue:
|
||||
mova m5, [pb_67]
|
||||
mova m3, [pb_zzzz2323zzzzabab]
|
||||
movd m0, leftm
|
||||
psllq m0, 48
|
||||
movd m7, maskm
|
||||
SPLATW m7 ,m7
|
||||
ADD_HFYU_LEFT_LOOP_INT16 a, a
|
||||
|
||||
INIT_XMM ssse3
|
||||
cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left
|
||||
mova m5, [pb_ef]
|
||||
mova m4, [pb_zzzzzzzz67676767]
|
||||
mova m3, [pb_zzzz2323zzzzabab]
|
||||
movd m0, leftm
|
||||
pslldq m0, 14
|
||||
movd m7, maskm
|
||||
SPLATW m7 ,m7
|
||||
test srcq, 15
|
||||
jnz .src_unaligned
|
||||
test dstq, 15
|
||||
jnz .dst_unaligned
|
||||
ADD_HFYU_LEFT_LOOP_INT16 a, a
|
||||
.dst_unaligned:
|
||||
ADD_HFYU_LEFT_LOOP_INT16 u, a
|
||||
.src_unaligned:
|
||||
ADD_HFYU_LEFT_LOOP_INT16 u, u
|
||||
|
||||
|
||||
;---------------------------------------------------------------------------------------------
|
||||
; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width)
|
||||
;---------------------------------------------------------------------------------------------
|
||||
%macro ADD_GRADIENT_PRED 0
|
||||
cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp
|
||||
mova xm0, [pb_15]
|
||||
|
||||
;load src - 1 in xm1
|
||||
movd xm1, [srcq-1]
|
||||
%if cpuflag(avx2)
|
||||
vpbroadcastb xm1, xm1
|
||||
%else
|
||||
pxor xm2, xm2
|
||||
pshufb xm1, xm2
|
||||
%endif
|
||||
|
||||
add srcq, widthq
|
||||
neg widthq
|
||||
neg strideq
|
||||
|
||||
.loop:
|
||||
lea tmpq, [srcq + strideq]
|
||||
mova m2, [tmpq + widthq] ; A = src[x-stride]
|
||||
movu m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)]
|
||||
mova m4, [srcq + widthq] ; current val (src[x])
|
||||
|
||||
psubb m2, m3; A - B
|
||||
|
||||
; prefix sum A-B
|
||||
pslldq m3, m2, 1
|
||||
paddb m2, m3
|
||||
pslldq m3, m2, 2
|
||||
paddb m2, m3
|
||||
pslldq m3, m2, 4
|
||||
paddb m2, m3
|
||||
pslldq m3, m2, 8
|
||||
paddb m2, m3
|
||||
|
||||
; prefix sum current val
|
||||
pslldq m3, m4, 1
|
||||
paddb m4, m3
|
||||
pslldq m3, m4, 2
|
||||
paddb m4, m3
|
||||
pslldq m3, m4, 4
|
||||
paddb m4, m3
|
||||
pslldq m3, m4, 8
|
||||
paddb m4, m3
|
||||
|
||||
; last sum
|
||||
paddb m2, m4 ; current + (A - B)
|
||||
|
||||
paddb xm1, xm2 ; += C
|
||||
mova [srcq + widthq], xm1 ; store
|
||||
|
||||
pshufb xm1, xm0 ; put last val in all val of xm1
|
||||
|
||||
%if mmsize == 32
|
||||
vextracti128 xm2, m2, 1 ; get second lane of the ymm
|
||||
paddb xm1, xm2; += C
|
||||
|
||||
mova [srcq + widthq + 16], xm1 ; store
|
||||
pshufb xm1, xm0 ; put last val in all val of m1
|
||||
%endif
|
||||
|
||||
add widthq, mmsize
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
ADD_GRADIENT_PRED
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
ADD_GRADIENT_PRED
|
||||
%endif
|
||||
128
externals/ffmpeg/libavcodec/x86/lossless_videodsp_init.c
vendored
Executable file
128
externals/ffmpeg/libavcodec/x86/lossless_videodsp_init.c
vendored
Executable file
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
* Lossless video DSP utils
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "../lossless_videodsp.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
|
||||
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t w);
|
||||
void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
|
||||
void ff_add_bytes_avx2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
|
||||
|
||||
void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
|
||||
const uint8_t *diff, ptrdiff_t w,
|
||||
int *left, int *left_top);
|
||||
void ff_add_median_pred_sse2(uint8_t *dst, const uint8_t *top,
|
||||
const uint8_t *diff, ptrdiff_t w,
|
||||
int *left, int *left_top);
|
||||
|
||||
int ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t w, int left);
|
||||
int ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t w, int left);
|
||||
int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t w, int left);
|
||||
|
||||
int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
|
||||
int ff_add_left_pred_int16_unaligned_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
|
||||
|
||||
void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
|
||||
void ff_add_gradient_pred_avx2(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
|
||||
|
||||
#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
|
||||
static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top,
|
||||
const uint8_t *diff, ptrdiff_t w,
|
||||
int *left, int *left_top)
|
||||
{
|
||||
x86_reg w2 = -w;
|
||||
x86_reg x;
|
||||
int l = *left & 0xff;
|
||||
int tl = *left_top & 0xff;
|
||||
int t;
|
||||
__asm__ volatile (
|
||||
"mov %7, %3 \n"
|
||||
"1: \n"
|
||||
"movzbl (%3, %4), %2 \n"
|
||||
"mov %2, %k3 \n"
|
||||
"sub %b1, %b3 \n"
|
||||
"add %b0, %b3 \n"
|
||||
"mov %2, %1 \n"
|
||||
"cmp %0, %2 \n"
|
||||
"cmovg %0, %2 \n"
|
||||
"cmovg %1, %0 \n"
|
||||
"cmp %k3, %0 \n"
|
||||
"cmovg %k3, %0 \n"
|
||||
"mov %7, %3 \n"
|
||||
"cmp %2, %0 \n"
|
||||
"cmovl %2, %0 \n"
|
||||
"add (%6, %4), %b0 \n"
|
||||
"mov %b0, (%5, %4) \n"
|
||||
"inc %4 \n"
|
||||
"jl 1b \n"
|
||||
: "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
|
||||
: "r"(dst + w), "r"(diff + w), "rm"(top + w)
|
||||
);
|
||||
*left = l;
|
||||
*left_top = tl;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ff_llviddsp_init_x86(LLVidDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
|
||||
if (cpu_flags & AV_CPU_FLAG_CMOV)
|
||||
c->add_median_pred = add_median_pred_cmov;
|
||||
#endif
|
||||
|
||||
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
|
||||
c->add_bytes = ff_add_bytes_mmx;
|
||||
}
|
||||
|
||||
if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
/* slower than cmov version on AMD */
|
||||
if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
|
||||
c->add_median_pred = ff_add_median_pred_mmxext;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->add_bytes = ff_add_bytes_sse2;
|
||||
c->add_median_pred = ff_add_median_pred_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->add_left_pred = ff_add_left_pred_ssse3;
|
||||
c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
|
||||
c->add_gradient_pred = ff_add_gradient_pred_ssse3;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
|
||||
c->add_left_pred = ff_add_left_pred_unaligned_ssse3;
|
||||
c->add_left_pred_int16 = ff_add_left_pred_int16_unaligned_ssse3;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
c->add_bytes = ff_add_bytes_avx2;
|
||||
c->add_left_pred = ff_add_left_pred_unaligned_avx2;
|
||||
c->add_gradient_pred = ff_add_gradient_pred_avx2;
|
||||
}
|
||||
}
|
||||
194
externals/ffmpeg/libavcodec/x86/lossless_videoencdsp.asm
vendored
Executable file
194
externals/ffmpeg/libavcodec/x86/lossless_videoencdsp.asm
vendored
Executable file
@@ -0,0 +1,194 @@
|
||||
;************************************************************************
|
||||
;* SIMD-optimized lossless video encoding functions
|
||||
;* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
;*
|
||||
;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
cextern pb_80
|
||||
|
||||
SECTION .text
|
||||
|
||||
; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
; intptr_t w);
|
||||
%macro DIFF_BYTES_PROLOGUE 0
|
||||
%if ARCH_X86_32
|
||||
cglobal diff_bytes, 3,5,2, dst, src1, src2
|
||||
%define wq r4q
|
||||
DECLARE_REG_TMP 3
|
||||
mov wq, r3mp
|
||||
%else
|
||||
cglobal diff_bytes, 4,5,2, dst, src1, src2, w
|
||||
DECLARE_REG_TMP 4
|
||||
%endif ; ARCH_X86_32
|
||||
%define i t0q
|
||||
%endmacro
|
||||
|
||||
; labels to jump to if w < regsize and w < 0
|
||||
%macro DIFF_BYTES_LOOP_PREP 2
|
||||
mov i, wq
|
||||
and i, -2 * regsize
|
||||
js %2
|
||||
jz %1
|
||||
add dstq, i
|
||||
add src1q, i
|
||||
add src2q, i
|
||||
neg i
|
||||
%endmacro
|
||||
|
||||
; mov type used for src1q, dstq, first reg, second reg
|
||||
%macro DIFF_BYTES_LOOP_CORE 4
|
||||
%if mmsize != 16
|
||||
mov%1 %3, [src1q + i]
|
||||
mov%1 %4, [src1q + i + regsize]
|
||||
psubb %3, [src2q + i]
|
||||
psubb %4, [src2q + i + regsize]
|
||||
mov%2 [dstq + i], %3
|
||||
mov%2 [regsize + dstq + i], %4
|
||||
%else
|
||||
; SSE enforces alignment of psubb operand
|
||||
mov%1 %3, [src1q + i]
|
||||
movu %4, [src2q + i]
|
||||
psubb %3, %4
|
||||
mov%2 [dstq + i], %3
|
||||
mov%1 %3, [src1q + i + regsize]
|
||||
movu %4, [src2q + i + regsize]
|
||||
psubb %3, %4
|
||||
mov%2 [regsize + dstq + i], %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
|
||||
%define regsize mmsize
|
||||
.loop_%1%2:
|
||||
DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
|
||||
add i, 2 * regsize
|
||||
jl .loop_%1%2
|
||||
.skip_main_%1%2:
|
||||
and wq, 2 * regsize - 1
|
||||
jz .end_%1%2
|
||||
%if mmsize > 16
|
||||
; fall back to narrower xmm
|
||||
%define regsize (mmsize / 2)
|
||||
DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa, .end_aa
|
||||
.loop2_%1%2:
|
||||
DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
|
||||
add i, 2 * regsize
|
||||
jl .loop2_%1%2
|
||||
.setup_loop_gpr_%1%2:
|
||||
and wq, 2 * regsize - 1
|
||||
jz .end_%1%2
|
||||
%endif
|
||||
add dstq, wq
|
||||
add src1q, wq
|
||||
add src2q, wq
|
||||
neg wq
|
||||
.loop_gpr_%1%2:
|
||||
mov t0b, [src1q + wq]
|
||||
sub t0b, [src2q + wq]
|
||||
mov [dstq + wq], t0b
|
||||
inc wq
|
||||
jl .loop_gpr_%1%2
|
||||
.end_%1%2:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
DIFF_BYTES_PROLOGUE
|
||||
%define regsize mmsize
|
||||
DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa
|
||||
DIFF_BYTES_BODY a, a
|
||||
%undef i
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
DIFF_BYTES_PROLOGUE
|
||||
%define regsize mmsize
|
||||
DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa
|
||||
test dstq, regsize - 1
|
||||
jnz .loop_uu
|
||||
test src1q, regsize - 1
|
||||
jnz .loop_ua
|
||||
DIFF_BYTES_BODY a, a
|
||||
DIFF_BYTES_BODY u, a
|
||||
DIFF_BYTES_BODY u, u
|
||||
%undef i
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
DIFF_BYTES_PROLOGUE
|
||||
%define regsize mmsize
|
||||
; Directly using unaligned SSE2 version is marginally faster than
|
||||
; branching based on arguments.
|
||||
DIFF_BYTES_LOOP_PREP .skip_main_uu, .end_uu
|
||||
test dstq, regsize - 1
|
||||
jnz .loop_uu
|
||||
test src1q, regsize - 1
|
||||
jnz .loop_ua
|
||||
DIFF_BYTES_BODY a, a
|
||||
DIFF_BYTES_BODY u, a
|
||||
DIFF_BYTES_BODY u, u
|
||||
%undef i
|
||||
%endif
|
||||
|
||||
|
||||
;--------------------------------------------------------------------------------------------------
|
||||
;void sub_left_predict(uint8_t *dst, uint8_t *src, ptrdiff_t stride, ptrdiff_t width, int height)
|
||||
;--------------------------------------------------------------------------------------------------
|
||||
|
||||
INIT_XMM avx
|
||||
cglobal sub_left_predict, 5,6,5, dst, src, stride, width, height, x
|
||||
mova m1, [pb_80] ; prev initial
|
||||
add dstq, widthq
|
||||
add srcq, widthq
|
||||
lea xd, [widthq-1]
|
||||
neg widthq
|
||||
and xd, 15
|
||||
pinsrb m4, m1, xd, 15
|
||||
mov xq, widthq
|
||||
|
||||
.loop:
|
||||
movu m0, [srcq + widthq]
|
||||
palignr m2, m0, m1, 15
|
||||
movu m1, [srcq + widthq + 16]
|
||||
palignr m3, m1, m0, 15
|
||||
psubb m2, m0, m2
|
||||
psubb m3, m1, m3
|
||||
movu [dstq + widthq], m2
|
||||
movu [dstq + widthq + 16], m3
|
||||
add widthq, 2 * 16
|
||||
jl .loop
|
||||
|
||||
add srcq, strideq
|
||||
sub dstq, xq ; dst + width
|
||||
test xd, 16
|
||||
jz .mod32
|
||||
mova m1, m0
|
||||
|
||||
.mod32:
|
||||
pshufb m1, m4
|
||||
mov widthq, xq
|
||||
dec heightd
|
||||
jg .loop
|
||||
RET
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user