early-access version 1432
This commit is contained in:
63
externals/ffmpeg/libavcodec/aarch64/Makefile
vendored
Executable file
63
externals/ffmpeg/libavcodec/aarch64/Makefile
vendored
Executable file
@@ -0,0 +1,63 @@
|
||||
# subsystems
|
||||
OBJS-$(CONFIG_FFT) += aarch64/fft_init_aarch64.o
|
||||
OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
|
||||
OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
|
||||
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
|
||||
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
|
||||
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
|
||||
OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o
|
||||
OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_init_aarch64.o
|
||||
|
||||
# decoders/encoders
|
||||
OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_init_aarch64.o \
|
||||
aarch64/sbrdsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_init.o
|
||||
OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opusdsp_init.o
|
||||
OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o
|
||||
OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9dsp_init_10bpp_aarch64.o \
|
||||
aarch64/vp9dsp_init_12bpp_aarch64.o \
|
||||
aarch64/vp9mc_aarch64.o \
|
||||
aarch64/vp9dsp_init_aarch64.o
|
||||
|
||||
# ARMv8 optimizations
|
||||
|
||||
# subsystems
|
||||
ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
|
||||
|
||||
# NEON optimizations
|
||||
|
||||
# subsystems
|
||||
NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o
|
||||
NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \
|
||||
aarch64/h264idct_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
|
||||
aarch64/hpeldsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o
|
||||
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
|
||||
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
|
||||
|
||||
# decoders/encoders
|
||||
NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o
|
||||
NEON-OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opusdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
|
||||
aarch64/vp9itxfm_neon.o \
|
||||
aarch64/vp9lpf_16bpp_neon.o \
|
||||
aarch64/vp9lpf_neon.o \
|
||||
aarch64/vp9mc_16bpp_neon.o \
|
||||
aarch64/vp9mc_neon.o
|
||||
48
externals/ffmpeg/libavcodec/aarch64/aacpsdsp_init_aarch64.c
vendored
Executable file
48
externals/ffmpeg/libavcodec/aarch64/aacpsdsp_init_aarch64.c
vendored
Executable file
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/aacpsdsp.h"
|
||||
|
||||
void ff_ps_add_squares_neon(float *dst, const float (*src)[2], int n);
|
||||
void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2],
|
||||
float *src1, int n);
|
||||
void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2],
|
||||
const float (*filter)[8][2],
|
||||
ptrdiff_t stride, int n);
|
||||
void ff_ps_stereo_interpolate_neon(float (*l)[2], float (*r)[2],
|
||||
float h[2][4], float h_step[2][4],
|
||||
int len);
|
||||
void ff_ps_stereo_interpolate_ipdopd_neon(float (*l)[2], float (*r)[2],
|
||||
float h[2][4], float h_step[2][4],
|
||||
int len);
|
||||
|
||||
av_cold void ff_psdsp_init_aarch64(PSDSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
s->add_squares = ff_ps_add_squares_neon;
|
||||
s->mul_pair_single = ff_ps_mul_pair_single_neon;
|
||||
s->hybrid_analysis = ff_ps_hybrid_analysis_neon;
|
||||
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_neon;
|
||||
s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_neon;
|
||||
}
|
||||
}
|
||||
148
externals/ffmpeg/libavcodec/aarch64/aacpsdsp_neon.S
vendored
Executable file
148
externals/ffmpeg/libavcodec/aarch64/aacpsdsp_neon.S
vendored
Executable file
@@ -0,0 +1,148 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_ps_add_squares_neon, export=1
|
||||
1: ld1 {v0.4S,v1.4S}, [x1], #32
|
||||
fmul v0.4S, v0.4S, v0.4S
|
||||
fmul v1.4S, v1.4S, v1.4S
|
||||
faddp v2.4S, v0.4S, v1.4S
|
||||
ld1 {v3.4S}, [x0]
|
||||
fadd v3.4S, v3.4S, v2.4S
|
||||
st1 {v3.4S}, [x0], #16
|
||||
subs w2, w2, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_mul_pair_single_neon, export=1
|
||||
1: ld1 {v0.4S,v1.4S}, [x1], #32
|
||||
ld1 {v2.4S}, [x2], #16
|
||||
zip1 v3.4S, v2.4S, v2.4S
|
||||
zip2 v4.4S, v2.4S, v2.4S
|
||||
fmul v0.4S, v0.4S, v3.4S
|
||||
fmul v1.4S, v1.4S, v4.4S
|
||||
st1 {v0.4S,v1.4S}, [x0], #32
|
||||
subs w3, w3, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_stereo_interpolate_neon, export=1
|
||||
ld1 {v0.4S}, [x2]
|
||||
ld1 {v1.4S}, [x3]
|
||||
zip1 v4.4S, v0.4S, v0.4S
|
||||
zip2 v5.4S, v0.4S, v0.4S
|
||||
zip1 v6.4S, v1.4S, v1.4S
|
||||
zip2 v7.4S, v1.4S, v1.4S
|
||||
1: ld1 {v2.2S}, [x0]
|
||||
ld1 {v3.2S}, [x1]
|
||||
fadd v4.4S, v4.4S, v6.4S
|
||||
fadd v5.4S, v5.4S, v7.4S
|
||||
mov v2.D[1], v2.D[0]
|
||||
mov v3.D[1], v3.D[0]
|
||||
fmul v2.4S, v2.4S, v4.4S
|
||||
fmla v2.4S, v3.4S, v5.4S
|
||||
st1 {v2.D}[0], [x0], #8
|
||||
st1 {v2.D}[1], [x1], #8
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_stereo_interpolate_ipdopd_neon, export=1
|
||||
ld1 {v0.4S,v1.4S}, [x2]
|
||||
ld1 {v6.4S,v7.4S}, [x3]
|
||||
fneg v2.4S, v1.4S
|
||||
fneg v3.4S, v7.4S
|
||||
zip1 v16.4S, v0.4S, v0.4S
|
||||
zip2 v17.4S, v0.4S, v0.4S
|
||||
zip1 v18.4S, v2.4S, v1.4S
|
||||
zip2 v19.4S, v2.4S, v1.4S
|
||||
zip1 v20.4S, v6.4S, v6.4S
|
||||
zip2 v21.4S, v6.4S, v6.4S
|
||||
zip1 v22.4S, v3.4S, v7.4S
|
||||
zip2 v23.4S, v3.4S, v7.4S
|
||||
1: ld1 {v2.2S}, [x0]
|
||||
ld1 {v3.2S}, [x1]
|
||||
fadd v16.4S, v16.4S, v20.4S
|
||||
fadd v17.4S, v17.4S, v21.4S
|
||||
mov v2.D[1], v2.D[0]
|
||||
mov v3.D[1], v3.D[0]
|
||||
fmul v4.4S, v2.4S, v16.4S
|
||||
fmla v4.4S, v3.4S, v17.4S
|
||||
fadd v18.4S, v18.4S, v22.4S
|
||||
fadd v19.4S, v19.4S, v23.4S
|
||||
ext v2.16B, v2.16B, v2.16B, #4
|
||||
ext v3.16B, v3.16B, v3.16B, #4
|
||||
fmla v4.4S, v2.4S, v18.4S
|
||||
fmla v4.4S, v3.4S, v19.4S
|
||||
st1 {v4.D}[0], [x0], #8
|
||||
st1 {v4.D}[1], [x1], #8
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_ps_hybrid_analysis_neon, export=1
|
||||
lsl x3, x3, #3
|
||||
ld2 {v0.4S,v1.4S}, [x1], #32
|
||||
ld2 {v2.2S,v3.2S}, [x1], #16
|
||||
ld1 {v24.2S}, [x1], #8
|
||||
ld2 {v4.2S,v5.2S}, [x1], #16
|
||||
ld2 {v6.4S,v7.4S}, [x1]
|
||||
rev64 v6.4S, v6.4S
|
||||
rev64 v7.4S, v7.4S
|
||||
ext v6.16B, v6.16B, v6.16B, #8
|
||||
ext v7.16B, v7.16B, v7.16B, #8
|
||||
rev64 v4.2S, v4.2S
|
||||
rev64 v5.2S, v5.2S
|
||||
mov v2.D[1], v3.D[0]
|
||||
mov v4.D[1], v5.D[0]
|
||||
mov v5.D[1], v2.D[0]
|
||||
mov v3.D[1], v4.D[0]
|
||||
fadd v16.4S, v0.4S, v6.4S
|
||||
fadd v17.4S, v1.4S, v7.4S
|
||||
fsub v18.4S, v1.4S, v7.4S
|
||||
fsub v19.4S, v0.4S, v6.4S
|
||||
fadd v22.4S, v2.4S, v4.4S
|
||||
fsub v23.4S, v5.4S, v3.4S
|
||||
trn1 v20.2D, v22.2D, v23.2D // {re4+re8, re5+re7, im8-im4, im7-im5}
|
||||
trn2 v21.2D, v22.2D, v23.2D // {im4+im8, im5+im7, re4-re8, re5-re7}
|
||||
1: ld2 {v2.4S,v3.4S}, [x2], #32
|
||||
ld2 {v4.2S,v5.2S}, [x2], #16
|
||||
ld1 {v6.2S}, [x2], #8
|
||||
add x2, x2, #8
|
||||
mov v4.D[1], v5.D[0]
|
||||
mov v6.S[1], v6.S[0]
|
||||
fmul v6.2S, v6.2S, v24.2S
|
||||
fmul v0.4S, v2.4S, v16.4S
|
||||
fmul v1.4S, v2.4S, v17.4S
|
||||
fmls v0.4S, v3.4S, v18.4S
|
||||
fmla v1.4S, v3.4S, v19.4S
|
||||
fmla v0.4S, v4.4S, v20.4S
|
||||
fmla v1.4S, v4.4S, v21.4S
|
||||
faddp v0.4S, v0.4S, v1.4S
|
||||
faddp v0.4S, v0.4S, v0.4S
|
||||
fadd v0.2S, v0.2S, v6.2S
|
||||
st1 {v0.2S}, [x0], x3
|
||||
subs w4, w4, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
25
externals/ffmpeg/libavcodec/aarch64/asm-offsets.h
vendored
Executable file
25
externals/ffmpeg/libavcodec/aarch64/asm-offsets.h
vendored
Executable file
@@ -0,0 +1,25 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_AARCH64_ASM_OFFSETS_H
|
||||
#define AVCODEC_AARCH64_ASM_OFFSETS_H
|
||||
|
||||
/* FFTContext */
|
||||
#define IMDCT_HALF 0x48
|
||||
|
||||
#endif /* AVCODEC_AARCH64_ASM_OFFSETS_H */
|
||||
104
externals/ffmpeg/libavcodec/aarch64/cabac.h
vendored
Executable file
104
externals/ffmpeg/libavcodec/aarch64/cabac.h
vendored
Executable file
@@ -0,0 +1,104 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_AARCH64_CABAC_H
|
||||
#define AVCODEC_AARCH64_CABAC_H
|
||||
|
||||
#include "config.h"
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavcodec/cabac.h"
|
||||
|
||||
#define get_cabac_inline get_cabac_inline_aarch64
|
||||
static av_always_inline int get_cabac_inline_aarch64(CABACContext *c,
|
||||
uint8_t *const state)
|
||||
{
|
||||
int bit;
|
||||
void *reg_a, *reg_b, *reg_c, *tmp;
|
||||
|
||||
__asm__ volatile(
|
||||
"ldrb %w[bit] , [%[state]] \n\t"
|
||||
"add %[r_b] , %[tables] , %[lps_off] \n\t"
|
||||
"mov %w[tmp] , %w[range] \n\t"
|
||||
"and %w[range] , %w[range] , #0xC0 \n\t"
|
||||
"lsl %w[r_c] , %w[range] , #1 \n\t"
|
||||
"add %[r_b] , %[r_b] , %w[bit], UXTW \n\t"
|
||||
"ldrb %w[range] , [%[r_b], %w[r_c], SXTW] \n\t"
|
||||
"sub %w[r_c] , %w[tmp] , %w[range] \n\t"
|
||||
"lsl %w[tmp] , %w[r_c] , #17 \n\t"
|
||||
"cmp %w[tmp] , %w[low] \n\t"
|
||||
"csel %w[tmp] , %w[tmp] , wzr , cc \n\t"
|
||||
"csel %w[range] , %w[r_c] , %w[range], gt \n\t"
|
||||
"cinv %w[bit] , %w[bit] , cc \n\t"
|
||||
"sub %w[low] , %w[low] , %w[tmp] \n\t"
|
||||
"add %[r_b] , %[tables] , %[norm_off] \n\t"
|
||||
"add %[r_a] , %[tables] , %[mlps_off] \n\t"
|
||||
"ldrb %w[tmp] , [%[r_b], %w[range], SXTW] \n\t"
|
||||
"ldrb %w[r_a] , [%[r_a], %w[bit], SXTW] \n\t"
|
||||
"lsl %w[low] , %w[low] , %w[tmp] \n\t"
|
||||
"lsl %w[range] , %w[range] , %w[tmp] \n\t"
|
||||
"uxth %w[r_c] , %w[low] \n\t"
|
||||
"strb %w[r_a] , [%[state]] \n\t"
|
||||
"cbnz %w[r_c] , 2f \n\t"
|
||||
"ldr %[r_c] , [%[c], %[byte]] \n\t"
|
||||
"ldr %[r_a] , [%[c], %[end]] \n\t"
|
||||
"ldrh %w[tmp] , [%[r_c]] \n\t"
|
||||
"cmp %[r_c] , %[r_a] \n\t"
|
||||
"b.ge 1f \n\t"
|
||||
"add %[r_a] , %[r_c] , #2 \n\t"
|
||||
"str %[r_a] , [%[c], %[byte]] \n\t"
|
||||
"1: \n\t"
|
||||
"sub %w[r_c] , %w[low] , #1 \n\t"
|
||||
"eor %w[r_c] , %w[r_c] , %w[low] \n\t"
|
||||
"rev %w[tmp] , %w[tmp] \n\t"
|
||||
"lsr %w[r_c] , %w[r_c] , #15 \n\t"
|
||||
"lsr %w[tmp] , %w[tmp] , #15 \n\t"
|
||||
"ldrb %w[r_c] , [%[r_b], %w[r_c], SXTW] \n\t"
|
||||
"mov %w[r_b] , #0xFFFF \n\t"
|
||||
"mov %w[r_a] , #7 \n\t"
|
||||
"sub %w[tmp] , %w[tmp] , %w[r_b] \n\t"
|
||||
"sub %w[r_c] , %w[r_a] , %w[r_c] \n\t"
|
||||
"lsl %w[tmp] , %w[tmp] , %w[r_c] \n\t"
|
||||
"add %w[low] , %w[low] , %w[tmp] \n\t"
|
||||
"2: \n\t"
|
||||
: [bit]"=&r"(bit),
|
||||
[low]"+&r"(c->low),
|
||||
[range]"+&r"(c->range),
|
||||
[r_a]"=&r"(reg_a),
|
||||
[r_b]"=&r"(reg_b),
|
||||
[r_c]"=&r"(reg_c),
|
||||
[tmp]"=&r"(tmp)
|
||||
: [c]"r"(c),
|
||||
[state]"r"(state),
|
||||
[tables]"r"(ff_h264_cabac_tables),
|
||||
[byte]"i"(offsetof(CABACContext, bytestream)),
|
||||
[end]"i"(offsetof(CABACContext, bytestream_end)),
|
||||
[norm_off]"I"(H264_NORM_SHIFT_OFFSET),
|
||||
[lps_off]"I"(H264_LPS_RANGE_OFFSET),
|
||||
[mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
|
||||
: "memory", "cc"
|
||||
);
|
||||
|
||||
return bit & 1;
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
#endif /* AVCODEC_AARCH64_CABAC_H */
|
||||
50
externals/ffmpeg/libavcodec/aarch64/fft_init_aarch64.c
vendored
Executable file
50
externals/ffmpeg/libavcodec/aarch64/fft_init_aarch64.c
vendored
Executable file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
|
||||
#include "libavcodec/fft.h"
|
||||
|
||||
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
|
||||
|
||||
void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
av_cold void ff_fft_init_aarch64(FFTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
s->fft_permute = ff_fft_permute_neon;
|
||||
s->fft_calc = ff_fft_calc_neon;
|
||||
#if CONFIG_MDCT
|
||||
s->imdct_calc = ff_imdct_calc_neon;
|
||||
s->imdct_half = ff_imdct_half_neon;
|
||||
s->mdct_calc = ff_mdct_calc_neon;
|
||||
s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
442
externals/ffmpeg/libavcodec/aarch64/fft_neon.S
vendored
Executable file
442
externals/ffmpeg/libavcodec/aarch64/fft_neon.S
vendored
Executable file
@@ -0,0 +1,442 @@
|
||||
/*
|
||||
* ARM NEON optimised FFT
|
||||
*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2009 Naotoshi Nojiri
|
||||
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This algorithm (though not any of the implementation details) is
|
||||
* based on libdjbfft by D. J. Bernstein.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
#define M_SQRT1_2 0.70710678118654752440
|
||||
|
||||
.macro transpose d0, d1, s0, s1
|
||||
trn1 \d0, \s0, \s1
|
||||
trn2 \d1, \s0, \s1
|
||||
.endm
|
||||
|
||||
|
||||
function fft4_neon
|
||||
ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
|
||||
|
||||
fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1
|
||||
fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1
|
||||
|
||||
ext v16.8b, v2.8b, v3.8b, #4
|
||||
ext v17.8b, v3.8b, v2.8b, #4
|
||||
|
||||
fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3
|
||||
fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3
|
||||
|
||||
fadd v0.2s, v4.2s, v5.2s
|
||||
fsub v2.2s, v4.2s, v5.2s
|
||||
fadd v1.2s, v6.2s, v7.2s
|
||||
fsub v3.2s, v6.2s, v7.2s
|
||||
|
||||
st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function fft8_neon
|
||||
mov x1, x0
|
||||
ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
|
||||
ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
|
||||
ext v22.8b, v2.8b, v3.8b, #4
|
||||
ext v23.8b, v3.8b, v2.8b, #4
|
||||
fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
|
||||
fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
|
||||
fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
|
||||
fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
|
||||
rev64 v27.2s, v28.2s // ???
|
||||
fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
|
||||
fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
|
||||
fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
|
||||
ext v6.8b, v4.8b, v5.8b, #4
|
||||
ext v7.8b, v5.8b, v4.8b, #4
|
||||
fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
|
||||
fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
|
||||
fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
|
||||
fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
|
||||
fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
|
||||
fadd v0.2s, v20.2s, v21.2s
|
||||
fsub v2.2s, v20.2s, v21.2s
|
||||
fadd v1.2s, v22.2s, v23.2s
|
||||
rev64 v26.2s, v26.2s
|
||||
rev64 v27.2s, v27.2s
|
||||
fsub v3.2s, v22.2s, v23.2s
|
||||
fsub v6.2s, v6.2s, v7.2s
|
||||
fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
|
||||
fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
|
||||
fadd v7.2s, v4.2s, v5.2s
|
||||
fsub v18.2s, v2.2s, v6.2s
|
||||
ext v26.8b, v24.8b, v25.8b, #4
|
||||
ext v27.8b, v25.8b, v24.8b, #4
|
||||
fadd v2.2s, v2.2s, v6.2s
|
||||
fsub v16.2s, v0.2s, v7.2s
|
||||
fadd v5.2s, v25.2s, v24.2s
|
||||
fsub v4.2s, v26.2s, v27.2s
|
||||
fadd v0.2s, v0.2s, v7.2s
|
||||
fsub v17.2s, v1.2s, v5.2s
|
||||
fsub v19.2s, v3.2s, v4.2s
|
||||
fadd v3.2s, v3.2s, v4.2s
|
||||
fadd v1.2s, v1.2s, v5.2s
|
||||
|
||||
st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
|
||||
st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1]
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function fft16_neon
|
||||
mov x1, x0
|
||||
ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
|
||||
ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
|
||||
ext v22.8b, v2.8b, v3.8b, #4
|
||||
ext v23.8b, v3.8b, v2.8b, #4
|
||||
fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
|
||||
fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
|
||||
fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
|
||||
fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
|
||||
rev64 v27.2s, v28.2s // ???
|
||||
fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
|
||||
fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
|
||||
fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
|
||||
ext v6.8b, v4.8b, v5.8b, #4
|
||||
ext v7.8b, v5.8b, v4.8b, #4
|
||||
fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
|
||||
fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
|
||||
fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
|
||||
fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
|
||||
fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
|
||||
fadd v0.2s, v20.2s, v21.2s
|
||||
fsub v2.2s, v20.2s, v21.2s
|
||||
fadd v1.2s, v22.2s, v23.2s
|
||||
rev64 v26.2s, v26.2s
|
||||
rev64 v27.2s, v27.2s
|
||||
fsub v3.2s, v22.2s, v23.2s
|
||||
fsub v6.2s, v6.2s, v7.2s
|
||||
fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
|
||||
fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
|
||||
fadd v7.2s, v4.2s, v5.2s
|
||||
fsub v18.2s, v2.2s, v6.2s
|
||||
ld1 {v20.4s,v21.4s}, [x0], #32
|
||||
ld1 {v22.4s,v23.4s}, [x0], #32
|
||||
ext v26.8b, v24.8b, v25.8b, #4
|
||||
ext v27.8b, v25.8b, v24.8b, #4
|
||||
fadd v2.2s, v2.2s, v6.2s
|
||||
fsub v16.2s, v0.2s, v7.2s
|
||||
fadd v5.2s, v25.2s, v24.2s
|
||||
fsub v4.2s, v26.2s, v27.2s
|
||||
transpose v24.2d, v25.2d, v20.2d, v22.2d
|
||||
transpose v26.2d, v27.2d, v21.2d, v23.2d
|
||||
fadd v0.2s, v0.2s, v7.2s
|
||||
fsub v17.2s, v1.2s, v5.2s
|
||||
fsub v19.2s, v3.2s, v4.2s
|
||||
fadd v3.2s, v3.2s, v4.2s
|
||||
fadd v1.2s, v1.2s, v5.2s
|
||||
ext v20.16b, v21.16b, v21.16b, #4
|
||||
ext v21.16b, v23.16b, v23.16b, #4
|
||||
|
||||
zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]}
|
||||
zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]}
|
||||
zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]}
|
||||
zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]}
|
||||
|
||||
// 2 x fft4
|
||||
transpose v22.2d, v23.2d, v20.2d, v21.2d
|
||||
|
||||
fadd v4.4s, v24.4s, v25.4s
|
||||
fadd v5.4s, v26.4s, v27.4s
|
||||
fsub v6.4s, v24.4s, v25.4s
|
||||
fsub v7.4s, v22.4s, v23.4s
|
||||
|
||||
ld1 {v23.4s}, [x14]
|
||||
|
||||
fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]}
|
||||
fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]}
|
||||
fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]}
|
||||
fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]}
|
||||
|
||||
//fft_pass_neon_16
|
||||
rev64 v7.4s, v25.4s
|
||||
fmul v25.4s, v25.4s, v23.s[1]
|
||||
fmul v7.4s, v7.4s, v29.4s
|
||||
fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a}
|
||||
|
||||
zip1 v20.4s, v24.4s, v25.4s
|
||||
zip2 v21.4s, v24.4s, v25.4s
|
||||
fneg v22.4s, v20.4s
|
||||
fadd v4.4s, v21.4s, v20.4s
|
||||
fsub v6.4s, v20.4s, v21.4s // just the second half
|
||||
fadd v5.4s, v21.4s, v22.4s // just the first half
|
||||
|
||||
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
||||
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
||||
|
||||
fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]}
|
||||
fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]}
|
||||
fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]}
|
||||
fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]}
|
||||
|
||||
//second half
|
||||
rev64 v6.4s, v26.4s
|
||||
fmul v26.4s, v26.4s, v23.s[2]
|
||||
rev64 v7.4s, v27.4s
|
||||
fmul v27.4s, v27.4s, v23.s[3]
|
||||
fmul v6.4s, v6.4s, v29.4s
|
||||
fmul v7.4s, v7.4s, v29.4s
|
||||
fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6}
|
||||
fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a}
|
||||
|
||||
zip1 v24.4s, v26.4s, v27.4s
|
||||
zip2 v25.4s, v26.4s, v27.4s
|
||||
fneg v26.4s, v24.4s
|
||||
fadd v4.4s, v25.4s, v24.4s
|
||||
fsub v6.4s, v24.4s, v25.4s // just the second half
|
||||
fadd v5.4s, v25.4s, v26.4s // just the first half
|
||||
|
||||
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
||||
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
||||
|
||||
fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]}
|
||||
fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]}
|
||||
fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]}
|
||||
fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]}
|
||||
|
||||
st1 {v16.4s,v17.4s}, [x1], #32
|
||||
st1 {v18.4s,v19.4s}, [x1], #32
|
||||
st1 {v20.4s,v21.4s}, [x1], #32
|
||||
st1 {v22.4s,v23.4s}, [x1], #32
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
const trans4_float, align=4
|
||||
.byte 0, 1, 2, 3
|
||||
.byte 8, 9, 10, 11
|
||||
.byte 4, 5, 6, 7
|
||||
.byte 12, 13, 14, 15
|
||||
endconst
|
||||
|
||||
const trans8_float, align=4
|
||||
.byte 24, 25, 26, 27
|
||||
.byte 0, 1, 2, 3
|
||||
.byte 28, 29, 30, 31
|
||||
.byte 4, 5, 6, 7
|
||||
endconst
|
||||
|
||||
function fft_pass_neon
|
||||
sub x6, x2, #1 // n - 1, loop counter
|
||||
lsl x5, x2, #3 // 2 * n * sizeof FFTSample
|
||||
lsl x1, x2, #4 // 2 * n * sizeof FFTComplex
|
||||
add x5, x4, x5 // wim
|
||||
add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex
|
||||
add x2, x0, x2, lsl #5 // &z[o2]
|
||||
add x3, x0, x3 // &z[o3]
|
||||
add x1, x0, x1 // &z[o1]
|
||||
ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
|
||||
ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
|
||||
ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
|
||||
trn2 v25.2d, v20.2d, v22.2d
|
||||
sub x5, x5, #4 // wim--
|
||||
trn1 v24.2d, v20.2d, v22.2d
|
||||
ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1]
|
||||
rev64 v7.4s, v25.4s
|
||||
fmul v25.4s, v25.4s, v4.s[1]
|
||||
ld1 {v16.4s}, [x0] // {z[0],z[1]}
|
||||
fmul v7.4s, v7.4s, v29.4s
|
||||
ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]}
|
||||
prfm pldl1keep, [x2, #16]
|
||||
prfm pldl1keep, [x3, #16]
|
||||
fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
|
||||
prfm pldl1keep, [x0, #16]
|
||||
prfm pldl1keep, [x1, #16]
|
||||
|
||||
zip1 v20.4s, v24.4s, v25.4s
|
||||
zip2 v21.4s, v24.4s, v25.4s
|
||||
fneg v22.4s, v20.4s
|
||||
fadd v4.4s, v21.4s, v20.4s
|
||||
fsub v6.4s, v20.4s, v21.4s // just the second half
|
||||
fadd v5.4s, v21.4s, v22.4s // just the first half
|
||||
|
||||
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
||||
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
||||
|
||||
fadd v20.4s, v16.4s, v4.4s
|
||||
fsub v22.4s, v16.4s, v4.4s
|
||||
fadd v21.4s, v17.4s, v5.4s
|
||||
st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
|
||||
fsub v23.4s, v17.4s, v5.4s
|
||||
|
||||
st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
|
||||
st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
|
||||
st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
|
||||
1:
|
||||
ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
|
||||
ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
|
||||
ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
|
||||
transpose v26.2d, v27.2d, v20.2d, v22.2d
|
||||
ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]}
|
||||
rev64 v6.4s, v26.4s
|
||||
fmul v26.4s, v26.4s, v4.s[0]
|
||||
rev64 v7.4s, v27.4s
|
||||
fmul v27.4s, v27.4s, v4.s[1]
|
||||
fmul v6.4s, v6.4s, v29.4s
|
||||
fmul v7.4s, v7.4s, v29.4s
|
||||
ld1 {v16.4s},[x0] // {z[0],z[1]}
|
||||
fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6}
|
||||
fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
|
||||
ld1 {v17.4s},[x1] // {z[o1],z[o1+1]}
|
||||
|
||||
subs x6, x6, #1 // n--
|
||||
|
||||
zip1 v20.4s, v26.4s, v27.4s
|
||||
zip2 v21.4s, v26.4s, v27.4s
|
||||
fneg v22.4s, v20.4s
|
||||
fadd v4.4s, v21.4s, v20.4s
|
||||
fsub v6.4s, v20.4s, v21.4s // just the second half
|
||||
fadd v5.4s, v21.4s, v22.4s // just the first half
|
||||
|
||||
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
||||
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
||||
|
||||
fadd v20.4s, v16.4s, v4.4s
|
||||
fsub v22.4s, v16.4s, v4.4s
|
||||
fadd v21.4s, v17.4s, v5.4s
|
||||
st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
|
||||
fsub v23.4s, v17.4s, v5.4s
|
||||
|
||||
st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
|
||||
st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
|
||||
st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
|
||||
b.ne 1b
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro def_fft n, n2, n4
|
||||
function fft\n\()_neon, align=6
|
||||
sub sp, sp, #16
|
||||
stp x28, x30, [sp]
|
||||
add x28, x0, #\n4*2*8
|
||||
bl fft\n2\()_neon
|
||||
mov x0, x28
|
||||
bl fft\n4\()_neon
|
||||
add x0, x28, #\n4*1*8
|
||||
bl fft\n4\()_neon
|
||||
sub x0, x28, #\n4*2*8
|
||||
ldp x28, x30, [sp], #16
|
||||
movrel x4, X(ff_cos_\n)
|
||||
mov x2, #\n4>>1
|
||||
b fft_pass_neon
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
def_fft 32, 16, 8
|
||||
def_fft 64, 32, 16
|
||||
def_fft 128, 64, 32
|
||||
def_fft 256, 128, 64
|
||||
def_fft 512, 256, 128
|
||||
def_fft 1024, 512, 256
|
||||
def_fft 2048, 1024, 512
|
||||
def_fft 4096, 2048, 1024
|
||||
def_fft 8192, 4096, 2048
|
||||
def_fft 16384, 8192, 4096
|
||||
def_fft 32768, 16384, 8192
|
||||
def_fft 65536, 32768, 16384
|
||||
|
||||
function ff_fft_calc_neon, export=1
|
||||
prfm pldl1keep, [x1]
|
||||
movrel x10, trans4_float
|
||||
ldr w2, [x0]
|
||||
movrel x11, trans8_float
|
||||
sub w2, w2, #2
|
||||
movrel x3, fft_tab_neon
|
||||
ld1 {v30.16b}, [x10]
|
||||
mov x7, #-8
|
||||
movrel x12, pmmp
|
||||
ldr x3, [x3, x2, lsl #3]
|
||||
movrel x13, mppm
|
||||
movrel x14, X(ff_cos_16)
|
||||
ld1 {v31.16b}, [x11]
|
||||
mov x0, x1
|
||||
ld1 {v29.4s}, [x12] // pmmp
|
||||
ld1 {v28.4s}, [x13]
|
||||
br x3
|
||||
endfunc
|
||||
|
||||
function ff_fft_permute_neon, export=1
|
||||
mov x6, #1
|
||||
ldr w2, [x0] // nbits
|
||||
ldr x3, [x0, #16] // tmp_buf
|
||||
ldr x0, [x0, #8] // revtab
|
||||
lsl x6, x6, x2
|
||||
mov x2, x6
|
||||
1:
|
||||
ld1 {v0.2s,v1.2s}, [x1], #16
|
||||
ldr w4, [x0], #4
|
||||
uxth w5, w4
|
||||
lsr w4, w4, #16
|
||||
add x5, x3, x5, lsl #3
|
||||
add x4, x3, x4, lsl #3
|
||||
st1 {v0.2s}, [x5]
|
||||
st1 {v1.2s}, [x4]
|
||||
subs x6, x6, #2
|
||||
b.gt 1b
|
||||
|
||||
sub x1, x1, x2, lsl #3
|
||||
1:
|
||||
ld1 {v0.4s,v1.4s}, [x3], #32
|
||||
st1 {v0.4s,v1.4s}, [x1], #32
|
||||
subs x2, x2, #4
|
||||
b.gt 1b
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
const fft_tab_neon, relocate=1
|
||||
.quad fft4_neon
|
||||
.quad fft8_neon
|
||||
.quad fft16_neon
|
||||
.quad fft32_neon
|
||||
.quad fft64_neon
|
||||
.quad fft128_neon
|
||||
.quad fft256_neon
|
||||
.quad fft512_neon
|
||||
.quad fft1024_neon
|
||||
.quad fft2048_neon
|
||||
.quad fft4096_neon
|
||||
.quad fft8192_neon
|
||||
.quad fft16384_neon
|
||||
.quad fft32768_neon
|
||||
.quad fft65536_neon
|
||||
endconst
|
||||
|
||||
const pmmp, align=4
|
||||
.float +1.0, -1.0, -1.0, +1.0
|
||||
endconst
|
||||
|
||||
const mppm, align=4
|
||||
.float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
|
||||
endconst
|
||||
43
externals/ffmpeg/libavcodec/aarch64/fmtconvert_init.c
vendored
Executable file
43
externals/ffmpeg/libavcodec/aarch64/fmtconvert_init.c
vendored
Executable file
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
* ARM optimized Format Conversion Utils
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/fmtconvert.h"
|
||||
|
||||
void ff_int32_to_float_fmul_array8_neon(FmtConvertContext *c, float *dst,
|
||||
const int32_t *src, const float *mul,
|
||||
int len);
|
||||
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src,
|
||||
float mul, int len);
|
||||
|
||||
av_cold void ff_fmt_convert_init_aarch64(FmtConvertContext *c,
|
||||
AVCodecContext *avctx)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_neon;
|
||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
|
||||
}
|
||||
}
|
||||
76
externals/ffmpeg/libavcodec/aarch64/fmtconvert_neon.S
vendored
Executable file
76
externals/ffmpeg/libavcodec/aarch64/fmtconvert_neon.S
vendored
Executable file
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
* ARM NEON optimised Format Conversion Utils
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_int32_to_float_fmul_scalar_neon, export=1
|
||||
ld1 {v1.4s,v2.4s}, [x1], #32
|
||||
scvtf v1.4s, v1.4s
|
||||
scvtf v2.4s, v2.4s
|
||||
1:
|
||||
subs w2, w2, #8
|
||||
fmul v3.4s, v1.4s, v0.s[0]
|
||||
fmul v4.4s, v2.4s, v0.s[0]
|
||||
b.le 2f
|
||||
ld1 {v1.4s,v2.4s}, [x1], #32
|
||||
st1 {v3.4s,v4.4s}, [x0], #32
|
||||
scvtf v1.4s, v1.4s
|
||||
scvtf v2.4s, v2.4s
|
||||
b 1b
|
||||
2:
|
||||
st1 {v3.4s,v4.4s}, [x0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_int32_to_float_fmul_array8_neon, export=1
|
||||
lsr w4, w4, #3
|
||||
subs w5, w4, #1
|
||||
b.eq 1f
|
||||
2:
|
||||
ld1 {v0.4s,v1.4s}, [x2], #32
|
||||
ld1 {v2.4s,v3.4s}, [x2], #32
|
||||
scvtf v0.4s, v0.4s
|
||||
scvtf v1.4s, v1.4s
|
||||
ld1 {v16.2s}, [x3], #8
|
||||
scvtf v2.4s, v2.4s
|
||||
scvtf v3.4s, v3.4s
|
||||
fmul v4.4s, v0.4s, v16.s[0]
|
||||
fmul v5.4s, v1.4s, v16.s[0]
|
||||
fmul v6.4s, v2.4s, v16.s[1]
|
||||
fmul v7.4s, v3.4s, v16.s[1]
|
||||
st1 {v4.4s,v5.4s}, [x1], #32
|
||||
st1 {v6.4s,v7.4s}, [x1], #32
|
||||
subs w5, w5, #2
|
||||
b.gt 2b
|
||||
b.eq 1f
|
||||
ret
|
||||
1:
|
||||
ld1 {v0.4s,v1.4s}, [x2]
|
||||
ld1 {v16.s}[0], [x3]
|
||||
scvtf v0.4s, v0.4s
|
||||
scvtf v1.4s, v1.4s
|
||||
fmul v4.4s, v0.4s, v16.s[0]
|
||||
fmul v5.4s, v1.4s, v16.s[0]
|
||||
st1 {v4.4s,v5.4s}, [x1]
|
||||
ret
|
||||
endfunc
|
||||
59
externals/ffmpeg/libavcodec/aarch64/h264chroma_init_aarch64.c
vendored
Executable file
59
externals/ffmpeg/libavcodec/aarch64/h264chroma_init_aarch64.c
vendored
Executable file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
* ARM NEON optimised H.264 chroma functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/h264chroma.h"
|
||||
|
||||
#include "config.h"
|
||||
|
||||
void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
|
||||
void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
|
||||
av_cold void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth)
|
||||
{
|
||||
const int high_bit_depth = bit_depth > 8;
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags) && !high_bit_depth) {
|
||||
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
|
||||
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
|
||||
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
|
||||
|
||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
|
||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
|
||||
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
|
||||
}
|
||||
}
|
||||
450
externals/ffmpeg/libavcodec/aarch64/h264cmc_neon.S
vendored
Executable file
450
externals/ffmpeg/libavcodec/aarch64/h264cmc_neon.S
vendored
Executable file
@@ -0,0 +1,450 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
|
||||
.macro h264_chroma_mc8 type, codec=h264
|
||||
function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
|
||||
.ifc \type,avg
|
||||
mov x8, x0
|
||||
.endif
|
||||
prfm pldl1strm, [x1]
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,rv40
|
||||
movrel x6, rv40bias
|
||||
lsr w9, w5, #1
|
||||
lsr w10, w4, #1
|
||||
lsl w9, w9, #3
|
||||
lsl w10, w10, #1
|
||||
add w9, w9, w10
|
||||
add x6, x6, w9, UXTW
|
||||
ld1r {v22.8H}, [x6]
|
||||
.endif
|
||||
.ifc \codec,vc1
|
||||
movi v22.8H, #28
|
||||
.endif
|
||||
mul w7, w4, w5
|
||||
lsl w14, w5, #3
|
||||
lsl w13, w4, #3
|
||||
cmp w7, #0
|
||||
sub w6, w14, w7
|
||||
sub w12, w13, w7
|
||||
sub w4, w7, w13
|
||||
sub w4, w4, w14
|
||||
add w4, w4, #64
|
||||
b.eq 2f
|
||||
|
||||
dup v0.8B, w4
|
||||
dup v1.8B, w12
|
||||
ld1 {v4.8B, v5.8B}, [x1], x2
|
||||
dup v2.8B, w6
|
||||
dup v3.8B, w7
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
1: ld1 {v6.8B, v7.8B}, [x1], x2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v5.8B, v1.8B
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
ld1 {v4.8B, v5.8B}, [x1], x2
|
||||
umlal v16.8H, v6.8B, v2.8B
|
||||
prfm pldl1strm, [x1]
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
umlal v16.8H, v7.8B, v3.8B
|
||||
umull v17.8H, v6.8B, v0.8B
|
||||
subs w3, w3, #2
|
||||
umlal v17.8H, v7.8B, v1.8B
|
||||
umlal v17.8H, v4.8B, v2.8B
|
||||
umlal v17.8H, v5.8B, v3.8B
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
.endif
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
b.gt 1b
|
||||
ret
|
||||
|
||||
2: adds w12, w12, w6
|
||||
dup v0.8B, w4
|
||||
b.eq 5f
|
||||
tst w6, w6
|
||||
dup v1.8B, w12
|
||||
b.eq 4f
|
||||
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
3: ld1 {v6.8B}, [x1], x2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v6.8B, v1.8B
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
umull v17.8H, v6.8B, v0.8B
|
||||
umlal v17.8H, v4.8B, v1.8B
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
.endif
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
.endif
|
||||
subs w3, w3, #2
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
b.gt 3b
|
||||
ret
|
||||
|
||||
4: ld1 {v4.8B, v5.8B}, [x1], x2
|
||||
ld1 {v6.8B, v7.8B}, [x1], x2
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
prfm pldl1strm, [x1]
|
||||
subs w3, w3, #2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v5.8B, v1.8B
|
||||
umull v17.8H, v6.8B, v0.8B
|
||||
umlal v17.8H, v7.8B, v1.8B
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
.endif
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
b.gt 4b
|
||||
ret
|
||||
|
||||
5: ld1 {v4.8B}, [x1], x2
|
||||
ld1 {v5.8B}, [x1], x2
|
||||
prfm pldl1strm, [x1]
|
||||
subs w3, w3, #2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umull v17.8H, v5.8B, v0.8B
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
.endif
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
b.gt 5b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
|
||||
.macro h264_chroma_mc4 type, codec=h264
|
||||
function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
|
||||
.ifc \type,avg
|
||||
mov x8, x0
|
||||
.endif
|
||||
prfm pldl1strm, [x1]
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,rv40
|
||||
movrel x6, rv40bias
|
||||
lsr w9, w5, #1
|
||||
lsr w10, w4, #1
|
||||
lsl w9, w9, #3
|
||||
lsl w10, w10, #1
|
||||
add w9, w9, w10
|
||||
add x6, x6, w9, UXTW
|
||||
ld1r {v22.8H}, [x6]
|
||||
.endif
|
||||
.ifc \codec,vc1
|
||||
movi v22.8H, #28
|
||||
.endif
|
||||
mul w7, w4, w5
|
||||
lsl w14, w5, #3
|
||||
lsl w13, w4, #3
|
||||
cmp w7, #0
|
||||
sub w6, w14, w7
|
||||
sub w12, w13, w7
|
||||
sub w4, w7, w13
|
||||
sub w4, w4, w14
|
||||
add w4, w4, #64
|
||||
b.eq 2f
|
||||
|
||||
dup v24.8B, w4
|
||||
dup v25.8B, w12
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
dup v26.8B, w6
|
||||
dup v27.8B, w7
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
trn1 v0.2S, v24.2S, v25.2S
|
||||
trn1 v2.2S, v26.2S, v27.2S
|
||||
trn1 v4.2S, v4.2S, v5.2S
|
||||
1: ld1 {v6.8B}, [x1], x2
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
trn1 v6.2S, v6.2S, v7.2S
|
||||
umull v18.8H, v4.8B, v0.8B
|
||||
umlal v18.8H, v6.8B, v2.8B
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
trn1 v4.2S, v4.2S, v5.2S
|
||||
prfm pldl1strm, [x1]
|
||||
umull v19.8H, v6.8B, v0.8B
|
||||
umlal v19.8H, v4.8B, v2.8B
|
||||
trn1 v30.2D, v18.2D, v19.2D
|
||||
trn2 v31.2D, v18.2D, v19.2D
|
||||
add v18.8H, v30.8H, v31.8H
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
.endif
|
||||
subs w3, w3, #2
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
.endif
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
b.gt 1b
|
||||
ret
|
||||
|
||||
2: adds w12, w12, w6
|
||||
dup v30.8B, w4
|
||||
b.eq 5f
|
||||
tst w6, w6
|
||||
dup v31.8B, w12
|
||||
trn1 v0.2S, v30.2S, v31.2S
|
||||
trn2 v1.2S, v30.2S, v31.2S
|
||||
b.eq 4f
|
||||
|
||||
ext v1.8B, v0.8B, v1.8B, #4
|
||||
ld1 {v4.S}[0], [x1], x2
|
||||
3: ld1 {v4.S}[1], [x1], x2
|
||||
umull v18.8H, v4.8B, v0.8B
|
||||
ld1 {v4.S}[0], [x1], x2
|
||||
umull v19.8H, v4.8B, v1.8B
|
||||
trn1 v30.2D, v18.2D, v19.2D
|
||||
trn2 v31.2D, v18.2D, v19.2D
|
||||
add v18.8H, v30.8H, v31.8H
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
.endif
|
||||
subs w3, w3, #2
|
||||
prfm pldl1strm, [x1, x2]
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
b.gt 3b
|
||||
ret
|
||||
|
||||
4: ld1 {v4.8B}, [x1], x2
|
||||
ld1 {v6.8B}, [x1], x2
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
trn1 v4.2S, v4.2S, v5.2S
|
||||
trn1 v6.2S, v6.2S, v7.2S
|
||||
umull v18.8H, v4.8B, v0.8B
|
||||
umull v19.8H, v6.8B, v0.8B
|
||||
subs w3, w3, #2
|
||||
trn1 v30.2D, v18.2D, v19.2D
|
||||
trn2 v31.2D, v18.2D, v19.2D
|
||||
add v18.8H, v30.8H, v31.8H
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
.endif
|
||||
prfm pldl1strm, [x1]
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
b.gt 4b
|
||||
ret
|
||||
|
||||
5: ld1 {v4.S}[0], [x1], x2
|
||||
ld1 {v4.S}[1], [x1], x2
|
||||
umull v18.8H, v4.8B, v30.8B
|
||||
subs w3, w3, #2
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
.endif
|
||||
prfm pldl1strm, [x1]
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
b.gt 5b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro h264_chroma_mc2 type
|
||||
function ff_\type\()_h264_chroma_mc2_neon, export=1
|
||||
prfm pldl1strm, [x1]
|
||||
prfm pldl1strm, [x1, x2]
|
||||
orr w7, w4, w5
|
||||
cbz w7, 2f
|
||||
|
||||
mul w7, w4, w5
|
||||
lsl w14, w5, #3
|
||||
lsl w13, w4, #3
|
||||
sub w6, w14, w7
|
||||
sub w12, w13, w7
|
||||
sub w4, w7, w13
|
||||
sub w4, w4, w14
|
||||
add w4, w4, #64
|
||||
dup v0.8B, w4
|
||||
dup v2.8B, w12
|
||||
dup v1.8B, w6
|
||||
dup v3.8B, w7
|
||||
trn1 v0.4H, v0.4H, v2.4H
|
||||
trn1 v1.4H, v1.4H, v3.4H
|
||||
1:
|
||||
ld1 {v4.S}[0], [x1], x2
|
||||
ld1 {v4.S}[1], [x1], x2
|
||||
rev64 v5.2S, v4.2S
|
||||
ld1 {v5.S}[1], [x1]
|
||||
ext v6.8B, v4.8B, v5.8B, #1
|
||||
ext v7.8B, v5.8B, v4.8B, #1
|
||||
trn1 v4.4H, v4.4H, v6.4H
|
||||
trn1 v5.4H, v5.4H, v7.4H
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v5.8B, v1.8B
|
||||
.ifc \type,avg
|
||||
ld1 {v18.H}[0], [x0], x2
|
||||
ld1 {v18.H}[2], [x0]
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
rev64 v17.4S, v16.4S
|
||||
add v16.8H, v16.8H, v17.8H
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
.ifc \type,avg
|
||||
urhadd v16.8B, v16.8B, v18.8B
|
||||
.endif
|
||||
st1 {v16.H}[0], [x0], x2
|
||||
st1 {v16.H}[2], [x0], x2
|
||||
subs w3, w3, #2
|
||||
b.gt 1b
|
||||
ret
|
||||
|
||||
2:
|
||||
ld1 {v16.H}[0], [x1], x2
|
||||
ld1 {v16.H}[1], [x1], x2
|
||||
.ifc \type,avg
|
||||
ld1 {v18.H}[0], [x0], x2
|
||||
ld1 {v18.H}[1], [x0]
|
||||
sub x0, x0, x2
|
||||
urhadd v16.8B, v16.8B, v18.8B
|
||||
.endif
|
||||
st1 {v16.H}[0], [x0], x2
|
||||
st1 {v16.H}[1], [x0], x2
|
||||
subs w3, w3, #2
|
||||
b.gt 2b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_chroma_mc8 put
|
||||
h264_chroma_mc8 avg
|
||||
h264_chroma_mc4 put
|
||||
h264_chroma_mc4 avg
|
||||
h264_chroma_mc2 put
|
||||
h264_chroma_mc2 avg
|
||||
|
||||
#if CONFIG_RV40_DECODER
|
||||
const rv40bias
|
||||
.short 0, 16, 32, 16
|
||||
.short 32, 28, 32, 28
|
||||
.short 0, 32, 16, 32
|
||||
.short 32, 28, 32, 28
|
||||
endconst
|
||||
|
||||
h264_chroma_mc8 put, rv40
|
||||
h264_chroma_mc8 avg, rv40
|
||||
h264_chroma_mc4 put, rv40
|
||||
h264_chroma_mc4 avg, rv40
|
||||
#endif
|
||||
|
||||
#if CONFIG_VC1DSP
|
||||
h264_chroma_mc8 put, vc1
|
||||
h264_chroma_mc8 avg, vc1
|
||||
h264_chroma_mc4 put, vc1
|
||||
h264_chroma_mc4 avg, vc1
|
||||
#endif
|
||||
129
externals/ffmpeg/libavcodec/aarch64/h264dsp_init_aarch64.c
vendored
Executable file
129
externals/ffmpeg/libavcodec/aarch64/h264dsp_init_aarch64.c
vendored
Executable file
@@ -0,0 +1,129 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/h264dsp.h"
|
||||
|
||||
void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||
int beta);
|
||||
void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||
int beta);
|
||||
void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_h_loop_filter_chroma422_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
void ff_h264_h_loop_filter_chroma422_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
|
||||
void ff_weight_h264_pixels_16_neon(uint8_t *dst, ptrdiff_t stride, int height,
|
||||
int log2_den, int weight, int offset);
|
||||
void ff_weight_h264_pixels_8_neon(uint8_t *dst, ptrdiff_t stride, int height,
|
||||
int log2_den, int weight, int offset);
|
||||
void ff_weight_h264_pixels_4_neon(uint8_t *dst, ptrdiff_t stride, int height,
|
||||
int log2_den, int weight, int offset);
|
||||
|
||||
void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int height, int log2_den, int weightd,
|
||||
int weights, int offset);
|
||||
void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int height, int log2_den, int weightd,
|
||||
int weights, int offset);
|
||||
void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int height, int log2_den, int weightd,
|
||||
int weights, int offset);
|
||||
|
||||
void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
|
||||
void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
|
||||
av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags) && bit_depth == 8) {
|
||||
c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
|
||||
c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
|
||||
c->h264_v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon;
|
||||
c->h264_h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon;
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon;
|
||||
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon;
|
||||
c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon;
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon;
|
||||
c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon;
|
||||
c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon;
|
||||
}
|
||||
|
||||
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
|
||||
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
|
||||
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
|
||||
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
|
||||
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
|
||||
|
||||
c->h264_idct_add = ff_h264_idct_add_neon;
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_neon;
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
|
||||
if (chroma_format_idc <= 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_neon;
|
||||
c->h264_idct8_add = ff_h264_idct8_add_neon;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
|
||||
}
|
||||
}
|
||||
829
externals/ffmpeg/libavcodec/aarch64/h264dsp_neon.S
vendored
Executable file
829
externals/ffmpeg/libavcodec/aarch64/h264dsp_neon.S
vendored
Executable file
@@ -0,0 +1,829 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
||||
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
.macro h264_loop_filter_start
|
||||
cmp w2, #0
|
||||
ldr w6, [x4]
|
||||
ccmp w3, #0, #0, ne
|
||||
mov v24.S[0], w6
|
||||
and w8, w6, w6, lsl #16
|
||||
b.eq 1f
|
||||
ands w8, w8, w8, lsl #8
|
||||
b.ge 2f
|
||||
1:
|
||||
ret
|
||||
2:
|
||||
.endm
|
||||
|
||||
.macro h264_loop_filter_luma
|
||||
dup v22.16B, w2 // alpha
|
||||
uxtl v24.8H, v24.8B
|
||||
uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
|
||||
uxtl v24.4S, v24.4H
|
||||
uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
|
||||
sli v24.8H, v24.8H, #8
|
||||
uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
|
||||
sli v24.4S, v24.4S, #16
|
||||
cmhi v21.16B, v22.16B, v21.16B // < alpha
|
||||
dup v22.16B, w3 // beta
|
||||
cmlt v23.16B, v24.16B, #0
|
||||
cmhi v28.16B, v22.16B, v28.16B // < beta
|
||||
cmhi v30.16B, v22.16B, v30.16B // < beta
|
||||
bic v21.16B, v21.16B, v23.16B
|
||||
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
|
||||
and v21.16B, v21.16B, v28.16B
|
||||
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
|
||||
and v21.16B, v21.16B, v30.16B // < beta
|
||||
shrn v30.8b, v21.8h, #4
|
||||
mov x7, v30.d[0]
|
||||
cmhi v17.16B, v22.16B, v17.16B // < beta
|
||||
cmhi v19.16B, v22.16B, v19.16B // < beta
|
||||
cbz x7, 9f
|
||||
and v17.16B, v17.16B, v21.16B
|
||||
and v19.16B, v19.16B, v21.16B
|
||||
and v24.16B, v24.16B, v21.16B
|
||||
urhadd v28.16B, v16.16B, v0.16B
|
||||
sub v21.16B, v24.16B, v17.16B
|
||||
uqadd v23.16B, v18.16B, v24.16B
|
||||
uhadd v20.16B, v20.16B, v28.16B
|
||||
sub v21.16B, v21.16B, v19.16B
|
||||
uhadd v28.16B, v4.16B, v28.16B
|
||||
umin v23.16B, v23.16B, v20.16B
|
||||
uqsub v22.16B, v18.16B, v24.16B
|
||||
uqadd v4.16B, v2.16B, v24.16B
|
||||
umax v23.16B, v23.16B, v22.16B
|
||||
uqsub v22.16B, v2.16B, v24.16B
|
||||
umin v28.16B, v4.16B, v28.16B
|
||||
uxtl v4.8H, v0.8B
|
||||
umax v28.16B, v28.16B, v22.16B
|
||||
uxtl2 v20.8H, v0.16B
|
||||
usubw v4.8H, v4.8H, v16.8B
|
||||
usubw2 v20.8H, v20.8H, v16.16B
|
||||
shl v4.8H, v4.8H, #2
|
||||
shl v20.8H, v20.8H, #2
|
||||
uaddw v4.8H, v4.8H, v18.8B
|
||||
uaddw2 v20.8H, v20.8H, v18.16B
|
||||
usubw v4.8H, v4.8H, v2.8B
|
||||
usubw2 v20.8H, v20.8H, v2.16B
|
||||
rshrn v4.8B, v4.8H, #3
|
||||
rshrn2 v4.16B, v20.8H, #3
|
||||
bsl v17.16B, v23.16B, v18.16B
|
||||
bsl v19.16B, v28.16B, v2.16B
|
||||
neg v23.16B, v21.16B
|
||||
uxtl v28.8H, v16.8B
|
||||
smin v4.16B, v4.16B, v21.16B
|
||||
uxtl2 v21.8H, v16.16B
|
||||
smax v4.16B, v4.16B, v23.16B
|
||||
uxtl v22.8H, v0.8B
|
||||
uxtl2 v24.8H, v0.16B
|
||||
saddw v28.8H, v28.8H, v4.8B
|
||||
saddw2 v21.8H, v21.8H, v4.16B
|
||||
ssubw v22.8H, v22.8H, v4.8B
|
||||
ssubw2 v24.8H, v24.8H, v4.16B
|
||||
sqxtun v16.8B, v28.8H
|
||||
sqxtun2 v16.16B, v21.8H
|
||||
sqxtun v0.8B, v22.8H
|
||||
sqxtun2 v0.16B, v24.8H
|
||||
.endm
|
||||
|
||||
function ff_h264_v_loop_filter_luma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
sxtw x1, w1
|
||||
|
||||
ld1 {v0.16B}, [x0], x1
|
||||
ld1 {v2.16B}, [x0], x1
|
||||
ld1 {v4.16B}, [x0], x1
|
||||
sub x0, x0, x1, lsl #2
|
||||
sub x0, x0, x1, lsl #1
|
||||
ld1 {v20.16B}, [x0], x1
|
||||
ld1 {v18.16B}, [x0], x1
|
||||
ld1 {v16.16B}, [x0], x1
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1 {v17.16B}, [x0], x1
|
||||
st1 {v16.16B}, [x0], x1
|
||||
st1 {v0.16B}, [x0], x1
|
||||
st1 {v19.16B}, [x0]
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_h_loop_filter_luma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
sxtw x1, w1
|
||||
|
||||
sub x0, x0, #4
|
||||
ld1 {v6.8B}, [x0], x1
|
||||
ld1 {v20.8B}, [x0], x1
|
||||
ld1 {v18.8B}, [x0], x1
|
||||
ld1 {v16.8B}, [x0], x1
|
||||
ld1 {v0.8B}, [x0], x1
|
||||
ld1 {v2.8B}, [x0], x1
|
||||
ld1 {v4.8B}, [x0], x1
|
||||
ld1 {v26.8B}, [x0], x1
|
||||
ld1 {v6.D}[1], [x0], x1
|
||||
ld1 {v20.D}[1], [x0], x1
|
||||
ld1 {v18.D}[1], [x0], x1
|
||||
ld1 {v16.D}[1], [x0], x1
|
||||
ld1 {v0.D}[1], [x0], x1
|
||||
ld1 {v2.D}[1], [x0], x1
|
||||
ld1 {v4.D}[1], [x0], x1
|
||||
ld1 {v26.D}[1], [x0], x1
|
||||
|
||||
transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
|
||||
|
||||
sub x0, x0, x1, lsl #4
|
||||
add x0, x0, #2
|
||||
st1 {v17.S}[0], [x0], x1
|
||||
st1 {v16.S}[0], [x0], x1
|
||||
st1 {v0.S}[0], [x0], x1
|
||||
st1 {v19.S}[0], [x0], x1
|
||||
st1 {v17.S}[1], [x0], x1
|
||||
st1 {v16.S}[1], [x0], x1
|
||||
st1 {v0.S}[1], [x0], x1
|
||||
st1 {v19.S}[1], [x0], x1
|
||||
st1 {v17.S}[2], [x0], x1
|
||||
st1 {v16.S}[2], [x0], x1
|
||||
st1 {v0.S}[2], [x0], x1
|
||||
st1 {v19.S}[2], [x0], x1
|
||||
st1 {v17.S}[3], [x0], x1
|
||||
st1 {v16.S}[3], [x0], x1
|
||||
st1 {v0.S}[3], [x0], x1
|
||||
st1 {v19.S}[3], [x0], x1
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
.macro h264_loop_filter_start_intra
|
||||
orr w4, w2, w3
|
||||
cbnz w4, 1f
|
||||
ret
|
||||
1:
|
||||
sxtw x1, w1
|
||||
dup v30.16b, w2 // alpha
|
||||
dup v31.16b, w3 // beta
|
||||
.endm
|
||||
|
||||
.macro h264_loop_filter_luma_intra
|
||||
uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
|
||||
uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
|
||||
uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
|
||||
cmhi v19.16b, v30.16b, v16.16b // < alpha
|
||||
cmhi v17.16b, v31.16b, v17.16b // < beta
|
||||
cmhi v18.16b, v31.16b, v18.16b // < beta
|
||||
|
||||
movi v29.16b, #2
|
||||
ushr v30.16b, v30.16b, #2 // alpha >> 2
|
||||
add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
|
||||
cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
|
||||
|
||||
and v19.16b, v19.16b, v17.16b
|
||||
and v19.16b, v19.16b, v18.16b
|
||||
shrn v20.8b, v19.8h, #4
|
||||
mov x4, v20.d[0]
|
||||
cbz x4, 9f
|
||||
|
||||
ushll v20.8h, v6.8b, #1
|
||||
ushll v22.8h, v1.8b, #1
|
||||
ushll2 v21.8h, v6.16b, #1
|
||||
ushll2 v23.8h, v1.16b, #1
|
||||
uaddw v20.8h, v20.8h, v7.8b
|
||||
uaddw v22.8h, v22.8h, v0.8b
|
||||
uaddw2 v21.8h, v21.8h, v7.16b
|
||||
uaddw2 v23.8h, v23.8h, v0.16b
|
||||
uaddw v20.8h, v20.8h, v1.8b
|
||||
uaddw v22.8h, v22.8h, v6.8b
|
||||
uaddw2 v21.8h, v21.8h, v1.16b
|
||||
uaddw2 v23.8h, v23.8h, v6.16b
|
||||
|
||||
rshrn v24.8b, v20.8h, #2 // p0'_1
|
||||
rshrn v25.8b, v22.8h, #2 // q0'_1
|
||||
rshrn2 v24.16b, v21.8h, #2 // p0'_1
|
||||
rshrn2 v25.16b, v23.8h, #2 // q0'_1
|
||||
|
||||
uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
|
||||
uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
|
||||
cmhi v17.16b, v31.16b, v17.16b // < beta
|
||||
cmhi v18.16b, v31.16b, v18.16b // < beta
|
||||
|
||||
and v17.16b, v16.16b, v17.16b // if_2 && if_3
|
||||
and v18.16b, v16.16b, v18.16b // if_2 && if_4
|
||||
|
||||
not v30.16b, v17.16b
|
||||
not v31.16b, v18.16b
|
||||
|
||||
and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
|
||||
and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
|
||||
|
||||
and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
|
||||
and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
|
||||
|
||||
//calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
|
||||
uaddl v26.8h, v5.8b, v7.8b
|
||||
uaddl2 v27.8h, v5.16b, v7.16b
|
||||
uaddw v26.8h, v26.8h, v0.8b
|
||||
uaddw2 v27.8h, v27.8h, v0.16b
|
||||
add v20.8h, v20.8h, v26.8h
|
||||
add v21.8h, v21.8h, v27.8h
|
||||
uaddw v20.8h, v20.8h, v0.8b
|
||||
uaddw2 v21.8h, v21.8h, v0.16b
|
||||
rshrn v20.8b, v20.8h, #3 // p0'_2
|
||||
rshrn2 v20.16b, v21.8h, #3 // p0'_2
|
||||
uaddw v26.8h, v26.8h, v6.8b
|
||||
uaddw2 v27.8h, v27.8h, v6.16b
|
||||
rshrn v21.8b, v26.8h, #2 // p1'_2
|
||||
rshrn2 v21.16b, v27.8h, #2 // p1'_2
|
||||
uaddl v28.8h, v4.8b, v5.8b
|
||||
uaddl2 v29.8h, v4.16b, v5.16b
|
||||
shl v28.8h, v28.8h, #1
|
||||
shl v29.8h, v29.8h, #1
|
||||
add v28.8h, v28.8h, v26.8h
|
||||
add v29.8h, v29.8h, v27.8h
|
||||
rshrn v19.8b, v28.8h, #3 // p2'_2
|
||||
rshrn2 v19.16b, v29.8h, #3 // p2'_2
|
||||
|
||||
//calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
|
||||
uaddl v26.8h, v2.8b, v0.8b
|
||||
uaddl2 v27.8h, v2.16b, v0.16b
|
||||
uaddw v26.8h, v26.8h, v7.8b
|
||||
uaddw2 v27.8h, v27.8h, v7.16b
|
||||
add v22.8h, v22.8h, v26.8h
|
||||
add v23.8h, v23.8h, v27.8h
|
||||
uaddw v22.8h, v22.8h, v7.8b
|
||||
uaddw2 v23.8h, v23.8h, v7.16b
|
||||
rshrn v22.8b, v22.8h, #3 // q0'_2
|
||||
rshrn2 v22.16b, v23.8h, #3 // q0'_2
|
||||
uaddw v26.8h, v26.8h, v1.8b
|
||||
uaddw2 v27.8h, v27.8h, v1.16b
|
||||
rshrn v23.8b, v26.8h, #2 // q1'_2
|
||||
rshrn2 v23.16b, v27.8h, #2 // q1'_2
|
||||
uaddl v28.8h, v2.8b, v3.8b
|
||||
uaddl2 v29.8h, v2.16b, v3.16b
|
||||
shl v28.8h, v28.8h, #1
|
||||
shl v29.8h, v29.8h, #1
|
||||
add v28.8h, v28.8h, v26.8h
|
||||
add v29.8h, v29.8h, v27.8h
|
||||
rshrn v26.8b, v28.8h, #3 // q2'_2
|
||||
rshrn2 v26.16b, v29.8h, #3 // q2'_2
|
||||
|
||||
bit v7.16b, v24.16b, v30.16b // p0'_1
|
||||
bit v0.16b, v25.16b, v31.16b // q0'_1
|
||||
bit v7.16b, v20.16b, v17.16b // p0'_2
|
||||
bit v6.16b, v21.16b, v17.16b // p1'_2
|
||||
bit v5.16b, v19.16b, v17.16b // p2'_2
|
||||
bit v0.16b, v22.16b, v18.16b // q0'_2
|
||||
bit v1.16b, v23.16b, v18.16b // q1'_2
|
||||
bit v2.16b, v26.16b, v18.16b // q2'_2
|
||||
.endm
|
||||
|
||||
function ff_h264_v_loop_filter_luma_intra_neon, export=1
|
||||
h264_loop_filter_start_intra
|
||||
|
||||
ld1 {v0.16b}, [x0], x1 // q0
|
||||
ld1 {v1.16b}, [x0], x1 // q1
|
||||
ld1 {v2.16b}, [x0], x1 // q2
|
||||
ld1 {v3.16b}, [x0], x1 // q3
|
||||
sub x0, x0, x1, lsl #3
|
||||
ld1 {v4.16b}, [x0], x1 // p3
|
||||
ld1 {v5.16b}, [x0], x1 // p2
|
||||
ld1 {v6.16b}, [x0], x1 // p1
|
||||
ld1 {v7.16b}, [x0] // p0
|
||||
|
||||
h264_loop_filter_luma_intra
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1 {v5.16b}, [x0], x1 // p2
|
||||
st1 {v6.16b}, [x0], x1 // p1
|
||||
st1 {v7.16b}, [x0], x1 // p0
|
||||
st1 {v0.16b}, [x0], x1 // q0
|
||||
st1 {v1.16b}, [x0], x1 // q1
|
||||
st1 {v2.16b}, [x0] // q2
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_h_loop_filter_luma_intra_neon, export=1
|
||||
h264_loop_filter_start_intra
|
||||
|
||||
sub x0, x0, #4
|
||||
ld1 {v4.8b}, [x0], x1
|
||||
ld1 {v5.8b}, [x0], x1
|
||||
ld1 {v6.8b}, [x0], x1
|
||||
ld1 {v7.8b}, [x0], x1
|
||||
ld1 {v0.8b}, [x0], x1
|
||||
ld1 {v1.8b}, [x0], x1
|
||||
ld1 {v2.8b}, [x0], x1
|
||||
ld1 {v3.8b}, [x0], x1
|
||||
ld1 {v4.d}[1], [x0], x1
|
||||
ld1 {v5.d}[1], [x0], x1
|
||||
ld1 {v6.d}[1], [x0], x1
|
||||
ld1 {v7.d}[1], [x0], x1
|
||||
ld1 {v0.d}[1], [x0], x1
|
||||
ld1 {v1.d}[1], [x0], x1
|
||||
ld1 {v2.d}[1], [x0], x1
|
||||
ld1 {v3.d}[1], [x0], x1
|
||||
|
||||
transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
|
||||
|
||||
h264_loop_filter_luma_intra
|
||||
|
||||
transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
|
||||
|
||||
sub x0, x0, x1, lsl #4
|
||||
st1 {v4.8b}, [x0], x1
|
||||
st1 {v5.8b}, [x0], x1
|
||||
st1 {v6.8b}, [x0], x1
|
||||
st1 {v7.8b}, [x0], x1
|
||||
st1 {v0.8b}, [x0], x1
|
||||
st1 {v1.8b}, [x0], x1
|
||||
st1 {v2.8b}, [x0], x1
|
||||
st1 {v3.8b}, [x0], x1
|
||||
st1 {v4.d}[1], [x0], x1
|
||||
st1 {v5.d}[1], [x0], x1
|
||||
st1 {v6.d}[1], [x0], x1
|
||||
st1 {v7.d}[1], [x0], x1
|
||||
st1 {v0.d}[1], [x0], x1
|
||||
st1 {v1.d}[1], [x0], x1
|
||||
st1 {v2.d}[1], [x0], x1
|
||||
st1 {v3.d}[1], [x0], x1
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro h264_loop_filter_chroma
|
||||
dup v22.8B, w2 // alpha
|
||||
dup v23.8B, w3 // beta
|
||||
uxtl v24.8H, v24.8B
|
||||
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
|
||||
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
|
||||
uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
|
||||
cmhi v26.8B, v22.8B, v26.8B // < alpha
|
||||
cmhi v28.8B, v23.8B, v28.8B // < beta
|
||||
cmhi v30.8B, v23.8B, v30.8B // < beta
|
||||
uxtl v4.8H, v0.8B
|
||||
and v26.8B, v26.8B, v28.8B
|
||||
usubw v4.8H, v4.8H, v16.8B
|
||||
and v26.8B, v26.8B, v30.8B
|
||||
shl v4.8H, v4.8H, #2
|
||||
mov x8, v26.d[0]
|
||||
sli v24.8H, v24.8H, #8
|
||||
uaddw v4.8H, v4.8H, v18.8B
|
||||
cbz x8, 9f
|
||||
usubw v4.8H, v4.8H, v2.8B
|
||||
rshrn v4.8B, v4.8H, #3
|
||||
smin v4.8B, v4.8B, v24.8B
|
||||
neg v25.8B, v24.8B
|
||||
smax v4.8B, v4.8B, v25.8B
|
||||
uxtl v22.8H, v0.8B
|
||||
and v4.8B, v4.8B, v26.8B
|
||||
uxtl v28.8H, v16.8B
|
||||
saddw v28.8H, v28.8H, v4.8B
|
||||
ssubw v22.8H, v22.8H, v4.8B
|
||||
sqxtun v16.8B, v28.8H
|
||||
sqxtun v0.8B, v22.8H
|
||||
.endm
|
||||
|
||||
function ff_h264_v_loop_filter_chroma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
sxtw x1, w1
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
ld1 {v18.8B}, [x0], x1
|
||||
ld1 {v16.8B}, [x0], x1
|
||||
ld1 {v0.8B}, [x0], x1
|
||||
ld1 {v2.8B}, [x0]
|
||||
|
||||
h264_loop_filter_chroma
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1 {v16.8B}, [x0], x1
|
||||
st1 {v0.8B}, [x0], x1
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_h_loop_filter_chroma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
sxtw x1, w1
|
||||
|
||||
sub x0, x0, #2
|
||||
h_loop_filter_chroma420:
|
||||
ld1 {v18.S}[0], [x0], x1
|
||||
ld1 {v16.S}[0], [x0], x1
|
||||
ld1 {v0.S}[0], [x0], x1
|
||||
ld1 {v2.S}[0], [x0], x1
|
||||
ld1 {v18.S}[1], [x0], x1
|
||||
ld1 {v16.S}[1], [x0], x1
|
||||
ld1 {v0.S}[1], [x0], x1
|
||||
ld1 {v2.S}[1], [x0], x1
|
||||
|
||||
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
|
||||
|
||||
h264_loop_filter_chroma
|
||||
|
||||
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
|
||||
|
||||
sub x0, x0, x1, lsl #3
|
||||
st1 {v18.S}[0], [x0], x1
|
||||
st1 {v16.S}[0], [x0], x1
|
||||
st1 {v0.S}[0], [x0], x1
|
||||
st1 {v2.S}[0], [x0], x1
|
||||
st1 {v18.S}[1], [x0], x1
|
||||
st1 {v16.S}[1], [x0], x1
|
||||
st1 {v0.S}[1], [x0], x1
|
||||
st1 {v2.S}[1], [x0], x1
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_h_loop_filter_chroma422_neon, export=1
|
||||
sxtw x1, w1
|
||||
h264_loop_filter_start
|
||||
add x5, x0, x1
|
||||
sub x0, x0, #2
|
||||
add x1, x1, x1
|
||||
mov x7, x30
|
||||
bl h_loop_filter_chroma420
|
||||
mov x30, x7
|
||||
sub x0, x5, #2
|
||||
mov v24.s[0], w6
|
||||
b h_loop_filter_chroma420
|
||||
endfunc
|
||||
|
||||
.macro h264_loop_filter_chroma_intra
|
||||
uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
|
||||
uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0)
|
||||
uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0)
|
||||
cmhi v26.8b, v30.8b, v26.8b // < alpha
|
||||
cmhi v27.8b, v31.8b, v27.8b // < beta
|
||||
cmhi v28.8b, v31.8b, v28.8b // < beta
|
||||
and v26.8b, v26.8b, v27.8b
|
||||
and v26.8b, v26.8b, v28.8b
|
||||
mov x2, v26.d[0]
|
||||
|
||||
ushll v4.8h, v18.8b, #1
|
||||
ushll v6.8h, v19.8b, #1
|
||||
cbz x2, 9f
|
||||
uaddl v20.8h, v16.8b, v19.8b
|
||||
uaddl v22.8h, v17.8b, v18.8b
|
||||
add v20.8h, v20.8h, v4.8h
|
||||
add v22.8h, v22.8h, v6.8h
|
||||
uqrshrn v24.8b, v20.8h, #2
|
||||
uqrshrn v25.8b, v22.8h, #2
|
||||
bit v16.8b, v24.8b, v26.8b
|
||||
bit v17.8b, v25.8b, v26.8b
|
||||
.endm
|
||||
|
||||
function ff_h264_v_loop_filter_chroma_intra_neon, export=1
|
||||
h264_loop_filter_start_intra
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
ld1 {v18.8b}, [x0], x1
|
||||
ld1 {v16.8b}, [x0], x1
|
||||
ld1 {v17.8b}, [x0], x1
|
||||
ld1 {v19.8b}, [x0]
|
||||
|
||||
h264_loop_filter_chroma_intra
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1 {v16.8b}, [x0], x1
|
||||
st1 {v17.8b}, [x0], x1
|
||||
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
|
||||
h264_loop_filter_start_intra
|
||||
|
||||
sub x4, x0, #2
|
||||
sub x0, x0, #1
|
||||
ld1 {v18.8b}, [x4], x1
|
||||
ld1 {v16.8b}, [x4], x1
|
||||
ld1 {v17.8b}, [x4], x1
|
||||
ld1 {v19.8b}, [x4], x1
|
||||
|
||||
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
|
||||
|
||||
h264_loop_filter_chroma_intra
|
||||
|
||||
st2 {v16.b,v17.b}[0], [x0], x1
|
||||
st2 {v16.b,v17.b}[1], [x0], x1
|
||||
st2 {v16.b,v17.b}[2], [x0], x1
|
||||
st2 {v16.b,v17.b}[3], [x0], x1
|
||||
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_h_loop_filter_chroma_intra_neon, export=1
|
||||
h264_loop_filter_start_intra
|
||||
|
||||
sub x4, x0, #2
|
||||
sub x0, x0, #1
|
||||
h_loop_filter_chroma420_intra:
|
||||
ld1 {v18.8b}, [x4], x1
|
||||
ld1 {v16.8b}, [x4], x1
|
||||
ld1 {v17.8b}, [x4], x1
|
||||
ld1 {v19.8b}, [x4], x1
|
||||
ld1 {v18.s}[1], [x4], x1
|
||||
ld1 {v16.s}[1], [x4], x1
|
||||
ld1 {v17.s}[1], [x4], x1
|
||||
ld1 {v19.s}[1], [x4], x1
|
||||
|
||||
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
|
||||
|
||||
h264_loop_filter_chroma_intra
|
||||
|
||||
st2 {v16.b,v17.b}[0], [x0], x1
|
||||
st2 {v16.b,v17.b}[1], [x0], x1
|
||||
st2 {v16.b,v17.b}[2], [x0], x1
|
||||
st2 {v16.b,v17.b}[3], [x0], x1
|
||||
st2 {v16.b,v17.b}[4], [x0], x1
|
||||
st2 {v16.b,v17.b}[5], [x0], x1
|
||||
st2 {v16.b,v17.b}[6], [x0], x1
|
||||
st2 {v16.b,v17.b}[7], [x0], x1
|
||||
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
|
||||
h264_loop_filter_start_intra
|
||||
sub x4, x0, #2
|
||||
add x5, x0, x1, lsl #3
|
||||
sub x0, x0, #1
|
||||
mov x7, x30
|
||||
bl h_loop_filter_chroma420_intra
|
||||
sub x0, x5, #1
|
||||
mov x30, x7
|
||||
b h_loop_filter_chroma420_intra
|
||||
endfunc
|
||||
|
||||
.macro biweight_16 macs, macd
|
||||
dup v0.16B, w5
|
||||
dup v1.16B, w6
|
||||
mov v4.16B, v16.16B
|
||||
mov v6.16B, v16.16B
|
||||
1: subs w3, w3, #2
|
||||
ld1 {v20.16B}, [x0], x2
|
||||
\macd v4.8H, v0.8B, v20.8B
|
||||
\macd\()2 v6.8H, v0.16B, v20.16B
|
||||
ld1 {v22.16B}, [x1], x2
|
||||
\macs v4.8H, v1.8B, v22.8B
|
||||
\macs\()2 v6.8H, v1.16B, v22.16B
|
||||
mov v24.16B, v16.16B
|
||||
ld1 {v28.16B}, [x0], x2
|
||||
mov v26.16B, v16.16B
|
||||
\macd v24.8H, v0.8B, v28.8B
|
||||
\macd\()2 v26.8H, v0.16B, v28.16B
|
||||
ld1 {v30.16B}, [x1], x2
|
||||
\macs v24.8H, v1.8B, v30.8B
|
||||
\macs\()2 v26.8H, v1.16B, v30.16B
|
||||
sshl v4.8H, v4.8H, v18.8H
|
||||
sshl v6.8H, v6.8H, v18.8H
|
||||
sqxtun v4.8B, v4.8H
|
||||
sqxtun2 v4.16B, v6.8H
|
||||
sshl v24.8H, v24.8H, v18.8H
|
||||
sshl v26.8H, v26.8H, v18.8H
|
||||
sqxtun v24.8B, v24.8H
|
||||
sqxtun2 v24.16B, v26.8H
|
||||
mov v6.16B, v16.16B
|
||||
st1 {v4.16B}, [x7], x2
|
||||
mov v4.16B, v16.16B
|
||||
st1 {v24.16B}, [x7], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro biweight_8 macs, macd
|
||||
dup v0.8B, w5
|
||||
dup v1.8B, w6
|
||||
mov v2.16B, v16.16B
|
||||
mov v20.16B, v16.16B
|
||||
1: subs w3, w3, #2
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
\macd v2.8H, v0.8B, v4.8B
|
||||
ld1 {v5.8B}, [x1], x2
|
||||
\macs v2.8H, v1.8B, v5.8B
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
\macd v20.8H, v0.8B, v6.8B
|
||||
ld1 {v7.8B}, [x1], x2
|
||||
\macs v20.8H, v1.8B, v7.8B
|
||||
sshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
sshl v20.8H, v20.8H, v18.8H
|
||||
sqxtun v4.8B, v20.8H
|
||||
mov v20.16B, v16.16B
|
||||
st1 {v2.8B}, [x7], x2
|
||||
mov v2.16B, v16.16B
|
||||
st1 {v4.8B}, [x7], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro biweight_4 macs, macd
|
||||
dup v0.8B, w5
|
||||
dup v1.8B, w6
|
||||
mov v2.16B, v16.16B
|
||||
mov v20.16B,v16.16B
|
||||
1: subs w3, w3, #4
|
||||
ld1 {v4.S}[0], [x0], x2
|
||||
ld1 {v4.S}[1], [x0], x2
|
||||
\macd v2.8H, v0.8B, v4.8B
|
||||
ld1 {v5.S}[0], [x1], x2
|
||||
ld1 {v5.S}[1], [x1], x2
|
||||
\macs v2.8H, v1.8B, v5.8B
|
||||
b.lt 2f
|
||||
ld1 {v6.S}[0], [x0], x2
|
||||
ld1 {v6.S}[1], [x0], x2
|
||||
\macd v20.8H, v0.8B, v6.8B
|
||||
ld1 {v7.S}[0], [x1], x2
|
||||
ld1 {v7.S}[1], [x1], x2
|
||||
\macs v20.8H, v1.8B, v7.8B
|
||||
sshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
sshl v20.8H, v20.8H, v18.8H
|
||||
sqxtun v4.8B, v20.8H
|
||||
mov v20.16B, v16.16B
|
||||
st1 {v2.S}[0], [x7], x2
|
||||
st1 {v2.S}[1], [x7], x2
|
||||
mov v2.16B, v16.16B
|
||||
st1 {v4.S}[0], [x7], x2
|
||||
st1 {v4.S}[1], [x7], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
2: sshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
st1 {v2.S}[0], [x7], x2
|
||||
st1 {v2.S}[1], [x7], x2
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro biweight_func w
|
||||
function ff_biweight_h264_pixels_\w\()_neon, export=1
|
||||
sxtw x2, w2
|
||||
lsr w8, w5, #31
|
||||
add w7, w7, #1
|
||||
eor w8, w8, w6, lsr #30
|
||||
orr w7, w7, #1
|
||||
dup v18.8H, w4
|
||||
lsl w7, w7, w4
|
||||
not v18.16B, v18.16B
|
||||
dup v16.8H, w7
|
||||
mov x7, x0
|
||||
cbz w8, 10f
|
||||
subs w8, w8, #1
|
||||
b.eq 20f
|
||||
subs w8, w8, #1
|
||||
b.eq 30f
|
||||
b 40f
|
||||
10: biweight_\w umlal, umlal
|
||||
20: neg w5, w5
|
||||
biweight_\w umlal, umlsl
|
||||
30: neg w5, w5
|
||||
neg w6, w6
|
||||
biweight_\w umlsl, umlsl
|
||||
40: neg w6, w6
|
||||
biweight_\w umlsl, umlal
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
biweight_func 16
|
||||
biweight_func 8
|
||||
biweight_func 4
|
||||
|
||||
.macro weight_16 add
|
||||
dup v0.16B, w4
|
||||
1: subs w2, w2, #2
|
||||
ld1 {v20.16B}, [x0], x1
|
||||
umull v4.8H, v0.8B, v20.8B
|
||||
umull2 v6.8H, v0.16B, v20.16B
|
||||
ld1 {v28.16B}, [x0], x1
|
||||
umull v24.8H, v0.8B, v28.8B
|
||||
umull2 v26.8H, v0.16B, v28.16B
|
||||
\add v4.8H, v16.8H, v4.8H
|
||||
srshl v4.8H, v4.8H, v18.8H
|
||||
\add v6.8H, v16.8H, v6.8H
|
||||
srshl v6.8H, v6.8H, v18.8H
|
||||
sqxtun v4.8B, v4.8H
|
||||
sqxtun2 v4.16B, v6.8H
|
||||
\add v24.8H, v16.8H, v24.8H
|
||||
srshl v24.8H, v24.8H, v18.8H
|
||||
\add v26.8H, v16.8H, v26.8H
|
||||
srshl v26.8H, v26.8H, v18.8H
|
||||
sqxtun v24.8B, v24.8H
|
||||
sqxtun2 v24.16B, v26.8H
|
||||
st1 {v4.16B}, [x5], x1
|
||||
st1 {v24.16B}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro weight_8 add
|
||||
dup v0.8B, w4
|
||||
1: subs w2, w2, #2
|
||||
ld1 {v4.8B}, [x0], x1
|
||||
umull v2.8H, v0.8B, v4.8B
|
||||
ld1 {v6.8B}, [x0], x1
|
||||
umull v20.8H, v0.8B, v6.8B
|
||||
\add v2.8H, v16.8H, v2.8H
|
||||
srshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
\add v20.8H, v16.8H, v20.8H
|
||||
srshl v20.8H, v20.8H, v18.8H
|
||||
sqxtun v4.8B, v20.8H
|
||||
st1 {v2.8B}, [x5], x1
|
||||
st1 {v4.8B}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro weight_4 add
|
||||
dup v0.8B, w4
|
||||
1: subs w2, w2, #4
|
||||
ld1 {v4.S}[0], [x0], x1
|
||||
ld1 {v4.S}[1], [x0], x1
|
||||
umull v2.8H, v0.8B, v4.8B
|
||||
b.lt 2f
|
||||
ld1 {v6.S}[0], [x0], x1
|
||||
ld1 {v6.S}[1], [x0], x1
|
||||
umull v20.8H, v0.8B, v6.8B
|
||||
\add v2.8H, v16.8H, v2.8H
|
||||
srshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
\add v20.8H, v16.8H, v20.8H
|
||||
srshl v20.8H, v20.8h, v18.8H
|
||||
sqxtun v4.8B, v20.8H
|
||||
st1 {v2.S}[0], [x5], x1
|
||||
st1 {v2.S}[1], [x5], x1
|
||||
st1 {v4.S}[0], [x5], x1
|
||||
st1 {v4.S}[1], [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
2: \add v2.8H, v16.8H, v2.8H
|
||||
srshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
st1 {v2.S}[0], [x5], x1
|
||||
st1 {v2.S}[1], [x5], x1
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro weight_func w
|
||||
function ff_weight_h264_pixels_\w\()_neon, export=1
|
||||
sxtw x1, w1
|
||||
cmp w3, #1
|
||||
mov w6, #1
|
||||
lsl w5, w5, w3
|
||||
dup v16.8H, w5
|
||||
mov x5, x0
|
||||
b.le 20f
|
||||
sub w6, w6, w3
|
||||
dup v18.8H, w6
|
||||
cmp w4, #0
|
||||
b.lt 10f
|
||||
weight_\w shadd
|
||||
10: neg w4, w4
|
||||
weight_\w shsub
|
||||
20: neg w6, w3
|
||||
dup v18.8H, w6
|
||||
cmp w4, #0
|
||||
b.lt 10f
|
||||
weight_\w add
|
||||
10: neg w4, w4
|
||||
weight_\w sub
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
weight_func 16
|
||||
weight_func 8
|
||||
weight_func 4
|
||||
413
externals/ffmpeg/libavcodec/aarch64/h264idct_neon.S
vendored
Executable file
413
externals/ffmpeg/libavcodec/aarch64/h264idct_neon.S
vendored
Executable file
@@ -0,0 +1,413 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
function ff_h264_idct_add_neon, export=1
|
||||
.L_ff_h264_idct_add_neon:
|
||||
ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
|
||||
sxtw x2, w2
|
||||
movi v30.8H, #0
|
||||
|
||||
add v4.4H, v0.4H, v2.4H
|
||||
sshr v16.4H, v1.4H, #1
|
||||
st1 {v30.8H}, [x1], #16
|
||||
sshr v17.4H, v3.4H, #1
|
||||
st1 {v30.8H}, [x1], #16
|
||||
sub v5.4H, v0.4H, v2.4H
|
||||
sub v6.4H, v16.4H, v3.4H
|
||||
add v7.4H, v1.4H, v17.4H
|
||||
add v0.4H, v4.4H, v7.4H
|
||||
add v1.4H, v5.4H, v6.4H
|
||||
sub v2.4H, v5.4H, v6.4H
|
||||
sub v3.4H, v4.4H, v7.4H
|
||||
|
||||
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
add v4.4H, v0.4H, v2.4H
|
||||
ld1 {v18.S}[0], [x0], x2
|
||||
sshr v16.4H, v3.4H, #1
|
||||
sshr v17.4H, v1.4H, #1
|
||||
ld1 {v18.S}[1], [x0], x2
|
||||
sub v5.4H, v0.4H, v2.4H
|
||||
ld1 {v19.S}[1], [x0], x2
|
||||
add v6.4H, v16.4H, v1.4H
|
||||
ins v4.D[1], v5.D[0]
|
||||
sub v7.4H, v17.4H, v3.4H
|
||||
ld1 {v19.S}[0], [x0], x2
|
||||
ins v6.D[1], v7.D[0]
|
||||
sub x0, x0, x2, lsl #2
|
||||
add v0.8H, v4.8H, v6.8H
|
||||
sub v1.8H, v4.8H, v6.8H
|
||||
|
||||
srshr v0.8H, v0.8H, #6
|
||||
srshr v1.8H, v1.8H, #6
|
||||
|
||||
uaddw v0.8H, v0.8H, v18.8B
|
||||
uaddw v1.8H, v1.8H, v19.8B
|
||||
|
||||
sqxtun v0.8B, v0.8H
|
||||
sqxtun v1.8B, v1.8H
|
||||
|
||||
st1 {v0.S}[0], [x0], x2
|
||||
st1 {v0.S}[1], [x0], x2
|
||||
st1 {v1.S}[1], [x0], x2
|
||||
st1 {v1.S}[0], [x0], x2
|
||||
|
||||
sub x1, x1, #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_dc_add_neon, export=1
|
||||
.L_ff_h264_idct_dc_add_neon:
|
||||
sxtw x2, w2
|
||||
mov w3, #0
|
||||
ld1r {v2.8H}, [x1]
|
||||
strh w3, [x1]
|
||||
srshr v2.8H, v2.8H, #6
|
||||
ld1 {v0.S}[0], [x0], x2
|
||||
ld1 {v0.S}[1], [x0], x2
|
||||
uaddw v3.8H, v2.8H, v0.8B
|
||||
ld1 {v1.S}[0], [x0], x2
|
||||
ld1 {v1.S}[1], [x0], x2
|
||||
uaddw v4.8H, v2.8H, v1.8B
|
||||
sqxtun v0.8B, v3.8H
|
||||
sqxtun v1.8B, v4.8H
|
||||
sub x0, x0, x2, lsl #2
|
||||
st1 {v0.S}[0], [x0], x2
|
||||
st1 {v0.S}[1], [x0], x2
|
||||
st1 {v1.S}[0], [x0], x2
|
||||
st1 {v1.S}[1], [x0], x2
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_add16_neon, export=1
|
||||
mov x12, x30
|
||||
mov x6, x0 // dest
|
||||
mov x5, x1 // block_offset
|
||||
mov x1, x2 // block
|
||||
mov w9, w3 // stride
|
||||
movrel x7, scan8
|
||||
mov x10, #16
|
||||
movrel x13, .L_ff_h264_idct_dc_add_neon
|
||||
movrel x14, .L_ff_h264_idct_add_neon
|
||||
1: mov w2, w9
|
||||
ldrb w3, [x7], #1
|
||||
ldrsw x0, [x5], #4
|
||||
ldrb w3, [x4, w3, uxtw]
|
||||
subs w3, w3, #1
|
||||
b.lt 2f
|
||||
ldrsh w3, [x1]
|
||||
add x0, x0, x6
|
||||
ccmp w3, #0, #4, eq
|
||||
csel x15, x13, x14, ne
|
||||
blr x15
|
||||
2: subs x10, x10, #1
|
||||
add x1, x1, #32
|
||||
b.ne 1b
|
||||
ret x12
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_add16intra_neon, export=1
|
||||
mov x12, x30
|
||||
mov x6, x0 // dest
|
||||
mov x5, x1 // block_offset
|
||||
mov x1, x2 // block
|
||||
mov w9, w3 // stride
|
||||
movrel x7, scan8
|
||||
mov x10, #16
|
||||
movrel x13, .L_ff_h264_idct_dc_add_neon
|
||||
movrel x14, .L_ff_h264_idct_add_neon
|
||||
1: mov w2, w9
|
||||
ldrb w3, [x7], #1
|
||||
ldrsw x0, [x5], #4
|
||||
ldrb w3, [x4, w3, uxtw]
|
||||
add x0, x0, x6
|
||||
cmp w3, #0
|
||||
ldrsh w3, [x1]
|
||||
csel x15, x13, x14, eq
|
||||
ccmp w3, #0, #0, eq
|
||||
b.eq 2f
|
||||
blr x15
|
||||
2: subs x10, x10, #1
|
||||
add x1, x1, #32
|
||||
b.ne 1b
|
||||
ret x12
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_add8_neon, export=1
|
||||
sub sp, sp, #0x40
|
||||
stp x19, x20, [sp]
|
||||
mov x12, x30
|
||||
ldp x6, x15, [x0] // dest[0], dest[1]
|
||||
add x5, x1, #16*4 // block_offset
|
||||
add x9, x2, #16*32 // block
|
||||
mov w19, w3 // stride
|
||||
movrel x13, .L_ff_h264_idct_dc_add_neon
|
||||
movrel x14, .L_ff_h264_idct_add_neon
|
||||
movrel x7, scan8, 16
|
||||
mov x10, #0
|
||||
mov x11, #16
|
||||
1: mov w2, w19
|
||||
ldrb w3, [x7, x10] // scan8[i]
|
||||
ldrsw x0, [x5, x10, lsl #2] // block_offset[i]
|
||||
ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ]
|
||||
add x0, x0, x6 // block_offset[i] + dst[j-1]
|
||||
add x1, x9, x10, lsl #5 // block + i * 16
|
||||
cmp w3, #0
|
||||
ldrsh w3, [x1] // block[i*16]
|
||||
csel x20, x13, x14, eq
|
||||
ccmp w3, #0, #0, eq
|
||||
b.eq 2f
|
||||
blr x20
|
||||
2: add x10, x10, #1
|
||||
cmp x10, #4
|
||||
csel x10, x11, x10, eq // mov x10, #16
|
||||
csel x6, x15, x6, eq
|
||||
cmp x10, #20
|
||||
b.lt 1b
|
||||
ldp x19, x20, [sp]
|
||||
add sp, sp, #0x40
|
||||
ret x12
|
||||
endfunc
|
||||
|
||||
.macro idct8x8_cols pass
|
||||
.if \pass == 0
|
||||
va .req v18
|
||||
vb .req v30
|
||||
sshr v18.8H, v26.8H, #1
|
||||
add v16.8H, v24.8H, v28.8H
|
||||
ld1 {v30.8H, v31.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
sub v17.8H, v24.8H, v28.8H
|
||||
sshr v19.8H, v30.8H, #1
|
||||
sub v18.8H, v18.8H, v30.8H
|
||||
add v19.8H, v19.8H, v26.8H
|
||||
.else
|
||||
va .req v30
|
||||
vb .req v18
|
||||
sshr v30.8H, v26.8H, #1
|
||||
sshr v19.8H, v18.8H, #1
|
||||
add v16.8H, v24.8H, v28.8H
|
||||
sub v17.8H, v24.8H, v28.8H
|
||||
sub v30.8H, v30.8H, v18.8H
|
||||
add v19.8H, v19.8H, v26.8H
|
||||
.endif
|
||||
add v26.8H, v17.8H, va.8H
|
||||
sub v28.8H, v17.8H, va.8H
|
||||
add v24.8H, v16.8H, v19.8H
|
||||
sub vb.8H, v16.8H, v19.8H
|
||||
sub v16.8H, v29.8H, v27.8H
|
||||
add v17.8H, v31.8H, v25.8H
|
||||
sub va.8H, v31.8H, v25.8H
|
||||
add v19.8H, v29.8H, v27.8H
|
||||
sub v16.8H, v16.8H, v31.8H
|
||||
sub v17.8H, v17.8H, v27.8H
|
||||
add va.8H, va.8H, v29.8H
|
||||
add v19.8H, v19.8H, v25.8H
|
||||
sshr v25.8H, v25.8H, #1
|
||||
sshr v27.8H, v27.8H, #1
|
||||
sshr v29.8H, v29.8H, #1
|
||||
sshr v31.8H, v31.8H, #1
|
||||
sub v16.8H, v16.8H, v31.8H
|
||||
sub v17.8H, v17.8H, v27.8H
|
||||
add va.8H, va.8H, v29.8H
|
||||
add v19.8H, v19.8H, v25.8H
|
||||
sshr v25.8H, v16.8H, #2
|
||||
sshr v27.8H, v17.8H, #2
|
||||
sshr v29.8H, va.8H, #2
|
||||
sshr v31.8H, v19.8H, #2
|
||||
sub v19.8H, v19.8H, v25.8H
|
||||
sub va.8H, v27.8H, va.8H
|
||||
add v17.8H, v17.8H, v29.8H
|
||||
add v16.8H, v16.8H, v31.8H
|
||||
.if \pass == 0
|
||||
sub v31.8H, v24.8H, v19.8H
|
||||
add v24.8H, v24.8H, v19.8H
|
||||
add v25.8H, v26.8H, v18.8H
|
||||
sub v18.8H, v26.8H, v18.8H
|
||||
add v26.8H, v28.8H, v17.8H
|
||||
add v27.8H, v30.8H, v16.8H
|
||||
sub v29.8H, v28.8H, v17.8H
|
||||
sub v28.8H, v30.8H, v16.8H
|
||||
.else
|
||||
sub v31.8H, v24.8H, v19.8H
|
||||
add v24.8H, v24.8H, v19.8H
|
||||
add v25.8H, v26.8H, v30.8H
|
||||
sub v30.8H, v26.8H, v30.8H
|
||||
add v26.8H, v28.8H, v17.8H
|
||||
sub v29.8H, v28.8H, v17.8H
|
||||
add v27.8H, v18.8H, v16.8H
|
||||
sub v28.8H, v18.8H, v16.8H
|
||||
.endif
|
||||
.unreq va
|
||||
.unreq vb
|
||||
.endm
|
||||
|
||||
function ff_h264_idct8_add_neon, export=1
|
||||
.L_ff_h264_idct8_add_neon:
|
||||
movi v19.8H, #0
|
||||
sxtw x2, w2
|
||||
ld1 {v24.8H, v25.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
ld1 {v26.8H, v27.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
ld1 {v28.8H, v29.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
|
||||
idct8x8_cols 0
|
||||
transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
|
||||
idct8x8_cols 1
|
||||
|
||||
mov x3, x0
|
||||
srshr v24.8H, v24.8H, #6
|
||||
ld1 {v0.8B}, [x0], x2
|
||||
srshr v25.8H, v25.8H, #6
|
||||
ld1 {v1.8B}, [x0], x2
|
||||
srshr v26.8H, v26.8H, #6
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
srshr v27.8H, v27.8H, #6
|
||||
ld1 {v3.8B}, [x0], x2
|
||||
srshr v28.8H, v28.8H, #6
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
srshr v29.8H, v29.8H, #6
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
srshr v30.8H, v30.8H, #6
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
srshr v31.8H, v31.8H, #6
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
uaddw v24.8H, v24.8H, v0.8B
|
||||
uaddw v25.8H, v25.8H, v1.8B
|
||||
uaddw v26.8H, v26.8H, v2.8B
|
||||
sqxtun v0.8B, v24.8H
|
||||
uaddw v27.8H, v27.8H, v3.8B
|
||||
sqxtun v1.8B, v25.8H
|
||||
uaddw v28.8H, v28.8H, v4.8B
|
||||
sqxtun v2.8B, v26.8H
|
||||
st1 {v0.8B}, [x3], x2
|
||||
uaddw v29.8H, v29.8H, v5.8B
|
||||
sqxtun v3.8B, v27.8H
|
||||
st1 {v1.8B}, [x3], x2
|
||||
uaddw v30.8H, v30.8H, v6.8B
|
||||
sqxtun v4.8B, v28.8H
|
||||
st1 {v2.8B}, [x3], x2
|
||||
uaddw v31.8H, v31.8H, v7.8B
|
||||
sqxtun v5.8B, v29.8H
|
||||
st1 {v3.8B}, [x3], x2
|
||||
sqxtun v6.8B, v30.8H
|
||||
sqxtun v7.8B, v31.8H
|
||||
st1 {v4.8B}, [x3], x2
|
||||
st1 {v5.8B}, [x3], x2
|
||||
st1 {v6.8B}, [x3], x2
|
||||
st1 {v7.8B}, [x3], x2
|
||||
|
||||
sub x1, x1, #128
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct8_dc_add_neon, export=1
|
||||
.L_ff_h264_idct8_dc_add_neon:
|
||||
mov w3, #0
|
||||
sxtw x2, w2
|
||||
ld1r {v31.8H}, [x1]
|
||||
strh w3, [x1]
|
||||
ld1 {v0.8B}, [x0], x2
|
||||
srshr v31.8H, v31.8H, #6
|
||||
ld1 {v1.8B}, [x0], x2
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
uaddw v24.8H, v31.8H, v0.8B
|
||||
ld1 {v3.8B}, [x0], x2
|
||||
uaddw v25.8H, v31.8H, v1.8B
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
uaddw v26.8H, v31.8H, v2.8B
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
uaddw v27.8H, v31.8H, v3.8B
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
uaddw v28.8H, v31.8H, v4.8B
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
uaddw v29.8H, v31.8H, v5.8B
|
||||
uaddw v30.8H, v31.8H, v6.8B
|
||||
uaddw v31.8H, v31.8H, v7.8B
|
||||
sqxtun v0.8B, v24.8H
|
||||
sqxtun v1.8B, v25.8H
|
||||
sqxtun v2.8B, v26.8H
|
||||
sqxtun v3.8B, v27.8H
|
||||
sub x0, x0, x2, lsl #3
|
||||
st1 {v0.8B}, [x0], x2
|
||||
sqxtun v4.8B, v28.8H
|
||||
st1 {v1.8B}, [x0], x2
|
||||
sqxtun v5.8B, v29.8H
|
||||
st1 {v2.8B}, [x0], x2
|
||||
sqxtun v6.8B, v30.8H
|
||||
st1 {v3.8B}, [x0], x2
|
||||
sqxtun v7.8B, v31.8H
|
||||
st1 {v4.8B}, [x0], x2
|
||||
st1 {v5.8B}, [x0], x2
|
||||
st1 {v6.8B}, [x0], x2
|
||||
st1 {v7.8B}, [x0], x2
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct8_add4_neon, export=1
|
||||
mov x12, x30
|
||||
mov x6, x0
|
||||
mov x5, x1
|
||||
mov x1, x2
|
||||
mov w2, w3
|
||||
movrel x7, scan8
|
||||
mov w10, #16
|
||||
movrel x13, .L_ff_h264_idct8_dc_add_neon
|
||||
movrel x14, .L_ff_h264_idct8_add_neon
|
||||
1: ldrb w9, [x7], #4
|
||||
ldrsw x0, [x5], #16
|
||||
ldrb w9, [x4, w9, UXTW]
|
||||
subs w9, w9, #1
|
||||
b.lt 2f
|
||||
ldrsh w11, [x1]
|
||||
add x0, x6, x0
|
||||
ccmp w11, #0, #4, eq
|
||||
csel x15, x13, x14, ne
|
||||
blr x15
|
||||
2: subs w10, w10, #4
|
||||
add x1, x1, #128
|
||||
b.ne 1b
|
||||
ret x12
|
||||
endfunc
|
||||
|
||||
const scan8
|
||||
.byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
|
||||
.byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
|
||||
.byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
|
||||
.byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
|
||||
.byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
|
||||
.byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
|
||||
.byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
|
||||
.byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
|
||||
.byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
|
||||
.byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
|
||||
.byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
|
||||
.byte 6+13*8, 7+13*8, 6+14*8, 7+14*8
|
||||
endconst
|
||||
93
externals/ffmpeg/libavcodec/aarch64/h264pred_init.c
vendored
Executable file
93
externals/ffmpeg/libavcodec/aarch64/h264pred_init.c
vendored
Executable file
@@ -0,0 +1,93 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/h264pred.h"
|
||||
|
||||
void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
|
||||
const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
const int high_depth = bit_depth > 8;
|
||||
|
||||
if (high_depth)
|
||||
return;
|
||||
|
||||
if (chroma_format_idc <= 1) {
|
||||
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon;
|
||||
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon;
|
||||
if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
|
||||
h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
|
||||
h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon;
|
||||
if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
|
||||
codec_id != AV_CODEC_ID_VP8) {
|
||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon;
|
||||
h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
|
||||
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
|
||||
h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
|
||||
h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
|
||||
h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
|
||||
h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
|
||||
}
|
||||
}
|
||||
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon;
|
||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon;
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon;
|
||||
h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
|
||||
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
|
||||
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
|
||||
if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
|
||||
codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon;
|
||||
}
|
||||
|
||||
av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
|
||||
int bit_depth, const int chroma_format_idc)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags))
|
||||
h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
|
||||
}
|
||||
361
externals/ffmpeg/libavcodec/aarch64/h264pred_neon.S
vendored
Executable file
361
externals/ffmpeg/libavcodec/aarch64/h264pred_neon.S
vendored
Executable file
@@ -0,0 +1,361 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
.macro ldcol.8 rd, rs, rt, n=8, hi=0
|
||||
.if \n >= 8 || \hi == 0
|
||||
ld1 {\rd\().b}[0], [\rs], \rt
|
||||
ld1 {\rd\().b}[1], [\rs], \rt
|
||||
ld1 {\rd\().b}[2], [\rs], \rt
|
||||
ld1 {\rd\().b}[3], [\rs], \rt
|
||||
.endif
|
||||
.if \n >= 8 || \hi == 1
|
||||
ld1 {\rd\().b}[4], [\rs], \rt
|
||||
ld1 {\rd\().b}[5], [\rs], \rt
|
||||
ld1 {\rd\().b}[6], [\rs], \rt
|
||||
ld1 {\rd\().b}[7], [\rs], \rt
|
||||
.endif
|
||||
.if \n == 16
|
||||
ld1 {\rd\().b}[8], [\rs], \rt
|
||||
ld1 {\rd\().b}[9], [\rs], \rt
|
||||
ld1 {\rd\().b}[10], [\rs], \rt
|
||||
ld1 {\rd\().b}[11], [\rs], \rt
|
||||
ld1 {\rd\().b}[12], [\rs], \rt
|
||||
ld1 {\rd\().b}[13], [\rs], \rt
|
||||
ld1 {\rd\().b}[14], [\rs], \rt
|
||||
ld1 {\rd\().b}[15], [\rs], \rt
|
||||
.endif
|
||||
.endm
|
||||
|
||||
function ff_pred16x16_128_dc_neon, export=1
|
||||
movi v0.16b, #128
|
||||
b .L_pred16x16_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_top_dc_neon, export=1
|
||||
sub x2, x0, x1
|
||||
ld1 {v0.16b}, [x2]
|
||||
uaddlv h0, v0.16b
|
||||
rshrn v0.8b, v0.8h, #4
|
||||
dup v0.16b, v0.b[0]
|
||||
b .L_pred16x16_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_left_dc_neon, export=1
|
||||
sub x2, x0, #1
|
||||
ldcol.8 v0, x2, x1, 16
|
||||
uaddlv h0, v0.16b
|
||||
rshrn v0.8b, v0.8h, #4
|
||||
dup v0.16b, v0.b[0]
|
||||
b .L_pred16x16_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_dc_neon, export=1
|
||||
sub x2, x0, x1
|
||||
sub x3, x0, #1
|
||||
ld1 {v0.16b}, [x2]
|
||||
ldcol.8 v1, x3, x1, 16
|
||||
uaddlv h0, v0.16b
|
||||
uaddlv h1, v1.16b
|
||||
add v0.4h, v0.4h, v1.4h
|
||||
rshrn v0.8b, v0.8h, #5
|
||||
dup v0.16b, v0.b[0]
|
||||
.L_pred16x16_dc_end:
|
||||
mov w3, #8
|
||||
6: st1 {v0.16b}, [x0], x1
|
||||
st1 {v0.16b}, [x0], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 6b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_hor_neon, export=1
|
||||
sub x2, x0, #1
|
||||
mov w3, #16
|
||||
1: ld1r {v0.16b}, [x2], x1
|
||||
st1 {v0.16b}, [x0], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_vert_neon, export=1
|
||||
sub x2, x0, x1
|
||||
add x1, x1, x1
|
||||
ld1 {v0.16b}, [x2], x1
|
||||
mov w3, #8
|
||||
1: st1 {v0.16b}, [x0], x1
|
||||
st1 {v0.16b}, [x2], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_plane_neon, export=1
|
||||
sub x3, x0, x1
|
||||
movrel x4, p16weight
|
||||
add x2, x3, #8
|
||||
sub x3, x3, #1
|
||||
ld1 {v0.8b}, [x3]
|
||||
ld1 {v2.8b}, [x2], x1
|
||||
ldcol.8 v1, x3, x1
|
||||
add x3, x3, x1
|
||||
ldcol.8 v3, x3, x1
|
||||
rev64 v0.8b, v0.8b
|
||||
rev64 v1.8b, v1.8b
|
||||
uaddl v7.8h, v2.8b, v3.8b
|
||||
usubl v2.8h, v2.8b, v0.8b
|
||||
usubl v3.8h, v3.8b, v1.8b
|
||||
ld1 {v0.8h}, [x4]
|
||||
mul v2.8h, v2.8h, v0.8h
|
||||
mul v3.8h, v3.8h, v0.8h
|
||||
addp v2.8h, v2.8h, v3.8h
|
||||
addp v2.8h, v2.8h, v2.8h
|
||||
addp v2.4h, v2.4h, v2.4h
|
||||
sshll v3.4s, v2.4h, #2
|
||||
saddw v2.4s, v3.4s, v2.4h
|
||||
rshrn v4.4h, v2.4s, #6
|
||||
trn2 v5.4h, v4.4h, v4.4h
|
||||
add v2.4h, v4.4h, v5.4h
|
||||
shl v3.4h, v2.4h, #3
|
||||
ext v7.16b, v7.16b, v7.16b, #14
|
||||
sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
|
||||
add v7.4h, v7.4h, v0.4h
|
||||
shl v2.4h, v7.4h, #4
|
||||
sub v2.4h, v2.4h, v3.4h
|
||||
shl v3.4h, v4.4h, #4
|
||||
ext v0.16b, v0.16b, v0.16b, #14
|
||||
sub v6.4h, v5.4h, v3.4h
|
||||
mov v0.h[0], wzr
|
||||
mul v0.8h, v0.8h, v4.h[0]
|
||||
dup v1.8h, v2.h[0]
|
||||
dup v2.8h, v4.h[0]
|
||||
dup v3.8h, v6.h[0]
|
||||
shl v2.8h, v2.8h, #3
|
||||
add v1.8h, v1.8h, v0.8h
|
||||
add v3.8h, v3.8h, v2.8h
|
||||
mov w3, #16
|
||||
1:
|
||||
sqshrun v0.8b, v1.8h, #5
|
||||
add v1.8h, v1.8h, v2.8h
|
||||
sqshrun2 v0.16b, v1.8h, #5
|
||||
add v1.8h, v1.8h, v3.8h
|
||||
st1 {v0.16b}, [x0], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
const p16weight, align=4
|
||||
.short 1,2,3,4,5,6,7,8
|
||||
endconst
|
||||
const p8weight, align=4
|
||||
.short 1,2,3,4,1,2,3,4
|
||||
endconst
|
||||
|
||||
function ff_pred8x8_hor_neon, export=1
|
||||
sub x2, x0, #1
|
||||
mov w3, #8
|
||||
1: ld1r {v0.8b}, [x2], x1
|
||||
st1 {v0.8b}, [x0], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_vert_neon, export=1
|
||||
sub x2, x0, x1
|
||||
lsl x1, x1, #1
|
||||
ld1 {v0.8b}, [x2], x1
|
||||
mov w3, #4
|
||||
1: st1 {v0.8b}, [x0], x1
|
||||
st1 {v0.8b}, [x2], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_plane_neon, export=1
|
||||
sub x3, x0, x1
|
||||
movrel x4, p8weight
|
||||
movrel x5, p16weight
|
||||
add x2, x3, #4
|
||||
sub x3, x3, #1
|
||||
ld1 {v0.s}[0], [x3]
|
||||
ld1 {v2.s}[0], [x2], x1
|
||||
ldcol.8 v0, x3, x1, 4, hi=1
|
||||
add x3, x3, x1
|
||||
ldcol.8 v3, x3, x1, 4
|
||||
uaddl v7.8h, v2.8b, v3.8b
|
||||
rev32 v0.8b, v0.8b
|
||||
trn1 v2.2s, v2.2s, v3.2s
|
||||
usubl v2.8h, v2.8b, v0.8b
|
||||
ld1 {v6.8h}, [x4]
|
||||
mul v2.8h, v2.8h, v6.8h
|
||||
ld1 {v0.8h}, [x5]
|
||||
saddlp v2.4s, v2.8h
|
||||
addp v2.4s, v2.4s, v2.4s
|
||||
shl v3.4s, v2.4s, #4
|
||||
add v2.4s, v3.4s, v2.4s
|
||||
rshrn v5.4h, v2.4s, #5
|
||||
addp v2.4h, v5.4h, v5.4h
|
||||
shl v3.4h, v2.4h, #1
|
||||
add v3.4h, v3.4h, v2.4h
|
||||
rev64 v7.4h, v7.4h
|
||||
add v7.4h, v7.4h, v0.4h
|
||||
shl v2.4h, v7.4h, #4
|
||||
sub v2.4h, v2.4h, v3.4h
|
||||
ext v0.16b, v0.16b, v0.16b, #14
|
||||
mov v0.h[0], wzr
|
||||
mul v0.8h, v0.8h, v5.h[0]
|
||||
dup v1.8h, v2.h[0]
|
||||
dup v2.8h, v5.h[1]
|
||||
add v1.8h, v1.8h, v0.8h
|
||||
mov w3, #8
|
||||
1:
|
||||
sqshrun v0.8b, v1.8h, #5
|
||||
add v1.8h, v1.8h, v2.8h
|
||||
st1 {v0.8b}, [x0], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_128_dc_neon, export=1
|
||||
movi v0.8b, #128
|
||||
movi v1.8b, #128
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_top_dc_neon, export=1
|
||||
sub x2, x0, x1
|
||||
ld1 {v0.8b}, [x2]
|
||||
uaddlp v0.4h, v0.8b
|
||||
addp v0.4h, v0.4h, v0.4h
|
||||
zip1 v0.8h, v0.8h, v0.8h
|
||||
rshrn v2.8b, v0.8h, #2
|
||||
zip1 v0.8b, v2.8b, v2.8b
|
||||
zip1 v1.8b, v2.8b, v2.8b
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_left_dc_neon, export=1
|
||||
sub x2, x0, #1
|
||||
ldcol.8 v0, x2, x1
|
||||
uaddlp v0.4h, v0.8b
|
||||
addp v0.4h, v0.4h, v0.4h
|
||||
rshrn v2.8b, v0.8h, #2
|
||||
dup v1.8b, v2.b[1]
|
||||
dup v0.8b, v2.b[0]
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_dc_neon, export=1
|
||||
sub x2, x0, x1
|
||||
sub x3, x0, #1
|
||||
ld1 {v0.8b}, [x2]
|
||||
ldcol.8 v1, x3, x1
|
||||
uaddlp v0.4h, v0.8b
|
||||
uaddlp v1.4h, v1.8b
|
||||
trn1 v2.2s, v0.2s, v1.2s
|
||||
trn2 v3.2s, v0.2s, v1.2s
|
||||
addp v4.4h, v2.4h, v3.4h
|
||||
addp v5.4h, v4.4h, v4.4h
|
||||
rshrn v6.8b, v5.8h, #3
|
||||
rshrn v7.8b, v4.8h, #2
|
||||
dup v0.8b, v6.b[0]
|
||||
dup v2.8b, v7.b[2]
|
||||
dup v1.8b, v7.b[3]
|
||||
dup v3.8b, v6.b[1]
|
||||
zip1 v0.2s, v0.2s, v2.2s
|
||||
zip1 v1.2s, v1.2s, v3.2s
|
||||
.L_pred8x8_dc_end:
|
||||
mov w3, #4
|
||||
add x2, x0, x1, lsl #2
|
||||
6: st1 {v0.8b}, [x0], x1
|
||||
st1 {v1.8b}, [x2], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 6b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_l0t_dc_neon, export=1
|
||||
sub x2, x0, x1
|
||||
sub x3, x0, #1
|
||||
ld1 {v0.8b}, [x2]
|
||||
ldcol.8 v1, x3, x1, 4
|
||||
zip1 v0.4s, v0.4s, v1.4s
|
||||
uaddlp v0.8h, v0.16b
|
||||
addp v0.8h, v0.8h, v0.8h
|
||||
addp v1.4h, v0.4h, v0.4h
|
||||
rshrn v2.8b, v0.8h, #2
|
||||
rshrn v3.8b, v1.8h, #3
|
||||
dup v4.8b, v3.b[0]
|
||||
dup v6.8b, v2.b[2]
|
||||
dup v5.8b, v2.b[0]
|
||||
zip1 v0.2s, v4.2s, v6.2s
|
||||
zip1 v1.2s, v5.2s, v6.2s
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_l00_dc_neon, export=1
|
||||
sub x2, x0, #1
|
||||
ldcol.8 v0, x2, x1, 4
|
||||
uaddlp v0.4h, v0.8b
|
||||
addp v0.4h, v0.4h, v0.4h
|
||||
rshrn v0.8b, v0.8h, #2
|
||||
movi v1.8b, #128
|
||||
dup v0.8b, v0.b[0]
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_0lt_dc_neon, export=1
|
||||
add x3, x0, x1, lsl #2
|
||||
sub x2, x0, x1
|
||||
sub x3, x3, #1
|
||||
ld1 {v0.8b}, [x2]
|
||||
ldcol.8 v1, x3, x1, 4, hi=1
|
||||
zip1 v0.4s, v0.4s, v1.4s
|
||||
uaddlp v0.8h, v0.16b
|
||||
addp v0.8h, v0.8h, v0.8h
|
||||
addp v1.4h, v0.4h, v0.4h
|
||||
rshrn v2.8b, v0.8h, #2
|
||||
rshrn v3.8b, v1.8h, #3
|
||||
dup v4.8b, v2.b[0]
|
||||
dup v5.8b, v2.b[3]
|
||||
dup v6.8b, v2.b[2]
|
||||
dup v7.8b, v3.b[1]
|
||||
zip1 v0.2s, v4.2s, v6.2s
|
||||
zip1 v1.2s, v5.2s, v7.2s
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_0l0_dc_neon, export=1
|
||||
add x2, x0, x1, lsl #2
|
||||
sub x2, x2, #1
|
||||
ldcol.8 v1, x2, x1, 4
|
||||
uaddlp v2.4h, v1.8b
|
||||
addp v2.4h, v2.4h, v2.4h
|
||||
rshrn v1.8b, v2.8h, #2
|
||||
movi v0.8b, #128
|
||||
dup v1.8b, v1.b[0]
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
172
externals/ffmpeg/libavcodec/aarch64/h264qpel_init_aarch64.c
vendored
Executable file
172
externals/ffmpeg/libavcodec/aarch64/h264qpel_init_aarch64.c
vendored
Executable file
@@ -0,0 +1,172 @@
|
||||
/*
|
||||
* ARM NEON optimised DSP functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/h264qpel.h"
|
||||
|
||||
void ff_put_h264_qpel16_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
void ff_put_h264_qpel8_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
void ff_avg_h264_qpel16_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
void ff_avg_h264_qpel8_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
|
||||
{
|
||||
const int high_bit_depth = bit_depth > 8;
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags) && !high_bit_depth) {
|
||||
c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
|
||||
|
||||
c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
|
||||
|
||||
c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon;
|
||||
|
||||
c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
|
||||
}
|
||||
}
|
||||
934
externals/ffmpeg/libavcodec/aarch64/h264qpel_neon.S
vendored
Executable file
934
externals/ffmpeg/libavcodec/aarch64/h264qpel_neon.S
vendored
Executable file
@@ -0,0 +1,934 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
/* H.264 qpel MC */
|
||||
|
||||
.macro lowpass_const r
|
||||
movz \r, #20, lsl #16
|
||||
movk \r, #5
|
||||
mov v6.S[0], \r
|
||||
.endm
|
||||
|
||||
//trashes v0-v5
|
||||
.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
|
||||
ext v2.8B, \r0\().8B, \r1\().8B, #2
|
||||
ext v3.8B, \r0\().8B, \r1\().8B, #3
|
||||
uaddl v2.8H, v2.8B, v3.8B
|
||||
ext v4.8B, \r0\().8B, \r1\().8B, #1
|
||||
ext v5.8B, \r0\().8B, \r1\().8B, #4
|
||||
uaddl v4.8H, v4.8B, v5.8B
|
||||
ext v1.8B, \r0\().8B, \r1\().8B, #5
|
||||
uaddl \d0\().8H, \r0\().8B, v1.8B
|
||||
ext v0.8B, \r2\().8B, \r3\().8B, #2
|
||||
mla \d0\().8H, v2.8H, v6.H[1]
|
||||
ext v1.8B, \r2\().8B, \r3\().8B, #3
|
||||
uaddl v0.8H, v0.8B, v1.8B
|
||||
ext v1.8B, \r2\().8B, \r3\().8B, #1
|
||||
mls \d0\().8H, v4.8H, v6.H[0]
|
||||
ext v3.8B, \r2\().8B, \r3\().8B, #4
|
||||
uaddl v1.8H, v1.8B, v3.8B
|
||||
ext v2.8B, \r2\().8B, \r3\().8B, #5
|
||||
uaddl \d1\().8H, \r2\().8B, v2.8B
|
||||
mla \d1\().8H, v0.8H, v6.H[1]
|
||||
mls \d1\().8H, v1.8H, v6.H[0]
|
||||
.if \narrow
|
||||
sqrshrun \d0\().8B, \d0\().8H, #5
|
||||
sqrshrun \d1\().8B, \d1\().8H, #5
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//trashes v0-v5, v7, v30-v31
|
||||
.macro lowpass_8H r0, r1
|
||||
ext v0.16B, \r0\().16B, \r0\().16B, #2
|
||||
ext v1.16B, \r0\().16B, \r0\().16B, #3
|
||||
uaddl v0.8H, v0.8B, v1.8B
|
||||
ext v2.16B, \r0\().16B, \r0\().16B, #1
|
||||
ext v3.16B, \r0\().16B, \r0\().16B, #4
|
||||
uaddl v2.8H, v2.8B, v3.8B
|
||||
ext v30.16B, \r0\().16B, \r0\().16B, #5
|
||||
uaddl \r0\().8H, \r0\().8B, v30.8B
|
||||
ext v4.16B, \r1\().16B, \r1\().16B, #2
|
||||
mla \r0\().8H, v0.8H, v6.H[1]
|
||||
ext v5.16B, \r1\().16B, \r1\().16B, #3
|
||||
uaddl v4.8H, v4.8B, v5.8B
|
||||
ext v7.16B, \r1\().16B, \r1\().16B, #1
|
||||
mls \r0\().8H, v2.8H, v6.H[0]
|
||||
ext v0.16B, \r1\().16B, \r1\().16B, #4
|
||||
uaddl v7.8H, v7.8B, v0.8B
|
||||
ext v31.16B, \r1\().16B, \r1\().16B, #5
|
||||
uaddl \r1\().8H, \r1\().8B, v31.8B
|
||||
mla \r1\().8H, v4.8H, v6.H[1]
|
||||
mls \r1\().8H, v7.8H, v6.H[0]
|
||||
.endm
|
||||
|
||||
// trashes v2-v5, v30
|
||||
.macro lowpass_8_1 r0, r1, d0, narrow=1
|
||||
ext v2.8B, \r0\().8B, \r1\().8B, #2
|
||||
ext v3.8B, \r0\().8B, \r1\().8B, #3
|
||||
uaddl v2.8H, v2.8B, v3.8B
|
||||
ext v4.8B, \r0\().8B, \r1\().8B, #1
|
||||
ext v5.8B, \r0\().8B, \r1\().8B, #4
|
||||
uaddl v4.8H, v4.8B, v5.8B
|
||||
ext v30.8B, \r0\().8B, \r1\().8B, #5
|
||||
uaddl \d0\().8H, \r0\().8B, v30.8B
|
||||
mla \d0\().8H, v2.8H, v6.H[1]
|
||||
mls \d0\().8H, v4.8H, v6.H[0]
|
||||
.if \narrow
|
||||
sqrshrun \d0\().8B, \d0\().8H, #5
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// trashed v0-v7
|
||||
.macro lowpass_8.16 r0, r1, r2
|
||||
ext v1.16B, \r0\().16B, \r1\().16B, #4
|
||||
ext v0.16B, \r0\().16B, \r1\().16B, #6
|
||||
saddl v5.4S, v1.4H, v0.4H
|
||||
ext v2.16B, \r0\().16B, \r1\().16B, #2
|
||||
saddl2 v1.4S, v1.8H, v0.8H
|
||||
ext v3.16B, \r0\().16B, \r1\().16B, #8
|
||||
saddl v6.4S, v2.4H, v3.4H
|
||||
ext \r1\().16B, \r0\().16B, \r1\().16B, #10
|
||||
saddl2 v2.4S, v2.8H, v3.8H
|
||||
saddl v0.4S, \r0\().4H, \r1\().4H
|
||||
saddl2 v4.4S, \r0\().8H, \r1\().8H
|
||||
|
||||
shl v3.4S, v5.4S, #4
|
||||
shl v5.4S, v5.4S, #2
|
||||
shl v7.4S, v6.4S, #2
|
||||
add v5.4S, v5.4S, v3.4S
|
||||
add v6.4S, v6.4S, v7.4S
|
||||
|
||||
shl v3.4S, v1.4S, #4
|
||||
shl v1.4S, v1.4S, #2
|
||||
shl v7.4S, v2.4S, #2
|
||||
add v1.4S, v1.4S, v3.4S
|
||||
add v2.4S, v2.4S, v7.4S
|
||||
|
||||
add v5.4S, v5.4S, v0.4S
|
||||
sub v5.4S, v5.4S, v6.4S
|
||||
|
||||
add v1.4S, v1.4S, v4.4S
|
||||
sub v1.4S, v1.4S, v2.4S
|
||||
|
||||
rshrn v5.4H, v5.4S, #10
|
||||
rshrn2 v5.8H, v1.4S, #10
|
||||
|
||||
sqxtun \r2\().8B, v5.8H
|
||||
.endm
|
||||
|
||||
function put_h264_qpel16_h_lowpass_neon_packed
|
||||
mov x4, x30
|
||||
mov x12, #16
|
||||
mov x3, #8
|
||||
bl put_h264_qpel8_h_lowpass_neon
|
||||
sub x1, x1, x2, lsl #4
|
||||
add x1, x1, #8
|
||||
mov x12, #16
|
||||
mov x30, x4
|
||||
b put_h264_qpel8_h_lowpass_neon
|
||||
endfunc
|
||||
|
||||
.macro h264_qpel_h_lowpass type
|
||||
function \type\()_h264_qpel16_h_lowpass_neon
|
||||
mov x13, x30
|
||||
mov x12, #16
|
||||
bl \type\()_h264_qpel8_h_lowpass_neon
|
||||
sub x0, x0, x3, lsl #4
|
||||
sub x1, x1, x2, lsl #4
|
||||
add x0, x0, #8
|
||||
add x1, x1, #8
|
||||
mov x12, #16
|
||||
mov x30, x13
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_h_lowpass_neon
|
||||
1: ld1 {v28.8B, v29.8B}, [x1], x2
|
||||
ld1 {v16.8B, v17.8B}, [x1], x2
|
||||
subs x12, x12, #2
|
||||
lowpass_8 v28, v29, v16, v17, v28, v16
|
||||
.ifc \type,avg
|
||||
ld1 {v2.8B}, [x0], x3
|
||||
urhadd v28.8B, v28.8B, v2.8B
|
||||
ld1 {v3.8B}, [x0]
|
||||
urhadd v16.8B, v16.8B, v3.8B
|
||||
sub x0, x0, x3
|
||||
.endif
|
||||
st1 {v28.8B}, [x0], x3
|
||||
st1 {v16.8B}, [x0], x3
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel_h_lowpass put
|
||||
h264_qpel_h_lowpass avg
|
||||
|
||||
.macro h264_qpel_h_lowpass_l2 type
|
||||
function \type\()_h264_qpel16_h_lowpass_l2_neon
|
||||
mov x13, x30
|
||||
mov x12, #16
|
||||
bl \type\()_h264_qpel8_h_lowpass_l2_neon
|
||||
sub x0, x0, x2, lsl #4
|
||||
sub x1, x1, x2, lsl #4
|
||||
sub x3, x3, x2, lsl #4
|
||||
add x0, x0, #8
|
||||
add x1, x1, #8
|
||||
add x3, x3, #8
|
||||
mov x12, #16
|
||||
mov x30, x13
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_h_lowpass_l2_neon
|
||||
1: ld1 {v26.8B, v27.8B}, [x1], x2
|
||||
ld1 {v16.8B, v17.8B}, [x1], x2
|
||||
ld1 {v28.8B}, [x3], x2
|
||||
ld1 {v29.8B}, [x3], x2
|
||||
subs x12, x12, #2
|
||||
lowpass_8 v26, v27, v16, v17, v26, v27
|
||||
urhadd v26.8B, v26.8B, v28.8B
|
||||
urhadd v27.8B, v27.8B, v29.8B
|
||||
.ifc \type,avg
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
urhadd v26.8B, v26.8B, v2.8B
|
||||
ld1 {v3.8B}, [x0]
|
||||
urhadd v27.8B, v27.8B, v3.8B
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v26.8B}, [x0], x2
|
||||
st1 {v27.8B}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel_h_lowpass_l2 put
|
||||
h264_qpel_h_lowpass_l2 avg
|
||||
|
||||
function put_h264_qpel16_v_lowpass_neon_packed
|
||||
mov x4, x30
|
||||
mov x2, #8
|
||||
bl put_h264_qpel8_v_lowpass_neon
|
||||
sub x1, x1, x3, lsl #2
|
||||
bl put_h264_qpel8_v_lowpass_neon
|
||||
sub x1, x1, x3, lsl #4
|
||||
sub x1, x1, x3, lsl #2
|
||||
add x1, x1, #8
|
||||
bl put_h264_qpel8_v_lowpass_neon
|
||||
sub x1, x1, x3, lsl #2
|
||||
mov x30, x4
|
||||
b put_h264_qpel8_v_lowpass_neon
|
||||
endfunc
|
||||
|
||||
.macro h264_qpel_v_lowpass type
|
||||
function \type\()_h264_qpel16_v_lowpass_neon
|
||||
mov x4, x30
|
||||
bl \type\()_h264_qpel8_v_lowpass_neon
|
||||
sub x1, x1, x3, lsl #2
|
||||
bl \type\()_h264_qpel8_v_lowpass_neon
|
||||
sub x0, x0, x2, lsl #4
|
||||
add x0, x0, #8
|
||||
sub x1, x1, x3, lsl #4
|
||||
sub x1, x1, x3, lsl #2
|
||||
add x1, x1, #8
|
||||
bl \type\()_h264_qpel8_v_lowpass_neon
|
||||
sub x1, x1, x3, lsl #2
|
||||
mov x30, x4
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_v_lowpass_neon
|
||||
ld1 {v16.8B}, [x1], x3
|
||||
ld1 {v18.8B}, [x1], x3
|
||||
ld1 {v20.8B}, [x1], x3
|
||||
ld1 {v22.8B}, [x1], x3
|
||||
ld1 {v24.8B}, [x1], x3
|
||||
ld1 {v26.8B}, [x1], x3
|
||||
ld1 {v28.8B}, [x1], x3
|
||||
ld1 {v30.8B}, [x1], x3
|
||||
ld1 {v17.8B}, [x1], x3
|
||||
ld1 {v19.8B}, [x1], x3
|
||||
ld1 {v21.8B}, [x1], x3
|
||||
ld1 {v23.8B}, [x1], x3
|
||||
ld1 {v25.8B}, [x1]
|
||||
|
||||
transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1
|
||||
transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1
|
||||
lowpass_8 v16, v17, v18, v19, v16, v17
|
||||
lowpass_8 v20, v21, v22, v23, v18, v19
|
||||
lowpass_8 v24, v25, v26, v27, v20, v21
|
||||
lowpass_8 v28, v29, v30, v31, v22, v23
|
||||
transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
|
||||
|
||||
.ifc \type,avg
|
||||
ld1 {v24.8B}, [x0], x2
|
||||
urhadd v16.8B, v16.8B, v24.8B
|
||||
ld1 {v25.8B}, [x0], x2
|
||||
urhadd v17.8B, v17.8B, v25.8B
|
||||
ld1 {v26.8B}, [x0], x2
|
||||
urhadd v18.8B, v18.8B, v26.8B
|
||||
ld1 {v27.8B}, [x0], x2
|
||||
urhadd v19.8B, v19.8B, v27.8B
|
||||
ld1 {v28.8B}, [x0], x2
|
||||
urhadd v20.8B, v20.8B, v28.8B
|
||||
ld1 {v29.8B}, [x0], x2
|
||||
urhadd v21.8B, v21.8B, v29.8B
|
||||
ld1 {v30.8B}, [x0], x2
|
||||
urhadd v22.8B, v22.8B, v30.8B
|
||||
ld1 {v31.8B}, [x0], x2
|
||||
urhadd v23.8B, v23.8B, v31.8B
|
||||
sub x0, x0, x2, lsl #3
|
||||
.endif
|
||||
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
st1 {v18.8B}, [x0], x2
|
||||
st1 {v19.8B}, [x0], x2
|
||||
st1 {v20.8B}, [x0], x2
|
||||
st1 {v21.8B}, [x0], x2
|
||||
st1 {v22.8B}, [x0], x2
|
||||
st1 {v23.8B}, [x0], x2
|
||||
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel_v_lowpass put
|
||||
h264_qpel_v_lowpass avg
|
||||
|
||||
.macro h264_qpel_v_lowpass_l2 type
|
||||
function \type\()_h264_qpel16_v_lowpass_l2_neon
|
||||
mov x4, x30
|
||||
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
sub x1, x1, x3, lsl #2
|
||||
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
sub x0, x0, x3, lsl #4
|
||||
sub x12, x12, x2, lsl #4
|
||||
add x0, x0, #8
|
||||
add x12, x12, #8
|
||||
sub x1, x1, x3, lsl #4
|
||||
sub x1, x1, x3, lsl #2
|
||||
add x1, x1, #8
|
||||
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
sub x1, x1, x3, lsl #2
|
||||
mov x30, x4
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
ld1 {v16.8B}, [x1], x3
|
||||
ld1 {v18.8B}, [x1], x3
|
||||
ld1 {v20.8B}, [x1], x3
|
||||
ld1 {v22.8B}, [x1], x3
|
||||
ld1 {v24.8B}, [x1], x3
|
||||
ld1 {v26.8B}, [x1], x3
|
||||
ld1 {v28.8B}, [x1], x3
|
||||
ld1 {v30.8B}, [x1], x3
|
||||
ld1 {v17.8B}, [x1], x3
|
||||
ld1 {v19.8B}, [x1], x3
|
||||
ld1 {v21.8B}, [x1], x3
|
||||
ld1 {v23.8B}, [x1], x3
|
||||
ld1 {v25.8B}, [x1]
|
||||
|
||||
transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1
|
||||
transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1
|
||||
lowpass_8 v16, v17, v18, v19, v16, v17
|
||||
lowpass_8 v20, v21, v22, v23, v18, v19
|
||||
lowpass_8 v24, v25, v26, v27, v20, v21
|
||||
lowpass_8 v28, v29, v30, v31, v22, v23
|
||||
transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
|
||||
|
||||
ld1 {v24.8B}, [x12], x2
|
||||
ld1 {v25.8B}, [x12], x2
|
||||
ld1 {v26.8B}, [x12], x2
|
||||
ld1 {v27.8B}, [x12], x2
|
||||
ld1 {v28.8B}, [x12], x2
|
||||
urhadd v16.8B, v24.8B, v16.8B
|
||||
urhadd v17.8B, v25.8B, v17.8B
|
||||
ld1 {v29.8B}, [x12], x2
|
||||
urhadd v18.8B, v26.8B, v18.8B
|
||||
urhadd v19.8B, v27.8B, v19.8B
|
||||
ld1 {v30.8B}, [x12], x2
|
||||
urhadd v20.8B, v28.8B, v20.8B
|
||||
urhadd v21.8B, v29.8B, v21.8B
|
||||
ld1 {v31.8B}, [x12], x2
|
||||
urhadd v22.8B, v30.8B, v22.8B
|
||||
urhadd v23.8B, v31.8B, v23.8B
|
||||
|
||||
.ifc \type,avg
|
||||
ld1 {v24.8B}, [x0], x3
|
||||
urhadd v16.8B, v16.8B, v24.8B
|
||||
ld1 {v25.8B}, [x0], x3
|
||||
urhadd v17.8B, v17.8B, v25.8B
|
||||
ld1 {v26.8B}, [x0], x3
|
||||
urhadd v18.8B, v18.8B, v26.8B
|
||||
ld1 {v27.8B}, [x0], x3
|
||||
urhadd v19.8B, v19.8B, v27.8B
|
||||
ld1 {v28.8B}, [x0], x3
|
||||
urhadd v20.8B, v20.8B, v28.8B
|
||||
ld1 {v29.8B}, [x0], x3
|
||||
urhadd v21.8B, v21.8B, v29.8B
|
||||
ld1 {v30.8B}, [x0], x3
|
||||
urhadd v22.8B, v22.8B, v30.8B
|
||||
ld1 {v31.8B}, [x0], x3
|
||||
urhadd v23.8B, v23.8B, v31.8B
|
||||
sub x0, x0, x3, lsl #3
|
||||
.endif
|
||||
|
||||
st1 {v16.8B}, [x0], x3
|
||||
st1 {v17.8B}, [x0], x3
|
||||
st1 {v18.8B}, [x0], x3
|
||||
st1 {v19.8B}, [x0], x3
|
||||
st1 {v20.8B}, [x0], x3
|
||||
st1 {v21.8B}, [x0], x3
|
||||
st1 {v22.8B}, [x0], x3
|
||||
st1 {v23.8B}, [x0], x3
|
||||
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel_v_lowpass_l2 put
|
||||
h264_qpel_v_lowpass_l2 avg
|
||||
|
||||
function put_h264_qpel8_hv_lowpass_neon_top
|
||||
lowpass_const w12
|
||||
ld1 {v16.8H}, [x1], x3
|
||||
ld1 {v17.8H}, [x1], x3
|
||||
ld1 {v18.8H}, [x1], x3
|
||||
ld1 {v19.8H}, [x1], x3
|
||||
ld1 {v20.8H}, [x1], x3
|
||||
ld1 {v21.8H}, [x1], x3
|
||||
ld1 {v22.8H}, [x1], x3
|
||||
ld1 {v23.8H}, [x1], x3
|
||||
ld1 {v24.8H}, [x1], x3
|
||||
ld1 {v25.8H}, [x1], x3
|
||||
ld1 {v26.8H}, [x1], x3
|
||||
ld1 {v27.8H}, [x1], x3
|
||||
ld1 {v28.8H}, [x1]
|
||||
lowpass_8H v16, v17
|
||||
lowpass_8H v18, v19
|
||||
lowpass_8H v20, v21
|
||||
lowpass_8H v22, v23
|
||||
lowpass_8H v24, v25
|
||||
lowpass_8H v26, v27
|
||||
lowpass_8H v28, v29
|
||||
|
||||
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
|
||||
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
|
||||
|
||||
lowpass_8.16 v16, v24, v16
|
||||
lowpass_8.16 v17, v25, v17
|
||||
|
||||
lowpass_8.16 v18, v26, v18
|
||||
lowpass_8.16 v19, v27, v19
|
||||
|
||||
lowpass_8.16 v20, v28, v20
|
||||
lowpass_8.16 v21, v29, v21
|
||||
|
||||
lowpass_8.16 v22, v30, v22
|
||||
lowpass_8.16 v23, v31, v23
|
||||
|
||||
transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro h264_qpel8_hv_lowpass type
|
||||
function \type\()_h264_qpel8_hv_lowpass_neon
|
||||
mov x10, x30
|
||||
bl put_h264_qpel8_hv_lowpass_neon_top
|
||||
.ifc \type,avg
|
||||
ld1 {v0.8B}, [x0], x2
|
||||
urhadd v16.8B, v16.8B, v0.8B
|
||||
ld1 {v1.8B}, [x0], x2
|
||||
urhadd v17.8B, v17.8B, v1.8B
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
urhadd v18.8B, v18.8B, v2.8B
|
||||
ld1 {v3.8B}, [x0], x2
|
||||
urhadd v19.8B, v19.8B, v3.8B
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
urhadd v20.8B, v20.8B, v4.8B
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
urhadd v21.8B, v21.8B, v5.8B
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
urhadd v22.8B, v22.8B, v6.8B
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
urhadd v23.8B, v23.8B, v7.8B
|
||||
sub x0, x0, x2, lsl #3
|
||||
.endif
|
||||
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
st1 {v18.8B}, [x0], x2
|
||||
st1 {v19.8B}, [x0], x2
|
||||
st1 {v20.8B}, [x0], x2
|
||||
st1 {v21.8B}, [x0], x2
|
||||
st1 {v22.8B}, [x0], x2
|
||||
st1 {v23.8B}, [x0], x2
|
||||
|
||||
ret x10
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel8_hv_lowpass put
|
||||
h264_qpel8_hv_lowpass avg
|
||||
|
||||
.macro h264_qpel8_hv_lowpass_l2 type
|
||||
function \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
mov x10, x30
|
||||
bl put_h264_qpel8_hv_lowpass_neon_top
|
||||
|
||||
ld1 {v0.8B, v1.8B}, [x2], #16
|
||||
ld1 {v2.8B, v3.8B}, [x2], #16
|
||||
urhadd v0.8B, v0.8B, v16.8B
|
||||
urhadd v1.8B, v1.8B, v17.8B
|
||||
ld1 {v4.8B, v5.8B}, [x2], #16
|
||||
urhadd v2.8B, v2.8B, v18.8B
|
||||
urhadd v3.8B, v3.8B, v19.8B
|
||||
ld1 {v6.8B, v7.8B}, [x2], #16
|
||||
urhadd v4.8B, v4.8B, v20.8B
|
||||
urhadd v5.8B, v5.8B, v21.8B
|
||||
urhadd v6.8B, v6.8B, v22.8B
|
||||
urhadd v7.8B, v7.8B, v23.8B
|
||||
.ifc \type,avg
|
||||
ld1 {v16.8B}, [x0], x3
|
||||
urhadd v0.8B, v0.8B, v16.8B
|
||||
ld1 {v17.8B}, [x0], x3
|
||||
urhadd v1.8B, v1.8B, v17.8B
|
||||
ld1 {v18.8B}, [x0], x3
|
||||
urhadd v2.8B, v2.8B, v18.8B
|
||||
ld1 {v19.8B}, [x0], x3
|
||||
urhadd v3.8B, v3.8B, v19.8B
|
||||
ld1 {v20.8B}, [x0], x3
|
||||
urhadd v4.8B, v4.8B, v20.8B
|
||||
ld1 {v21.8B}, [x0], x3
|
||||
urhadd v5.8B, v5.8B, v21.8B
|
||||
ld1 {v22.8B}, [x0], x3
|
||||
urhadd v6.8B, v6.8B, v22.8B
|
||||
ld1 {v23.8B}, [x0], x3
|
||||
urhadd v7.8B, v7.8B, v23.8B
|
||||
sub x0, x0, x3, lsl #3
|
||||
.endif
|
||||
st1 {v0.8B}, [x0], x3
|
||||
st1 {v1.8B}, [x0], x3
|
||||
st1 {v2.8B}, [x0], x3
|
||||
st1 {v3.8B}, [x0], x3
|
||||
st1 {v4.8B}, [x0], x3
|
||||
st1 {v5.8B}, [x0], x3
|
||||
st1 {v6.8B}, [x0], x3
|
||||
st1 {v7.8B}, [x0], x3
|
||||
|
||||
ret x10
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel8_hv_lowpass_l2 put
|
||||
h264_qpel8_hv_lowpass_l2 avg
|
||||
|
||||
.macro h264_qpel16_hv type
|
||||
function \type\()_h264_qpel16_hv_lowpass_neon
|
||||
mov x13, x30
|
||||
bl \type\()_h264_qpel8_hv_lowpass_neon
|
||||
sub x1, x1, x3, lsl #2
|
||||
bl \type\()_h264_qpel8_hv_lowpass_neon
|
||||
sub x1, x1, x3, lsl #4
|
||||
sub x1, x1, x3, lsl #2
|
||||
add x1, x1, #8
|
||||
sub x0, x0, x2, lsl #4
|
||||
add x0, x0, #8
|
||||
bl \type\()_h264_qpel8_hv_lowpass_neon
|
||||
sub x1, x1, x3, lsl #2
|
||||
mov x30, x13
|
||||
b \type\()_h264_qpel8_hv_lowpass_neon
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel16_hv_lowpass_l2_neon
|
||||
mov x13, x30
|
||||
sub x2, x4, #256
|
||||
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
sub x1, x1, x3, lsl #2
|
||||
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
sub x1, x1, x3, lsl #4
|
||||
sub x1, x1, x3, lsl #2
|
||||
add x1, x1, #8
|
||||
sub x0, x0, x3, lsl #4
|
||||
add x0, x0, #8
|
||||
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
sub x1, x1, x3, lsl #2
|
||||
mov x30, x13
|
||||
b \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel16_hv put
|
||||
h264_qpel16_hv avg
|
||||
|
||||
.macro h264_qpel8 type
|
||||
function ff_\type\()_h264_qpel8_mc10_neon, export=1
|
||||
lowpass_const w3
|
||||
mov x3, x1
|
||||
sub x1, x1, #2
|
||||
mov x12, #8
|
||||
b \type\()_h264_qpel8_h_lowpass_l2_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc20_neon, export=1
|
||||
lowpass_const w3
|
||||
sub x1, x1, #2
|
||||
mov x3, x2
|
||||
mov x12, #8
|
||||
b \type\()_h264_qpel8_h_lowpass_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc30_neon, export=1
|
||||
lowpass_const w3
|
||||
add x3, x1, #1
|
||||
sub x1, x1, #2
|
||||
mov x12, #8
|
||||
b \type\()_h264_qpel8_h_lowpass_l2_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc01_neon, export=1
|
||||
mov x14, x30
|
||||
mov x12, x1
|
||||
\type\()_h264_qpel8_mc01:
|
||||
lowpass_const w3
|
||||
mov x3, x2
|
||||
sub x1, x1, x2, lsl #1
|
||||
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc11_neon, export=1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
\type\()_h264_qpel8_mc11:
|
||||
lowpass_const w3
|
||||
mov x11, sp
|
||||
sub sp, sp, #64
|
||||
mov x0, sp
|
||||
sub x1, x1, #2
|
||||
mov x3, #8
|
||||
mov x12, #8
|
||||
bl put_h264_qpel8_h_lowpass_neon
|
||||
mov x0, x8
|
||||
mov x3, x2
|
||||
mov x12, sp
|
||||
sub x1, x9, x2, lsl #1
|
||||
mov x2, #8
|
||||
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
mov sp, x11
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc21_neon, export=1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
\type\()_h264_qpel8_mc21:
|
||||
lowpass_const w3
|
||||
mov x11, sp
|
||||
sub sp, sp, #(8*8+16*12)
|
||||
sub x1, x1, #2
|
||||
mov x3, #8
|
||||
mov x0, sp
|
||||
mov x12, #8
|
||||
bl put_h264_qpel8_h_lowpass_neon
|
||||
mov x4, x0
|
||||
mov x0, x8
|
||||
sub x1, x9, x2, lsl #1
|
||||
sub x1, x1, #2
|
||||
mov x3, x2
|
||||
sub x2, x4, #64
|
||||
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
mov sp, x11
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc31_neon, export=1
|
||||
add x1, x1, #1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
sub x1, x1, #1
|
||||
b \type\()_h264_qpel8_mc11
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc02_neon, export=1
|
||||
mov x14, x30
|
||||
lowpass_const w3
|
||||
sub x1, x1, x2, lsl #1
|
||||
mov x3, x2
|
||||
bl \type\()_h264_qpel8_v_lowpass_neon
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc12_neon, export=1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
\type\()_h264_qpel8_mc12:
|
||||
lowpass_const w3
|
||||
mov x11, sp
|
||||
sub sp, sp, #(8*8+16*12)
|
||||
sub x1, x1, x2, lsl #1
|
||||
mov x3, x2
|
||||
mov x2, #8
|
||||
mov x0, sp
|
||||
bl put_h264_qpel8_v_lowpass_neon
|
||||
mov x4, x0
|
||||
mov x0, x8
|
||||
sub x1, x9, x3, lsl #1
|
||||
sub x1, x1, #2
|
||||
sub x2, x4, #64
|
||||
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
mov sp, x11
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc22_neon, export=1
|
||||
mov x14, x30
|
||||
mov x11, sp
|
||||
sub x1, x1, x2, lsl #1
|
||||
sub x1, x1, #2
|
||||
mov x3, x2
|
||||
bl \type\()_h264_qpel8_hv_lowpass_neon
|
||||
mov sp, x11
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc32_neon, export=1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
add x1, x1, #1
|
||||
b \type\()_h264_qpel8_mc12
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc03_neon, export=1
|
||||
mov x14, x30
|
||||
add x12, x1, x2
|
||||
b \type\()_h264_qpel8_mc01
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc13_neon, export=1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
add x1, x1, x2
|
||||
b \type\()_h264_qpel8_mc11
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc23_neon, export=1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
add x1, x1, x2
|
||||
b \type\()_h264_qpel8_mc21
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc33_neon, export=1
|
||||
add x1, x1, #1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
add x1, x1, x2
|
||||
sub x1, x1, #1
|
||||
b \type\()_h264_qpel8_mc11
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel8 put
|
||||
h264_qpel8 avg
|
||||
|
||||
.macro h264_qpel16 type
|
||||
function ff_\type\()_h264_qpel16_mc10_neon, export=1
|
||||
lowpass_const w3
|
||||
mov x3, x1
|
||||
sub x1, x1, #2
|
||||
b \type\()_h264_qpel16_h_lowpass_l2_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc20_neon, export=1
|
||||
lowpass_const w3
|
||||
sub x1, x1, #2
|
||||
mov x3, x2
|
||||
b \type\()_h264_qpel16_h_lowpass_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc30_neon, export=1
|
||||
lowpass_const w3
|
||||
add x3, x1, #1
|
||||
sub x1, x1, #2
|
||||
b \type\()_h264_qpel16_h_lowpass_l2_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc01_neon, export=1
|
||||
mov x14, x30
|
||||
mov x12, x1
|
||||
\type\()_h264_qpel16_mc01:
|
||||
lowpass_const w3
|
||||
mov x3, x2
|
||||
sub x1, x1, x2, lsl #1
|
||||
bl \type\()_h264_qpel16_v_lowpass_l2_neon
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc11_neon, export=1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
\type\()_h264_qpel16_mc11:
|
||||
lowpass_const w3
|
||||
mov x11, sp
|
||||
sub sp, sp, #256
|
||||
mov x0, sp
|
||||
sub x1, x1, #2
|
||||
mov x3, #16
|
||||
bl put_h264_qpel16_h_lowpass_neon
|
||||
mov x0, x8
|
||||
mov x3, x2
|
||||
mov x12, sp
|
||||
sub x1, x9, x2, lsl #1
|
||||
mov x2, #16
|
||||
bl \type\()_h264_qpel16_v_lowpass_l2_neon
|
||||
mov sp, x11
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc21_neon, export=1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
\type\()_h264_qpel16_mc21:
|
||||
lowpass_const w3
|
||||
mov x11, sp
|
||||
sub sp, sp, #(16*16+16*12)
|
||||
sub x1, x1, #2
|
||||
mov x0, sp
|
||||
bl put_h264_qpel16_h_lowpass_neon_packed
|
||||
mov x4, x0
|
||||
mov x0, x8
|
||||
sub x1, x9, x2, lsl #1
|
||||
sub x1, x1, #2
|
||||
mov x3, x2
|
||||
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
|
||||
mov sp, x11
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc31_neon, export=1
|
||||
add x1, x1, #1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
sub x1, x1, #1
|
||||
b \type\()_h264_qpel16_mc11
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc02_neon, export=1
|
||||
mov x14, x30
|
||||
lowpass_const w3
|
||||
sub x1, x1, x2, lsl #1
|
||||
mov x3, x2
|
||||
bl \type\()_h264_qpel16_v_lowpass_neon
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc12_neon, export=1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
\type\()_h264_qpel16_mc12:
|
||||
lowpass_const w3
|
||||
mov x11, sp
|
||||
sub sp, sp, #(16*16+16*12)
|
||||
sub x1, x1, x2, lsl #1
|
||||
mov x0, sp
|
||||
mov x3, x2
|
||||
bl put_h264_qpel16_v_lowpass_neon_packed
|
||||
mov x4, x0
|
||||
mov x0, x8
|
||||
sub x1, x9, x3, lsl #1
|
||||
sub x1, x1, #2
|
||||
mov x2, x3
|
||||
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
|
||||
mov sp, x11
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc22_neon, export=1
|
||||
mov x14, x30
|
||||
lowpass_const w3
|
||||
mov x11, sp
|
||||
sub x1, x1, x2, lsl #1
|
||||
sub x1, x1, #2
|
||||
mov x3, x2
|
||||
bl \type\()_h264_qpel16_hv_lowpass_neon
|
||||
mov sp, x11 // restore stack
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc32_neon, export=1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
add x1, x1, #1
|
||||
b \type\()_h264_qpel16_mc12
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc03_neon, export=1
|
||||
mov x14, x30
|
||||
add x12, x1, x2
|
||||
b \type\()_h264_qpel16_mc01
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc13_neon, export=1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
add x1, x1, x2
|
||||
b \type\()_h264_qpel16_mc11
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc23_neon, export=1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
add x1, x1, x2
|
||||
b \type\()_h264_qpel16_mc21
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc33_neon, export=1
|
||||
add x1, x1, #1
|
||||
mov x14, x30
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
add x1, x1, x2
|
||||
sub x1, x1, #1
|
||||
b \type\()_h264_qpel16_mc11
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel16 put
|
||||
h264_qpel16 avg
|
||||
123
externals/ffmpeg/libavcodec/aarch64/hpeldsp_init_aarch64.c
vendored
Executable file
123
externals/ffmpeg/libavcodec/aarch64/hpeldsp_init_aarch64.c
vendored
Executable file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* ARM NEON optimised DSP functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/hpeldsp.h"
|
||||
|
||||
void ff_put_pixels16_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_put_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_avg_pixels16_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
av_cold void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
|
||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
|
||||
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
|
||||
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
|
||||
c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
|
||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
|
||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
|
||||
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
|
||||
|
||||
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
|
||||
c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
|
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
|
||||
|
||||
c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
|
||||
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
|
||||
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
|
||||
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
|
||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
|
||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
|
||||
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
|
||||
|
||||
c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
|
||||
c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
|
||||
c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
|
||||
c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
|
||||
}
|
||||
}
|
||||
397
externals/ffmpeg/libavcodec/aarch64/hpeldsp_neon.S
vendored
Executable file
397
externals/ffmpeg/libavcodec/aarch64/hpeldsp_neon.S
vendored
Executable file
@@ -0,0 +1,397 @@
|
||||
/*
|
||||
* ARM NEON optimised DSP functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
.macro pixels16 rnd=1, avg=0
|
||||
.if \avg
|
||||
mov x12, x0
|
||||
.endif
|
||||
1: ld1 {v0.16B}, [x1], x2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
ld1 {v2.16B}, [x1], x2
|
||||
ld1 {v3.16B}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v4.16B}, [x12], x2
|
||||
urhadd v0.16B, v0.16B, v4.16B
|
||||
ld1 {v5.16B}, [x12], x2
|
||||
urhadd v1.16B, v1.16B, v5.16B
|
||||
ld1 {v6.16B}, [x12], x2
|
||||
urhadd v2.16B, v2.16B, v6.16B
|
||||
ld1 {v7.16B}, [x12], x2
|
||||
urhadd v3.16B, v3.16B, v7.16B
|
||||
.endif
|
||||
subs w3, w3, #4
|
||||
st1 {v0.16B}, [x0], x2
|
||||
st1 {v1.16B}, [x0], x2
|
||||
st1 {v2.16B}, [x0], x2
|
||||
st1 {v3.16B}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels16_x2 rnd=1, avg=0
|
||||
1: ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
ld1 {v2.16B, v3.16B}, [x1], x2
|
||||
subs w3, w3, #2
|
||||
ext v1.16B, v0.16B, v1.16B, #1
|
||||
avg v0.16B, v0.16B, v1.16B
|
||||
ext v3.16B, v2.16B, v3.16B, #1
|
||||
avg v2.16B, v2.16B, v3.16B
|
||||
.if \avg
|
||||
ld1 {v1.16B}, [x0], x2
|
||||
ld1 {v3.16B}, [x0]
|
||||
urhadd v0.16B, v0.16B, v1.16B
|
||||
urhadd v2.16B, v2.16B, v3.16B
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v0.16B}, [x0], x2
|
||||
st1 {v2.16B}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels16_y2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
1: subs w3, w3, #2
|
||||
avg v2.16B, v0.16B, v1.16B
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
avg v3.16B, v0.16B, v1.16B
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v4.16B}, [x0], x2
|
||||
ld1 {v5.16B}, [x0]
|
||||
urhadd v2.16B, v2.16B, v4.16B
|
||||
urhadd v3.16B, v3.16B, v5.16B
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v2.16B}, [x0], x2
|
||||
st1 {v3.16B}, [x0], x2
|
||||
b.ne 1b
|
||||
|
||||
avg v2.16B, v0.16B, v1.16B
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
avg v3.16B, v0.16B, v1.16B
|
||||
.if \avg
|
||||
ld1 {v4.16B}, [x0], x2
|
||||
ld1 {v5.16B}, [x0]
|
||||
urhadd v2.16B, v2.16B, v4.16B
|
||||
urhadd v3.16B, v3.16B, v5.16B
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v2.16B}, [x0], x2
|
||||
st1 {v3.16B}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels16_xy2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
ld1 {v4.16B, v5.16B}, [x1], x2
|
||||
NRND movi v26.8H, #1
|
||||
ext v1.16B, v0.16B, v1.16B, #1
|
||||
ext v5.16B, v4.16B, v5.16B, #1
|
||||
uaddl v16.8H, v0.8B, v1.8B
|
||||
uaddl2 v20.8H, v0.16B, v1.16B
|
||||
uaddl v18.8H, v4.8B, v5.8B
|
||||
uaddl2 v22.8H, v4.16B, v5.16B
|
||||
1: subs w3, w3, #2
|
||||
ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
ext v30.16B, v0.16B, v1.16B, #1
|
||||
add v1.8H, v20.8H, v22.8H
|
||||
mshrn v28.8B, v24.8H, #2
|
||||
NRND add v1.8H, v1.8H, v26.8H
|
||||
mshrn2 v28.16B, v1.8H, #2
|
||||
.if \avg
|
||||
ld1 {v16.16B}, [x0]
|
||||
urhadd v28.16B, v28.16B, v16.16B
|
||||
.endif
|
||||
uaddl v16.8H, v0.8B, v30.8B
|
||||
ld1 {v2.16B, v3.16B}, [x1], x2
|
||||
uaddl2 v20.8H, v0.16B, v30.16B
|
||||
st1 {v28.16B}, [x0], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
ext v3.16B, v2.16B, v3.16B, #1
|
||||
add v0.8H, v20.8H, v22.8H
|
||||
mshrn v30.8B, v24.8H, #2
|
||||
NRND add v0.8H, v0.8H, v26.8H
|
||||
mshrn2 v30.16B, v0.8H, #2
|
||||
.if \avg
|
||||
ld1 {v18.16B}, [x0]
|
||||
urhadd v30.16B, v30.16B, v18.16B
|
||||
.endif
|
||||
uaddl v18.8H, v2.8B, v3.8B
|
||||
uaddl2 v22.8H, v2.16B, v3.16B
|
||||
st1 {v30.16B}, [x0], x2
|
||||
b.gt 1b
|
||||
|
||||
ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
ext v30.16B, v0.16B, v1.16B, #1
|
||||
add v1.8H, v20.8H, v22.8H
|
||||
mshrn v28.8B, v24.8H, #2
|
||||
NRND add v1.8H, v1.8H, v26.8H
|
||||
mshrn2 v28.16B, v1.8H, #2
|
||||
.if \avg
|
||||
ld1 {v16.16B}, [x0]
|
||||
urhadd v28.16B, v28.16B, v16.16B
|
||||
.endif
|
||||
uaddl v16.8H, v0.8B, v30.8B
|
||||
uaddl2 v20.8H, v0.16B, v30.16B
|
||||
st1 {v28.16B}, [x0], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
add v0.8H, v20.8H, v22.8H
|
||||
mshrn v30.8B, v24.8H, #2
|
||||
NRND add v0.8H, v0.8H, v26.8H
|
||||
mshrn2 v30.16B, v0.8H, #2
|
||||
.if \avg
|
||||
ld1 {v18.16B}, [x0]
|
||||
urhadd v30.16B, v30.16B, v18.16B
|
||||
.endif
|
||||
st1 {v30.16B}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8 rnd=1, avg=0
|
||||
1: ld1 {v0.8B}, [x1], x2
|
||||
ld1 {v1.8B}, [x1], x2
|
||||
ld1 {v2.8B}, [x1], x2
|
||||
ld1 {v3.8B}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
urhadd v0.8B, v0.8B, v4.8B
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
urhadd v1.8B, v1.8B, v5.8B
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
urhadd v2.8B, v2.8B, v6.8B
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
urhadd v3.8B, v3.8B, v7.8B
|
||||
sub x0, x0, x2, lsl #2
|
||||
.endif
|
||||
subs w3, w3, #4
|
||||
st1 {v0.8B}, [x0], x2
|
||||
st1 {v1.8B}, [x0], x2
|
||||
st1 {v2.8B}, [x0], x2
|
||||
st1 {v3.8B}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8_x2 rnd=1, avg=0
|
||||
1: ld1 {v0.8B, v1.8B}, [x1], x2
|
||||
ext v1.8B, v0.8B, v1.8B, #1
|
||||
ld1 {v2.8B, v3.8B}, [x1], x2
|
||||
ext v3.8B, v2.8B, v3.8B, #1
|
||||
subs w3, w3, #2
|
||||
avg v0.8B, v0.8B, v1.8B
|
||||
avg v2.8B, v2.8B, v3.8B
|
||||
.if \avg
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
ld1 {v5.8B}, [x0]
|
||||
urhadd v0.8B, v0.8B, v4.8B
|
||||
urhadd v2.8B, v2.8B, v5.8B
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v0.8B}, [x0], x2
|
||||
st1 {v2.8B}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8_y2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.8B}, [x1], x2
|
||||
ld1 {v1.8B}, [x1], x2
|
||||
1: subs w3, w3, #2
|
||||
avg v4.8B, v0.8B, v1.8B
|
||||
ld1 {v0.8B}, [x1], x2
|
||||
avg v5.8B, v0.8B, v1.8B
|
||||
ld1 {v1.8B}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
ld1 {v3.8B}, [x0]
|
||||
urhadd v4.8B, v4.8B, v2.8B
|
||||
urhadd v5.8B, v5.8B, v3.8B
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v4.8B}, [x0], x2
|
||||
st1 {v5.8B}, [x0], x2
|
||||
b.ne 1b
|
||||
|
||||
avg v4.8B, v0.8B, v1.8B
|
||||
ld1 {v0.8B}, [x1], x2
|
||||
avg v5.8B, v0.8B, v1.8B
|
||||
.if \avg
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
ld1 {v3.8B}, [x0]
|
||||
urhadd v4.8B, v4.8B, v2.8B
|
||||
urhadd v5.8B, v5.8B, v3.8B
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v4.8B}, [x0], x2
|
||||
st1 {v5.8B}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8_xy2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
NRND movi v19.8H, #1
|
||||
ext v4.16B, v0.16B, v4.16B, #1
|
||||
ext v6.16B, v1.16B, v6.16B, #1
|
||||
uaddl v16.8H, v0.8B, v4.8B
|
||||
uaddl v17.8H, v1.8B, v6.8B
|
||||
1: subs w3, w3, #2
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
ext v4.16B, v0.16B, v4.16B, #1
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
uaddl v16.8H, v0.8B, v4.8B
|
||||
mshrn v5.8B, v18.8H, #2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
.if \avg
|
||||
ld1 {v7.8B}, [x0]
|
||||
urhadd v5.8B, v5.8B, v7.8B
|
||||
.endif
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
st1 {v5.8B}, [x0], x2
|
||||
mshrn v7.8B, v18.8H, #2
|
||||
.if \avg
|
||||
ld1 {v5.8B}, [x0]
|
||||
urhadd v7.8B, v7.8B, v5.8B
|
||||
.endif
|
||||
ext v6.16B, v1.16B, v6.16B, #1
|
||||
uaddl v17.8H, v1.8B, v6.8B
|
||||
st1 {v7.8B}, [x0], x2
|
||||
b.gt 1b
|
||||
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
ext v4.16B, v0.16B, v4.16B, #1
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
uaddl v16.8H, v0.8B, v4.8B
|
||||
mshrn v5.8B, v18.8H, #2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
.if \avg
|
||||
ld1 {v7.8B}, [x0]
|
||||
urhadd v5.8B, v5.8B, v7.8B
|
||||
.endif
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
st1 {v5.8B}, [x0], x2
|
||||
mshrn v7.8B, v18.8H, #2
|
||||
.if \avg
|
||||
ld1 {v5.8B}, [x0]
|
||||
urhadd v7.8B, v7.8B, v5.8B
|
||||
.endif
|
||||
st1 {v7.8B}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixfunc pfx, name, suf, rnd=1, avg=0
|
||||
.if \rnd
|
||||
.macro avg rd, rn, rm
|
||||
urhadd \rd, \rn, \rm
|
||||
.endm
|
||||
.macro mshrn rd, rn, rm
|
||||
rshrn \rd, \rn, \rm
|
||||
.endm
|
||||
.macro mshrn2 rd, rn, rm
|
||||
rshrn2 \rd, \rn, \rm
|
||||
.endm
|
||||
.macro NRND insn:vararg
|
||||
.endm
|
||||
.else
|
||||
.macro avg rd, rn, rm
|
||||
uhadd \rd, \rn, \rm
|
||||
.endm
|
||||
.macro mshrn rd, rn, rm
|
||||
shrn \rd, \rn, \rm
|
||||
.endm
|
||||
.macro mshrn2 rd, rn, rm
|
||||
shrn2 \rd, \rn, \rm
|
||||
.endm
|
||||
.macro NRND insn:vararg
|
||||
\insn
|
||||
.endm
|
||||
.endif
|
||||
function ff_\pfx\name\suf\()_neon, export=1
|
||||
\name \rnd, \avg
|
||||
endfunc
|
||||
.purgem avg
|
||||
.purgem mshrn
|
||||
.purgem mshrn2
|
||||
.purgem NRND
|
||||
.endm
|
||||
|
||||
.macro pixfunc2 pfx, name, avg=0
|
||||
pixfunc \pfx, \name, rnd=1, avg=\avg
|
||||
pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
|
||||
.endm
|
||||
|
||||
function ff_put_h264_qpel16_mc00_neon, export=1
|
||||
mov w3, #16
|
||||
endfunc
|
||||
|
||||
pixfunc put_, pixels16, avg=0
|
||||
pixfunc2 put_, pixels16_x2, avg=0
|
||||
pixfunc2 put_, pixels16_y2, avg=0
|
||||
pixfunc2 put_, pixels16_xy2, avg=0
|
||||
|
||||
function ff_avg_h264_qpel16_mc00_neon, export=1
|
||||
mov w3, #16
|
||||
endfunc
|
||||
|
||||
pixfunc avg_, pixels16, avg=1
|
||||
pixfunc2 avg_, pixels16_x2, avg=1
|
||||
pixfunc2 avg_, pixels16_y2, avg=1
|
||||
pixfunc2 avg_, pixels16_xy2, avg=1
|
||||
|
||||
function ff_put_h264_qpel8_mc00_neon, export=1
|
||||
mov w3, #8
|
||||
endfunc
|
||||
|
||||
pixfunc put_, pixels8, avg=0
|
||||
pixfunc2 put_, pixels8_x2, avg=0
|
||||
pixfunc2 put_, pixels8_y2, avg=0
|
||||
pixfunc2 put_, pixels8_xy2, avg=0
|
||||
|
||||
function ff_avg_h264_qpel8_mc00_neon, export=1
|
||||
mov w3, #8
|
||||
endfunc
|
||||
|
||||
pixfunc avg_, pixels8, avg=1
|
||||
pixfunc avg_, pixels8_x2, avg=1
|
||||
pixfunc avg_, pixels8_y2, avg=1
|
||||
pixfunc avg_, pixels8_xy2, avg=1
|
||||
28
externals/ffmpeg/libavcodec/aarch64/idct.h
vendored
Executable file
28
externals/ffmpeg/libavcodec/aarch64/idct.h
vendored
Executable file
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_AARCH64_IDCT_H
|
||||
#define AVCODEC_AARCH64_IDCT_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
void ff_simple_idct_neon(int16_t *data);
|
||||
void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
|
||||
void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
|
||||
|
||||
#endif /* AVCODEC_AARCH64_IDCT_H */
|
||||
45
externals/ffmpeg/libavcodec/aarch64/idctdsp_init_aarch64.c
vendored
Executable file
45
externals/ffmpeg/libavcodec/aarch64/idctdsp_init_aarch64.c
vendored
Executable file
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
* ARM-NEON-optimized IDCT functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/idctdsp.h"
|
||||
#include "idct.h"
|
||||
|
||||
av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) {
|
||||
if (avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLENEON) {
|
||||
c->idct_put = ff_simple_idct_put_neon;
|
||||
c->idct_add = ff_simple_idct_add_neon;
|
||||
c->idct = ff_simple_idct_neon;
|
||||
c->perm_type = FF_IDCT_PERM_PARTTRANS;
|
||||
}
|
||||
}
|
||||
}
|
||||
323
externals/ffmpeg/libavcodec/aarch64/mdct_neon.S
vendored
Executable file
323
externals/ffmpeg/libavcodec/aarch64/mdct_neon.S
vendored
Executable file
@@ -0,0 +1,323 @@
|
||||
/*
|
||||
* AArch64 NEON optimised MDCT
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_imdct_half_neon, export=1
|
||||
sub sp, sp, #32
|
||||
stp x19, x20, [sp]
|
||||
str x30, [sp, #16]
|
||||
mov x12, #1
|
||||
ldr w14, [x0, #28] // mdct_bits
|
||||
ldr x4, [x0, #32] // tcos
|
||||
ldr x3, [x0, #8] // revtab
|
||||
lsl x12, x12, x14 // n = 1 << nbits
|
||||
lsr x14, x12, #2 // n4 = n >> 2
|
||||
add x7, x2, x12, lsl #1
|
||||
mov x12, #-16
|
||||
sub x7, x7, #16
|
||||
|
||||
ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0
|
||||
ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x
|
||||
rev64 v17.2s, v17.2s
|
||||
ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
|
||||
fmul v6.2s, v17.2s, v2.2s
|
||||
fmul v7.2s, v0.2s, v2.2s
|
||||
1:
|
||||
subs x14, x14, #2
|
||||
ldr w6, [x3], #4
|
||||
fmul v4.2s, v0.2s, v3.2s
|
||||
fmul v5.2s, v17.2s, v3.2s
|
||||
fsub v4.2s, v6.2s, v4.2s
|
||||
fadd v5.2s, v5.2s, v7.2s
|
||||
ubfm x8, x6, #16, #31
|
||||
ubfm x6, x6, #0, #15
|
||||
add x8, x1, x8, lsl #3
|
||||
add x6, x1, x6, lsl #3
|
||||
b.eq 2f
|
||||
ld2 {v16.2s,v17.2s}, [x7], x12
|
||||
ld2 {v0.2s,v1.2s}, [x2], #16
|
||||
rev64 v17.2s, v17.2s
|
||||
ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
|
||||
fmul v6.2s, v17.2s, v2.2s
|
||||
fmul v7.2s, v0.2s, v2.2s
|
||||
st2 {v4.s,v5.s}[0], [x6]
|
||||
st2 {v4.s,v5.s}[1], [x8]
|
||||
b 1b
|
||||
2:
|
||||
st2 {v4.s,v5.s}[0], [x6]
|
||||
st2 {v4.s,v5.s}[1], [x8]
|
||||
|
||||
mov x19, x0
|
||||
mov x20, x1
|
||||
bl X(ff_fft_calc_neon)
|
||||
|
||||
mov x12, #1
|
||||
ldr w14, [x19, #28] // mdct_bits
|
||||
ldr x4, [x19, #32] // tcos
|
||||
lsl x12, x12, x14 // n = 1 << nbits
|
||||
lsr x14, x12, #3 // n8 = n >> 3
|
||||
|
||||
add x4, x4, x14, lsl #3
|
||||
add x6, x20, x14, lsl #3
|
||||
sub x1, x4, #16
|
||||
sub x3, x6, #16
|
||||
|
||||
mov x7, #-16
|
||||
mov x8, x6
|
||||
mov x0, x3
|
||||
|
||||
ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0
|
||||
ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3
|
||||
ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
|
||||
3:
|
||||
subs x14, x14, #2
|
||||
fmul v7.2s, v0.2s, v17.2s
|
||||
ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3
|
||||
fmul v4.2s, v1.2s, v17.2s
|
||||
fmul v6.2s, v21.2s, v19.2s
|
||||
fmul v5.2s, v20.2s, v19.2s
|
||||
fmul v22.2s, v1.2s, v16.2s
|
||||
fmul v23.2s, v21.2s, v18.2s
|
||||
fmul v24.2s, v0.2s, v16.2s
|
||||
fmul v25.2s, v20.2s, v18.2s
|
||||
fadd v7.2s, v7.2s, v22.2s
|
||||
fadd v5.2s, v5.2s, v23.2s
|
||||
fsub v4.2s, v4.2s, v24.2s
|
||||
fsub v6.2s, v6.2s, v25.2s
|
||||
b.eq 4f
|
||||
ld2 {v0.2s,v1.2s}, [x3], x7
|
||||
ld2 {v20.2s,v21.2s},[x6], #16
|
||||
ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
|
||||
rev64 v5.2s, v5.2s
|
||||
rev64 v7.2s, v7.2s
|
||||
st2 {v4.2s,v5.2s}, [x0], x7
|
||||
st2 {v6.2s,v7.2s}, [x8], #16
|
||||
b 3b
|
||||
4:
|
||||
rev64 v5.2s, v5.2s
|
||||
rev64 v7.2s, v7.2s
|
||||
st2 {v4.2s,v5.2s}, [x0]
|
||||
st2 {v6.2s,v7.2s}, [x8]
|
||||
|
||||
ldp x19, x20, [sp]
|
||||
ldr x30, [sp, #16]
|
||||
add sp, sp, #32
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_imdct_calc_neon, export=1
|
||||
sub sp, sp, #32
|
||||
stp x19, x20, [sp]
|
||||
str x30, [sp, #16]
|
||||
ldr w3, [x0, #28] // mdct_bits
|
||||
mov x19, #1
|
||||
mov x20, x1
|
||||
lsl x19, x19, x3
|
||||
add x1, x1, x19
|
||||
|
||||
bl X(ff_imdct_half_neon)
|
||||
|
||||
add x0, x20, x19, lsl #2
|
||||
add x1, x20, x19, lsl #1
|
||||
sub x0, x0, #8
|
||||
sub x2, x1, #16
|
||||
mov x3, #-16
|
||||
mov x6, #-8
|
||||
1:
|
||||
ld1 {v0.4s}, [x2], x3
|
||||
prfum pldl1keep, [x0, #-16]
|
||||
rev64 v0.4s, v0.4s
|
||||
ld1 {v2.2s,v3.2s}, [x1], #16
|
||||
fneg v4.4s, v0.4s
|
||||
prfum pldl1keep, [x2, #-16]
|
||||
rev64 v2.2s, v2.2s
|
||||
rev64 v3.2s, v3.2s
|
||||
ext v4.16b, v4.16b, v4.16b, #8
|
||||
st1 {v2.2s}, [x0], x6
|
||||
st1 {v3.2s}, [x0], x6
|
||||
st1 {v4.4s}, [x20], #16
|
||||
subs x19, x19, #16
|
||||
b.gt 1b
|
||||
|
||||
ldp x19, x20, [sp], #16
|
||||
ldr x30, [sp], #16
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
function ff_mdct_calc_neon, export=1
|
||||
sub sp, sp, #32
|
||||
stp x19, x20, [sp]
|
||||
str x30, [sp, #16]
|
||||
|
||||
mov x12, #1
|
||||
ldr w14, [x0, #28] // mdct_bits
|
||||
ldr x4, [x0, #32] // tcos
|
||||
ldr x3, [x0, #8] // revtab
|
||||
lsl x14, x12, x14 // n = 1 << nbits
|
||||
add x7, x2, x14 // in4u
|
||||
sub x9, x7, #16 // in4d
|
||||
add x2, x7, x14, lsl #1 // in3u
|
||||
add x8, x9, x14, lsl #1 // in3d
|
||||
add x5, x4, x14, lsl #1
|
||||
sub x5, x5, #16
|
||||
sub x3, x3, #4
|
||||
mov x12, #-16
|
||||
lsr x13, x14, #1
|
||||
|
||||
ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
|
||||
ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
|
||||
ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
|
||||
rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
|
||||
rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
|
||||
ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
|
||||
fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
|
||||
ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
|
||||
rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
|
||||
rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
|
||||
ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
|
||||
fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
|
||||
fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
|
||||
fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
|
||||
1:
|
||||
fmul v7.2s, v0.2s, v21.2s // I*s
|
||||
ldr w10, [x3, x13]
|
||||
fmul v6.2s, v2.2s, v20.2s // -R*c
|
||||
ldr w6, [x3, #4]!
|
||||
fmul v4.2s, v2.2s, v21.2s // -R*s
|
||||
fmul v5.2s, v0.2s, v20.2s // I*c
|
||||
fmul v24.2s, v16.2s, v30.2s // R*c
|
||||
fmul v25.2s, v18.2s, v31.2s // -I*s
|
||||
fmul v22.2s, v16.2s, v31.2s // R*s
|
||||
fmul v23.2s, v18.2s, v30.2s // I*c
|
||||
subs x14, x14, #16
|
||||
subs x13, x13, #8
|
||||
fsub v6.2s, v6.2s, v7.2s // -R*c-I*s
|
||||
fadd v7.2s, v4.2s, v5.2s // -R*s+I*c
|
||||
fsub v24.2s, v25.2s, v24.2s // I*s-R*c
|
||||
fadd v25.2s, v22.2s, v23.2s // R*s-I*c
|
||||
b.eq 1f
|
||||
mov x12, #-16
|
||||
ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
|
||||
ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
|
||||
fneg v7.2s, v7.2s // R*s-I*c
|
||||
ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
|
||||
rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
|
||||
rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
|
||||
ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
|
||||
fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
|
||||
ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
|
||||
rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
|
||||
rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
|
||||
ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
|
||||
fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
|
||||
fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
|
||||
fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
|
||||
ubfm x12, x6, #16, #31
|
||||
ubfm x6, x6, #0, #15
|
||||
add x12, x1, x12, lsl #3
|
||||
add x6, x1, x6, lsl #3
|
||||
st2 {v6.s,v7.s}[0], [x6]
|
||||
st2 {v6.s,v7.s}[1], [x12]
|
||||
ubfm x6, x10, #16, #31
|
||||
ubfm x10, x10, #0, #15
|
||||
add x6 , x1, x6, lsl #3
|
||||
add x10, x1, x10, lsl #3
|
||||
st2 {v24.s,v25.s}[0], [x10]
|
||||
st2 {v24.s,v25.s}[1], [x6]
|
||||
b 1b
|
||||
1:
|
||||
fneg v7.2s, v7.2s // R*s-I*c
|
||||
ubfm x12, x6, #16, #31
|
||||
ubfm x6, x6, #0, #15
|
||||
add x12, x1, x12, lsl #3
|
||||
add x6, x1, x6, lsl #3
|
||||
st2 {v6.s,v7.s}[0], [x6]
|
||||
st2 {v6.s,v7.s}[1], [x12]
|
||||
ubfm x6, x10, #16, #31
|
||||
ubfm x10, x10, #0, #15
|
||||
add x6 , x1, x6, lsl #3
|
||||
add x10, x1, x10, lsl #3
|
||||
st2 {v24.s,v25.s}[0], [x10]
|
||||
st2 {v24.s,v25.s}[1], [x6]
|
||||
|
||||
mov x19, x0
|
||||
mov x20, x1
|
||||
bl X(ff_fft_calc_neon)
|
||||
|
||||
mov x12, #1
|
||||
ldr w14, [x19, #28] // mdct_bits
|
||||
ldr x4, [x19, #32] // tcos
|
||||
lsl x12, x12, x14 // n = 1 << nbits
|
||||
lsr x14, x12, #3 // n8 = n >> 3
|
||||
|
||||
add x4, x4, x14, lsl #3
|
||||
add x6, x20, x14, lsl #3
|
||||
sub x1, x4, #16
|
||||
sub x3, x6, #16
|
||||
|
||||
mov x7, #-16
|
||||
mov x8, x6
|
||||
mov x0, x3
|
||||
|
||||
ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0
|
||||
ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3
|
||||
ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
|
||||
1:
|
||||
subs x14, x14, #2
|
||||
fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0
|
||||
ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3
|
||||
fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0
|
||||
fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3
|
||||
fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3
|
||||
fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0
|
||||
fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3
|
||||
fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3
|
||||
fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0
|
||||
fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0
|
||||
fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3
|
||||
fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3
|
||||
fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0
|
||||
fneg v4.2s, v4.2s
|
||||
fneg v6.2s, v6.2s
|
||||
b.eq 1f
|
||||
ld2 {v0.2s, v1.2s}, [x3], x7
|
||||
ld2 {v20.2s,v21.2s}, [x6], #16
|
||||
ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
|
||||
rev64 v5.2s, v5.2s
|
||||
rev64 v7.2s, v7.2s
|
||||
st2 {v4.2s,v5.2s}, [x0], x7
|
||||
st2 {v6.2s,v7.2s}, [x8], #16
|
||||
b 1b
|
||||
1:
|
||||
rev64 v5.2s, v5.2s
|
||||
rev64 v7.2s, v7.2s
|
||||
st2 {v4.2s,v5.2s}, [x0]
|
||||
st2 {v6.2s,v7.2s}, [x8]
|
||||
|
||||
ldp x19, x20, [sp], #16
|
||||
ldr x30, [sp], #16
|
||||
ret
|
||||
endfunc
|
||||
40
externals/ffmpeg/libavcodec/aarch64/mpegaudiodsp_init.c
vendored
Executable file
40
externals/ffmpeg/libavcodec/aarch64/mpegaudiodsp_init.c
vendored
Executable file
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/mpegaudiodsp.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_mpadsp_apply_window_fixed_neon(int32_t *synth_buf, int32_t *window,
|
||||
int *dither, int16_t *samples, ptrdiff_t incr);
|
||||
void ff_mpadsp_apply_window_float_neon(float *synth_buf, float *window,
|
||||
int *dither, float *samples, ptrdiff_t incr);
|
||||
|
||||
av_cold void ff_mpadsp_init_aarch64(MPADSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
s->apply_window_fixed = ff_mpadsp_apply_window_fixed_neon;
|
||||
s->apply_window_float = ff_mpadsp_apply_window_float_neon;
|
||||
}
|
||||
}
|
||||
225
externals/ffmpeg/libavcodec/aarch64/mpegaudiodsp_neon.S
vendored
Executable file
225
externals/ffmpeg/libavcodec/aarch64/mpegaudiodsp_neon.S
vendored
Executable file
@@ -0,0 +1,225 @@
|
||||
/*
|
||||
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
#define FRAC_BITS 23 // fractional bits for sb_samples and dct
|
||||
#define WFRAC_BITS 16 // fractional bits for window
|
||||
#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
|
||||
|
||||
const tbl_rev128_s, align=4
|
||||
.byte 12, 13, 14, 15
|
||||
.byte 8, 9, 10, 11
|
||||
.byte 4, 5, 6, 7
|
||||
.byte 0, 1, 2, 3
|
||||
endconst
|
||||
|
||||
.macro apply_window type, st
|
||||
function ff_mpadsp_apply_window_\type\()_neon, export=1
|
||||
mov x7, x0
|
||||
add x8, x0, #512<<2
|
||||
ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x7], #64
|
||||
ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x7], #64
|
||||
st1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x8], #64
|
||||
st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x8], #64
|
||||
movrel x15, tbl_rev128_s
|
||||
ld1 {v27.4s}, [x15]
|
||||
.ifc \type, fixed
|
||||
lsl x4, x4, #1
|
||||
.else
|
||||
lsl x4, x4, #2
|
||||
.endif
|
||||
add x10, x0, #45<<2
|
||||
add x0, x0, #16<<2
|
||||
add x1, x1, #16<<2
|
||||
add x5, x3, x4, lsl #5
|
||||
sub x5, x5, x4 // samples2
|
||||
neg x13, x4 // -incr
|
||||
mov x9, #64<<2
|
||||
.ifc \type, fixed
|
||||
ld1r {v16.2s}, [x2] // dither_state
|
||||
sxtl v16.2d, v16.2s
|
||||
movi v29.2d, #0
|
||||
movi v30.2d, #(1<<OUT_SHIFT)-1
|
||||
trn1 v31.2d, v29.2d, v30.2d
|
||||
trn2 v30.2d, v30.2d, v29.2d
|
||||
trn1 v16.2d, v16.2d, v29.2d
|
||||
.else
|
||||
movi v16.4s, #0
|
||||
movi v28.4s, #0
|
||||
.endif
|
||||
mov x14, #4
|
||||
1:
|
||||
mov x8, x0
|
||||
sub x7, x1, #3<<2
|
||||
sub x6, x1, x14, lsl #4
|
||||
add x7, x7, x14, lsl #4
|
||||
add x11, x6, #(32)<<2 // w + 32
|
||||
add x12, x7, #(32)<<2 // w2 + 32
|
||||
mov x15, #8
|
||||
movi v17.2d, #0
|
||||
movi v18.2d, #0
|
||||
movi v19.2d, #0
|
||||
2:
|
||||
subs x15, x15, #1
|
||||
ld1 {v0.4s}, [x8], x9
|
||||
ld1 {v1.4s}, [x10], x9
|
||||
ld1 {v2.4s}, [x6], x9
|
||||
ld1 {v3.4s}, [x7], x9
|
||||
tbl v6.16b, {v0.16b}, v27.16b
|
||||
tbl v7.16b, {v1.16b}, v27.16b
|
||||
ld1 {v4.4s}, [x11], x9
|
||||
ld1 {v5.4s}, [x12], x9
|
||||
MLA v16, v2, v0
|
||||
MLA2 v17, v2, v0
|
||||
MLS v18, v3, v6
|
||||
MLS2 v19, v3, v6
|
||||
MLS v16, v4, v7
|
||||
MLS2 v17, v4, v7
|
||||
MLS v18, v5, v1
|
||||
MLS2 v19, v5, v1
|
||||
b.gt 2b
|
||||
|
||||
cmp x14, #4
|
||||
sub x10, x10, #64<<5 // 64 * 8 * sizeof(int32_t)
|
||||
|
||||
.ifc \type, fixed
|
||||
and v28.16b, v16.16b, v30.16b
|
||||
ext v28.16b, v29.16b, v28.16b, #8
|
||||
|
||||
b.eq 4f
|
||||
round_sample v19, 1, 1
|
||||
4:
|
||||
round_sample v16, 1, 0
|
||||
shrn v16.2s, v16.2d, #OUT_SHIFT
|
||||
round_sample v19, 0, 0
|
||||
shrn v19.2s, v19.2d, #OUT_SHIFT
|
||||
round_sample v17, 0, 1
|
||||
round_sample v18, 1, 1
|
||||
round_sample v17, 1, 0
|
||||
shrn2 v16.4s, v17.2d, #OUT_SHIFT
|
||||
round_sample v18, 0, 0
|
||||
shrn2 v19.4s, v18.2d, #OUT_SHIFT
|
||||
sqxtn v16.4h, v16.4s
|
||||
sqxtn v18.4h, v19.4s
|
||||
.else
|
||||
ext v18.16b, v18.16b, v18.16b, #8
|
||||
.endif
|
||||
|
||||
st1 {v16.\st\()}[0], [x3], x4
|
||||
b.eq 4f
|
||||
st1 {v18.\st\()}[1], [x5], x13
|
||||
4:
|
||||
st1 {v16.\st\()}[1], [x3], x4
|
||||
st1 {v18.\st\()}[0], [x5], x13
|
||||
st1 {v16.\st\()}[2], [x3], x4
|
||||
st1 {v18.\st\()}[3], [x5], x13
|
||||
st1 {v16.\st\()}[3], [x3], x4
|
||||
st1 {v18.\st\()}[2], [x5], x13
|
||||
|
||||
mov v16.16b, v28.16b
|
||||
|
||||
subs x14, x14, #1
|
||||
add x0, x0, #4<<2
|
||||
sub x10, x10, #4<<2
|
||||
b.gt 1b
|
||||
|
||||
// computing samples[16]
|
||||
add x6, x1, #32<<2
|
||||
ld1 {v0.2s}, [x6], x9
|
||||
ld1 {v1.2s}, [x0], x9
|
||||
.rept 3
|
||||
ld1 {v2.2s}, [x6], x9
|
||||
ld1 {v3.2s}, [x0], x9
|
||||
MLS v16, v0, v1
|
||||
ld1 {v0.2s}, [x6], x9
|
||||
ld1 {v1.2s}, [x0], x9
|
||||
MLS v16, v2, v3
|
||||
.endr
|
||||
ld1 {v2.2s}, [x6], x9
|
||||
ld1 {v3.2s}, [x0], x9
|
||||
MLS v16, v0, v1
|
||||
MLS v16, v2, v3
|
||||
|
||||
.ifc \type, fixed
|
||||
and v28.16b, v16.16b, v30.16b
|
||||
shrn v20.2s, v16.2d, #OUT_SHIFT
|
||||
xtn v28.2s, v28.2d
|
||||
sqxtn v20.4h, v20.4s
|
||||
st1 {v28.s}[0], [x2] // save dither_state
|
||||
st1 {v20.h}[0], [x3]
|
||||
.else
|
||||
st1 {v16.s}[0], [x3]
|
||||
.endif
|
||||
|
||||
ret
|
||||
endfunc
|
||||
.purgem round_sample
|
||||
.purgem MLA
|
||||
.purgem MLA2
|
||||
.purgem MLS
|
||||
.purgem MLS2
|
||||
.endm
|
||||
|
||||
|
||||
.macro round_sample r, idx, next
|
||||
add \r\().2d, \r\().2d, v28.2d
|
||||
.if \idx == 0
|
||||
and v28.16b, \r\().16b, v30.16b
|
||||
.else // \idx == 1
|
||||
and v28.16b, \r\().16b, v31.16b
|
||||
.endif
|
||||
.if \idx != \next
|
||||
.if \next == 0
|
||||
ext v28.16b, v28.16b, v29.16b, #8
|
||||
.else
|
||||
ext v28.16b, v29.16b, v28.16b, #8
|
||||
.endif
|
||||
.endif
|
||||
.endm
|
||||
.macro MLA d, s1, s2
|
||||
smlal \d\().2d, \s1\().2s, \s2\().2s
|
||||
.endm
|
||||
.macro MLA2 d, s1, s2
|
||||
smlal2 \d\().2d, \s1\().4s, \s2\().4s
|
||||
.endm
|
||||
.macro MLS d, s1, s2
|
||||
smlsl \d\().2d, \s1\().2s, \s2\().2s
|
||||
.endm
|
||||
.macro MLS2 d, s1, s2
|
||||
smlsl2 \d\().2d, \s1\().4s, \s2\().4s
|
||||
.endm
|
||||
apply_window fixed, h
|
||||
|
||||
|
||||
// nothing to do for round_sample and ML{A,S}2
|
||||
.macro round_sample r, idx, next
|
||||
.endm
|
||||
.macro MLA2 d, s1, s2
|
||||
.endm
|
||||
.macro MLS2 d, s1, s2
|
||||
.endm
|
||||
.macro MLA d, s1, s2
|
||||
fmla \d\().4s, \s1\().4s, \s2\().4s
|
||||
.endm
|
||||
.macro MLS d, s1, s2
|
||||
fmls \d\().4s, \s1\().4s, \s2\().4s
|
||||
.endm
|
||||
apply_window float, s
|
||||
149
externals/ffmpeg/libavcodec/aarch64/neon.S
vendored
Executable file
149
externals/ffmpeg/libavcodec/aarch64/neon.S
vendored
Executable file
@@ -0,0 +1,149 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().8B, \r0\().8B, \r1\().8B
|
||||
trn2 \r9\().8B, \r0\().8B, \r1\().8B
|
||||
trn1 \r1\().8B, \r2\().8B, \r3\().8B
|
||||
trn2 \r3\().8B, \r2\().8B, \r3\().8B
|
||||
trn1 \r0\().8B, \r4\().8B, \r5\().8B
|
||||
trn2 \r5\().8B, \r4\().8B, \r5\().8B
|
||||
trn1 \r2\().8B, \r6\().8B, \r7\().8B
|
||||
trn2 \r7\().8B, \r6\().8B, \r7\().8B
|
||||
|
||||
trn1 \r4\().4H, \r0\().4H, \r2\().4H
|
||||
trn2 \r2\().4H, \r0\().4H, \r2\().4H
|
||||
trn1 \r6\().4H, \r5\().4H, \r7\().4H
|
||||
trn2 \r7\().4H, \r5\().4H, \r7\().4H
|
||||
trn1 \r5\().4H, \r9\().4H, \r3\().4H
|
||||
trn2 \r9\().4H, \r9\().4H, \r3\().4H
|
||||
trn1 \r3\().4H, \r8\().4H, \r1\().4H
|
||||
trn2 \r8\().4H, \r8\().4H, \r1\().4H
|
||||
|
||||
trn1 \r0\().2S, \r3\().2S, \r4\().2S
|
||||
trn2 \r4\().2S, \r3\().2S, \r4\().2S
|
||||
|
||||
trn1 \r1\().2S, \r5\().2S, \r6\().2S
|
||||
trn2 \r5\().2S, \r5\().2S, \r6\().2S
|
||||
|
||||
trn2 \r6\().2S, \r8\().2S, \r2\().2S
|
||||
trn1 \r2\().2S, \r8\().2S, \r2\().2S
|
||||
|
||||
trn1 \r3\().2S, \r9\().2S, \r7\().2S
|
||||
trn2 \r7\().2S, \r9\().2S, \r7\().2S
|
||||
.endm
|
||||
|
||||
.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
|
||||
trn1 \t0\().16B, \r0\().16B, \r1\().16B
|
||||
trn2 \t1\().16B, \r0\().16B, \r1\().16B
|
||||
trn1 \r1\().16B, \r2\().16B, \r3\().16B
|
||||
trn2 \r3\().16B, \r2\().16B, \r3\().16B
|
||||
trn1 \r0\().16B, \r4\().16B, \r5\().16B
|
||||
trn2 \r5\().16B, \r4\().16B, \r5\().16B
|
||||
trn1 \r2\().16B, \r6\().16B, \r7\().16B
|
||||
trn2 \r7\().16B, \r6\().16B, \r7\().16B
|
||||
|
||||
trn1 \r4\().8H, \r0\().8H, \r2\().8H
|
||||
trn2 \r2\().8H, \r0\().8H, \r2\().8H
|
||||
trn1 \r6\().8H, \r5\().8H, \r7\().8H
|
||||
trn2 \r7\().8H, \r5\().8H, \r7\().8H
|
||||
trn1 \r5\().8H, \t1\().8H, \r3\().8H
|
||||
trn2 \t1\().8H, \t1\().8H, \r3\().8H
|
||||
trn1 \r3\().8H, \t0\().8H, \r1\().8H
|
||||
trn2 \t0\().8H, \t0\().8H, \r1\().8H
|
||||
|
||||
trn1 \r0\().4S, \r3\().4S, \r4\().4S
|
||||
trn2 \r4\().4S, \r3\().4S, \r4\().4S
|
||||
|
||||
trn1 \r1\().4S, \r5\().4S, \r6\().4S
|
||||
trn2 \r5\().4S, \r5\().4S, \r6\().4S
|
||||
|
||||
trn2 \r6\().4S, \t0\().4S, \r2\().4S
|
||||
trn1 \r2\().4S, \t0\().4S, \r2\().4S
|
||||
|
||||
trn1 \r3\().4S, \t1\().4S, \r7\().4S
|
||||
trn2 \r7\().4S, \t1\().4S, \r7\().4S
|
||||
.endm
|
||||
|
||||
.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().16B, \r0\().16B, \r1\().16B
|
||||
trn2 \t5\().16B, \r0\().16B, \r1\().16B
|
||||
trn1 \t6\().16B, \r2\().16B, \r3\().16B
|
||||
trn2 \t7\().16B, \r2\().16B, \r3\().16B
|
||||
|
||||
trn1 \r0\().8H, \t4\().8H, \t6\().8H
|
||||
trn2 \r2\().8H, \t4\().8H, \t6\().8H
|
||||
trn1 \r1\().8H, \t5\().8H, \t7\().8H
|
||||
trn2 \r3\().8H, \t5\().8H, \t7\().8H
|
||||
.endm
|
||||
|
||||
.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().8B, \r0\().8B, \r1\().8B
|
||||
trn2 \t5\().8B, \r0\().8B, \r1\().8B
|
||||
trn1 \t6\().8B, \r2\().8B, \r3\().8B
|
||||
trn2 \t7\().8B, \r2\().8B, \r3\().8B
|
||||
|
||||
trn1 \r0\().4H, \t4\().4H, \t6\().4H
|
||||
trn2 \r2\().4H, \t4\().4H, \t6\().4H
|
||||
trn1 \r1\().4H, \t5\().4H, \t7\().4H
|
||||
trn2 \r3\().4H, \t5\().4H, \t7\().4H
|
||||
.endm
|
||||
|
||||
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
|
||||
trn1 \r4\().4H, \r0\().4H, \r1\().4H
|
||||
trn2 \r5\().4H, \r0\().4H, \r1\().4H
|
||||
trn1 \r6\().4H, \r2\().4H, \r3\().4H
|
||||
trn2 \r7\().4H, \r2\().4H, \r3\().4H
|
||||
trn1 \r0\().2S, \r4\().2S, \r6\().2S
|
||||
trn2 \r2\().2S, \r4\().2S, \r6\().2S
|
||||
trn1 \r1\().2S, \r5\().2S, \r7\().2S
|
||||
trn2 \r3\().2S, \r5\().2S, \r7\().2S
|
||||
.endm
|
||||
|
||||
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().8H, \r0\().8H, \r1\().8H
|
||||
trn2 \r9\().8H, \r0\().8H, \r1\().8H
|
||||
trn1 \r1\().8H, \r2\().8H, \r3\().8H
|
||||
trn2 \r3\().8H, \r2\().8H, \r3\().8H
|
||||
trn1 \r0\().8H, \r4\().8H, \r5\().8H
|
||||
trn2 \r5\().8H, \r4\().8H, \r5\().8H
|
||||
trn1 \r2\().8H, \r6\().8H, \r7\().8H
|
||||
trn2 \r7\().8H, \r6\().8H, \r7\().8H
|
||||
|
||||
trn1 \r4\().4S, \r0\().4S, \r2\().4S
|
||||
trn2 \r2\().4S, \r0\().4S, \r2\().4S
|
||||
trn1 \r6\().4S, \r5\().4S, \r7\().4S
|
||||
trn2 \r7\().4S, \r5\().4S, \r7\().4S
|
||||
trn1 \r5\().4S, \r9\().4S, \r3\().4S
|
||||
trn2 \r9\().4S, \r9\().4S, \r3\().4S
|
||||
trn1 \r3\().4S, \r8\().4S, \r1\().4S
|
||||
trn2 \r8\().4S, \r8\().4S, \r1\().4S
|
||||
|
||||
trn1 \r0\().2D, \r3\().2D, \r4\().2D
|
||||
trn2 \r4\().2D, \r3\().2D, \r4\().2D
|
||||
|
||||
trn1 \r1\().2D, \r5\().2D, \r6\().2D
|
||||
trn2 \r5\().2D, \r5\().2D, \r6\().2D
|
||||
|
||||
trn2 \r6\().2D, \r8\().2D, \r2\().2D
|
||||
trn1 \r2\().2D, \r8\().2D, \r2\().2D
|
||||
|
||||
trn1 \r3\().2D, \r9\().2D, \r7\().2D
|
||||
trn2 \r7\().2D, \r9\().2D, \r7\().2D
|
||||
|
||||
.endm
|
||||
99
externals/ffmpeg/libavcodec/aarch64/neontest.c
vendored
Executable file
99
externals/ffmpeg/libavcodec/aarch64/neontest.c
vendored
Executable file
@@ -0,0 +1,99 @@
|
||||
/*
|
||||
* check NEON registers for clobbers
|
||||
* Copyright (c) 2013 Martin Storsjo
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavutil/aarch64/neontest.h"
|
||||
|
||||
wrap(avcodec_open2(AVCodecContext *avctx,
|
||||
const AVCodec *codec,
|
||||
AVDictionary **options))
|
||||
{
|
||||
testneonclobbers(avcodec_open2, avctx, codec, options);
|
||||
}
|
||||
|
||||
wrap(avcodec_decode_audio4(AVCodecContext *avctx,
|
||||
AVFrame *frame,
|
||||
int *got_frame_ptr,
|
||||
AVPacket *avpkt))
|
||||
{
|
||||
testneonclobbers(avcodec_decode_audio4, avctx, frame,
|
||||
got_frame_ptr, avpkt);
|
||||
}
|
||||
|
||||
wrap(avcodec_decode_video2(AVCodecContext *avctx,
|
||||
AVFrame *picture,
|
||||
int *got_picture_ptr,
|
||||
AVPacket *avpkt))
|
||||
{
|
||||
testneonclobbers(avcodec_decode_video2, avctx, picture,
|
||||
got_picture_ptr, avpkt);
|
||||
}
|
||||
|
||||
wrap(avcodec_decode_subtitle2(AVCodecContext *avctx,
|
||||
AVSubtitle *sub,
|
||||
int *got_sub_ptr,
|
||||
AVPacket *avpkt))
|
||||
{
|
||||
testneonclobbers(avcodec_decode_subtitle2, avctx, sub,
|
||||
got_sub_ptr, avpkt);
|
||||
}
|
||||
|
||||
wrap(avcodec_encode_audio2(AVCodecContext *avctx,
|
||||
AVPacket *avpkt,
|
||||
const AVFrame *frame,
|
||||
int *got_packet_ptr))
|
||||
{
|
||||
testneonclobbers(avcodec_encode_audio2, avctx, avpkt, frame,
|
||||
got_packet_ptr);
|
||||
}
|
||||
|
||||
wrap(avcodec_encode_subtitle(AVCodecContext *avctx,
|
||||
uint8_t *buf, int buf_size,
|
||||
const AVSubtitle *sub))
|
||||
{
|
||||
testneonclobbers(avcodec_encode_subtitle, avctx, buf, buf_size, sub);
|
||||
}
|
||||
|
||||
wrap(avcodec_encode_video2(AVCodecContext *avctx, AVPacket *avpkt,
|
||||
const AVFrame *frame, int *got_packet_ptr))
|
||||
{
|
||||
testneonclobbers(avcodec_encode_video2, avctx, avpkt, frame, got_packet_ptr);
|
||||
}
|
||||
|
||||
wrap(avcodec_send_packet(AVCodecContext *avctx, const AVPacket *avpkt))
|
||||
{
|
||||
testneonclobbers(avcodec_send_packet, avctx, avpkt);
|
||||
}
|
||||
|
||||
wrap(avcodec_receive_packet(AVCodecContext *avctx, AVPacket *avpkt))
|
||||
{
|
||||
testneonclobbers(avcodec_receive_packet, avctx, avpkt);
|
||||
}
|
||||
|
||||
wrap(avcodec_send_frame(AVCodecContext *avctx, const AVFrame *frame))
|
||||
{
|
||||
testneonclobbers(avcodec_send_frame, avctx, frame);
|
||||
}
|
||||
|
||||
wrap(avcodec_receive_frame(AVCodecContext *avctx, AVFrame *frame))
|
||||
{
|
||||
testneonclobbers(avcodec_receive_frame, avctx, frame);
|
||||
}
|
||||
35
externals/ffmpeg/libavcodec/aarch64/opusdsp_init.c
vendored
Executable file
35
externals/ffmpeg/libavcodec/aarch64/opusdsp_init.c
vendored
Executable file
@@ -0,0 +1,35 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/opusdsp.h"
|
||||
|
||||
void ff_opus_postfilter_neon(float *data, int period, float *gains, int len);
|
||||
float ff_opus_deemphasis_neon(float *out, float *in, float coeff, int len);
|
||||
|
||||
av_cold void ff_opus_dsp_init_aarch64(OpusDSP *ctx)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
ctx->postfilter = ff_opus_postfilter_neon;
|
||||
ctx->deemphasis = ff_opus_deemphasis_neon;
|
||||
}
|
||||
}
|
||||
113
externals/ffmpeg/libavcodec/aarch64/opusdsp_neon.S
vendored
Executable file
113
externals/ffmpeg/libavcodec/aarch64/opusdsp_neon.S
vendored
Executable file
@@ -0,0 +1,113 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
// 0.85..^1 0.85..^2 0.85..^3 0.85..^4
|
||||
const tab_st, align=4
|
||||
.word 0x3f599a00, 0x3f38f671, 0x3f1d382a, 0x3f05a32f
|
||||
endconst
|
||||
const tab_x0, align=4
|
||||
.word 0x0, 0x3f599a00, 0x3f38f671, 0x3f1d382a
|
||||
endconst
|
||||
const tab_x1, align=4
|
||||
.word 0x0, 0x0, 0x3f599a00, 0x3f38f671
|
||||
endconst
|
||||
const tab_x2, align=4
|
||||
.word 0x0, 0x0, 0x0, 0x3f599a00
|
||||
endconst
|
||||
|
||||
function ff_opus_deemphasis_neon, export=1
|
||||
movrel x4, tab_st
|
||||
ld1 {v4.4s}, [x4]
|
||||
movrel x4, tab_x0
|
||||
ld1 {v5.4s}, [x4]
|
||||
movrel x4, tab_x1
|
||||
ld1 {v6.4s}, [x4]
|
||||
movrel x4, tab_x2
|
||||
ld1 {v7.4s}, [x4]
|
||||
|
||||
fmul v0.4s, v4.4s, v0.s[0]
|
||||
|
||||
1: ld1 {v1.4s, v2.4s}, [x1], #32
|
||||
|
||||
fmla v0.4s, v5.4s, v1.s[0]
|
||||
fmul v3.4s, v7.4s, v2.s[2]
|
||||
|
||||
fmla v0.4s, v6.4s, v1.s[1]
|
||||
fmla v3.4s, v6.4s, v2.s[1]
|
||||
|
||||
fmla v0.4s, v7.4s, v1.s[2]
|
||||
fmla v3.4s, v5.4s, v2.s[0]
|
||||
|
||||
fadd v1.4s, v1.4s, v0.4s
|
||||
fadd v2.4s, v2.4s, v3.4s
|
||||
|
||||
fmla v2.4s, v4.4s, v1.s[3]
|
||||
|
||||
st1 {v1.4s, v2.4s}, [x0], #32
|
||||
fmul v0.4s, v4.4s, v2.s[3]
|
||||
|
||||
subs w2, w2, #8
|
||||
b.gt 1b
|
||||
|
||||
mov s0, v2.s[3]
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_opus_postfilter_neon, export=1
|
||||
ld1 {v0.4s}, [x2]
|
||||
dup v1.4s, v0.s[1]
|
||||
dup v2.4s, v0.s[2]
|
||||
dup v0.4s, v0.s[0]
|
||||
|
||||
add w1, w1, #2
|
||||
sub x1, x0, x1, lsl #2
|
||||
|
||||
ld1 {v3.4s}, [x1]
|
||||
fmul v3.4s, v3.4s, v2.4s
|
||||
|
||||
1: add x1, x1, #4
|
||||
ld1 {v4.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v5.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v6.4s}, [x1]
|
||||
add x1, x1, #4
|
||||
ld1 {v7.4s}, [x1]
|
||||
|
||||
fmla v3.4s, v7.4s, v2.4s
|
||||
fadd v6.4s, v6.4s, v4.4s
|
||||
|
||||
ld1 {v4.4s}, [x0]
|
||||
fmla v4.4s, v5.4s, v0.4s
|
||||
|
||||
fmul v6.4s, v6.4s, v1.4s
|
||||
fadd v6.4s, v6.4s, v3.4s
|
||||
|
||||
fadd v4.4s, v4.4s, v6.4s
|
||||
fmul v3.4s, v7.4s, v2.4s
|
||||
|
||||
st1 {v4.4s}, [x0], #16
|
||||
|
||||
subs w3, w3, #4
|
||||
b.gt 1b
|
||||
|
||||
ret
|
||||
endfunc
|
||||
46
externals/ffmpeg/libavcodec/aarch64/pixblockdsp_init_aarch64.c
vendored
Executable file
46
externals/ffmpeg/libavcodec/aarch64/pixblockdsp_init_aarch64.c
vendored
Executable file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/pixblockdsp.h"
|
||||
|
||||
void ff_get_pixels_neon(int16_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t stride);
|
||||
void ff_diff_pixels_neon(int16_t *block, const uint8_t *s1,
|
||||
const uint8_t *s2, ptrdiff_t stride);
|
||||
|
||||
av_cold void ff_pixblockdsp_init_aarch64(PixblockDSPContext *c,
|
||||
AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
if (!high_bit_depth) {
|
||||
c->get_pixels_unaligned =
|
||||
c->get_pixels = ff_get_pixels_neon;
|
||||
}
|
||||
c->diff_pixels_unaligned =
|
||||
c->diff_pixels = ff_diff_pixels_neon;
|
||||
}
|
||||
}
|
||||
51
externals/ffmpeg/libavcodec/aarch64/pixblockdsp_neon.S
vendored
Executable file
51
externals/ffmpeg/libavcodec/aarch64/pixblockdsp_neon.S
vendored
Executable file
@@ -0,0 +1,51 @@
|
||||
/*
|
||||
* Copyright (c) 2020 Martin Storsjo
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_get_pixels_neon, export=1
|
||||
mov w3, #8
|
||||
1:
|
||||
ld1 {v0.8b}, [x1], x2
|
||||
subs w3, w3, #2
|
||||
ld1 {v1.8b}, [x1], x2
|
||||
uxtl v0.8h, v0.8b
|
||||
uxtl v1.8h, v1.8b
|
||||
st1 {v0.8h, v1.8h}, [x0], #32
|
||||
b.gt 1b
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_diff_pixels_neon, export=1
|
||||
mov w4, #8
|
||||
1:
|
||||
ld1 {v0.8b}, [x1], x3
|
||||
ld1 {v1.8b}, [x2], x3
|
||||
subs w4, w4, #2
|
||||
ld1 {v2.8b}, [x1], x3
|
||||
usubl v0.8h, v0.8b, v1.8b
|
||||
ld1 {v3.8b}, [x2], x3
|
||||
usubl v1.8h, v2.8b, v3.8b
|
||||
st1 {v0.8h, v1.8h}, [x0], #32
|
||||
b.gt 1b
|
||||
|
||||
ret
|
||||
endfunc
|
||||
48
externals/ffmpeg/libavcodec/aarch64/rv40dsp_init_aarch64.c
vendored
Executable file
48
externals/ffmpeg/libavcodec/aarch64/rv40dsp_init_aarch64.c
vendored
Executable file
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/rv34dsp.h"
|
||||
|
||||
#include "config.h"
|
||||
|
||||
void ff_put_rv40_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_put_rv40_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
|
||||
void ff_avg_rv40_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_avg_rv40_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
|
||||
av_cold void ff_rv40dsp_init_aarch64(RV34DSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon;
|
||||
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon;
|
||||
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
|
||||
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
|
||||
}
|
||||
}
|
||||
70
externals/ffmpeg/libavcodec/aarch64/sbrdsp_init_aarch64.c
vendored
Executable file
70
externals/ffmpeg/libavcodec/aarch64/sbrdsp_init_aarch64.c
vendored
Executable file
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/sbrdsp.h"
|
||||
|
||||
void ff_sbr_sum64x5_neon(float *z);
|
||||
float ff_sbr_sum_square_neon(float (*x)[2], int n);
|
||||
void ff_sbr_neg_odd_64_neon(float *x);
|
||||
void ff_sbr_qmf_pre_shuffle_neon(float *z);
|
||||
void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z);
|
||||
void ff_sbr_qmf_deint_neg_neon(float *v, const float *src);
|
||||
void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1);
|
||||
void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2],
|
||||
const float *g_filt, int m_max, intptr_t ixh);
|
||||
void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2],
|
||||
const float alpha0[2], const float alpha1[2],
|
||||
float bw, int start, int end);
|
||||
void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]);
|
||||
void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
|
||||
av_cold void ff_sbrdsp_init_aarch64(SBRDSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
s->sum64x5 = ff_sbr_sum64x5_neon;
|
||||
s->sum_square = ff_sbr_sum_square_neon;
|
||||
s->neg_odd_64 = ff_sbr_neg_odd_64_neon;
|
||||
s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon;
|
||||
s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon;
|
||||
s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon;
|
||||
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon;
|
||||
s->hf_g_filt = ff_sbr_hf_g_filt_neon;
|
||||
s->hf_gen = ff_sbr_hf_gen_neon;
|
||||
s->autocorrelate = ff_sbr_autocorrelate_neon;
|
||||
s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon;
|
||||
s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon;
|
||||
s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon;
|
||||
s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon;
|
||||
}
|
||||
}
|
||||
327
externals/ffmpeg/libavcodec/aarch64/sbrdsp_neon.S
vendored
Executable file
327
externals/ffmpeg/libavcodec/aarch64/sbrdsp_neon.S
vendored
Executable file
@@ -0,0 +1,327 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
const factors, align=4
|
||||
.float 1.0, -1.0, 1.0, -1.0
|
||||
endconst
|
||||
|
||||
const phi_noise_0, align=4
|
||||
.float 1.0, 0.0, 1.0, 0.0
|
||||
endconst
|
||||
|
||||
const phi_noise_1, align=4
|
||||
.float 0.0, 1.0, 0.0, -1.0
|
||||
.float 0.0, -1.0, 0.0, 1.0
|
||||
endconst
|
||||
|
||||
const phi_noise_2, align=4
|
||||
.float -1.0, 0.0, -1.0, 0.0
|
||||
endconst
|
||||
|
||||
const phi_noise_3, align=4
|
||||
.float 0.0, -1.0, 0.0, 1.0
|
||||
.float 0.0, 1.0, 0.0, -1.0
|
||||
endconst
|
||||
|
||||
function ff_sbr_sum64x5_neon, export=1
|
||||
add x1, x0, #64*4
|
||||
add x2, x0, #128*4
|
||||
add x3, x0, #192*4
|
||||
add x4, x0, #256*4
|
||||
mov x5, #64
|
||||
1: ld1 {v0.4S}, [x0]
|
||||
ld1 {v1.4S}, [x1], #16
|
||||
fadd v0.4S, v0.4S, v1.4S
|
||||
ld1 {v2.4S}, [x2], #16
|
||||
fadd v0.4S, v0.4S, v2.4S
|
||||
ld1 {v3.4S}, [x3], #16
|
||||
fadd v0.4S, v0.4S, v3.4S
|
||||
ld1 {v4.4S}, [x4], #16
|
||||
fadd v0.4S, v0.4S, v4.4S
|
||||
st1 {v0.4S}, [x0], #16
|
||||
subs x5, x5, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_sum_square_neon, export=1
|
||||
movi v0.4S, #0
|
||||
1: ld1 {v1.4S}, [x0], #16
|
||||
fmla v0.4S, v1.4S, v1.4S
|
||||
subs w1, w1, #2
|
||||
b.gt 1b
|
||||
faddp v0.4S, v0.4S, v0.4S
|
||||
faddp v0.4S, v0.4S, v0.4S
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_neg_odd_64_neon, export=1
|
||||
mov x1, x0
|
||||
movi v5.4S, #1<<7, lsl #24
|
||||
ld2 {v0.4S, v1.4S}, [x0], #32
|
||||
eor v1.16B, v1.16B, v5.16B
|
||||
ld2 {v2.4S, v3.4S}, [x0], #32
|
||||
.rept 3
|
||||
st2 {v0.4S, v1.4S}, [x1], #32
|
||||
eor v3.16B, v3.16B, v5.16B
|
||||
ld2 {v0.4S, v1.4S}, [x0], #32
|
||||
st2 {v2.4S, v3.4S}, [x1], #32
|
||||
eor v1.16B, v1.16B, v5.16B
|
||||
ld2 {v2.4S, v3.4S}, [x0], #32
|
||||
.endr
|
||||
eor v3.16B, v3.16B, v5.16B
|
||||
st2 {v0.4S, v1.4S}, [x1], #32
|
||||
st2 {v2.4S, v3.4S}, [x1], #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_qmf_pre_shuffle_neon, export=1
|
||||
add x1, x0, #60*4
|
||||
add x2, x0, #64*4
|
||||
mov x3, #-16
|
||||
mov x4, #-4
|
||||
movi v6.4S, #1<<7, lsl #24
|
||||
ld1 {v0.2S}, [x0], #8
|
||||
st1 {v0.2S}, [x2], #8
|
||||
.rept 7
|
||||
ld1 {v1.4S}, [x1], x3
|
||||
ld1 {v2.4S}, [x0], #16
|
||||
eor v1.16B, v1.16B, v6.16B
|
||||
rev64 v1.4S, v1.4S
|
||||
ext v1.16B, v1.16B, v1.16B, #8
|
||||
st2 {v1.4S, v2.4S}, [x2], #32
|
||||
.endr
|
||||
add x1, x1, #8
|
||||
ld1 {v1.2S}, [x1], x4
|
||||
ld1 {v2.2S}, [x0], #8
|
||||
ld1 {v1.S}[3], [x1]
|
||||
ld1 {v2.S}[2], [x0]
|
||||
eor v1.16B, v1.16B, v6.16B
|
||||
rev64 v1.4S, v1.4S
|
||||
st2 {v1.2S, v2.2S}, [x2], #16
|
||||
st2 {v1.S, v2.S}[2], [x2]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_qmf_post_shuffle_neon, export=1
|
||||
add x2, x1, #60*4
|
||||
mov x3, #-16
|
||||
mov x4, #32
|
||||
movi v6.4S, #1<<7, lsl #24
|
||||
1: ld1 {v0.4S}, [x2], x3
|
||||
ld1 {v1.4S}, [x1], #16
|
||||
eor v0.16B, v0.16B, v6.16B
|
||||
rev64 v0.4S, v0.4S
|
||||
ext v0.16B, v0.16B, v0.16B, #8
|
||||
st2 {v0.4S, v1.4S}, [x0], #32
|
||||
subs x4, x4, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_qmf_deint_neg_neon, export=1
|
||||
add x1, x1, #56*4
|
||||
add x2, x0, #60*4
|
||||
mov x3, #-32
|
||||
mov x4, #32
|
||||
movi v2.4S, #1<<7, lsl #24
|
||||
1: ld2 {v0.4S, v1.4S}, [x1], x3
|
||||
eor v0.16B, v0.16B, v2.16B
|
||||
rev64 v1.4S, v1.4S
|
||||
ext v1.16B, v1.16B, v1.16B, #8
|
||||
st1 {v0.4S}, [x2]
|
||||
st1 {v1.4S}, [x0], #16
|
||||
sub x2, x2, #16
|
||||
subs x4, x4, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_qmf_deint_bfly_neon, export=1
|
||||
add x2, x2, #60*4
|
||||
add x3, x0, #124*4
|
||||
mov x4, #64
|
||||
mov x5, #-16
|
||||
1: ld1 {v0.4S}, [x1], #16
|
||||
ld1 {v1.4S}, [x2], x5
|
||||
rev64 v2.4S, v0.4S
|
||||
ext v2.16B, v2.16B, v2.16B, #8
|
||||
rev64 v3.4S, v1.4S
|
||||
ext v3.16B, v3.16B, v3.16B, #8
|
||||
fadd v1.4S, v1.4S, v2.4S
|
||||
fsub v0.4S, v0.4S, v3.4S
|
||||
st1 {v0.4S}, [x0], #16
|
||||
st1 {v1.4S}, [x3], x5
|
||||
subs x4, x4, #4
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_hf_gen_neon, export=1
|
||||
sxtw x4, w4
|
||||
sxtw x5, w5
|
||||
movrel x6, factors
|
||||
ld1 {v7.4S}, [x6]
|
||||
dup v1.4S, v0.S[0]
|
||||
mov v2.8B, v1.8B
|
||||
mov v2.S[2], v7.S[0]
|
||||
mov v2.S[3], v7.S[0]
|
||||
fmul v1.4S, v1.4S, v2.4S
|
||||
ld1 {v0.D}[0], [x3]
|
||||
ld1 {v0.D}[1], [x2]
|
||||
fmul v0.4S, v0.4S, v1.4S
|
||||
fmul v1.4S, v0.4S, v7.4S
|
||||
rev64 v0.4S, v0.4S
|
||||
sub x7, x5, x4
|
||||
add x0, x0, x4, lsl #3
|
||||
add x1, x1, x4, lsl #3
|
||||
sub x1, x1, #16
|
||||
1: ld1 {v2.4S}, [x1], #16
|
||||
ld1 {v3.2S}, [x1]
|
||||
fmul v4.4S, v2.4S, v1.4S
|
||||
fmul v5.4S, v2.4S, v0.4S
|
||||
faddp v4.4S, v4.4S, v4.4S
|
||||
faddp v5.4S, v5.4S, v5.4S
|
||||
faddp v4.4S, v4.4S, v4.4S
|
||||
faddp v5.4S, v5.4S, v5.4S
|
||||
mov v4.S[1], v5.S[0]
|
||||
fadd v4.2S, v4.2S, v3.2S
|
||||
st1 {v4.2S}, [x0], #8
|
||||
sub x1, x1, #8
|
||||
subs x7, x7, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_hf_g_filt_neon, export=1
|
||||
sxtw x3, w3
|
||||
sxtw x4, w4
|
||||
mov x5, #40*2*4
|
||||
add x1, x1, x4, lsl #3
|
||||
1: ld1 {v0.2S}, [x1], x5
|
||||
ld1 {v1.S}[0], [x2], #4
|
||||
fmul v2.4S, v0.4S, v1.S[0]
|
||||
st1 {v2.2S}, [x0], #8
|
||||
subs x3, x3, #1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_autocorrelate_neon, export=1
|
||||
mov x2, #38
|
||||
movrel x3, factors
|
||||
ld1 {v0.4S}, [x3]
|
||||
movi v1.4S, #0
|
||||
movi v2.4S, #0
|
||||
movi v3.4S, #0
|
||||
ld1 {v4.2S}, [x0], #8
|
||||
ld1 {v5.2S}, [x0], #8
|
||||
fmul v16.2S, v4.2S, v4.2S
|
||||
fmul v17.2S, v5.2S, v4.S[0]
|
||||
fmul v18.2S, v5.2S, v4.S[1]
|
||||
1: ld1 {v5.D}[1], [x0], #8
|
||||
fmla v1.2S, v4.2S, v4.2S
|
||||
fmla v2.4S, v5.4S, v4.S[0]
|
||||
fmla v3.4S, v5.4S, v4.S[1]
|
||||
mov v4.D[0], v5.D[0]
|
||||
mov v5.D[0], v5.D[1]
|
||||
subs x2, x2, #1
|
||||
b.gt 1b
|
||||
fmul v19.2S, v4.2S, v4.2S
|
||||
fmul v20.2S, v5.2S, v4.S[0]
|
||||
fmul v21.2S, v5.2S, v4.S[1]
|
||||
fadd v22.4S, v2.4S, v20.4S
|
||||
fsub v22.4S, v22.4S, v17.4S
|
||||
fadd v23.4S, v3.4S, v21.4S
|
||||
fsub v23.4S, v23.4S, v18.4S
|
||||
rev64 v23.4S, v23.4S
|
||||
fmul v23.4S, v23.4S, v0.4S
|
||||
fadd v22.4S, v22.4S, v23.4S
|
||||
st1 {v22.4S}, [x1], #16
|
||||
fadd v23.2S, v1.2S, v19.2S
|
||||
fsub v23.2S, v23.2S, v16.2S
|
||||
faddp v23.2S, v23.2S, v23.2S
|
||||
st1 {v23.S}[0], [x1]
|
||||
add x1, x1, #8
|
||||
rev64 v3.2S, v3.2S
|
||||
fmul v3.2S, v3.2S, v0.2S
|
||||
fadd v2.2S, v2.2S, v3.2S
|
||||
st1 {v2.2S}, [x1]
|
||||
add x1, x1, #16
|
||||
faddp v1.2S, v1.2S, v1.2S
|
||||
st1 {v1.S}[0], [x1]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro apply_noise_common
|
||||
sxtw x3, w3
|
||||
sxtw x5, w5
|
||||
movrel x7, X(ff_sbr_noise_table)
|
||||
add x3, x3, #1
|
||||
1: and x3, x3, #0x1ff
|
||||
add x8, x7, x3, lsl #3
|
||||
add x3, x3, #2
|
||||
ld1 {v2.4S}, [x0]
|
||||
ld1 {v3.2S}, [x1], #8
|
||||
ld1 {v4.2S}, [x2], #8
|
||||
ld1 {v5.4S}, [x8]
|
||||
mov v6.16B, v2.16B
|
||||
zip1 v3.4S, v3.4S, v3.4S
|
||||
zip1 v4.4S, v4.4S, v4.4S
|
||||
fmla v6.4S, v1.4S, v3.4S
|
||||
fmla v2.4S, v5.4S, v4.4S
|
||||
fcmeq v7.4S, v3.4S, #0
|
||||
bif v2.16B, v6.16B, v7.16B
|
||||
st1 {v2.4S}, [x0], #16
|
||||
subs x5, x5, #2
|
||||
b.gt 1b
|
||||
.endm
|
||||
|
||||
function ff_sbr_hf_apply_noise_0_neon, export=1
|
||||
movrel x9, phi_noise_0
|
||||
ld1 {v1.4S}, [x9]
|
||||
apply_noise_common
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_hf_apply_noise_1_neon, export=1
|
||||
movrel x9, phi_noise_1
|
||||
and x4, x4, #1
|
||||
add x9, x9, x4, lsl #4
|
||||
ld1 {v1.4S}, [x9]
|
||||
apply_noise_common
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_hf_apply_noise_2_neon, export=1
|
||||
movrel x9, phi_noise_2
|
||||
ld1 {v1.4S}, [x9]
|
||||
apply_noise_common
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_sbr_hf_apply_noise_3_neon, export=1
|
||||
movrel x9, phi_noise_3
|
||||
and x4, x4, #1
|
||||
add x9, x9, x4, lsl #4
|
||||
ld1 {v1.4S}, [x9]
|
||||
apply_noise_common
|
||||
ret
|
||||
endfunc
|
||||
362
externals/ffmpeg/libavcodec/aarch64/simple_idct_neon.S
vendored
Executable file
362
externals/ffmpeg/libavcodec/aarch64/simple_idct_neon.S
vendored
Executable file
@@ -0,0 +1,362 @@
|
||||
/*
|
||||
* ARM NEON IDCT
|
||||
*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
|
||||
*
|
||||
* Based on Simple IDCT
|
||||
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
#define Z1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define Z2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define Z3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define Z4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define Z5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define Z6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define Z7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define Z4c ((1<<(COL_SHIFT-1))/Z4)
|
||||
#define ROW_SHIFT 11
|
||||
#define COL_SHIFT 20
|
||||
|
||||
#define z1 v0.H[0]
|
||||
#define z2 v0.H[1]
|
||||
#define z3 v0.H[2]
|
||||
#define z4 v0.H[3]
|
||||
#define z5 v0.H[4]
|
||||
#define z6 v0.H[5]
|
||||
#define z7 v0.H[6]
|
||||
#define z4c v0.H[7]
|
||||
|
||||
const idct_coeff_neon, align=4
|
||||
.short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
|
||||
endconst
|
||||
|
||||
.macro idct_start data
|
||||
prfm pldl1keep, [\data]
|
||||
mov x10, x30
|
||||
movrel x3, idct_coeff_neon
|
||||
ld1 {v0.2D}, [x3]
|
||||
.endm
|
||||
|
||||
.macro idct_end
|
||||
br x10
|
||||
.endm
|
||||
|
||||
.macro smull1 a, b, c
|
||||
smull \a, \b, \c
|
||||
.endm
|
||||
|
||||
.macro smlal1 a, b, c
|
||||
smlal \a, \b, \c
|
||||
.endm
|
||||
|
||||
.macro smlsl1 a, b, c
|
||||
smlsl \a, \b, \c
|
||||
.endm
|
||||
|
||||
.macro idct_col4_top y1, y2, y3, y4, i, l
|
||||
smull\i v7.4S, \y3\l, z2
|
||||
smull\i v16.4S, \y3\l, z6
|
||||
smull\i v17.4S, \y2\l, z1
|
||||
add v19.4S, v23.4S, v7.4S
|
||||
smull\i v18.4S, \y2\l, z3
|
||||
add v20.4S, v23.4S, v16.4S
|
||||
smull\i v5.4S, \y2\l, z5
|
||||
sub v21.4S, v23.4S, v16.4S
|
||||
smull\i v6.4S, \y2\l, z7
|
||||
sub v22.4S, v23.4S, v7.4S
|
||||
|
||||
smlal\i v17.4S, \y4\l, z3
|
||||
smlsl\i v18.4S, \y4\l, z7
|
||||
smlsl\i v5.4S, \y4\l, z1
|
||||
smlsl\i v6.4S, \y4\l, z5
|
||||
.endm
|
||||
|
||||
.macro idct_row4_neon y1, y2, y3, y4, pass
|
||||
ld1 {\y1\().2D,\y2\().2D}, [x2], #32
|
||||
movi v23.4S, #1<<2, lsl #8
|
||||
orr v5.16B, \y1\().16B, \y2\().16B
|
||||
ld1 {\y3\().2D,\y4\().2D}, [x2], #32
|
||||
orr v6.16B, \y3\().16B, \y4\().16B
|
||||
orr v5.16B, v5.16B, v6.16B
|
||||
mov x3, v5.D[1]
|
||||
smlal v23.4S, \y1\().4H, z4
|
||||
|
||||
idct_col4_top \y1, \y2, \y3, \y4, 1, .4H
|
||||
|
||||
cmp x3, #0
|
||||
b.eq \pass\()f
|
||||
|
||||
smull2 v7.4S, \y1\().8H, z4
|
||||
smlal2 v17.4S, \y2\().8H, z5
|
||||
smlsl2 v18.4S, \y2\().8H, z1
|
||||
smull2 v16.4S, \y3\().8H, z2
|
||||
smlal2 v5.4S, \y2\().8H, z7
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v7.4S
|
||||
sub v21.4S, v21.4S, v7.4S
|
||||
add v22.4S, v22.4S, v7.4S
|
||||
smlal2 v6.4S, \y2\().8H, z3
|
||||
smull2 v7.4S, \y3\().8H, z6
|
||||
smlal2 v17.4S, \y4\().8H, z7
|
||||
smlsl2 v18.4S, \y4\().8H, z5
|
||||
smlal2 v5.4S, \y4\().8H, z3
|
||||
smlsl2 v6.4S, \y4\().8H, z1
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v16.4S
|
||||
add v21.4S, v21.4S, v16.4S
|
||||
sub v22.4S, v22.4S, v7.4S
|
||||
|
||||
\pass: add \y3\().4S, v19.4S, v17.4S
|
||||
add \y4\().4S, v20.4S, v18.4S
|
||||
shrn \y1\().4H, \y3\().4S, #ROW_SHIFT
|
||||
shrn \y2\().4H, \y4\().4S, #ROW_SHIFT
|
||||
add v7.4S, v21.4S, v5.4S
|
||||
add v16.4S, v22.4S, v6.4S
|
||||
shrn \y3\().4H, v7.4S, #ROW_SHIFT
|
||||
shrn \y4\().4H, v16.4S, #ROW_SHIFT
|
||||
sub v22.4S, v22.4S, v6.4S
|
||||
sub v19.4S, v19.4S, v17.4S
|
||||
sub v21.4S, v21.4S, v5.4S
|
||||
shrn2 \y1\().8H, v22.4S, #ROW_SHIFT
|
||||
sub v20.4S, v20.4S, v18.4S
|
||||
shrn2 \y2\().8H, v21.4S, #ROW_SHIFT
|
||||
shrn2 \y3\().8H, v20.4S, #ROW_SHIFT
|
||||
shrn2 \y4\().8H, v19.4S, #ROW_SHIFT
|
||||
|
||||
trn1 v16.8H, \y1\().8H, \y2\().8H
|
||||
trn2 v17.8H, \y1\().8H, \y2\().8H
|
||||
trn1 v18.8H, \y3\().8H, \y4\().8H
|
||||
trn2 v19.8H, \y3\().8H, \y4\().8H
|
||||
trn1 \y1\().4S, v16.4S, v18.4S
|
||||
trn1 \y2\().4S, v17.4S, v19.4S
|
||||
trn2 \y3\().4S, v16.4S, v18.4S
|
||||
trn2 \y4\().4S, v17.4S, v19.4S
|
||||
.endm
|
||||
|
||||
.macro declare_idct_col4_neon i, l
|
||||
function idct_col4_neon\i
|
||||
dup v23.4H, z4c
|
||||
.if \i == 1
|
||||
add v23.4H, v23.4H, v24.4H
|
||||
.else
|
||||
mov v5.D[0], v24.D[1]
|
||||
add v23.4H, v23.4H, v5.4H
|
||||
.endif
|
||||
smull v23.4S, v23.4H, z4
|
||||
|
||||
idct_col4_top v24, v25, v26, v27, \i, \l
|
||||
|
||||
mov x4, v28.D[\i - 1]
|
||||
mov x5, v29.D[\i - 1]
|
||||
cmp x4, #0
|
||||
b.eq 1f
|
||||
|
||||
smull\i v7.4S, v28\l, z4
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v7.4S
|
||||
sub v21.4S, v21.4S, v7.4S
|
||||
add v22.4S, v22.4S, v7.4S
|
||||
|
||||
1: mov x4, v30.D[\i - 1]
|
||||
cmp x5, #0
|
||||
b.eq 2f
|
||||
|
||||
smlal\i v17.4S, v29\l, z5
|
||||
smlsl\i v18.4S, v29\l, z1
|
||||
smlal\i v5.4S, v29\l, z7
|
||||
smlal\i v6.4S, v29\l, z3
|
||||
|
||||
2: mov x5, v31.D[\i - 1]
|
||||
cmp x4, #0
|
||||
b.eq 3f
|
||||
|
||||
smull\i v7.4S, v30\l, z6
|
||||
smull\i v16.4S, v30\l, z2
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v22.4S, v22.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v16.4S
|
||||
add v21.4S, v21.4S, v16.4S
|
||||
|
||||
3: cmp x5, #0
|
||||
b.eq 4f
|
||||
|
||||
smlal\i v17.4S, v31\l, z7
|
||||
smlsl\i v18.4S, v31\l, z5
|
||||
smlal\i v5.4S, v31\l, z3
|
||||
smlsl\i v6.4S, v31\l, z1
|
||||
|
||||
4: addhn v7.4H, v19.4S, v17.4S
|
||||
addhn2 v7.8H, v20.4S, v18.4S
|
||||
subhn v18.4H, v20.4S, v18.4S
|
||||
subhn2 v18.8H, v19.4S, v17.4S
|
||||
|
||||
addhn v16.4H, v21.4S, v5.4S
|
||||
addhn2 v16.8H, v22.4S, v6.4S
|
||||
subhn v17.4H, v22.4S, v6.4S
|
||||
subhn2 v17.8H, v21.4S, v5.4S
|
||||
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
declare_idct_col4_neon 1, .4H
|
||||
declare_idct_col4_neon 2, .8H
|
||||
|
||||
function ff_simple_idct_put_neon, export=1
|
||||
idct_start x2
|
||||
|
||||
idct_row4_neon v24, v25, v26, v27, 1
|
||||
idct_row4_neon v28, v29, v30, v31, 2
|
||||
bl idct_col4_neon1
|
||||
|
||||
sqshrun v1.8B, v7.8H, #COL_SHIFT-16
|
||||
sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16
|
||||
sqshrun v3.8B, v17.8H, #COL_SHIFT-16
|
||||
sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16
|
||||
|
||||
bl idct_col4_neon2
|
||||
|
||||
sqshrun v2.8B, v7.8H, #COL_SHIFT-16
|
||||
sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16
|
||||
sqshrun v4.8B, v17.8H, #COL_SHIFT-16
|
||||
sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16
|
||||
|
||||
zip1 v16.4S, v1.4S, v2.4S
|
||||
zip2 v17.4S, v1.4S, v2.4S
|
||||
|
||||
st1 {v16.D}[0], [x0], x1
|
||||
st1 {v16.D}[1], [x0], x1
|
||||
|
||||
zip1 v18.4S, v3.4S, v4.4S
|
||||
zip2 v19.4S, v3.4S, v4.4S
|
||||
|
||||
st1 {v17.D}[0], [x0], x1
|
||||
st1 {v17.D}[1], [x0], x1
|
||||
st1 {v18.D}[0], [x0], x1
|
||||
st1 {v18.D}[1], [x0], x1
|
||||
st1 {v19.D}[0], [x0], x1
|
||||
st1 {v19.D}[1], [x0], x1
|
||||
|
||||
idct_end
|
||||
endfunc
|
||||
|
||||
function ff_simple_idct_add_neon, export=1
|
||||
idct_start x2
|
||||
|
||||
idct_row4_neon v24, v25, v26, v27, 1
|
||||
idct_row4_neon v28, v29, v30, v31, 2
|
||||
bl idct_col4_neon1
|
||||
|
||||
sshr v1.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v2.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v3.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v4.8H, v18.8H, #COL_SHIFT-16
|
||||
|
||||
bl idct_col4_neon2
|
||||
|
||||
sshr v7.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v16.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v17.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v18.8H, v18.8H, #COL_SHIFT-16
|
||||
|
||||
mov x9, x0
|
||||
ld1 {v19.D}[0], [x0], x1
|
||||
zip1 v23.2D, v1.2D, v7.2D
|
||||
zip2 v24.2D, v1.2D, v7.2D
|
||||
ld1 {v19.D}[1], [x0], x1
|
||||
zip1 v25.2D, v2.2D, v16.2D
|
||||
zip2 v26.2D, v2.2D, v16.2D
|
||||
ld1 {v20.D}[0], [x0], x1
|
||||
zip1 v27.2D, v3.2D, v17.2D
|
||||
zip2 v28.2D, v3.2D, v17.2D
|
||||
ld1 {v20.D}[1], [x0], x1
|
||||
zip1 v29.2D, v4.2D, v18.2D
|
||||
zip2 v30.2D, v4.2D, v18.2D
|
||||
ld1 {v21.D}[0], [x0], x1
|
||||
uaddw v23.8H, v23.8H, v19.8B
|
||||
uaddw2 v24.8H, v24.8H, v19.16B
|
||||
ld1 {v21.D}[1], [x0], x1
|
||||
sqxtun v23.8B, v23.8H
|
||||
sqxtun2 v23.16B, v24.8H
|
||||
ld1 {v22.D}[0], [x0], x1
|
||||
uaddw v24.8H, v25.8H, v20.8B
|
||||
uaddw2 v25.8H, v26.8H, v20.16B
|
||||
ld1 {v22.D}[1], [x0], x1
|
||||
sqxtun v24.8B, v24.8H
|
||||
sqxtun2 v24.16B, v25.8H
|
||||
st1 {v23.D}[0], [x9], x1
|
||||
uaddw v25.8H, v27.8H, v21.8B
|
||||
uaddw2 v26.8H, v28.8H, v21.16B
|
||||
st1 {v23.D}[1], [x9], x1
|
||||
sqxtun v25.8B, v25.8H
|
||||
sqxtun2 v25.16B, v26.8H
|
||||
st1 {v24.D}[0], [x9], x1
|
||||
uaddw v26.8H, v29.8H, v22.8B
|
||||
uaddw2 v27.8H, v30.8H, v22.16B
|
||||
st1 {v24.D}[1], [x9], x1
|
||||
sqxtun v26.8B, v26.8H
|
||||
sqxtun2 v26.16B, v27.8H
|
||||
st1 {v25.D}[0], [x9], x1
|
||||
st1 {v25.D}[1], [x9], x1
|
||||
st1 {v26.D}[0], [x9], x1
|
||||
st1 {v26.D}[1], [x9], x1
|
||||
|
||||
idct_end
|
||||
endfunc
|
||||
|
||||
function ff_simple_idct_neon, export=1
|
||||
idct_start x0
|
||||
|
||||
mov x2, x0
|
||||
idct_row4_neon v24, v25, v26, v27, 1
|
||||
idct_row4_neon v28, v29, v30, v31, 2
|
||||
sub x2, x2, #128
|
||||
bl idct_col4_neon1
|
||||
|
||||
sshr v1.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v2.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v3.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v4.8H, v18.8H, #COL_SHIFT-16
|
||||
|
||||
bl idct_col4_neon2
|
||||
|
||||
sshr v7.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v16.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v17.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v18.8H, v18.8H, #COL_SHIFT-16
|
||||
|
||||
zip1 v23.2D, v1.2D, v7.2D
|
||||
zip2 v24.2D, v1.2D, v7.2D
|
||||
st1 {v23.2D,v24.2D}, [x2], #32
|
||||
zip1 v25.2D, v2.2D, v16.2D
|
||||
zip2 v26.2D, v2.2D, v16.2D
|
||||
st1 {v25.2D,v26.2D}, [x2], #32
|
||||
zip1 v27.2D, v3.2D, v17.2D
|
||||
zip2 v28.2D, v3.2D, v17.2D
|
||||
st1 {v27.2D,v28.2D}, [x2], #32
|
||||
zip1 v29.2D, v4.2D, v18.2D
|
||||
zip2 v30.2D, v4.2D, v18.2D
|
||||
st1 {v29.2D,v30.2D}, [x2], #32
|
||||
|
||||
idct_end
|
||||
endfunc
|
||||
47
externals/ffmpeg/libavcodec/aarch64/synth_filter_init.c
vendored
Executable file
47
externals/ffmpeg/libavcodec/aarch64/synth_filter_init.c
vendored
Executable file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavcodec/fft.h"
|
||||
#include "libavcodec/synth_filter.h"
|
||||
|
||||
#include "asm-offsets.h"
|
||||
|
||||
#if HAVE_NEON || HAVE_VFP
|
||||
AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
|
||||
#endif
|
||||
|
||||
void ff_synth_filter_float_neon(FFTContext *imdct,
|
||||
float *synth_buf_ptr, int *synth_buf_offset,
|
||||
float synth_buf2[32], const float window[512],
|
||||
float out[32], const float in[32],
|
||||
float scale);
|
||||
|
||||
av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags))
|
||||
s->synth_filter_float = ff_synth_filter_float_neon;
|
||||
}
|
||||
119
externals/ffmpeg/libavcodec/aarch64/synth_filter_neon.S
vendored
Executable file
119
externals/ffmpeg/libavcodec/aarch64/synth_filter_neon.S
vendored
Executable file
@@ -0,0 +1,119 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "asm-offsets.h"
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
.macro inner_loop
|
||||
ld1 {v29.4s}, [x9], x15
|
||||
ld1 {v28.4s}, [x8], x15
|
||||
ld1 {v30.4s}, [x10], x15
|
||||
ld1 {v31.4s}, [x11], x15
|
||||
rev64 v28.4s, v28.4s
|
||||
ld1 {v24.4s}, [x4], x15
|
||||
ld1 {v25.4s}, [x5], x15
|
||||
rev64 v31.4s, v31.4s
|
||||
ld1 {v26.4s}, [x6], x15
|
||||
fmla v5.4s, v25.4s, v29.4s
|
||||
ld1 {v27.4s}, [x7], x15
|
||||
ext v28.16b, v28.16b, v28.16b, #8
|
||||
ext v31.16b, v31.16b, v31.16b, #8
|
||||
fmla v6.4s, v26.4s, v30.4s
|
||||
fmls v4.4s, v24.4s, v28.4s
|
||||
fmla v7.4s, v27.4s, v31.4s
|
||||
.endm
|
||||
|
||||
function ff_synth_filter_float_neon, export=1
|
||||
ldr w7, [x2] // *synth_buf_offset
|
||||
ldr x9, [x0, #IMDCT_HALF] // imdct_half function pointer
|
||||
sxtw x7, w7
|
||||
stp x3, x4, [sp, #-64]!
|
||||
add x1, x1, x7, lsl #2 // synth_buf
|
||||
sub w8, w7, #32
|
||||
stp x5, x1, [sp, #16]
|
||||
and x7, x7, #~63
|
||||
and w8, w8, #511
|
||||
stp x7, x30, [sp, #32]
|
||||
str w8, [x2]
|
||||
str s0, [sp, #48]
|
||||
|
||||
mov x2, x6 // in
|
||||
|
||||
blr x9
|
||||
|
||||
ldp x2, x4, [sp] // synct_buf_2, window
|
||||
ldp x13, x9, [sp, #16] // out, synth_buf
|
||||
ldp x0, x30, [sp, #32] // *synth_buf_offset
|
||||
ldr s0, [sp, #48]
|
||||
|
||||
add x3, x2, #16*4 // synct_buf_2 + 16
|
||||
add x14, x13, #16*4 // out + 16
|
||||
add x8, x9, #12*4
|
||||
mov x15, #64*4
|
||||
mov x1, #4
|
||||
1:
|
||||
add x10, x9, #16*4 // synth_buf
|
||||
add x11, x8, #16*4
|
||||
add x5, x4, #16*4 // window
|
||||
add x6, x4, #32*4
|
||||
add x7, x4, #48*4
|
||||
|
||||
ld1 {v4.4s}, [x2] // a
|
||||
ld1 {v5.4s}, [x3] // b
|
||||
movi v6.4s, #0 // c
|
||||
movi v7.4s, #0 // d
|
||||
|
||||
mov x12, #512
|
||||
2:
|
||||
sub x12, x12, #64
|
||||
cmp x12, x0
|
||||
inner_loop
|
||||
b.gt 2b
|
||||
|
||||
sub x8, x8, #512*4
|
||||
sub x9, x9, #512*4
|
||||
cbz x12, 4f
|
||||
sub x10, x10, #512*4
|
||||
sub x11, x11, #512*4
|
||||
3:
|
||||
subs x12, x12, #64
|
||||
inner_loop
|
||||
b.gt 3b
|
||||
4:
|
||||
subs x1, x1, #1
|
||||
fmul v4.4s, v4.4s, v0.s[0]
|
||||
fmul v5.4s, v5.4s, v0.s[0]
|
||||
st1 {v6.4s}, [x2], #16
|
||||
st1 {v7.4s}, [x3], #16
|
||||
st1 {v4.4s}, [x13], #16
|
||||
st1 {v5.4s}, [x14], #16
|
||||
b.le 10f
|
||||
|
||||
sub x4, x4, #508*4 // window
|
||||
add x9, x9, #4*4 // synth_buf
|
||||
sub x8, x8, #4*4 // synth_buf
|
||||
b 1b
|
||||
|
||||
10:
|
||||
add sp, sp, #64
|
||||
ret
|
||||
endfunc
|
||||
47
externals/ffmpeg/libavcodec/aarch64/vc1dsp_init_aarch64.c
vendored
Executable file
47
externals/ffmpeg/libavcodec/aarch64/vc1dsp_init_aarch64.c
vendored
Executable file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/vc1dsp.h"
|
||||
|
||||
#include "config.h"
|
||||
|
||||
void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
|
||||
av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
|
||||
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
|
||||
dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
|
||||
dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
|
||||
}
|
||||
}
|
||||
28
externals/ffmpeg/libavcodec/aarch64/videodsp.S
vendored
Executable file
28
externals/ffmpeg/libavcodec/aarch64/videodsp.S
vendored
Executable file
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_prefetch_aarch64, export=1
|
||||
subs w2, w2, #2
|
||||
prfm pldl1strm, [x0]
|
||||
prfm pldl1strm, [x0, x1]
|
||||
add x0, x0, x1, lsl #1
|
||||
b.gt X(ff_prefetch_aarch64)
|
||||
ret
|
||||
endfunc
|
||||
32
externals/ffmpeg/libavcodec/aarch64/videodsp_init.c
vendored
Executable file
32
externals/ffmpeg/libavcodec/aarch64/videodsp_init.c
vendored
Executable file
@@ -0,0 +1,32 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/videodsp.h"
|
||||
|
||||
void ff_prefetch_aarch64(uint8_t *mem, ptrdiff_t stride, int h);
|
||||
|
||||
av_cold void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_armv8(cpu_flags))
|
||||
ctx->prefetch = ff_prefetch_aarch64;
|
||||
}
|
||||
34
externals/ffmpeg/libavcodec/aarch64/vorbisdsp_init.c
vendored
Executable file
34
externals/ffmpeg/libavcodec/aarch64/vorbisdsp_init.c
vendored
Executable file
@@ -0,0 +1,34 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/vorbisdsp.h"
|
||||
|
||||
void ff_vorbis_inverse_coupling_neon(float *mag, float *ang,
|
||||
intptr_t blocksize);
|
||||
|
||||
av_cold void ff_vorbisdsp_init_aarch64(VorbisDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
|
||||
}
|
||||
}
|
||||
82
externals/ffmpeg/libavcodec/aarch64/vorbisdsp_neon.S
vendored
Executable file
82
externals/ffmpeg/libavcodec/aarch64/vorbisdsp_neon.S
vendored
Executable file
@@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_vorbis_inverse_coupling_neon, export=1
|
||||
movi v20.4s, #1<<7, lsl #24
|
||||
subs x2, x2, #4
|
||||
mov x3, x0
|
||||
mov x4, x1
|
||||
b.eq 3f
|
||||
|
||||
ld1 {v7.4s}, [x1], #16
|
||||
ld1 {v6.4s}, [x0], #16
|
||||
cmle v4.4s, v7.4s, #0
|
||||
and v5.16b, v6.16b, v20.16b
|
||||
eor v7.16b, v7.16b, v5.16b
|
||||
and v2.16b, v7.16b, v4.16b
|
||||
bic v3.16b, v7.16b, v4.16b
|
||||
fadd v7.4s, v6.4s, v2.4s
|
||||
fsub v6.4s, v6.4s, v3.4s
|
||||
1: ld1 {v1.4s}, [x1], #16
|
||||
ld1 {v0.4s}, [x0], #16
|
||||
cmle v4.4s, v1.4s, #0
|
||||
and v5.16b, v0.16b, v20.16b
|
||||
eor v1.16b, v1.16b, v5.16b
|
||||
st1 {v7.4s}, [x3], #16
|
||||
st1 {v6.4s}, [x4], #16
|
||||
and v2.16b, v1.16b, v4.16b
|
||||
bic v3.16b, v1.16b, v4.16b
|
||||
fadd v1.4s, v0.4s, v2.4s
|
||||
fsub v0.4s, v0.4s, v3.4s
|
||||
subs x2, x2, #8
|
||||
b.le 2f
|
||||
ld1 {v7.4s}, [x1], #16
|
||||
ld1 {v6.4s}, [x0], #16
|
||||
cmle v4.4s, v7.4s, #0
|
||||
and v5.16b, v6.16b, v20.16b
|
||||
eor v7.16b, v7.16b, v5.16b
|
||||
st1 {v1.4s}, [x3], #16
|
||||
st1 {v0.4s}, [x4], #16
|
||||
and v2.16b, v7.16b, v4.16b
|
||||
bic v3.16b, v7.16b, v4.16b
|
||||
fadd v7.4s, v6.4s, v2.4s
|
||||
fsub v6.4s, v6.4s, v3.4s
|
||||
b 1b
|
||||
|
||||
2: st1 {v1.4s}, [x3], #16
|
||||
st1 {v0.4s}, [x4], #16
|
||||
b.lt ret
|
||||
|
||||
3: ld1 {v1.4s}, [x1]
|
||||
ld1 {v0.4s}, [x0]
|
||||
cmle v4.4s, v1.4s, #0
|
||||
and v5.16b, v0.16b, v20.16b
|
||||
eor v1.16b, v1.16b, v5.16b
|
||||
and v2.16b, v1.16b, v4.16b
|
||||
bic v3.16b, v1.16b, v4.16b
|
||||
fadd v1.4s, v0.4s, v2.4s
|
||||
fsub v0.4s, v0.4s, v3.4s
|
||||
st1 {v1.4s}, [x0], #16
|
||||
st1 {v0.4s}, [x1], #16
|
||||
ret:
|
||||
ret
|
||||
endfunc
|
||||
75
externals/ffmpeg/libavcodec/aarch64/vp8dsp.h
vendored
Executable file
75
externals/ffmpeg/libavcodec/aarch64/vp8dsp.h
vendored
Executable file
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_AARCH64_VP8DSP_H
|
||||
#define AVCODEC_AARCH64_VP8DSP_H
|
||||
|
||||
#include "libavcodec/vp8dsp.h"
|
||||
|
||||
#define VP8_LF_Y(hv, inner, opt) \
|
||||
void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst, \
|
||||
ptrdiff_t stride, \
|
||||
int flim_E, int flim_I, \
|
||||
int hev_thresh)
|
||||
|
||||
#define VP8_LF_UV(hv, inner, opt) \
|
||||
void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU, \
|
||||
uint8_t *dstV, \
|
||||
ptrdiff_t stride, \
|
||||
int flim_E, int flim_I, \
|
||||
int hev_thresh)
|
||||
|
||||
#define VP8_LF_SIMPLE(hv, opt) \
|
||||
void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst, \
|
||||
ptrdiff_t stride, \
|
||||
int flim)
|
||||
|
||||
#define VP8_LF_HV(inner, opt) \
|
||||
VP8_LF_Y(h, inner, opt); \
|
||||
VP8_LF_Y(v, inner, opt); \
|
||||
VP8_LF_UV(h, inner, opt); \
|
||||
VP8_LF_UV(v, inner, opt)
|
||||
|
||||
#define VP8_LF(opt) \
|
||||
VP8_LF_HV(, opt); \
|
||||
VP8_LF_HV(_inner, opt); \
|
||||
VP8_LF_SIMPLE(h, opt); \
|
||||
VP8_LF_SIMPLE(v, opt)
|
||||
|
||||
#define VP8_MC(n, opt) \
|
||||
void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride, \
|
||||
uint8_t *src, ptrdiff_t srcstride, \
|
||||
int h, int x, int y)
|
||||
|
||||
#define VP8_EPEL(w, opt) \
|
||||
VP8_MC(pixels ## w, opt); \
|
||||
VP8_MC(epel ## w ## _h4, opt); \
|
||||
VP8_MC(epel ## w ## _h6, opt); \
|
||||
VP8_MC(epel ## w ## _v4, opt); \
|
||||
VP8_MC(epel ## w ## _h4v4, opt); \
|
||||
VP8_MC(epel ## w ## _h6v4, opt); \
|
||||
VP8_MC(epel ## w ## _v6, opt); \
|
||||
VP8_MC(epel ## w ## _h4v6, opt); \
|
||||
VP8_MC(epel ## w ## _h6v6, opt)
|
||||
|
||||
#define VP8_BILIN(w, opt) \
|
||||
VP8_MC(bilin ## w ## _h, opt); \
|
||||
VP8_MC(bilin ## w ## _v, opt); \
|
||||
VP8_MC(bilin ## w ## _hv, opt)
|
||||
|
||||
#endif /* AVCODEC_AARCH64_VP8DSP_H */
|
||||
124
externals/ffmpeg/libavcodec/aarch64/vp8dsp_init_aarch64.c
vendored
Executable file
124
externals/ffmpeg/libavcodec/aarch64/vp8dsp_init_aarch64.c
vendored
Executable file
@@ -0,0 +1,124 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/vp8dsp.h"
|
||||
#include "vp8dsp.h"
|
||||
|
||||
void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]);
|
||||
|
||||
void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
|
||||
void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
|
||||
void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
|
||||
void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
|
||||
|
||||
VP8_LF(neon);
|
||||
|
||||
VP8_EPEL(16, neon);
|
||||
VP8_EPEL(8, neon);
|
||||
VP8_EPEL(4, neon);
|
||||
|
||||
VP8_BILIN(16, neon);
|
||||
VP8_BILIN(8, neon);
|
||||
VP8_BILIN(4, neon);
|
||||
|
||||
av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp)
|
||||
{
|
||||
if (!have_neon(av_get_cpu_flags()))
|
||||
return;
|
||||
dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon;
|
||||
|
||||
dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon;
|
||||
|
||||
dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon;
|
||||
dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon;
|
||||
|
||||
dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon;
|
||||
|
||||
dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon;
|
||||
|
||||
dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon;
|
||||
dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon;
|
||||
}
|
||||
|
||||
av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp)
|
||||
{
|
||||
if (!have_neon(av_get_cpu_flags()))
|
||||
return;
|
||||
dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon;
|
||||
|
||||
dsp->vp8_idct_add = ff_vp8_idct_add_neon;
|
||||
dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon;
|
||||
dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon;
|
||||
dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon;
|
||||
|
||||
dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
|
||||
dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;
|
||||
dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon;
|
||||
dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon;
|
||||
|
||||
dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon;
|
||||
dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon;
|
||||
dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon;
|
||||
dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon;
|
||||
|
||||
dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon;
|
||||
dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon;
|
||||
}
|
||||
1790
externals/ffmpeg/libavcodec/aarch64/vp8dsp_neon.S
vendored
Executable file
1790
externals/ffmpeg/libavcodec/aarch64/vp8dsp_neon.S
vendored
Executable file
File diff suppressed because it is too large
Load Diff
29
externals/ffmpeg/libavcodec/aarch64/vp9dsp_init.h
vendored
Executable file
29
externals/ffmpeg/libavcodec/aarch64/vp9dsp_init.h
vendored
Executable file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright (c) 2017 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_AARCH64_VP9DSP_INIT_H
|
||||
#define AVCODEC_AARCH64_VP9DSP_INIT_H
|
||||
|
||||
#include "libavcodec/vp9dsp.h"
|
||||
|
||||
void ff_vp9dsp_init_10bpp_aarch64(VP9DSPContext *dsp);
|
||||
void ff_vp9dsp_init_12bpp_aarch64(VP9DSPContext *dsp);
|
||||
|
||||
#endif /* AVCODEC_AARCH64_VP9DSP_INIT_H */
|
||||
23
externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
vendored
Executable file
23
externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
vendored
Executable file
@@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Copyright (c) 2017 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#define BPP 10
|
||||
#define INIT_FUNC ff_vp9dsp_init_10bpp_aarch64
|
||||
#include "vp9dsp_init_16bpp_aarch64_template.c"
|
||||
23
externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
vendored
Executable file
23
externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
vendored
Executable file
@@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Copyright (c) 2017 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#define BPP 12
|
||||
#define INIT_FUNC ff_vp9dsp_init_12bpp_aarch64
|
||||
#include "vp9dsp_init_16bpp_aarch64_template.c"
|
||||
273
externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
vendored
Executable file
273
externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
vendored
Executable file
@@ -0,0 +1,273 @@
|
||||
/*
|
||||
* Copyright (c) 2017 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "vp9dsp_init.h"
|
||||
|
||||
#define declare_fpel(type, sz, suffix) \
|
||||
void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my)
|
||||
|
||||
#define decl_mc_func(op, filter, dir, sz, bpp) \
|
||||
void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my)
|
||||
|
||||
#define define_8tap_2d_fn(op, filter, sz, bpp) \
|
||||
static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int h, int mx, int my) \
|
||||
{ \
|
||||
LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]); \
|
||||
/* We only need h + 7 lines, but the horizontal filter assumes an \
|
||||
* even number of rows, so filter h + 8 lines here. */ \
|
||||
ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz, \
|
||||
src - 3 * src_stride, src_stride, \
|
||||
h + 8, mx, 0); \
|
||||
ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride, \
|
||||
temp + 3 * 2 * sz, 2 * sz, \
|
||||
h, 0, my); \
|
||||
}
|
||||
|
||||
#define decl_filter_funcs(op, dir, sz, bpp) \
|
||||
decl_mc_func(op, regular, dir, sz, bpp); \
|
||||
decl_mc_func(op, sharp, dir, sz, bpp); \
|
||||
decl_mc_func(op, smooth, dir, sz, bpp)
|
||||
|
||||
#define decl_mc_funcs(sz, bpp) \
|
||||
decl_filter_funcs(put, h, sz, bpp); \
|
||||
decl_filter_funcs(avg, h, sz, bpp); \
|
||||
decl_filter_funcs(put, v, sz, bpp); \
|
||||
decl_filter_funcs(avg, v, sz, bpp); \
|
||||
decl_filter_funcs(put, hv, sz, bpp); \
|
||||
decl_filter_funcs(avg, hv, sz, bpp)
|
||||
|
||||
#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
|
||||
#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
|
||||
#define ff_vp9_copy128_neon ff_vp9_copy128_aarch64
|
||||
|
||||
declare_fpel(copy, 128, );
|
||||
declare_fpel(copy, 64, );
|
||||
declare_fpel(copy, 32, );
|
||||
declare_fpel(copy, 16, );
|
||||
declare_fpel(copy, 8, );
|
||||
declare_fpel(avg, 64, _16);
|
||||
declare_fpel(avg, 32, _16);
|
||||
declare_fpel(avg, 16, _16);
|
||||
declare_fpel(avg, 8, _16);
|
||||
declare_fpel(avg, 4, _16);
|
||||
|
||||
decl_mc_funcs(64, BPP);
|
||||
decl_mc_funcs(32, BPP);
|
||||
decl_mc_funcs(16, BPP);
|
||||
decl_mc_funcs(8, BPP);
|
||||
decl_mc_funcs(4, BPP);
|
||||
|
||||
#define define_8tap_2d_funcs(sz, bpp) \
|
||||
define_8tap_2d_fn(put, regular, sz, bpp) \
|
||||
define_8tap_2d_fn(put, sharp, sz, bpp) \
|
||||
define_8tap_2d_fn(put, smooth, sz, bpp) \
|
||||
define_8tap_2d_fn(avg, regular, sz, bpp) \
|
||||
define_8tap_2d_fn(avg, sharp, sz, bpp) \
|
||||
define_8tap_2d_fn(avg, smooth, sz, bpp)
|
||||
|
||||
define_8tap_2d_funcs(64, BPP)
|
||||
define_8tap_2d_funcs(32, BPP)
|
||||
define_8tap_2d_funcs(16, BPP)
|
||||
define_8tap_2d_funcs(8, BPP)
|
||||
define_8tap_2d_funcs(4, BPP)
|
||||
|
||||
static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#define init_fpel(idx1, idx2, sz, type, suffix) \
|
||||
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix
|
||||
|
||||
#define init_copy(idx, sz, suffix) \
|
||||
init_fpel(idx, 0, sz, copy, suffix)
|
||||
|
||||
#define init_avg(idx, sz, suffix) \
|
||||
init_fpel(idx, 1, sz, avg, suffix)
|
||||
|
||||
#define init_copy_avg(idx, sz1, sz2) \
|
||||
init_copy(idx, sz2, _neon); \
|
||||
init_avg (idx, sz1, _16_neon)
|
||||
|
||||
if (have_armv8(cpu_flags)) {
|
||||
init_copy(0, 128, _aarch64);
|
||||
init_copy(1, 64, _aarch64);
|
||||
init_copy(2, 32, _aarch64);
|
||||
}
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
|
||||
dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
|
||||
|
||||
#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp) \
|
||||
init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
|
||||
init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
|
||||
init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp); \
|
||||
init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
|
||||
init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
|
||||
init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp)
|
||||
|
||||
#define init_mc_funcs_dirs(idx, sz, bpp) \
|
||||
init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_, bpp); \
|
||||
init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_, bpp); \
|
||||
init_mc_funcs(idx, hv, 1, 1, sz, , bpp)
|
||||
|
||||
|
||||
init_avg(0, 64, _16_neon);
|
||||
init_avg(1, 32, _16_neon);
|
||||
init_avg(2, 16, _16_neon);
|
||||
init_copy_avg(3, 8, 16);
|
||||
init_copy_avg(4, 4, 8);
|
||||
|
||||
init_mc_funcs_dirs(0, 64, BPP);
|
||||
init_mc_funcs_dirs(1, 32, BPP);
|
||||
init_mc_funcs_dirs(2, 16, BPP);
|
||||
init_mc_funcs_dirs(3, 8, BPP);
|
||||
init_mc_funcs_dirs(4, 4, BPP);
|
||||
}
|
||||
}
|
||||
|
||||
#define define_itxfm2(type_a, type_b, sz, bpp) \
|
||||
void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst, \
|
||||
ptrdiff_t stride, \
|
||||
int16_t *_block, int eob)
|
||||
#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
|
||||
|
||||
#define define_itxfm_funcs(sz, bpp) \
|
||||
define_itxfm(idct, idct, sz, bpp); \
|
||||
define_itxfm(iadst, idct, sz, bpp); \
|
||||
define_itxfm(idct, iadst, sz, bpp); \
|
||||
define_itxfm(iadst, iadst, sz, bpp)
|
||||
|
||||
define_itxfm_funcs(4, BPP);
|
||||
define_itxfm_funcs(8, BPP);
|
||||
define_itxfm_funcs(16, BPP);
|
||||
define_itxfm(idct, idct, 32, BPP);
|
||||
define_itxfm(iwht, iwht, 4, BPP);
|
||||
|
||||
|
||||
static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
#define init_itxfm2(tx, sz, bpp) \
|
||||
dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_##bpp##_neon; \
|
||||
dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
|
||||
dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
|
||||
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
|
||||
#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
|
||||
|
||||
#define init_idct2(tx, nm, bpp) \
|
||||
dsp->itxfm_add[tx][DCT_DCT] = \
|
||||
dsp->itxfm_add[tx][ADST_DCT] = \
|
||||
dsp->itxfm_add[tx][DCT_ADST] = \
|
||||
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
|
||||
#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
|
||||
|
||||
init_itxfm(TX_4X4, 4x4, BPP);
|
||||
init_itxfm(TX_8X8, 8x8, BPP);
|
||||
init_itxfm(TX_16X16, 16x16, BPP);
|
||||
init_idct(TX_32X32, idct_idct_32x32, BPP);
|
||||
init_idct(4, iwht_iwht_4x4, BPP);
|
||||
}
|
||||
}
|
||||
|
||||
#define define_loop_filter(dir, wd, size, bpp) \
|
||||
void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
|
||||
|
||||
#define define_loop_filters(wd, size, bpp) \
|
||||
define_loop_filter(h, wd, size, bpp); \
|
||||
define_loop_filter(v, wd, size, bpp)
|
||||
|
||||
define_loop_filters(4, 8, BPP);
|
||||
define_loop_filters(8, 8, BPP);
|
||||
define_loop_filters(16, 8, BPP);
|
||||
|
||||
define_loop_filters(16, 16, BPP);
|
||||
|
||||
define_loop_filters(44, 16, BPP);
|
||||
define_loop_filters(48, 16, BPP);
|
||||
define_loop_filters(84, 16, BPP);
|
||||
define_loop_filters(88, 16, BPP);
|
||||
|
||||
static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
|
||||
dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
|
||||
|
||||
#define init_lpf_func_16(idx, dir, bpp) \
|
||||
dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
|
||||
|
||||
#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
|
||||
dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
|
||||
|
||||
#define init_lpf_funcs_8_wd(idx, wd, bpp) \
|
||||
init_lpf_func_8(idx, 0, h, wd, bpp); \
|
||||
init_lpf_func_8(idx, 1, v, wd, bpp)
|
||||
|
||||
#define init_lpf_funcs_16(bpp) \
|
||||
init_lpf_func_16(0, h, bpp); \
|
||||
init_lpf_func_16(1, v, bpp)
|
||||
|
||||
#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
|
||||
init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp); \
|
||||
init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
|
||||
|
||||
#define init_lpf_funcs_8(bpp) \
|
||||
init_lpf_funcs_8_wd(0, 4, bpp); \
|
||||
init_lpf_funcs_8_wd(1, 8, bpp); \
|
||||
init_lpf_funcs_8_wd(2, 16, bpp)
|
||||
|
||||
#define init_lpf_funcs_mix2(bpp) \
|
||||
init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
|
||||
init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
|
||||
init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
|
||||
init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
|
||||
|
||||
init_lpf_funcs_8(BPP);
|
||||
init_lpf_funcs_16(BPP);
|
||||
init_lpf_funcs_mix2(BPP);
|
||||
}
|
||||
}
|
||||
|
||||
av_cold void INIT_FUNC(VP9DSPContext *dsp)
|
||||
{
|
||||
vp9dsp_mc_init_aarch64(dsp);
|
||||
vp9dsp_loopfilter_init_aarch64(dsp);
|
||||
vp9dsp_itxfm_init_aarch64(dsp);
|
||||
}
|
||||
258
externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_aarch64.c
vendored
Executable file
258
externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_aarch64.c
vendored
Executable file
@@ -0,0 +1,258 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/vp9dsp.h"
|
||||
#include "vp9dsp_init.h"
|
||||
|
||||
#define declare_fpel(type, sz) \
|
||||
void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my)
|
||||
|
||||
#define declare_copy_avg(sz) \
|
||||
declare_fpel(copy, sz); \
|
||||
declare_fpel(avg , sz)
|
||||
|
||||
#define decl_mc_func(op, filter, dir, sz) \
|
||||
void ff_vp9_##op##_##filter##sz##_##dir##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my)
|
||||
|
||||
#define define_8tap_2d_fn(op, filter, sz) \
|
||||
static void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my) \
|
||||
{ \
|
||||
LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz]); \
|
||||
/* We only need h + 7 lines, but the horizontal filter assumes an \
|
||||
* even number of rows, so filter h + 8 lines here. */ \
|
||||
ff_vp9_put_##filter##sz##_h_neon(temp, sz, \
|
||||
src - 3 * src_stride, src_stride, \
|
||||
h + 8, mx, 0); \
|
||||
ff_vp9_##op##_##filter##sz##_v_neon(dst, dst_stride, \
|
||||
temp + 3 * sz, sz, \
|
||||
h, 0, my); \
|
||||
}
|
||||
|
||||
#define decl_filter_funcs(op, dir, sz) \
|
||||
decl_mc_func(op, regular, dir, sz); \
|
||||
decl_mc_func(op, sharp, dir, sz); \
|
||||
decl_mc_func(op, smooth, dir, sz)
|
||||
|
||||
#define decl_mc_funcs(sz) \
|
||||
decl_filter_funcs(put, h, sz); \
|
||||
decl_filter_funcs(avg, h, sz); \
|
||||
decl_filter_funcs(put, v, sz); \
|
||||
decl_filter_funcs(avg, v, sz); \
|
||||
decl_filter_funcs(put, hv, sz); \
|
||||
decl_filter_funcs(avg, hv, sz)
|
||||
|
||||
#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
|
||||
#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
|
||||
|
||||
declare_copy_avg(64);
|
||||
declare_copy_avg(32);
|
||||
declare_copy_avg(16);
|
||||
declare_copy_avg(8);
|
||||
declare_copy_avg(4);
|
||||
|
||||
decl_mc_funcs(64);
|
||||
decl_mc_funcs(32);
|
||||
decl_mc_funcs(16);
|
||||
decl_mc_funcs(8);
|
||||
decl_mc_funcs(4);
|
||||
|
||||
#define define_8tap_2d_funcs(sz) \
|
||||
define_8tap_2d_fn(put, regular, sz) \
|
||||
define_8tap_2d_fn(put, sharp, sz) \
|
||||
define_8tap_2d_fn(put, smooth, sz) \
|
||||
define_8tap_2d_fn(avg, regular, sz) \
|
||||
define_8tap_2d_fn(avg, sharp, sz) \
|
||||
define_8tap_2d_fn(avg, smooth, sz)
|
||||
|
||||
define_8tap_2d_funcs(64)
|
||||
define_8tap_2d_funcs(32)
|
||||
define_8tap_2d_funcs(16)
|
||||
define_8tap_2d_funcs(8)
|
||||
define_8tap_2d_funcs(4)
|
||||
|
||||
static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#define init_fpel(idx1, idx2, sz, type, suffix) \
|
||||
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix
|
||||
|
||||
#define init_copy(idx, sz, suffix) \
|
||||
init_fpel(idx, 0, sz, copy, suffix)
|
||||
|
||||
#define init_avg(idx, sz, suffix) \
|
||||
init_fpel(idx, 1, sz, avg, suffix)
|
||||
|
||||
#define init_copy_avg(idx, sz) \
|
||||
init_copy(idx, sz, _neon); \
|
||||
init_avg (idx, sz, _neon)
|
||||
|
||||
if (have_armv8(cpu_flags)) {
|
||||
init_copy(0, 64, _aarch64);
|
||||
init_copy(1, 32, _aarch64);
|
||||
}
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \
|
||||
dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon
|
||||
|
||||
#define init_mc_funcs(idx, dir, mx, my, sz, pfx) \
|
||||
init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
|
||||
init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \
|
||||
init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx); \
|
||||
init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
|
||||
init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \
|
||||
init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx)
|
||||
|
||||
#define init_mc_funcs_dirs(idx, sz) \
|
||||
init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_); \
|
||||
init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_); \
|
||||
init_mc_funcs(idx, hv, 1, 1, sz,)
|
||||
|
||||
init_avg(0, 64, _neon);
|
||||
init_avg(1, 32, _neon);
|
||||
init_copy_avg(2, 16);
|
||||
init_copy_avg(3, 8);
|
||||
init_copy_avg(4, 4);
|
||||
|
||||
init_mc_funcs_dirs(0, 64);
|
||||
init_mc_funcs_dirs(1, 32);
|
||||
init_mc_funcs_dirs(2, 16);
|
||||
init_mc_funcs_dirs(3, 8);
|
||||
init_mc_funcs_dirs(4, 4);
|
||||
}
|
||||
}
|
||||
|
||||
#define define_itxfm(type_a, type_b, sz) \
|
||||
void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst, \
|
||||
ptrdiff_t stride, \
|
||||
int16_t *_block, int eob)
|
||||
|
||||
#define define_itxfm_funcs(sz) \
|
||||
define_itxfm(idct, idct, sz); \
|
||||
define_itxfm(iadst, idct, sz); \
|
||||
define_itxfm(idct, iadst, sz); \
|
||||
define_itxfm(iadst, iadst, sz)
|
||||
|
||||
define_itxfm_funcs(4);
|
||||
define_itxfm_funcs(8);
|
||||
define_itxfm_funcs(16);
|
||||
define_itxfm(idct, idct, 32);
|
||||
define_itxfm(iwht, iwht, 4);
|
||||
|
||||
|
||||
static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
#define init_itxfm(tx, sz) \
|
||||
dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_neon; \
|
||||
dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_neon; \
|
||||
dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_neon; \
|
||||
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
|
||||
|
||||
#define init_idct(tx, nm) \
|
||||
dsp->itxfm_add[tx][DCT_DCT] = \
|
||||
dsp->itxfm_add[tx][ADST_DCT] = \
|
||||
dsp->itxfm_add[tx][DCT_ADST] = \
|
||||
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
|
||||
|
||||
init_itxfm(TX_4X4, 4x4);
|
||||
init_itxfm(TX_8X8, 8x8);
|
||||
init_itxfm(TX_16X16, 16x16);
|
||||
init_idct(TX_32X32, idct_idct_32x32);
|
||||
init_idct(4, iwht_iwht_4x4);
|
||||
}
|
||||
}
|
||||
|
||||
#define define_loop_filter(dir, wd, len) \
|
||||
void ff_vp9_loop_filter_##dir##_##wd##_##len##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
|
||||
|
||||
#define define_loop_filters(wd, len) \
|
||||
define_loop_filter(h, wd, len); \
|
||||
define_loop_filter(v, wd, len)
|
||||
|
||||
define_loop_filters(4, 8);
|
||||
define_loop_filters(8, 8);
|
||||
define_loop_filters(16, 8);
|
||||
|
||||
define_loop_filters(16, 16);
|
||||
|
||||
define_loop_filters(44, 16);
|
||||
define_loop_filters(48, 16);
|
||||
define_loop_filters(84, 16);
|
||||
define_loop_filters(88, 16);
|
||||
|
||||
static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_neon;
|
||||
dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_neon;
|
||||
dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_neon;
|
||||
dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_neon;
|
||||
dsp->loop_filter_8[2][1] = ff_vp9_loop_filter_v_16_8_neon;
|
||||
dsp->loop_filter_8[2][0] = ff_vp9_loop_filter_h_16_8_neon;
|
||||
|
||||
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
|
||||
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
|
||||
|
||||
dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
|
||||
dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
|
||||
dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_neon;
|
||||
dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_neon;
|
||||
dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_neon;
|
||||
dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_neon;
|
||||
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_neon;
|
||||
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_neon;
|
||||
}
|
||||
}
|
||||
|
||||
av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
|
||||
{
|
||||
if (bpp == 10) {
|
||||
ff_vp9dsp_init_10bpp_aarch64(dsp);
|
||||
return;
|
||||
} else if (bpp == 12) {
|
||||
ff_vp9dsp_init_12bpp_aarch64(dsp);
|
||||
return;
|
||||
} else if (bpp != 8)
|
||||
return;
|
||||
|
||||
vp9dsp_mc_init_aarch64(dsp);
|
||||
vp9dsp_loopfilter_init_aarch64(dsp);
|
||||
vp9dsp_itxfm_init_aarch64(dsp);
|
||||
}
|
||||
2017
externals/ffmpeg/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
vendored
Executable file
2017
externals/ffmpeg/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
vendored
Executable file
File diff suppressed because it is too large
Load Diff
1580
externals/ffmpeg/libavcodec/aarch64/vp9itxfm_neon.S
vendored
Executable file
1580
externals/ffmpeg/libavcodec/aarch64/vp9itxfm_neon.S
vendored
Executable file
File diff suppressed because it is too large
Load Diff
873
externals/ffmpeg/libavcodec/aarch64/vp9lpf_16bpp_neon.S
vendored
Executable file
873
externals/ffmpeg/libavcodec/aarch64/vp9lpf_16bpp_neon.S
vendored
Executable file
@@ -0,0 +1,873 @@
|
||||
/*
|
||||
* Copyright (c) 2017 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
|
||||
.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().8h, \r0\().8h, \r1\().8h
|
||||
trn2 \t5\().8h, \r0\().8h, \r1\().8h
|
||||
trn1 \t6\().8h, \r2\().8h, \r3\().8h
|
||||
trn2 \t7\().8h, \r2\().8h, \r3\().8h
|
||||
|
||||
trn1 \r0\().4s, \t4\().4s, \t6\().4s
|
||||
trn2 \r2\().4s, \t4\().4s, \t6\().4s
|
||||
trn1 \r1\().4s, \t5\().4s, \t7\().4s
|
||||
trn2 \r3\().4s, \t5\().4s, \t7\().4s
|
||||
.endm
|
||||
|
||||
// The input to and output from this macro is in the registers v16-v31,
|
||||
// and v0-v7 are used as scratch registers.
|
||||
// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
|
||||
// Depending on the width of the loop filter, we either use v16-v19
|
||||
// and v28-v31 as temp registers, or v8-v15.
|
||||
.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
|
||||
dup v0.8h, w2 // E
|
||||
dup v2.8h, w3 // I
|
||||
dup v3.8h, w4 // H
|
||||
|
||||
uabd v4.8h, v20.8h, v21.8h // abs(p3 - p2)
|
||||
uabd v5.8h, v21.8h, v22.8h // abs(p2 - p1)
|
||||
uabd v6.8h, v22.8h, v23.8h // abs(p1 - p0)
|
||||
uabd v7.8h, v24.8h, v25.8h // abs(q0 - q1)
|
||||
uabd \tmp1\().8h, v25.8h, v26.8h // abs(q1 - q2)
|
||||
uabd \tmp2\().8h, v26.8h, v27.8h // abs(q2 - q3)
|
||||
umax v4.8h, v4.8h, v5.8h
|
||||
umax v5.8h, v6.8h, v7.8h
|
||||
umax \tmp1\().8h, \tmp1\().8h, \tmp2\().8h
|
||||
uabd v6.8h, v23.8h, v24.8h // abs(p0 - q0)
|
||||
umax v4.8h, v4.8h, v5.8h
|
||||
add v6.8h, v6.8h, v6.8h // abs(p0 - q0) * 2
|
||||
uabd v5.8h, v22.8h, v25.8h // abs(p1 - q1)
|
||||
umax v4.8h, v4.8h, \tmp1\().8h // max(abs(p3 - p2), ..., abs(q2 - q3))
|
||||
ushr v5.8h, v5.8h, #1
|
||||
cmhs v4.8h, v2.8h, v4.8h // max(abs()) <= I
|
||||
add v6.8h, v6.8h, v5.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
|
||||
cmhs v6.8h, v0.8h, v6.8h
|
||||
and v4.16b, v4.16b, v6.16b // fm
|
||||
|
||||
// If no pixels need filtering, just exit as soon as possible
|
||||
mov x11, v4.d[0]
|
||||
mov x12, v4.d[1]
|
||||
adds x11, x11, x12
|
||||
b.ne 1f
|
||||
br x10
|
||||
1:
|
||||
|
||||
.if \wd >= 8
|
||||
dup v0.8h, w5
|
||||
|
||||
uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0)
|
||||
uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
|
||||
uabd v1.8h, v22.8h, v23.8h // abs(p1 - p0)
|
||||
uabd \tmp1\().8h, v25.8h, v24.8h // abs(q1 - q0)
|
||||
uabd \tmp2\().8h, v26.8h, v24.8h // abs(q2 - q0)
|
||||
uabd \tmp3\().8h, v27.8h, v24.8h // abs(q3 - q0)
|
||||
umax v6.8h, v6.8h, v2.8h
|
||||
umax v1.8h, v1.8h, \tmp1\().8h
|
||||
umax \tmp2\().8h, \tmp2\().8h, \tmp3\().8h
|
||||
.if \wd == 16
|
||||
uabd v7.8h, v16.8h, v23.8h // abs(p7 - p0)
|
||||
umax v6.8h, v6.8h, v1.8h
|
||||
uabd v2.8h, v17.8h, v23.8h // abs(p6 - p0)
|
||||
umax v6.8h, v6.8h, \tmp2\().8h
|
||||
uabd v1.8h, v18.8h, v23.8h // abs(p5 - p0)
|
||||
cmhs v6.8h, v0.8h, v6.8h // flat8in
|
||||
uabd v8.8h, v19.8h, v23.8h // abs(p4 - p0)
|
||||
and v6.16b, v6.16b, v4.16b // flat8in && fm
|
||||
uabd v9.8h, v28.8h, v24.8h // abs(q4 - q0)
|
||||
bic v4.16b, v4.16b, v6.16b // fm && !flat8in
|
||||
uabd v10.8h, v29.8h, v24.8h // abs(q5 - q0)
|
||||
uabd v11.8h, v30.8h, v24.8h // abs(q6 - q0)
|
||||
uabd v12.8h, v31.8h, v24.8h // abs(q7 - q0)
|
||||
|
||||
umax v7.8h, v7.8h, v2.8h
|
||||
umax v1.8h, v1.8h, v8.8h
|
||||
umax v9.8h, v9.8h, v10.8h
|
||||
umax v11.8h, v11.8h, v12.8h
|
||||
// The rest of the calculation of flat8out is interleaved below
|
||||
.else
|
||||
// The rest of the calculation of flat8in is interleaved below
|
||||
.endif
|
||||
.endif
|
||||
|
||||
// Calculate the normal inner loop filter for 2 or 4 pixels
|
||||
uabd v5.8h, v22.8h, v23.8h // abs(p1 - p0)
|
||||
.if \wd == 16
|
||||
umax v7.8h, v7.8h, v1.8h
|
||||
umax v9.8h, v9.8h, v11.8h
|
||||
.elseif \wd == 8
|
||||
umax v6.8h, v6.8h, v1.8h
|
||||
.endif
|
||||
uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0)
|
||||
.if \wd == 16
|
||||
umax v7.8h, v7.8h, v9.8h
|
||||
.elseif \wd == 8
|
||||
umax v6.8h, v6.8h, \tmp2\().8h
|
||||
.endif
|
||||
dup \tmp2\().8h, w6 // left shift for saturation
|
||||
sub \tmp1\().8h, v22.8h, v25.8h // p1 - q1
|
||||
neg \tmp6\().8h, \tmp2\().8h // negative left shift after saturation
|
||||
umax v5.8h, v5.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0))
|
||||
sub \tmp3\().8h, v24.8h, v23.8h // q0 - p0
|
||||
movi \tmp5\().8h, #3
|
||||
.if \wd == 8
|
||||
cmhs v6.8h, v0.8h, v6.8h // flat8in
|
||||
.endif
|
||||
cmhs v5.8h, v3.8h, v5.8h // !hev
|
||||
.if \wd == 8
|
||||
and v6.16b, v6.16b, v4.16b // flat8in && fm
|
||||
.endif
|
||||
sqshl \tmp1\().8h, \tmp1\().8h, \tmp2\().8h
|
||||
.if \wd == 16
|
||||
cmhs v7.8h, v0.8h, v7.8h // flat8out
|
||||
.elseif \wd == 8
|
||||
bic v4.16b, v4.16b, v6.16b // fm && !flat8in
|
||||
.endif
|
||||
and v5.16b, v5.16b, v4.16b // !hev && fm && !flat8in
|
||||
.if \wd == 16
|
||||
and v7.16b, v7.16b, v6.16b // flat8out && flat8in && fm
|
||||
.endif
|
||||
sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
|
||||
|
||||
mul \tmp3\().8h, \tmp3\().8h, \tmp5\().8h // 3 * (q0 - p0)
|
||||
bic \tmp1\().16b, \tmp1\().16b, v5.16b // if (!hev) av_clip_int8 = 0
|
||||
movi v2.8h, #4
|
||||
add \tmp3\().8h, \tmp3\().8h, \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
|
||||
movi v3.8h, #3
|
||||
sqshl \tmp1\().8h, \tmp3\().8h, \tmp2\().8h
|
||||
movi \tmp5\().8h, #0
|
||||
sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
|
||||
dup \tmp6\().8h, w7 // max pixel value
|
||||
.if \wd == 16
|
||||
bic v6.16b, v6.16b, v7.16b // fm && flat8in && !flat8out
|
||||
.endif
|
||||
|
||||
ushr \tmp2\().8h, \tmp6\().8h, #1 // (1 << (BIT_DEPTH - 1)) - 1
|
||||
|
||||
add \tmp3\().8h, \tmp1\().8h, v2.8h // f + 4
|
||||
add \tmp4\().8h, \tmp1\().8h, v3.8h // f + 3
|
||||
smin \tmp3\().8h, \tmp3\().8h, \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
|
||||
smin \tmp4\().8h, \tmp4\().8h, \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
|
||||
sshr \tmp3\().8h, \tmp3\().8h, #3 // f1
|
||||
sshr \tmp4\().8h, \tmp4\().8h, #3 // f2
|
||||
|
||||
add v0.8h, v23.8h, \tmp4\().8h // p0 + f2
|
||||
sub v2.8h, v24.8h, \tmp3\().8h // q0 - f1
|
||||
smin v0.8h, v0.8h, \tmp6\().8h
|
||||
smin v2.8h, v2.8h, \tmp6\().8h
|
||||
srshr \tmp3\().8h, \tmp3\().8h, #1 // f = (f1 + 1) >> 1
|
||||
smax v0.8h, v0.8h, \tmp5\().8h // out p0
|
||||
smax v2.8h, v2.8h, \tmp5\().8h // out q0
|
||||
bit v23.16b, v0.16b, v4.16b // if (fm && !flat8in)
|
||||
bit v24.16b, v2.16b, v4.16b
|
||||
|
||||
add v0.8h, v22.8h, \tmp3\().8h // p1 + f
|
||||
sub v2.8h, v25.8h, \tmp3\().8h // q1 - f
|
||||
.if \wd >= 8
|
||||
mov x11, v6.d[0]
|
||||
.endif
|
||||
smin v0.8h, v0.8h, \tmp6\().8h
|
||||
smin v2.8h, v2.8h, \tmp6\().8h
|
||||
.if \wd >= 8
|
||||
mov x12, v6.d[1]
|
||||
.endif
|
||||
smax v0.8h, v0.8h, \tmp5\().8h // out p1
|
||||
smax v2.8h, v2.8h, \tmp5\().8h // out q1
|
||||
.if \wd >= 8
|
||||
adds x11, x11, x12
|
||||
.endif
|
||||
bit v22.16b, v0.16b, v5.16b // if (!hev && fm && !flat8in)
|
||||
bit v25.16b, v2.16b, v5.16b
|
||||
|
||||
// If no pixels need flat8in, jump to flat8out
|
||||
// (or to a writeout of the inner 4 pixels, for wd=8)
|
||||
.if \wd >= 8
|
||||
.if \wd == 16
|
||||
b.eq 6f
|
||||
.else
|
||||
b.ne 1f
|
||||
br x13
|
||||
1:
|
||||
.endif
|
||||
|
||||
// flat8in
|
||||
add \tmp1\().8h, v20.8h, v21.8h
|
||||
add \tmp3\().8h, v22.8h, v25.8h
|
||||
add \tmp5\().8h, v20.8h, v22.8h
|
||||
add \tmp7\().8h, v23.8h, v26.8h
|
||||
add v0.8h, \tmp1\().8h, \tmp1\().8h
|
||||
add v0.8h, v0.8h, v23.8h
|
||||
add v0.8h, v0.8h, v24.8h
|
||||
add v0.8h, v0.8h, \tmp5\().8h
|
||||
sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
|
||||
sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
|
||||
urshr v2.8h, v0.8h, #3 // out p2
|
||||
|
||||
add v0.8h, v0.8h, \tmp3\().8h
|
||||
add \tmp1\().8h, v20.8h, v23.8h
|
||||
add \tmp3\().8h, v24.8h, v27.8h
|
||||
urshr v3.8h, v0.8h, #3 // out p1
|
||||
|
||||
add v0.8h, v0.8h, \tmp7\().8h
|
||||
sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
|
||||
add \tmp5\().8h, v21.8h, v24.8h
|
||||
add \tmp7\().8h, v25.8h, v27.8h
|
||||
urshr v4.8h, v0.8h, #3 // out p0
|
||||
|
||||
add v0.8h, v0.8h, \tmp3\().8h
|
||||
sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
|
||||
add \tmp1\().8h, v22.8h, v25.8h
|
||||
add \tmp3\().8h, v26.8h, v27.8h
|
||||
urshr v5.8h, v0.8h, #3 // out q0
|
||||
|
||||
add v0.8h, v0.8h, \tmp7\().8h
|
||||
sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
|
||||
urshr \tmp5\().8h, v0.8h, #3 // out q1
|
||||
|
||||
add v0.8h, v0.8h, \tmp3\().8h
|
||||
// The output here is written back into the input registers. This doesn't
|
||||
// matter for the flat8part below, since we only update those pixels
|
||||
// which won't be touched below.
|
||||
bit v21.16b, v2.16b, v6.16b
|
||||
bit v22.16b, v3.16b, v6.16b
|
||||
bit v23.16b, v4.16b, v6.16b
|
||||
urshr \tmp6\().8h, v0.8h, #3 // out q2
|
||||
bit v24.16b, v5.16b, v6.16b
|
||||
bit v25.16b, \tmp5\().16b, v6.16b
|
||||
bit v26.16b, \tmp6\().16b, v6.16b
|
||||
.endif
|
||||
.if \wd == 16
|
||||
6:
|
||||
orr v2.16b, v6.16b, v7.16b
|
||||
mov x11, v2.d[0]
|
||||
mov x12, v2.d[1]
|
||||
adds x11, x11, x12
|
||||
b.ne 1f
|
||||
// If no pixels needed flat8in nor flat8out, jump to a
|
||||
// writeout of the inner 4 pixels
|
||||
br x14
|
||||
1:
|
||||
|
||||
mov x11, v7.d[0]
|
||||
mov x12, v7.d[1]
|
||||
adds x11, x11, x12
|
||||
b.ne 1f
|
||||
// If no pixels need flat8out, jump to a writeout of the inner 6 pixels
|
||||
br x15
|
||||
|
||||
1:
|
||||
// flat8out
|
||||
// This writes all outputs into v2-v17 (skipping v6 and v16).
|
||||
// If this part is skipped, the output is read from v21-v26 (which is the input
|
||||
// to this section).
|
||||
shl v0.8h, v16.8h, #3 // 8 * v16
|
||||
sub v0.8h, v0.8h, v16.8h // 7 * v16
|
||||
add v0.8h, v0.8h, v17.8h
|
||||
add v8.8h, v17.8h, v18.8h
|
||||
add v10.8h, v19.8h, v20.8h
|
||||
add v0.8h, v0.8h, v8.8h
|
||||
add v8.8h, v16.8h, v17.8h
|
||||
add v12.8h, v21.8h, v22.8h
|
||||
add v0.8h, v0.8h, v10.8h
|
||||
add v10.8h, v18.8h, v25.8h
|
||||
add v14.8h, v23.8h, v24.8h
|
||||
sub v10.8h, v10.8h, v8.8h
|
||||
add v0.8h, v0.8h, v12.8h
|
||||
add v0.8h, v0.8h, v14.8h
|
||||
add v12.8h, v16.8h, v18.8h
|
||||
add v14.8h, v19.8h, v26.8h
|
||||
urshr v2.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v10.8h
|
||||
add v8.8h, v16.8h, v19.8h
|
||||
add v10.8h, v20.8h, v27.8h
|
||||
sub v14.8h, v14.8h, v12.8h
|
||||
bif v2.16b, v17.16b, v7.16b
|
||||
urshr v3.8h , v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v14.8h
|
||||
add v12.8h, v16.8h, v20.8h
|
||||
add v14.8h, v21.8h, v28.8h
|
||||
sub v10.8h, v10.8h, v8.8h
|
||||
bif v3.16b, v18.16b, v7.16b
|
||||
urshr v4.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v10.8h
|
||||
add v8.8h, v16.8h, v21.8h
|
||||
add v10.8h, v22.8h, v29.8h
|
||||
sub v14.8h, v14.8h, v12.8h
|
||||
bif v4.16b, v19.16b, v7.16b
|
||||
urshr v5.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v14.8h
|
||||
add v12.8h, v16.8h, v22.8h
|
||||
add v14.8h, v23.8h, v30.8h
|
||||
sub v10.8h, v10.8h, v8.8h
|
||||
bif v5.16b, v20.16b, v7.16b
|
||||
urshr v6.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v10.8h
|
||||
add v10.8h, v16.8h, v23.8h
|
||||
sub v14.8h, v14.8h, v12.8h
|
||||
add v12.8h, v24.8h, v31.8h
|
||||
bif v6.16b, v21.16b, v7.16b
|
||||
urshr v8.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v14.8h
|
||||
sub v10.8h, v12.8h, v10.8h
|
||||
add v12.8h, v17.8h, v24.8h
|
||||
add v14.8h, v25.8h, v31.8h
|
||||
bif v8.16b, v22.16b, v7.16b
|
||||
urshr v9.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v10.8h
|
||||
sub v14.8h, v14.8h, v12.8h
|
||||
add v12.8h, v26.8h, v31.8h
|
||||
bif v9.16b, v23.16b, v7.16b
|
||||
urshr v10.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v14.8h
|
||||
add v14.8h, v18.8h, v25.8h
|
||||
add v18.8h, v19.8h, v26.8h
|
||||
sub v12.8h, v12.8h, v14.8h
|
||||
add v14.8h, v27.8h, v31.8h
|
||||
bif v10.16b, v24.16b, v7.16b
|
||||
urshr v11.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v12.8h
|
||||
add v12.8h, v20.8h, v27.8h
|
||||
sub v14.8h, v14.8h, v18.8h
|
||||
add v18.8h, v28.8h, v31.8h
|
||||
bif v11.16b, v25.16b, v7.16b
|
||||
sub v18.8h, v18.8h, v12.8h
|
||||
urshr v12.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v14.8h
|
||||
add v14.8h, v21.8h, v28.8h
|
||||
add v20.8h, v29.8h, v31.8h
|
||||
bif v12.16b, v26.16b, v7.16b
|
||||
urshr v13.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v18.8h
|
||||
sub v20.8h, v20.8h, v14.8h
|
||||
add v18.8h, v22.8h, v29.8h
|
||||
add v22.8h, v30.8h, v31.8h
|
||||
bif v13.16b, v27.16b, v7.16b
|
||||
urshr v14.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v20.8h
|
||||
sub v22.8h, v22.8h, v18.8h
|
||||
bif v14.16b, v28.16b, v7.16b
|
||||
urshr v15.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v22.8h
|
||||
bif v15.16b, v29.16b, v7.16b
|
||||
urshr v17.8h, v0.8h, #4
|
||||
bif v17.16b, v30.16b, v7.16b
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
|
||||
// while we need those for inputs/outputs in wd=16 and use v8-v15
|
||||
// for temp registers there instead.
|
||||
function vp9_loop_filter_4
|
||||
loop_filter 4, v16, v17, v18, v19, v28, v29, v30, v31
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function vp9_loop_filter_8
|
||||
loop_filter 8, v16, v17, v18, v19, v28, v29, v30, v31
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function vp9_loop_filter_16
|
||||
loop_filter 16, v8, v9, v10, v11, v12, v13, v14, v15
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro loop_filter_4
|
||||
bl vp9_loop_filter_4
|
||||
.endm
|
||||
|
||||
.macro loop_filter_8
|
||||
// calculate alternative 'return' targets
|
||||
adr x13, 6f
|
||||
bl vp9_loop_filter_8
|
||||
.endm
|
||||
|
||||
.macro loop_filter_16
|
||||
// calculate alternative 'return' targets
|
||||
adr x14, 7f
|
||||
adr x15, 8f
|
||||
bl vp9_loop_filter_16
|
||||
.endm
|
||||
|
||||
|
||||
// The public functions in this file have got the following signature:
|
||||
// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
|
||||
|
||||
.macro bpp_frontend func, bpp, push
|
||||
function ff_\func\()_\bpp\()_neon, export=1
|
||||
.if \push
|
||||
mov x16, x30
|
||||
stp d14, d15, [sp, #-0x10]!
|
||||
stp d12, d13, [sp, #-0x10]!
|
||||
stp d10, d11, [sp, #-0x10]!
|
||||
stp d8, d9, [sp, #-0x10]!
|
||||
.endif
|
||||
lsl w2, w2, #\bpp - 8
|
||||
lsl w3, w3, #\bpp - 8
|
||||
lsl w4, w4, #\bpp - 8
|
||||
mov x5, #1 << (\bpp - 8)
|
||||
mov x6, #16 - \bpp
|
||||
mov x7, #((1 << \bpp) - 1)
|
||||
.if \push
|
||||
bl \func\()_16_neon
|
||||
ldp d8, d9, [sp], 0x10
|
||||
ldp d10, d11, [sp], 0x10
|
||||
ldp d12, d13, [sp], 0x10
|
||||
ldp d14, d15, [sp], 0x10
|
||||
br x16
|
||||
.else
|
||||
b \func\()_16_neon
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro bpp_frontends func, push=0
|
||||
bpp_frontend \func, 10, \push
|
||||
bpp_frontend \func, 12, \push
|
||||
.endm
|
||||
|
||||
.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
|
||||
function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
|
||||
mov x16, x30
|
||||
.if \push
|
||||
stp d14, d15, [sp, #-0x10]!
|
||||
stp d12, d13, [sp, #-0x10]!
|
||||
stp d10, d11, [sp, #-0x10]!
|
||||
stp d8, d9, [sp, #-0x10]!
|
||||
.endif
|
||||
lsl w2, w2, #\bpp - 8
|
||||
lsl w3, w3, #\bpp - 8
|
||||
lsl w4, w4, #\bpp - 8
|
||||
mov x5, #1 << (\bpp - 8)
|
||||
mov x6, #16 - \bpp
|
||||
mov x7, #((1 << \bpp) - 1)
|
||||
bl \func\()_\int_suffix\()_16_neon
|
||||
.ifc \dir,h
|
||||
add x0, x0, x1, lsl #3
|
||||
.else
|
||||
add x0, x0, #16
|
||||
.endif
|
||||
bl \func\()_\int_suffix\()_16_neon
|
||||
.if \push
|
||||
ldp d8, d9, [sp], 0x10
|
||||
ldp d10, d11, [sp], 0x10
|
||||
ldp d12, d13, [sp], 0x10
|
||||
ldp d14, d15, [sp], 0x10
|
||||
.endif
|
||||
br x16
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
|
||||
bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
|
||||
bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
|
||||
.endm
|
||||
|
||||
.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
|
||||
function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
|
||||
mov x16, x30
|
||||
lsr w8, w2, #8
|
||||
lsr w14, w3, #8
|
||||
lsr w15, w4, #8
|
||||
and w2, w2, #0xff
|
||||
and w3, w3, #0xff
|
||||
and w4, w4, #0xff
|
||||
lsl w2, w2, #\bpp - 8
|
||||
lsl w3, w3, #\bpp - 8
|
||||
lsl w4, w4, #\bpp - 8
|
||||
mov x5, #1 << (\bpp - 8)
|
||||
mov x6, #16 - \bpp
|
||||
mov x7, #((1 << \bpp) - 1)
|
||||
bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
|
||||
.ifc \dir,h
|
||||
add x0, x0, x1, lsl #3
|
||||
.else
|
||||
add x0, x0, #16
|
||||
.endif
|
||||
lsl w2, w8, #\bpp - 8
|
||||
lsl w3, w14, #\bpp - 8
|
||||
lsl w4, w15, #\bpp - 8
|
||||
bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
|
||||
br x16
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro bpp_frontends_mix2 wd1, wd2
|
||||
bpp_frontend_mix2 \wd1, \wd2, v, 10
|
||||
bpp_frontend_mix2 \wd1, \wd2, v, 12
|
||||
bpp_frontend_mix2 \wd1, \wd2, h, 10
|
||||
bpp_frontend_mix2 \wd1, \wd2, h, 12
|
||||
.endm
|
||||
|
||||
function vp9_loop_filter_v_4_8_16_neon
|
||||
mov x10, x30
|
||||
sub x9, x0, x1, lsl #2
|
||||
ld1 {v20.8h}, [x9], x1 // p3
|
||||
ld1 {v24.8h}, [x0], x1 // q0
|
||||
ld1 {v21.8h}, [x9], x1 // p2
|
||||
ld1 {v25.8h}, [x0], x1 // q1
|
||||
ld1 {v22.8h}, [x9], x1 // p1
|
||||
ld1 {v26.8h}, [x0], x1 // q2
|
||||
ld1 {v23.8h}, [x9], x1 // p0
|
||||
ld1 {v27.8h}, [x0], x1 // q3
|
||||
sub x0, x0, x1, lsl #2
|
||||
sub x9, x9, x1, lsl #1
|
||||
|
||||
loop_filter_4
|
||||
|
||||
st1 {v22.8h}, [x9], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #1
|
||||
|
||||
br x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_v_4_8
|
||||
|
||||
function vp9_loop_filter_h_4_8_16_neon
|
||||
mov x10, x30
|
||||
sub x9, x0, #8
|
||||
add x0, x9, x1, lsl #2
|
||||
ld1 {v20.8h}, [x9], x1
|
||||
ld1 {v24.8h}, [x0], x1
|
||||
ld1 {v21.8h}, [x9], x1
|
||||
ld1 {v25.8h}, [x0], x1
|
||||
ld1 {v22.8h}, [x9], x1
|
||||
ld1 {v26.8h}, [x0], x1
|
||||
ld1 {v23.8h}, [x9], x1
|
||||
ld1 {v27.8h}, [x0], x1
|
||||
|
||||
sub x9, x9, x1, lsl #2
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #8
|
||||
|
||||
transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
|
||||
loop_filter_4
|
||||
|
||||
// Move x9 forward by 2 pixels; we don't need to rewrite the
|
||||
// outermost 2 pixels since they aren't changed.
|
||||
add x9, x9, #4
|
||||
add x0, x9, x1, lsl #2
|
||||
|
||||
// We only will write the mid 4 pixels back; after the loop filter,
|
||||
// these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
|
||||
// We need to transpose them to columns, done with a 4x8 transpose
|
||||
// (which in practice is two 4x4 transposes of the two 4x4 halves
|
||||
// of the 8x4 pixels; into 4x8 pixels).
|
||||
transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
|
||||
st1 {v22.d}[0], [x9], x1
|
||||
st1 {v22.d}[1], [x0], x1
|
||||
st1 {v23.d}[0], [x9], x1
|
||||
st1 {v23.d}[1], [x0], x1
|
||||
st1 {v24.d}[0], [x9], x1
|
||||
st1 {v24.d}[1], [x0], x1
|
||||
st1 {v25.d}[0], [x9], x1
|
||||
st1 {v25.d}[1], [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #4
|
||||
|
||||
br x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_h_4_8
|
||||
|
||||
function vp9_loop_filter_v_8_8_16_neon
|
||||
mov x10, x30
|
||||
sub x9, x0, x1, lsl #2
|
||||
ld1 {v20.8h}, [x9], x1 // p3
|
||||
ld1 {v24.8h}, [x0], x1 // q0
|
||||
ld1 {v21.8h}, [x9], x1 // p2
|
||||
ld1 {v25.8h}, [x0], x1 // q1
|
||||
ld1 {v22.8h}, [x9], x1 // p1
|
||||
ld1 {v26.8h}, [x0], x1 // q2
|
||||
ld1 {v23.8h}, [x9], x1 // p0
|
||||
ld1 {v27.8h}, [x0], x1 // q3
|
||||
sub x9, x9, x1, lsl #2
|
||||
sub x0, x0, x1, lsl #2
|
||||
add x9, x9, x1
|
||||
|
||||
loop_filter_8
|
||||
|
||||
st1 {v21.8h}, [x9], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v22.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v26.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #1
|
||||
sub x0, x0, x1
|
||||
|
||||
br x10
|
||||
6:
|
||||
sub x9, x0, x1, lsl #1
|
||||
st1 {v22.8h}, [x9], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #1
|
||||
br x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_v_8_8
|
||||
|
||||
function vp9_loop_filter_h_8_8_16_neon
|
||||
mov x10, x30
|
||||
sub x9, x0, #8
|
||||
add x0, x9, x1, lsl #2
|
||||
ld1 {v20.8h}, [x9], x1
|
||||
ld1 {v24.8h}, [x0], x1
|
||||
ld1 {v21.8h}, [x9], x1
|
||||
ld1 {v25.8h}, [x0], x1
|
||||
ld1 {v22.8h}, [x9], x1
|
||||
ld1 {v26.8h}, [x0], x1
|
||||
ld1 {v23.8h}, [x9], x1
|
||||
ld1 {v27.8h}, [x0], x1
|
||||
|
||||
sub x9, x9, x1, lsl #2
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #8
|
||||
|
||||
transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
|
||||
loop_filter_8
|
||||
|
||||
add x0, x9, x1, lsl #2
|
||||
|
||||
// Even though only 6 pixels per row have been changed, we write the
|
||||
// full 8 pixel registers.
|
||||
transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
|
||||
st1 {v20.8h}, [x9], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v21.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
st1 {v22.8h}, [x9], x1
|
||||
st1 {v26.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v27.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #8
|
||||
|
||||
br x10
|
||||
6:
|
||||
// If we didn't need to do the flat8in part, we use the same writeback
|
||||
// as in loop_filter_h_4_8.
|
||||
add x9, x9, #4
|
||||
add x0, x9, x1, lsl #2
|
||||
transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
|
||||
st1 {v22.d}[0], [x9], x1
|
||||
st1 {v22.d}[1], [x0], x1
|
||||
st1 {v23.d}[0], [x9], x1
|
||||
st1 {v23.d}[1], [x0], x1
|
||||
st1 {v24.d}[0], [x9], x1
|
||||
st1 {v24.d}[1], [x0], x1
|
||||
st1 {v25.d}[0], [x9], x1
|
||||
st1 {v25.d}[1], [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #4
|
||||
br x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_h_8_8
|
||||
|
||||
bpp_frontends_mix2 4, 4
|
||||
bpp_frontends_mix2 4, 8
|
||||
bpp_frontends_mix2 8, 4
|
||||
bpp_frontends_mix2 8, 8
|
||||
|
||||
function vp9_loop_filter_v_16_8_16_neon
|
||||
mov x10, x30
|
||||
sub x9, x0, x1, lsl #3
|
||||
ld1 {v16.8h}, [x9], x1 // p7
|
||||
ld1 {v24.8h}, [x0], x1 // q0
|
||||
ld1 {v17.8h}, [x9], x1 // p6
|
||||
ld1 {v25.8h}, [x0], x1 // q1
|
||||
ld1 {v18.8h}, [x9], x1 // p5
|
||||
ld1 {v26.8h}, [x0], x1 // q2
|
||||
ld1 {v19.8h}, [x9], x1 // p4
|
||||
ld1 {v27.8h}, [x0], x1 // q3
|
||||
ld1 {v20.8h}, [x9], x1 // p3
|
||||
ld1 {v28.8h}, [x0], x1 // q4
|
||||
ld1 {v21.8h}, [x9], x1 // p2
|
||||
ld1 {v29.8h}, [x0], x1 // q5
|
||||
ld1 {v22.8h}, [x9], x1 // p1
|
||||
ld1 {v30.8h}, [x0], x1 // q6
|
||||
ld1 {v23.8h}, [x9], x1 // p0
|
||||
ld1 {v31.8h}, [x0], x1 // q7
|
||||
sub x9, x9, x1, lsl #3
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x9, x9, x1
|
||||
|
||||
loop_filter_16
|
||||
|
||||
// If we did the flat8out part, we get the output in
|
||||
// v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
|
||||
// store v2-v9 there, and v10-v17 into x0.
|
||||
st1 {v2.8h}, [x9], x1
|
||||
st1 {v10.8h}, [x0], x1
|
||||
st1 {v3.8h}, [x9], x1
|
||||
st1 {v11.8h}, [x0], x1
|
||||
st1 {v4.8h}, [x9], x1
|
||||
st1 {v12.8h}, [x0], x1
|
||||
st1 {v5.8h}, [x9], x1
|
||||
st1 {v13.8h}, [x0], x1
|
||||
st1 {v6.8h}, [x9], x1
|
||||
st1 {v14.8h}, [x0], x1
|
||||
st1 {v8.8h}, [x9], x1
|
||||
st1 {v15.8h}, [x0], x1
|
||||
st1 {v9.8h}, [x9], x1
|
||||
st1 {v17.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, x1
|
||||
|
||||
br x10
|
||||
8:
|
||||
add x9, x9, x1, lsl #2
|
||||
// If we didn't do the flat8out part, the output is left in the
|
||||
// input registers.
|
||||
st1 {v21.8h}, [x9], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v22.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v26.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #1
|
||||
sub x0, x0, x1
|
||||
br x10
|
||||
7:
|
||||
sub x9, x0, x1, lsl #1
|
||||
st1 {v22.8h}, [x9], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #1
|
||||
br x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_v_16_8, push=1
|
||||
bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
|
||||
|
||||
function vp9_loop_filter_h_16_8_16_neon
|
||||
mov x10, x30
|
||||
sub x9, x0, #16
|
||||
ld1 {v16.8h}, [x9], x1
|
||||
ld1 {v24.8h}, [x0], x1
|
||||
ld1 {v17.8h}, [x9], x1
|
||||
ld1 {v25.8h}, [x0], x1
|
||||
ld1 {v18.8h}, [x9], x1
|
||||
ld1 {v26.8h}, [x0], x1
|
||||
ld1 {v19.8h}, [x9], x1
|
||||
ld1 {v27.8h}, [x0], x1
|
||||
ld1 {v20.8h}, [x9], x1
|
||||
ld1 {v28.8h}, [x0], x1
|
||||
ld1 {v21.8h}, [x9], x1
|
||||
ld1 {v29.8h}, [x0], x1
|
||||
ld1 {v22.8h}, [x9], x1
|
||||
ld1 {v30.8h}, [x0], x1
|
||||
ld1 {v23.8h}, [x9], x1
|
||||
ld1 {v31.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
sub x9, x9, x1, lsl #3
|
||||
|
||||
// The 16x8 pixels read above is in two 8x8 blocks; the left
|
||||
// half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
|
||||
// of this, to get one column per register.
|
||||
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
|
||||
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
|
||||
|
||||
loop_filter_16
|
||||
|
||||
transpose_8x8H v16, v2, v3, v4, v5, v6, v8, v9, v0, v1
|
||||
transpose_8x8H v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
|
||||
|
||||
st1 {v16.8h}, [x9], x1
|
||||
st1 {v10.8h}, [x0], x1
|
||||
st1 {v2.8h}, [x9], x1
|
||||
st1 {v11.8h}, [x0], x1
|
||||
st1 {v3.8h}, [x9], x1
|
||||
st1 {v12.8h}, [x0], x1
|
||||
st1 {v4.8h}, [x9], x1
|
||||
st1 {v13.8h}, [x0], x1
|
||||
st1 {v5.8h}, [x9], x1
|
||||
st1 {v14.8h}, [x0], x1
|
||||
st1 {v6.8h}, [x9], x1
|
||||
st1 {v15.8h}, [x0], x1
|
||||
st1 {v8.8h}, [x9], x1
|
||||
st1 {v17.8h}, [x0], x1
|
||||
st1 {v9.8h}, [x9], x1
|
||||
st1 {v31.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
|
||||
br x10
|
||||
8:
|
||||
// The same writeback as in loop_filter_h_8_8
|
||||
sub x9, x0, #8
|
||||
add x0, x9, x1, lsl #2
|
||||
transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
|
||||
st1 {v20.8h}, [x9], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v21.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
st1 {v22.8h}, [x9], x1
|
||||
st1 {v26.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v27.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #8
|
||||
br x10
|
||||
7:
|
||||
// The same writeback as in loop_filter_h_4_8
|
||||
sub x9, x0, #4
|
||||
add x0, x9, x1, lsl #2
|
||||
transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
|
||||
st1 {v22.d}[0], [x9], x1
|
||||
st1 {v22.d}[1], [x0], x1
|
||||
st1 {v23.d}[0], [x9], x1
|
||||
st1 {v23.d}[1], [x0], x1
|
||||
st1 {v24.d}[0], [x9], x1
|
||||
st1 {v24.d}[1], [x0], x1
|
||||
st1 {v25.d}[0], [x9], x1
|
||||
st1 {v25.d}[1], [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #4
|
||||
br x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_h_16_8, push=1
|
||||
bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1
|
||||
1334
externals/ffmpeg/libavcodec/aarch64/vp9lpf_neon.S
vendored
Executable file
1334
externals/ffmpeg/libavcodec/aarch64/vp9lpf_neon.S
vendored
Executable file
File diff suppressed because it is too large
Load Diff
606
externals/ffmpeg/libavcodec/aarch64/vp9mc_16bpp_neon.S
vendored
Executable file
606
externals/ffmpeg/libavcodec/aarch64/vp9mc_16bpp_neon.S
vendored
Executable file
@@ -0,0 +1,606 @@
|
||||
/*
|
||||
* Copyright (c) 2017 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
// All public functions in this file have the following signature:
|
||||
// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
|
||||
// const uint8_t *ref, ptrdiff_t ref_stride,
|
||||
// int h, int mx, int my);
|
||||
|
||||
function ff_vp9_avg64_16_neon, export=1
|
||||
mov x5, x0
|
||||
sub x1, x1, #64
|
||||
sub x3, x3, #64
|
||||
1:
|
||||
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
|
||||
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
|
||||
urhadd v0.8h, v0.8h, v4.8h
|
||||
urhadd v1.8h, v1.8h, v5.8h
|
||||
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
|
||||
urhadd v2.8h, v2.8h, v6.8h
|
||||
urhadd v3.8h, v3.8h, v7.8h
|
||||
subs w4, w4, #1
|
||||
urhadd v16.8h, v16.8h, v20.8h
|
||||
urhadd v17.8h, v17.8h, v21.8h
|
||||
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], #64
|
||||
urhadd v18.8h, v18.8h, v22.8h
|
||||
urhadd v19.8h, v19.8h, v23.8h
|
||||
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg32_16_neon, export=1
|
||||
mov x5, x0
|
||||
1:
|
||||
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x3
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
|
||||
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
|
||||
urhadd v0.8h, v0.8h, v4.8h
|
||||
urhadd v1.8h, v1.8h, v5.8h
|
||||
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
|
||||
urhadd v2.8h, v2.8h, v6.8h
|
||||
urhadd v3.8h, v3.8h, v7.8h
|
||||
subs w4, w4, #2
|
||||
urhadd v16.8h, v16.8h, v20.8h
|
||||
urhadd v17.8h, v17.8h, v21.8h
|
||||
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], x1
|
||||
urhadd v18.8h, v18.8h, v22.8h
|
||||
urhadd v19.8h, v19.8h, v23.8h
|
||||
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg16_16_neon, export=1
|
||||
1:
|
||||
ld1 {v2.8h, v3.8h}, [x2], x3
|
||||
ld1 {v0.8h, v1.8h}, [x0]
|
||||
urhadd v0.8h, v0.8h, v2.8h
|
||||
urhadd v1.8h, v1.8h, v3.8h
|
||||
subs w4, w4, #1
|
||||
st1 {v0.8h, v1.8h}, [x0], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg8_16_neon, export=1
|
||||
mov x5, x0
|
||||
1:
|
||||
ld1 {v2.8h}, [x2], x3
|
||||
ld1 {v0.8h}, [x0], x1
|
||||
ld1 {v3.8h}, [x2], x3
|
||||
urhadd v0.8h, v0.8h, v2.8h
|
||||
ld1 {v1.8h}, [x0], x1
|
||||
urhadd v1.8h, v1.8h, v3.8h
|
||||
subs w4, w4, #2
|
||||
st1 {v0.8h}, [x5], x1
|
||||
st1 {v1.8h}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg4_16_neon, export=1
|
||||
mov x5, x0
|
||||
1:
|
||||
ld1 {v2.4h}, [x2], x3
|
||||
ld1 {v0.4h}, [x0], x1
|
||||
ld1 {v3.4h}, [x2], x3
|
||||
urhadd v0.4h, v0.4h, v2.4h
|
||||
ld1 {v1.4h}, [x0], x1
|
||||
urhadd v1.4h, v1.4h, v3.4h
|
||||
subs w4, w4, #2
|
||||
st1 {v0.4h}, [x5], x1
|
||||
st1 {v1.8b}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
|
||||
// for size >= 16), and multiply-accumulate into dst1 and dst5 (or
|
||||
// dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8
|
||||
// for size >= 16)
|
||||
.macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size
|
||||
ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
|
||||
ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
|
||||
smlal \dst1\().4s, v20.4h, v0.h[\offset]
|
||||
smlal \dst5\().4s, v22.4h, v0.h[\offset]
|
||||
.if \size >= 16
|
||||
ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
|
||||
ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
|
||||
.endif
|
||||
.if \size >= 8
|
||||
smlal2 \dst2\().4s, v20.8h, v0.h[\offset]
|
||||
smlal2 \dst6\().4s, v22.8h, v0.h[\offset]
|
||||
.endif
|
||||
.if \size >= 16
|
||||
smlal \dst3\().4s, v21.4h, v0.h[\offset]
|
||||
smlal \dst7\().4s, v23.4h, v0.h[\offset]
|
||||
smlal2 \dst4\().4s, v21.8h, v0.h[\offset]
|
||||
smlal2 \dst8\().4s, v23.8h, v0.h[\offset]
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
// Instantiate a horizontal filter function for the given size.
|
||||
// This can work on 4, 8 or 16 pixels in parallel; for larger
|
||||
// widths it will do 16 pixels at a time and loop horizontally.
|
||||
// The actual width (in bytes) is passed in x5, the height in w4 and
|
||||
// the filter coefficients in x9.
|
||||
.macro do_8tap_h type, size
|
||||
function \type\()_8tap_\size\()h
|
||||
sub x2, x2, #6
|
||||
add x6, x0, x1
|
||||
add x7, x2, x3
|
||||
add x1, x1, x1
|
||||
add x3, x3, x3
|
||||
// Only size >= 16 loops horizontally and needs
|
||||
// reduced dst stride
|
||||
.if \size >= 16
|
||||
sub x1, x1, x5
|
||||
.endif
|
||||
// size >= 16 loads two qwords and increments r2,
|
||||
// for size 4/8 it's enough with one qword and no
|
||||
// postincrement
|
||||
.if \size >= 16
|
||||
sub x3, x3, x5
|
||||
sub x3, x3, #16
|
||||
.endif
|
||||
// Load the filter vector
|
||||
ld1 {v0.8h}, [x9]
|
||||
1:
|
||||
.if \size >= 16
|
||||
mov x9, x5
|
||||
.endif
|
||||
// Load src
|
||||
.if \size >= 16
|
||||
ld1 {v5.8h, v6.8h, v7.8h}, [x2], #48
|
||||
ld1 {v16.8h, v17.8h, v18.8h}, [x7], #48
|
||||
.else
|
||||
ld1 {v5.8h, v6.8h}, [x2]
|
||||
ld1 {v16.8h, v17.8h}, [x7]
|
||||
.endif
|
||||
2:
|
||||
|
||||
smull v1.4s, v5.4h, v0.h[0]
|
||||
smull v24.4s, v16.4h, v0.h[0]
|
||||
.if \size >= 8
|
||||
smull2 v2.4s, v5.8h, v0.h[0]
|
||||
smull2 v25.4s, v16.8h, v0.h[0]
|
||||
.endif
|
||||
.if \size >= 16
|
||||
smull v3.4s, v6.4h, v0.h[0]
|
||||
smull v26.4s, v17.4h, v0.h[0]
|
||||
smull2 v4.4s, v6.8h, v0.h[0]
|
||||
smull2 v27.4s, v17.8h, v0.h[0]
|
||||
.endif
|
||||
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 1, \size
|
||||
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 2, \size
|
||||
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 3, \size
|
||||
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 4, \size
|
||||
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 5, \size
|
||||
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 6, \size
|
||||
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 7, \size
|
||||
|
||||
// Round, shift and saturate
|
||||
// The sqrshrun takes care of clamping negative values to zero, but
|
||||
// we manually need to do umin with the max pixel value.
|
||||
sqrshrun v1.4h, v1.4s, #7
|
||||
sqrshrun v24.4h, v24.4s, #7
|
||||
.if \size >= 8
|
||||
sqrshrun2 v1.8h, v2.4s, #7
|
||||
sqrshrun2 v24.8h, v25.4s, #7
|
||||
umin v1.8h, v1.8h, v31.8h
|
||||
umin v24.8h, v24.8h, v31.8h
|
||||
.if \size >= 16
|
||||
sqrshrun v2.4h, v3.4s, #7
|
||||
sqrshrun v25.4h, v26.4s, #7
|
||||
sqrshrun2 v2.8h, v4.4s, #7
|
||||
sqrshrun2 v25.8h, v27.4s, #7
|
||||
umin v2.8h, v2.8h, v31.8h
|
||||
umin v25.8h, v25.8h, v31.8h
|
||||
.endif
|
||||
.else
|
||||
umin v1.4h, v1.4h, v31.4h
|
||||
umin v24.4h, v24.4h, v31.4h
|
||||
.endif
|
||||
// Average
|
||||
.ifc \type,avg
|
||||
.if \size >= 16
|
||||
ld1 {v3.8h, v4.8h}, [x0]
|
||||
ld1 {v29.8h, v30.8h}, [x6]
|
||||
urhadd v1.8h, v1.8h, v3.8h
|
||||
urhadd v2.8h, v2.8h, v4.8h
|
||||
urhadd v24.8h, v24.8h, v29.8h
|
||||
urhadd v25.8h, v25.8h, v30.8h
|
||||
.elseif \size >= 8
|
||||
ld1 {v3.8h}, [x0]
|
||||
ld1 {v4.8h}, [x6]
|
||||
urhadd v1.8h, v1.8h, v3.8h
|
||||
urhadd v24.8h, v24.8h, v4.8h
|
||||
.else
|
||||
ld1 {v3.4h}, [x0]
|
||||
ld1 {v4.4h}, [x6]
|
||||
urhadd v1.4h, v1.4h, v3.4h
|
||||
urhadd v24.4h, v24.4h, v4.4h
|
||||
.endif
|
||||
.endif
|
||||
// Store and loop horizontally (for size >= 16)
|
||||
.if \size >= 16
|
||||
subs x9, x9, #32
|
||||
st1 {v1.8h, v2.8h}, [x0], #32
|
||||
st1 {v24.8h, v25.8h}, [x6], #32
|
||||
b.eq 3f
|
||||
mov v5.16b, v7.16b
|
||||
mov v16.16b, v18.16b
|
||||
ld1 {v6.8h, v7.8h}, [x2], #32
|
||||
ld1 {v17.8h, v18.8h}, [x7], #32
|
||||
b 2b
|
||||
.elseif \size == 8
|
||||
st1 {v1.8h}, [x0]
|
||||
st1 {v24.8h}, [x6]
|
||||
.else // \size == 4
|
||||
st1 {v1.4h}, [x0]
|
||||
st1 {v24.4h}, [x6]
|
||||
.endif
|
||||
3:
|
||||
// Loop vertically
|
||||
add x0, x0, x1
|
||||
add x6, x6, x1
|
||||
add x2, x2, x3
|
||||
add x7, x7, x3
|
||||
subs w4, w4, #2
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro do_8tap_h_size size
|
||||
do_8tap_h put, \size
|
||||
do_8tap_h avg, \size
|
||||
.endm
|
||||
|
||||
do_8tap_h_size 4
|
||||
do_8tap_h_size 8
|
||||
do_8tap_h_size 16
|
||||
|
||||
.macro do_8tap_h_func type, filter, offset, size, bpp
|
||||
function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
|
||||
mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
|
||||
movrel x6, X(ff_vp9_subpel_filters), 256*\offset
|
||||
cmp w5, #8
|
||||
add x9, x6, w5, uxtw #4
|
||||
mov x5, #2*\size
|
||||
.if \size >= 16
|
||||
b \type\()_8tap_16h
|
||||
.else
|
||||
b \type\()_8tap_\size\()h
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro do_8tap_h_filters size, bpp
|
||||
do_8tap_h_func put, regular, 1, \size, \bpp
|
||||
do_8tap_h_func avg, regular, 1, \size, \bpp
|
||||
do_8tap_h_func put, sharp, 2, \size, \bpp
|
||||
do_8tap_h_func avg, sharp, 2, \size, \bpp
|
||||
do_8tap_h_func put, smooth, 0, \size, \bpp
|
||||
do_8tap_h_func avg, smooth, 0, \size, \bpp
|
||||
.endm
|
||||
|
||||
.macro do_8tap_h_filters_bpp bpp
|
||||
do_8tap_h_filters 64, \bpp
|
||||
do_8tap_h_filters 32, \bpp
|
||||
do_8tap_h_filters 16, \bpp
|
||||
do_8tap_h_filters 8, \bpp
|
||||
do_8tap_h_filters 4, \bpp
|
||||
.endm
|
||||
|
||||
do_8tap_h_filters_bpp 10
|
||||
do_8tap_h_filters_bpp 12
|
||||
|
||||
|
||||
// Vertical filters
|
||||
|
||||
// Round, shift and saturate and store reg1-reg4
|
||||
.macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type
|
||||
sqrshrun \reg1\().4h, \reg1\().4s, #7
|
||||
sqrshrun \reg2\().4h, \reg2\().4s, #7
|
||||
sqrshrun \reg3\().4h, \reg3\().4s, #7
|
||||
sqrshrun \reg4\().4h, \reg4\().4s, #7
|
||||
.ifc \type,avg
|
||||
ld1 {\tmp1\().4h}, [x7], x1
|
||||
ld1 {\tmp2\().4h}, [x7], x1
|
||||
ld1 {\tmp3\().4h}, [x7], x1
|
||||
ld1 {\tmp4\().4h}, [x7], x1
|
||||
.endif
|
||||
umin \reg1\().4h, \reg1\().4h, \minreg\().4h
|
||||
umin \reg2\().4h, \reg2\().4h, \minreg\().4h
|
||||
umin \reg3\().4h, \reg3\().4h, \minreg\().4h
|
||||
umin \reg4\().4h, \reg4\().4h, \minreg\().4h
|
||||
.ifc \type,avg
|
||||
urhadd \reg1\().4h, \reg1\().4h, \tmp1\().4h
|
||||
urhadd \reg2\().4h, \reg2\().4h, \tmp2\().4h
|
||||
urhadd \reg3\().4h, \reg3\().4h, \tmp3\().4h
|
||||
urhadd \reg4\().4h, \reg4\().4h, \tmp4\().4h
|
||||
.endif
|
||||
st1 {\reg1\().4h}, [x0], x1
|
||||
st1 {\reg2\().4h}, [x0], x1
|
||||
st1 {\reg3\().4h}, [x0], x1
|
||||
st1 {\reg4\().4h}, [x0], x1
|
||||
.endm
|
||||
|
||||
// Round, shift and saturate and store reg1-8, where
|
||||
// reg1-2, reg3-4 etc pairwise correspond to 4 rows.
|
||||
.macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type
|
||||
sqrshrun \reg1\().4h, \reg1\().4s, #7
|
||||
sqrshrun2 \reg1\().8h, \reg2\().4s, #7
|
||||
sqrshrun \reg2\().4h, \reg3\().4s, #7
|
||||
sqrshrun2 \reg2\().8h, \reg4\().4s, #7
|
||||
sqrshrun \reg3\().4h, \reg5\().4s, #7
|
||||
sqrshrun2 \reg3\().8h, \reg6\().4s, #7
|
||||
sqrshrun \reg4\().4h, \reg7\().4s, #7
|
||||
sqrshrun2 \reg4\().8h, \reg8\().4s, #7
|
||||
.ifc \type,avg
|
||||
ld1 {\reg5\().8h}, [x7], x1
|
||||
ld1 {\reg6\().8h}, [x7], x1
|
||||
ld1 {\reg7\().8h}, [x7], x1
|
||||
ld1 {\reg8\().8h}, [x7], x1
|
||||
.endif
|
||||
umin \reg1\().8h, \reg1\().8h, \minreg\().8h
|
||||
umin \reg2\().8h, \reg2\().8h, \minreg\().8h
|
||||
umin \reg3\().8h, \reg3\().8h, \minreg\().8h
|
||||
umin \reg4\().8h, \reg4\().8h, \minreg\().8h
|
||||
.ifc \type,avg
|
||||
urhadd \reg1\().8h, \reg1\().8h, \reg5\().8h
|
||||
urhadd \reg2\().8h, \reg2\().8h, \reg6\().8h
|
||||
urhadd \reg3\().8h, \reg3\().8h, \reg7\().8h
|
||||
urhadd \reg4\().8h, \reg4\().8h, \reg8\().8h
|
||||
.endif
|
||||
st1 {\reg1\().8h}, [x0], x1
|
||||
st1 {\reg2\().8h}, [x0], x1
|
||||
st1 {\reg3\().8h}, [x0], x1
|
||||
st1 {\reg4\().8h}, [x0], x1
|
||||
.endm
|
||||
|
||||
// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
|
||||
// (src1-src8 into dst1, src2-src9 into dst2).
|
||||
.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
|
||||
smull \dst1\().4s, \src1\().4h, v0.h[0]
|
||||
smull \dst2\().4s, \src2\().4h, v0.h[0]
|
||||
smull \tmp1\().4s, \src2\().4h, v0.h[1]
|
||||
smull \tmp2\().4s, \src3\().4h, v0.h[1]
|
||||
smlal \dst1\().4s, \src3\().4h, v0.h[2]
|
||||
smlal \dst2\().4s, \src4\().4h, v0.h[2]
|
||||
smlal \tmp1\().4s, \src4\().4h, v0.h[3]
|
||||
smlal \tmp2\().4s, \src5\().4h, v0.h[3]
|
||||
smlal \dst1\().4s, \src5\().4h, v0.h[4]
|
||||
smlal \dst2\().4s, \src6\().4h, v0.h[4]
|
||||
smlal \tmp1\().4s, \src6\().4h, v0.h[5]
|
||||
smlal \tmp2\().4s, \src7\().4h, v0.h[5]
|
||||
smlal \dst1\().4s, \src7\().4h, v0.h[6]
|
||||
smlal \dst2\().4s, \src8\().4h, v0.h[6]
|
||||
smlal \tmp1\().4s, \src8\().4h, v0.h[7]
|
||||
smlal \tmp2\().4s, \src9\().4h, v0.h[7]
|
||||
add \dst1\().4s, \dst1\().4s, \tmp1\().4s
|
||||
add \dst2\().4s, \dst2\().4s, \tmp2\().4s
|
||||
.endm
|
||||
|
||||
// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4
|
||||
// (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4).
|
||||
.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9
|
||||
smull \dst1\().4s, \src1\().4h, v0.h[0]
|
||||
smull2 \dst2\().4s, \src1\().8h, v0.h[0]
|
||||
smull \dst3\().4s, \src2\().4h, v0.h[0]
|
||||
smull2 \dst4\().4s, \src2\().8h, v0.h[0]
|
||||
smlal \dst1\().4s, \src2\().4h, v0.h[1]
|
||||
smlal2 \dst2\().4s, \src2\().8h, v0.h[1]
|
||||
smlal \dst3\().4s, \src3\().4h, v0.h[1]
|
||||
smlal2 \dst4\().4s, \src3\().8h, v0.h[1]
|
||||
smlal \dst1\().4s, \src3\().4h, v0.h[2]
|
||||
smlal2 \dst2\().4s, \src3\().8h, v0.h[2]
|
||||
smlal \dst3\().4s, \src4\().4h, v0.h[2]
|
||||
smlal2 \dst4\().4s, \src4\().8h, v0.h[2]
|
||||
smlal \dst1\().4s, \src4\().4h, v0.h[3]
|
||||
smlal2 \dst2\().4s, \src4\().8h, v0.h[3]
|
||||
smlal \dst3\().4s, \src5\().4h, v0.h[3]
|
||||
smlal2 \dst4\().4s, \src5\().8h, v0.h[3]
|
||||
smlal \dst1\().4s, \src5\().4h, v0.h[4]
|
||||
smlal2 \dst2\().4s, \src5\().8h, v0.h[4]
|
||||
smlal \dst3\().4s, \src6\().4h, v0.h[4]
|
||||
smlal2 \dst4\().4s, \src6\().8h, v0.h[4]
|
||||
smlal \dst1\().4s, \src6\().4h, v0.h[5]
|
||||
smlal2 \dst2\().4s, \src6\().8h, v0.h[5]
|
||||
smlal \dst3\().4s, \src7\().4h, v0.h[5]
|
||||
smlal2 \dst4\().4s, \src7\().8h, v0.h[5]
|
||||
smlal \dst1\().4s, \src7\().4h, v0.h[6]
|
||||
smlal2 \dst2\().4s, \src7\().8h, v0.h[6]
|
||||
smlal \dst3\().4s, \src8\().4h, v0.h[6]
|
||||
smlal2 \dst4\().4s, \src8\().8h, v0.h[6]
|
||||
smlal \dst1\().4s, \src8\().4h, v0.h[7]
|
||||
smlal2 \dst2\().4s, \src8\().8h, v0.h[7]
|
||||
smlal \dst3\().4s, \src9\().4h, v0.h[7]
|
||||
smlal2 \dst4\().4s, \src9\().8h, v0.h[7]
|
||||
.endm
|
||||
|
||||
// Instantiate a vertical filter function for filtering 8 pixels at a time.
|
||||
// The height is passed in x4, the width in x5 and the filter coefficients
|
||||
// in x6.
|
||||
.macro do_8tap_8v type
|
||||
function \type\()_8tap_8v
|
||||
sub x2, x2, x3, lsl #1
|
||||
sub x2, x2, x3
|
||||
ld1 {v0.8h}, [x6]
|
||||
1:
|
||||
.ifc \type,avg
|
||||
mov x7, x0
|
||||
.endif
|
||||
mov x6, x4
|
||||
|
||||
ld1 {v17.8h}, [x2], x3
|
||||
ld1 {v18.8h}, [x2], x3
|
||||
ld1 {v19.8h}, [x2], x3
|
||||
ld1 {v20.8h}, [x2], x3
|
||||
ld1 {v21.8h}, [x2], x3
|
||||
ld1 {v22.8h}, [x2], x3
|
||||
ld1 {v23.8h}, [x2], x3
|
||||
2:
|
||||
ld1 {v24.8h}, [x2], x3
|
||||
ld1 {v25.8h}, [x2], x3
|
||||
ld1 {v26.8h}, [x2], x3
|
||||
ld1 {v27.8h}, [x2], x3
|
||||
|
||||
convolve8 v2, v3, v4, v5, v17, v18, v19, v20, v21, v22, v23, v24, v25
|
||||
convolve8 v6, v7, v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27
|
||||
do_store8 v2, v3, v4, v5, v6, v7, v30, v31, v1, \type
|
||||
|
||||
subs x6, x6, #4
|
||||
b.eq 8f
|
||||
|
||||
ld1 {v16.8h}, [x2], x3
|
||||
ld1 {v17.8h}, [x2], x3
|
||||
ld1 {v18.8h}, [x2], x3
|
||||
ld1 {v19.8h}, [x2], x3
|
||||
convolve8 v2, v3, v4, v5, v21, v22, v23, v24, v25, v26, v27, v16, v17
|
||||
convolve8 v6, v7, v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19
|
||||
do_store8 v2, v3, v4, v5, v6, v7, v20, v21, v1, \type
|
||||
|
||||
subs x6, x6, #4
|
||||
b.eq 8f
|
||||
|
||||
ld1 {v20.8h}, [x2], x3
|
||||
ld1 {v21.8h}, [x2], x3
|
||||
ld1 {v22.8h}, [x2], x3
|
||||
ld1 {v23.8h}, [x2], x3
|
||||
convolve8 v2, v3, v4, v5, v25, v26, v27, v16, v17, v18, v19, v20, v21
|
||||
convolve8 v6, v7, v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
do_store8 v2, v3, v4, v5, v6, v7, v24, v25, v1, \type
|
||||
|
||||
subs x6, x6, #4
|
||||
b.ne 2b
|
||||
|
||||
8:
|
||||
subs x5, x5, #8
|
||||
b.eq 9f
|
||||
// x0 -= h * dst_stride
|
||||
msub x0, x1, x4, x0
|
||||
// x2 -= h * src_stride
|
||||
msub x2, x3, x4, x2
|
||||
// x2 -= 8 * src_stride
|
||||
sub x2, x2, x3, lsl #3
|
||||
// x2 += 1 * src_stride
|
||||
add x2, x2, x3
|
||||
add x2, x2, #16
|
||||
add x0, x0, #16
|
||||
b 1b
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
do_8tap_8v put
|
||||
do_8tap_8v avg
|
||||
|
||||
|
||||
// Instantiate a vertical filter function for filtering a 4 pixels wide
|
||||
// slice. This only is designed to work for 4 or 8 output lines.
|
||||
.macro do_8tap_4v type
|
||||
function \type\()_8tap_4v
|
||||
sub x2, x2, x3, lsl #1
|
||||
sub x2, x2, x3
|
||||
ld1 {v0.8h}, [x6]
|
||||
.ifc \type,avg
|
||||
mov x7, x0
|
||||
.endif
|
||||
|
||||
ld1 {v16.4h}, [x2], x3
|
||||
ld1 {v17.4h}, [x2], x3
|
||||
ld1 {v18.4h}, [x2], x3
|
||||
ld1 {v19.4h}, [x2], x3
|
||||
ld1 {v20.4h}, [x2], x3
|
||||
ld1 {v21.4h}, [x2], x3
|
||||
ld1 {v22.4h}, [x2], x3
|
||||
ld1 {v23.4h}, [x2], x3
|
||||
ld1 {v24.4h}, [x2], x3
|
||||
ld1 {v25.4h}, [x2], x3
|
||||
ld1 {v26.4h}, [x2], x3
|
||||
|
||||
convolve4 v2, v3, v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31
|
||||
convolve4 v4, v5, v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31
|
||||
do_store4 v2, v3, v4, v5, v28, v29, v30, v31, v1, \type
|
||||
|
||||
subs x4, x4, #4
|
||||
b.eq 9f
|
||||
|
||||
ld1 {v27.4h}, [x2], x3
|
||||
ld1 {v28.4h}, [x2], x3
|
||||
ld1 {v29.4h}, [x2], x3
|
||||
ld1 {v30.4h}, [x2], x3
|
||||
|
||||
convolve4 v2, v3, v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17
|
||||
convolve4 v4, v5, v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
|
||||
do_store4 v2, v3, v4, v5, v16, v17, v18, v19, v1, \type
|
||||
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
do_8tap_4v put
|
||||
do_8tap_4v avg
|
||||
|
||||
|
||||
.macro do_8tap_v_func type, filter, offset, size, bpp
|
||||
function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
|
||||
uxtw x4, w4
|
||||
mvni v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
|
||||
movrel x5, X(ff_vp9_subpel_filters), 256*\offset
|
||||
add x6, x5, w6, uxtw #4
|
||||
mov x5, #\size
|
||||
.if \size >= 8
|
||||
b \type\()_8tap_8v
|
||||
.else
|
||||
b \type\()_8tap_4v
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro do_8tap_v_filters size, bpp
|
||||
do_8tap_v_func put, regular, 1, \size, \bpp
|
||||
do_8tap_v_func avg, regular, 1, \size, \bpp
|
||||
do_8tap_v_func put, sharp, 2, \size, \bpp
|
||||
do_8tap_v_func avg, sharp, 2, \size, \bpp
|
||||
do_8tap_v_func put, smooth, 0, \size, \bpp
|
||||
do_8tap_v_func avg, smooth, 0, \size, \bpp
|
||||
.endm
|
||||
|
||||
.macro do_8tap_v_filters_bpp bpp
|
||||
do_8tap_v_filters 64, \bpp
|
||||
do_8tap_v_filters 32, \bpp
|
||||
do_8tap_v_filters 16, \bpp
|
||||
do_8tap_v_filters 8, \bpp
|
||||
do_8tap_v_filters 4, \bpp
|
||||
.endm
|
||||
|
||||
do_8tap_v_filters_bpp 10
|
||||
do_8tap_v_filters_bpp 12
|
||||
81
externals/ffmpeg/libavcodec/aarch64/vp9mc_aarch64.S
vendored
Executable file
81
externals/ffmpeg/libavcodec/aarch64/vp9mc_aarch64.S
vendored
Executable file
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
// All public functions in this file have the following signature:
|
||||
// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
|
||||
// const uint8_t *ref, ptrdiff_t ref_stride,
|
||||
// int h, int mx, int my);
|
||||
|
||||
function ff_vp9_copy128_aarch64, export=1
|
||||
1:
|
||||
ldp x5, x6, [x2]
|
||||
ldp x7, x8, [x2, #16]
|
||||
stp x5, x6, [x0]
|
||||
ldp x9, x10, [x2, #32]
|
||||
stp x7, x8, [x0, #16]
|
||||
subs w4, w4, #1
|
||||
ldp x11, x12, [x2, #48]
|
||||
stp x9, x10, [x0, #32]
|
||||
stp x11, x12, [x0, #48]
|
||||
ldp x5, x6, [x2, #64]
|
||||
ldp x7, x8, [x2, #80]
|
||||
stp x5, x6, [x0, #64]
|
||||
ldp x9, x10, [x2, #96]
|
||||
stp x7, x8, [x0, #80]
|
||||
ldp x11, x12, [x2, #112]
|
||||
stp x9, x10, [x0, #96]
|
||||
stp x11, x12, [x0, #112]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_copy64_aarch64, export=1
|
||||
1:
|
||||
ldp x5, x6, [x2]
|
||||
ldp x7, x8, [x2, #16]
|
||||
stp x5, x6, [x0]
|
||||
ldp x9, x10, [x2, #32]
|
||||
stp x7, x8, [x0, #16]
|
||||
subs w4, w4, #1
|
||||
ldp x11, x12, [x2, #48]
|
||||
stp x9, x10, [x0, #32]
|
||||
stp x11, x12, [x0, #48]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_copy32_aarch64, export=1
|
||||
1:
|
||||
ldp x5, x6, [x2]
|
||||
ldp x7, x8, [x2, #16]
|
||||
stp x5, x6, [x0]
|
||||
subs w4, w4, #1
|
||||
stp x7, x8, [x0, #16]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
657
externals/ffmpeg/libavcodec/aarch64/vp9mc_neon.S
vendored
Executable file
657
externals/ffmpeg/libavcodec/aarch64/vp9mc_neon.S
vendored
Executable file
@@ -0,0 +1,657 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
// All public functions in this file have the following signature:
|
||||
// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
|
||||
// const uint8_t *ref, ptrdiff_t ref_stride,
|
||||
// int h, int mx, int my);
|
||||
|
||||
function ff_vp9_avg64_neon, export=1
|
||||
mov x5, x0
|
||||
1:
|
||||
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x2], x3
|
||||
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
|
||||
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
|
||||
urhadd v0.16b, v0.16b, v4.16b
|
||||
urhadd v1.16b, v1.16b, v5.16b
|
||||
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
|
||||
urhadd v2.16b, v2.16b, v6.16b
|
||||
urhadd v3.16b, v3.16b, v7.16b
|
||||
subs w4, w4, #2
|
||||
urhadd v16.16b, v16.16b, v20.16b
|
||||
urhadd v17.16b, v17.16b, v21.16b
|
||||
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], x1
|
||||
urhadd v18.16b, v18.16b, v22.16b
|
||||
urhadd v19.16b, v19.16b, v23.16b
|
||||
st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg32_neon, export=1
|
||||
1:
|
||||
ld1 {v2.16b, v3.16b}, [x2], x3
|
||||
ld1 {v0.16b, v1.16b}, [x0]
|
||||
urhadd v0.16b, v0.16b, v2.16b
|
||||
urhadd v1.16b, v1.16b, v3.16b
|
||||
subs w4, w4, #1
|
||||
st1 {v0.16b, v1.16b}, [x0], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_copy16_neon, export=1
|
||||
add x5, x0, x1
|
||||
lsl x1, x1, #1
|
||||
add x6, x2, x3
|
||||
lsl x3, x3, #1
|
||||
1:
|
||||
ld1 {v0.16b}, [x2], x3
|
||||
ld1 {v1.16b}, [x6], x3
|
||||
ld1 {v2.16b}, [x2], x3
|
||||
ld1 {v3.16b}, [x6], x3
|
||||
subs w4, w4, #4
|
||||
st1 {v0.16b}, [x0], x1
|
||||
st1 {v1.16b}, [x5], x1
|
||||
st1 {v2.16b}, [x0], x1
|
||||
st1 {v3.16b}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg16_neon, export=1
|
||||
mov x5, x0
|
||||
1:
|
||||
ld1 {v2.16b}, [x2], x3
|
||||
ld1 {v0.16b}, [x0], x1
|
||||
ld1 {v3.16b}, [x2], x3
|
||||
urhadd v0.16b, v0.16b, v2.16b
|
||||
ld1 {v1.16b}, [x0], x1
|
||||
urhadd v1.16b, v1.16b, v3.16b
|
||||
subs w4, w4, #2
|
||||
st1 {v0.16b}, [x5], x1
|
||||
st1 {v1.16b}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_copy8_neon, export=1
|
||||
1:
|
||||
ld1 {v0.8b}, [x2], x3
|
||||
ld1 {v1.8b}, [x2], x3
|
||||
subs w4, w4, #2
|
||||
st1 {v0.8b}, [x0], x1
|
||||
st1 {v1.8b}, [x0], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg8_neon, export=1
|
||||
mov x5, x0
|
||||
1:
|
||||
ld1 {v2.8b}, [x2], x3
|
||||
ld1 {v0.8b}, [x0], x1
|
||||
ld1 {v3.8b}, [x2], x3
|
||||
urhadd v0.8b, v0.8b, v2.8b
|
||||
ld1 {v1.8b}, [x0], x1
|
||||
urhadd v1.8b, v1.8b, v3.8b
|
||||
subs w4, w4, #2
|
||||
st1 {v0.8b}, [x5], x1
|
||||
st1 {v1.8b}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_copy4_neon, export=1
|
||||
1:
|
||||
ld1 {v0.s}[0], [x2], x3
|
||||
ld1 {v1.s}[0], [x2], x3
|
||||
st1 {v0.s}[0], [x0], x1
|
||||
ld1 {v2.s}[0], [x2], x3
|
||||
st1 {v1.s}[0], [x0], x1
|
||||
ld1 {v3.s}[0], [x2], x3
|
||||
subs w4, w4, #4
|
||||
st1 {v2.s}[0], [x0], x1
|
||||
st1 {v3.s}[0], [x0], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg4_neon, export=1
|
||||
mov x5, x0
|
||||
1:
|
||||
ld1 {v2.s}[0], [x2], x3
|
||||
ld1 {v0.s}[0], [x0], x1
|
||||
ld1 {v2.s}[1], [x2], x3
|
||||
ld1 {v0.s}[1], [x0], x1
|
||||
ld1 {v3.s}[0], [x2], x3
|
||||
ld1 {v1.s}[0], [x0], x1
|
||||
ld1 {v3.s}[1], [x2], x3
|
||||
ld1 {v1.s}[1], [x0], x1
|
||||
subs w4, w4, #4
|
||||
urhadd v0.8b, v0.8b, v2.8b
|
||||
urhadd v1.8b, v1.8b, v3.8b
|
||||
st1 {v0.s}[0], [x5], x1
|
||||
st1 {v0.s}[1], [x5], x1
|
||||
st1 {v1.s}[0], [x5], x1
|
||||
st1 {v1.s}[1], [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
|
||||
// for size >= 16), and multiply-accumulate into dst1 and dst3 (or
|
||||
// dst1-dst2 and dst3-dst4 for size >= 16)
|
||||
.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
|
||||
ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
|
||||
ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
|
||||
.if \size >= 16
|
||||
mla \dst1\().8h, v20.8h, v0.h[\offset]
|
||||
ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
|
||||
mla \dst3\().8h, v22.8h, v0.h[\offset]
|
||||
ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
|
||||
mla \dst2\().8h, v21.8h, v0.h[\offset]
|
||||
mla \dst4\().8h, v23.8h, v0.h[\offset]
|
||||
.elseif \size == 8
|
||||
mla \dst1\().8h, v20.8h, v0.h[\offset]
|
||||
mla \dst3\().8h, v22.8h, v0.h[\offset]
|
||||
.else
|
||||
mla \dst1\().4h, v20.4h, v0.h[\offset]
|
||||
mla \dst3\().4h, v22.4h, v0.h[\offset]
|
||||
.endif
|
||||
.endm
|
||||
// The same as above, but don't accumulate straight into the
|
||||
// destination, but use a temp register and accumulate with saturation.
|
||||
.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
|
||||
ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
|
||||
ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
|
||||
.if \size >= 16
|
||||
mul v20.8h, v20.8h, v0.h[\offset]
|
||||
ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
|
||||
mul v22.8h, v22.8h, v0.h[\offset]
|
||||
ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
|
||||
mul v21.8h, v21.8h, v0.h[\offset]
|
||||
mul v23.8h, v23.8h, v0.h[\offset]
|
||||
.elseif \size == 8
|
||||
mul v20.8h, v20.8h, v0.h[\offset]
|
||||
mul v22.8h, v22.8h, v0.h[\offset]
|
||||
.else
|
||||
mul v20.4h, v20.4h, v0.h[\offset]
|
||||
mul v22.4h, v22.4h, v0.h[\offset]
|
||||
.endif
|
||||
.if \size == 4
|
||||
sqadd \dst1\().4h, \dst1\().4h, v20.4h
|
||||
sqadd \dst3\().4h, \dst3\().4h, v22.4h
|
||||
.else
|
||||
sqadd \dst1\().8h, \dst1\().8h, v20.8h
|
||||
sqadd \dst3\().8h, \dst3\().8h, v22.8h
|
||||
.if \size >= 16
|
||||
sqadd \dst2\().8h, \dst2\().8h, v21.8h
|
||||
sqadd \dst4\().8h, \dst4\().8h, v23.8h
|
||||
.endif
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
// Instantiate a horizontal filter function for the given size.
|
||||
// This can work on 4, 8 or 16 pixels in parallel; for larger
|
||||
// widths it will do 16 pixels at a time and loop horizontally.
|
||||
// The actual width is passed in x5, the height in w4 and the
|
||||
// filter coefficients in x9. idx2 is the index of the largest
|
||||
// filter coefficient (3 or 4) and idx1 is the other one of them.
|
||||
.macro do_8tap_h type, size, idx1, idx2
|
||||
function \type\()_8tap_\size\()h_\idx1\idx2
|
||||
sub x2, x2, #3
|
||||
add x6, x0, x1
|
||||
add x7, x2, x3
|
||||
add x1, x1, x1
|
||||
add x3, x3, x3
|
||||
// Only size >= 16 loops horizontally and needs
|
||||
// reduced dst stride
|
||||
.if \size >= 16
|
||||
sub x1, x1, x5
|
||||
.endif
|
||||
// size >= 16 loads two qwords and increments x2,
|
||||
// for size 4/8 it's enough with one qword and no
|
||||
// postincrement
|
||||
.if \size >= 16
|
||||
sub x3, x3, x5
|
||||
sub x3, x3, #8
|
||||
.endif
|
||||
// Load the filter vector
|
||||
ld1 {v0.8h}, [x9]
|
||||
1:
|
||||
.if \size >= 16
|
||||
mov x9, x5
|
||||
.endif
|
||||
// Load src
|
||||
.if \size >= 16
|
||||
ld1 {v4.8b, v5.8b, v6.8b}, [x2], #24
|
||||
ld1 {v16.8b, v17.8b, v18.8b}, [x7], #24
|
||||
.else
|
||||
ld1 {v4.8b, v5.8b}, [x2]
|
||||
ld1 {v16.8b, v17.8b}, [x7]
|
||||
.endif
|
||||
uxtl v4.8h, v4.8b
|
||||
uxtl v5.8h, v5.8b
|
||||
uxtl v16.8h, v16.8b
|
||||
uxtl v17.8h, v17.8b
|
||||
.if \size >= 16
|
||||
uxtl v6.8h, v6.8b
|
||||
uxtl v18.8h, v18.8b
|
||||
.endif
|
||||
2:
|
||||
|
||||
// Accumulate, adding idx2 last with a separate
|
||||
// saturating add. The positive filter coefficients
|
||||
// for all indices except idx2 must add up to less
|
||||
// than 127 for this not to overflow.
|
||||
mul v1.8h, v4.8h, v0.h[0]
|
||||
mul v24.8h, v16.8h, v0.h[0]
|
||||
.if \size >= 16
|
||||
mul v2.8h, v5.8h, v0.h[0]
|
||||
mul v25.8h, v17.8h, v0.h[0]
|
||||
.endif
|
||||
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 1, \size
|
||||
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 2, \size
|
||||
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx1, \size
|
||||
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 5, \size
|
||||
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 6, \size
|
||||
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 7, \size
|
||||
extmulqadd v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx2, \size
|
||||
|
||||
// Round, shift and saturate
|
||||
sqrshrun v1.8b, v1.8h, #7
|
||||
sqrshrun v24.8b, v24.8h, #7
|
||||
.if \size >= 16
|
||||
sqrshrun2 v1.16b, v2.8h, #7
|
||||
sqrshrun2 v24.16b, v25.8h, #7
|
||||
.endif
|
||||
// Average
|
||||
.ifc \type,avg
|
||||
.if \size >= 16
|
||||
ld1 {v2.16b}, [x0]
|
||||
ld1 {v3.16b}, [x6]
|
||||
urhadd v1.16b, v1.16b, v2.16b
|
||||
urhadd v24.16b, v24.16b, v3.16b
|
||||
.elseif \size == 8
|
||||
ld1 {v2.8b}, [x0]
|
||||
ld1 {v3.8b}, [x6]
|
||||
urhadd v1.8b, v1.8b, v2.8b
|
||||
urhadd v24.8b, v24.8b, v3.8b
|
||||
.else
|
||||
ld1 {v2.s}[0], [x0]
|
||||
ld1 {v3.s}[0], [x6]
|
||||
urhadd v1.8b, v1.8b, v2.8b
|
||||
urhadd v24.8b, v24.8b, v3.8b
|
||||
.endif
|
||||
.endif
|
||||
// Store and loop horizontally (for size >= 16)
|
||||
.if \size >= 16
|
||||
subs x9, x9, #16
|
||||
st1 {v1.16b}, [x0], #16
|
||||
st1 {v24.16b}, [x6], #16
|
||||
b.eq 3f
|
||||
mov v4.16b, v6.16b
|
||||
mov v16.16b, v18.16b
|
||||
ld1 {v6.16b}, [x2], #16
|
||||
ld1 {v18.16b}, [x7], #16
|
||||
uxtl v5.8h, v6.8b
|
||||
uxtl2 v6.8h, v6.16b
|
||||
uxtl v17.8h, v18.8b
|
||||
uxtl2 v18.8h, v18.16b
|
||||
b 2b
|
||||
.elseif \size == 8
|
||||
st1 {v1.8b}, [x0]
|
||||
st1 {v24.8b}, [x6]
|
||||
.else // \size == 4
|
||||
st1 {v1.s}[0], [x0]
|
||||
st1 {v24.s}[0], [x6]
|
||||
.endif
|
||||
3:
|
||||
// Loop vertically
|
||||
add x0, x0, x1
|
||||
add x6, x6, x1
|
||||
add x2, x2, x3
|
||||
add x7, x7, x3
|
||||
subs w4, w4, #2
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro do_8tap_h_size size
|
||||
do_8tap_h put, \size, 3, 4
|
||||
do_8tap_h avg, \size, 3, 4
|
||||
do_8tap_h put, \size, 4, 3
|
||||
do_8tap_h avg, \size, 4, 3
|
||||
.endm
|
||||
|
||||
do_8tap_h_size 4
|
||||
do_8tap_h_size 8
|
||||
do_8tap_h_size 16
|
||||
|
||||
.macro do_8tap_h_func type, filter, offset, size
|
||||
function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
|
||||
movrel x6, X(ff_vp9_subpel_filters), 256*\offset
|
||||
cmp w5, #8
|
||||
add x9, x6, w5, uxtw #4
|
||||
mov x5, #\size
|
||||
.if \size >= 16
|
||||
b.ge \type\()_8tap_16h_34
|
||||
b \type\()_8tap_16h_43
|
||||
.else
|
||||
b.ge \type\()_8tap_\size\()h_34
|
||||
b \type\()_8tap_\size\()h_43
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro do_8tap_h_filters size
|
||||
do_8tap_h_func put, regular, 1, \size
|
||||
do_8tap_h_func avg, regular, 1, \size
|
||||
do_8tap_h_func put, sharp, 2, \size
|
||||
do_8tap_h_func avg, sharp, 2, \size
|
||||
do_8tap_h_func put, smooth, 0, \size
|
||||
do_8tap_h_func avg, smooth, 0, \size
|
||||
.endm
|
||||
|
||||
do_8tap_h_filters 64
|
||||
do_8tap_h_filters 32
|
||||
do_8tap_h_filters 16
|
||||
do_8tap_h_filters 8
|
||||
do_8tap_h_filters 4
|
||||
|
||||
|
||||
// Vertical filters
|
||||
|
||||
// Round, shift and saturate and store reg1-reg2 over 4 lines
|
||||
.macro do_store4 reg1, reg2, tmp1, tmp2, type
|
||||
sqrshrun \reg1\().8b, \reg1\().8h, #7
|
||||
sqrshrun \reg2\().8b, \reg2\().8h, #7
|
||||
.ifc \type,avg
|
||||
ld1 {\tmp1\().s}[0], [x7], x1
|
||||
ld1 {\tmp2\().s}[0], [x7], x1
|
||||
ld1 {\tmp1\().s}[1], [x7], x1
|
||||
ld1 {\tmp2\().s}[1], [x7], x1
|
||||
urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b
|
||||
urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b
|
||||
.endif
|
||||
st1 {\reg1\().s}[0], [x0], x1
|
||||
st1 {\reg2\().s}[0], [x0], x1
|
||||
st1 {\reg1\().s}[1], [x0], x1
|
||||
st1 {\reg2\().s}[1], [x0], x1
|
||||
.endm
|
||||
|
||||
// Round, shift and saturate and store reg1-4
|
||||
.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
|
||||
sqrshrun \reg1\().8b, \reg1\().8h, #7
|
||||
sqrshrun \reg2\().8b, \reg2\().8h, #7
|
||||
sqrshrun \reg3\().8b, \reg3\().8h, #7
|
||||
sqrshrun \reg4\().8b, \reg4\().8h, #7
|
||||
.ifc \type,avg
|
||||
ld1 {\tmp1\().8b}, [x7], x1
|
||||
ld1 {\tmp2\().8b}, [x7], x1
|
||||
ld1 {\tmp3\().8b}, [x7], x1
|
||||
ld1 {\tmp4\().8b}, [x7], x1
|
||||
urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b
|
||||
urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b
|
||||
urhadd \reg3\().8b, \reg3\().8b, \tmp3\().8b
|
||||
urhadd \reg4\().8b, \reg4\().8b, \tmp4\().8b
|
||||
.endif
|
||||
st1 {\reg1\().8b}, [x0], x1
|
||||
st1 {\reg2\().8b}, [x0], x1
|
||||
st1 {\reg3\().8b}, [x0], x1
|
||||
st1 {\reg4\().8b}, [x0], x1
|
||||
.endm
|
||||
|
||||
// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
|
||||
// (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
|
||||
// at the end with saturation. Indices 0 and 7 always have negative or zero
|
||||
// coefficients, so they can be accumulated into tmp1-tmp2 together with the
|
||||
// largest coefficient.
|
||||
.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
|
||||
mul \dst1\().8h, \src2\().8h, v0.h[1]
|
||||
mul \dst2\().8h, \src3\().8h, v0.h[1]
|
||||
mul \tmp1\().8h, \src1\().8h, v0.h[0]
|
||||
mul \tmp2\().8h, \src2\().8h, v0.h[0]
|
||||
mla \dst1\().8h, \src3\().8h, v0.h[2]
|
||||
mla \dst2\().8h, \src4\().8h, v0.h[2]
|
||||
.if \idx1 == 3
|
||||
mla \dst1\().8h, \src4\().8h, v0.h[3]
|
||||
mla \dst2\().8h, \src5\().8h, v0.h[3]
|
||||
.else
|
||||
mla \dst1\().8h, \src5\().8h, v0.h[4]
|
||||
mla \dst2\().8h, \src6\().8h, v0.h[4]
|
||||
.endif
|
||||
mla \dst1\().8h, \src6\().8h, v0.h[5]
|
||||
mla \dst2\().8h, \src7\().8h, v0.h[5]
|
||||
mla \tmp1\().8h, \src8\().8h, v0.h[7]
|
||||
mla \tmp2\().8h, \src9\().8h, v0.h[7]
|
||||
mla \dst1\().8h, \src7\().8h, v0.h[6]
|
||||
mla \dst2\().8h, \src8\().8h, v0.h[6]
|
||||
.if \idx2 == 3
|
||||
mla \tmp1\().8h, \src4\().8h, v0.h[3]
|
||||
mla \tmp2\().8h, \src5\().8h, v0.h[3]
|
||||
.else
|
||||
mla \tmp1\().8h, \src5\().8h, v0.h[4]
|
||||
mla \tmp2\().8h, \src6\().8h, v0.h[4]
|
||||
.endif
|
||||
sqadd \dst1\().8h, \dst1\().8h, \tmp1\().8h
|
||||
sqadd \dst2\().8h, \dst2\().8h, \tmp2\().8h
|
||||
.endm
|
||||
|
||||
// Load pixels and extend them to 16 bit
|
||||
.macro loadl dst1, dst2, dst3, dst4
|
||||
ld1 {v1.8b}, [x2], x3
|
||||
ld1 {v2.8b}, [x2], x3
|
||||
ld1 {v3.8b}, [x2], x3
|
||||
.ifnb \dst4
|
||||
ld1 {v4.8b}, [x2], x3
|
||||
.endif
|
||||
uxtl \dst1\().8h, v1.8b
|
||||
uxtl \dst2\().8h, v2.8b
|
||||
uxtl \dst3\().8h, v3.8b
|
||||
.ifnb \dst4
|
||||
uxtl \dst4\().8h, v4.8b
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// Instantiate a vertical filter function for filtering 8 pixels at a time.
|
||||
// The height is passed in x4, the width in x5 and the filter coefficients
|
||||
// in x6. idx2 is the index of the largest filter coefficient (3 or 4)
|
||||
// and idx1 is the other one of them.
|
||||
.macro do_8tap_8v type, idx1, idx2
|
||||
function \type\()_8tap_8v_\idx1\idx2
|
||||
sub x2, x2, x3, lsl #1
|
||||
sub x2, x2, x3
|
||||
ld1 {v0.8h}, [x6]
|
||||
1:
|
||||
.ifc \type,avg
|
||||
mov x7, x0
|
||||
.endif
|
||||
mov x6, x4
|
||||
|
||||
loadl v17, v18, v19
|
||||
|
||||
loadl v20, v21, v22, v23
|
||||
2:
|
||||
loadl v24, v25, v26, v27
|
||||
convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5, v6
|
||||
convolve v3, v4, v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5, v6
|
||||
do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
|
||||
|
||||
subs x6, x6, #4
|
||||
b.eq 8f
|
||||
|
||||
loadl v16, v17, v18, v19
|
||||
convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5, v6
|
||||
convolve v3, v4, v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5, v6
|
||||
do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
|
||||
|
||||
subs x6, x6, #4
|
||||
b.eq 8f
|
||||
|
||||
loadl v20, v21, v22, v23
|
||||
convolve v1, v2, v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5, v6
|
||||
convolve v3, v4, v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5, v6
|
||||
do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
|
||||
|
||||
subs x6, x6, #4
|
||||
b.ne 2b
|
||||
|
||||
8:
|
||||
subs x5, x5, #8
|
||||
b.eq 9f
|
||||
// x0 -= h * dst_stride
|
||||
msub x0, x1, x4, x0
|
||||
// x2 -= h * src_stride
|
||||
msub x2, x3, x4, x2
|
||||
// x2 -= 8 * src_stride
|
||||
sub x2, x2, x3, lsl #3
|
||||
// x2 += 1 * src_stride
|
||||
add x2, x2, x3
|
||||
add x2, x2, #8
|
||||
add x0, x0, #8
|
||||
b 1b
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
do_8tap_8v put, 3, 4
|
||||
do_8tap_8v put, 4, 3
|
||||
do_8tap_8v avg, 3, 4
|
||||
do_8tap_8v avg, 4, 3
|
||||
|
||||
|
||||
// Instantiate a vertical filter function for filtering a 4 pixels wide
|
||||
// slice. The first half of the registers contain one row, while the second
|
||||
// half of a register contains the second-next row (also stored in the first
|
||||
// half of the register two steps ahead). The convolution does two outputs
|
||||
// at a time; the output of v17-v24 into one, and v18-v25 into another one.
|
||||
// The first half of first output is the first output row, the first half
|
||||
// of the other output is the second output row. The second halves of the
|
||||
// registers are rows 3 and 4.
|
||||
// This only is designed to work for 4 or 8 output lines.
|
||||
.macro do_8tap_4v type, idx1, idx2
|
||||
function \type\()_8tap_4v_\idx1\idx2
|
||||
sub x2, x2, x3, lsl #1
|
||||
sub x2, x2, x3
|
||||
ld1 {v0.8h}, [x6]
|
||||
.ifc \type,avg
|
||||
mov x7, x0
|
||||
.endif
|
||||
|
||||
ld1 {v1.s}[0], [x2], x3
|
||||
ld1 {v2.s}[0], [x2], x3
|
||||
ld1 {v3.s}[0], [x2], x3
|
||||
ld1 {v4.s}[0], [x2], x3
|
||||
ld1 {v5.s}[0], [x2], x3
|
||||
ld1 {v6.s}[0], [x2], x3
|
||||
trn1 v1.2s, v1.2s, v3.2s
|
||||
ld1 {v7.s}[0], [x2], x3
|
||||
trn1 v2.2s, v2.2s, v4.2s
|
||||
ld1 {v26.s}[0], [x2], x3
|
||||
uxtl v17.8h, v1.8b
|
||||
trn1 v3.2s, v3.2s, v5.2s
|
||||
ld1 {v27.s}[0], [x2], x3
|
||||
uxtl v18.8h, v2.8b
|
||||
trn1 v4.2s, v4.2s, v6.2s
|
||||
ld1 {v28.s}[0], [x2], x3
|
||||
uxtl v19.8h, v3.8b
|
||||
trn1 v5.2s, v5.2s, v7.2s
|
||||
ld1 {v29.s}[0], [x2], x3
|
||||
uxtl v20.8h, v4.8b
|
||||
trn1 v6.2s, v6.2s, v26.2s
|
||||
uxtl v21.8h, v5.8b
|
||||
trn1 v7.2s, v7.2s, v27.2s
|
||||
uxtl v22.8h, v6.8b
|
||||
trn1 v26.2s, v26.2s, v28.2s
|
||||
uxtl v23.8h, v7.8b
|
||||
trn1 v27.2s, v27.2s, v29.2s
|
||||
uxtl v24.8h, v26.8b
|
||||
uxtl v25.8h, v27.8b
|
||||
|
||||
convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3, v4
|
||||
do_store4 v1, v2, v5, v6, \type
|
||||
|
||||
subs x4, x4, #4
|
||||
b.eq 9f
|
||||
|
||||
ld1 {v1.s}[0], [x2], x3
|
||||
ld1 {v2.s}[0], [x2], x3
|
||||
trn1 v28.2s, v28.2s, v1.2s
|
||||
trn1 v29.2s, v29.2s, v2.2s
|
||||
ld1 {v1.s}[1], [x2], x3
|
||||
uxtl v26.8h, v28.8b
|
||||
ld1 {v2.s}[1], [x2], x3
|
||||
uxtl v27.8h, v29.8b
|
||||
uxtl v28.8h, v1.8b
|
||||
uxtl v29.8h, v2.8b
|
||||
|
||||
convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3, v4
|
||||
do_store4 v1, v2, v5, v6, \type
|
||||
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
do_8tap_4v put, 3, 4
|
||||
do_8tap_4v put, 4, 3
|
||||
do_8tap_4v avg, 3, 4
|
||||
do_8tap_4v avg, 4, 3
|
||||
|
||||
|
||||
.macro do_8tap_v_func type, filter, offset, size
|
||||
function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
|
||||
uxtw x4, w4
|
||||
movrel x5, X(ff_vp9_subpel_filters), 256*\offset
|
||||
cmp w6, #8
|
||||
add x6, x5, w6, uxtw #4
|
||||
mov x5, #\size
|
||||
.if \size >= 8
|
||||
b.ge \type\()_8tap_8v_34
|
||||
b \type\()_8tap_8v_43
|
||||
.else
|
||||
b.ge \type\()_8tap_4v_34
|
||||
b \type\()_8tap_4v_43
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro do_8tap_v_filters size
|
||||
do_8tap_v_func put, regular, 1, \size
|
||||
do_8tap_v_func avg, regular, 1, \size
|
||||
do_8tap_v_func put, sharp, 2, \size
|
||||
do_8tap_v_func avg, sharp, 2, \size
|
||||
do_8tap_v_func put, smooth, 0, \size
|
||||
do_8tap_v_func avg, smooth, 0, \size
|
||||
.endm
|
||||
|
||||
do_8tap_v_filters 64
|
||||
do_8tap_v_filters 32
|
||||
do_8tap_v_filters 16
|
||||
do_8tap_v_filters 8
|
||||
do_8tap_v_filters 4
|
||||
Reference in New Issue
Block a user