early-access version 1432
This commit is contained in:
158
externals/ffmpeg/libavcodec/arm/Makefile
vendored
Executable file
158
externals/ffmpeg/libavcodec/arm/Makefile
vendored
Executable file
@@ -0,0 +1,158 @@
|
||||
ARCH_HEADERS = mathops.h
|
||||
|
||||
# subsystems
|
||||
OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \
|
||||
arm/ac3dsp_arm.o
|
||||
OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_arm.o
|
||||
OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_arm.o
|
||||
OBJS-$(CONFIG_FFT) += arm/fft_init_arm.o \
|
||||
arm/fft_fixed_init_arm.o
|
||||
OBJS-$(CONFIG_FLACDSP) += arm/flacdsp_init_arm.o \
|
||||
arm/flacdsp_arm.o
|
||||
OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_init_arm.o
|
||||
OBJS-$(CONFIG_G722DSP) += arm/g722dsp_init_arm.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o
|
||||
OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o
|
||||
OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
|
||||
OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o
|
||||
OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \
|
||||
arm/hpeldsp_arm.o
|
||||
OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_arm.o \
|
||||
arm/idctdsp_arm.o \
|
||||
arm/jrevdct_arm.o \
|
||||
arm/simple_idct_arm.o
|
||||
OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_init_arm.o
|
||||
OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_init_arm.o
|
||||
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
|
||||
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
|
||||
OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o
|
||||
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
|
||||
OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_init_arm.o
|
||||
OBJS-$(CONFIG_RDFT) += arm/rdft_init_arm.o
|
||||
OBJS-$(CONFIG_RV34DSP) += arm/rv34dsp_init_arm.o
|
||||
OBJS-$(CONFIG_VC1DSP) += arm/vc1dsp_init_arm.o
|
||||
OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o
|
||||
OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o
|
||||
OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_arm.o
|
||||
|
||||
# decoders/encoders
|
||||
OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
|
||||
arm/sbrdsp_init_arm.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o
|
||||
OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o
|
||||
OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
|
||||
OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o
|
||||
OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o
|
||||
OBJS-$(CONFIG_TRUEHD_DECODER) += arm/mlpdsp_init_arm.o
|
||||
OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o
|
||||
OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o
|
||||
OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_10bpp_arm.o \
|
||||
arm/vp9dsp_init_12bpp_arm.o \
|
||||
arm/vp9dsp_init_arm.o
|
||||
|
||||
|
||||
# ARMv5 optimizations
|
||||
# subsystems
|
||||
ARMV5TE-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv5te.o \
|
||||
arm/simple_idct_armv5te.o
|
||||
ARMV5TE-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_armv5te.o \
|
||||
arm/mpegvideo_armv5te_s.o
|
||||
ARMV5TE-OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_armv5te.o \
|
||||
arm/videodsp_armv5te.o
|
||||
|
||||
# decoders/encoders
|
||||
ARMV5TE-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv5te.o
|
||||
ARMV5TE-OBJS-$(CONFIG_TRUEHD_DECODER) += arm/mlpdsp_armv5te.o
|
||||
|
||||
|
||||
# ARMv6 optimizations
|
||||
# subsystems
|
||||
ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \
|
||||
arm/hpeldsp_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv6.o \
|
||||
arm/idctdsp_armv6.o \
|
||||
arm/simple_idct_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_STARTCODE) += arm/startcode_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_VP8DSP) += arm/vp8_armv6.o \
|
||||
arm/vp8dsp_init_armv6.o \
|
||||
arm/vp8dsp_armv6.o
|
||||
|
||||
# decoders/encoders
|
||||
ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_TRUEHD_DECODER) += arm/mlpdsp_armv6.o
|
||||
|
||||
|
||||
# VFP optimizations
|
||||
|
||||
# subsystems
|
||||
VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o
|
||||
VFP-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_vfp.o
|
||||
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
|
||||
|
||||
# decoders/encoders
|
||||
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o
|
||||
|
||||
|
||||
# NEON optimizations
|
||||
|
||||
# subsystems
|
||||
NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_neon.o \
|
||||
arm/audiodsp_neon.o \
|
||||
arm/int_neon.o
|
||||
NEON-OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_neon.o \
|
||||
arm/blockdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \
|
||||
arm/fft_fixed_neon.o
|
||||
NEON-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_neon.o
|
||||
NEON-OBJS-$(CONFIG_G722DSP) += arm/g722dsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264CHROMA) += arm/h264cmc_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_neon.o \
|
||||
arm/h264idct_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264PRED) += arm/h264pred_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_neon.o \
|
||||
arm/hpeldsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_neon.o \
|
||||
arm/hpeldsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_neon.o \
|
||||
arm/idctdsp_neon.o \
|
||||
arm/simple_idct_neon.o
|
||||
NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o \
|
||||
arm/mdct_fixed_neon.o
|
||||
NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o
|
||||
NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_RDFT) += arm/rdft_neon.o
|
||||
NEON-OBJS-$(CONFIG_VC1DSP) += arm/vc1dsp_init_neon.o \
|
||||
arm/vc1dsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_neon.o \
|
||||
arm/vp8dsp_neon.o
|
||||
|
||||
# decoders/encoders
|
||||
NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
|
||||
arm/sbrdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o
|
||||
NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
|
||||
arm/hevcdsp_deblock_neon.o \
|
||||
arm/hevcdsp_idct_neon.o \
|
||||
arm/hevcdsp_qpel_neon.o \
|
||||
arm/hevcdsp_sao_neon.o
|
||||
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
|
||||
arm/rv40dsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9itxfm_16bpp_neon.o \
|
||||
arm/vp9itxfm_neon.o \
|
||||
arm/vp9lpf_16bpp_neon.o \
|
||||
arm/vp9lpf_neon.o \
|
||||
arm/vp9mc_16bpp_neon.o \
|
||||
arm/vp9mc_neon.o
|
||||
143
externals/ffmpeg/libavcodec/arm/aac.h
vendored
Executable file
143
externals/ffmpeg/libavcodec/arm/aac.h
vendored
Executable file
@@ -0,0 +1,143 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_ARM_AAC_H
|
||||
#define AVCODEC_ARM_AAC_H
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#if HAVE_NEON_INLINE
|
||||
|
||||
#define VMUL2 VMUL2
|
||||
static inline float *VMUL2(float *dst, const float *v, unsigned idx,
|
||||
const float *scale)
|
||||
{
|
||||
unsigned v0, v1;
|
||||
__asm__ ("ubfx %0, %6, #0, #4 \n\t"
|
||||
"ubfx %1, %6, #4, #4 \n\t"
|
||||
"ldr %0, [%5, %0, lsl #2] \n\t"
|
||||
"ldr %1, [%5, %1, lsl #2] \n\t"
|
||||
"vld1.32 {d1[]}, [%7,:32] \n\t"
|
||||
"vmov d0, %0, %1 \n\t"
|
||||
"vmul.f32 d0, d0, d1 \n\t"
|
||||
"vst1.32 {d0}, [%2,:64]! \n\t"
|
||||
: "=&r"(v0), "=&r"(v1), "+r"(dst), "=m"(dst[0]), "=m"(dst[1])
|
||||
: "r"(v), "r"(idx), "r"(scale)
|
||||
: "d0", "d1");
|
||||
return dst;
|
||||
}
|
||||
|
||||
#define VMUL4 VMUL4
|
||||
static inline float *VMUL4(float *dst, const float *v, unsigned idx,
|
||||
const float *scale)
|
||||
{
|
||||
unsigned v0, v1, v2, v3;
|
||||
__asm__ ("ubfx %0, %10, #0, #2 \n\t"
|
||||
"ubfx %1, %10, #2, #2 \n\t"
|
||||
"ldr %0, [%9, %0, lsl #2] \n\t"
|
||||
"ubfx %2, %10, #4, #2 \n\t"
|
||||
"ldr %1, [%9, %1, lsl #2] \n\t"
|
||||
"ubfx %3, %10, #6, #2 \n\t"
|
||||
"ldr %2, [%9, %2, lsl #2] \n\t"
|
||||
"vmov d0, %0, %1 \n\t"
|
||||
"ldr %3, [%9, %3, lsl #2] \n\t"
|
||||
"vld1.32 {d2[],d3[]},[%11,:32] \n\t"
|
||||
"vmov d1, %2, %3 \n\t"
|
||||
"vmul.f32 q0, q0, q1 \n\t"
|
||||
"vst1.32 {q0}, [%4,:128]! \n\t"
|
||||
: "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
|
||||
"=m"(dst[0]), "=m"(dst[1]), "=m"(dst[2]), "=m"(dst[3])
|
||||
: "r"(v), "r"(idx), "r"(scale)
|
||||
: "d0", "d1", "d2", "d3");
|
||||
return dst;
|
||||
}
|
||||
|
||||
#define VMUL2S VMUL2S
|
||||
static inline float *VMUL2S(float *dst, const float *v, unsigned idx,
|
||||
unsigned sign, const float *scale)
|
||||
{
|
||||
unsigned v0, v1, v2, v3;
|
||||
__asm__ ("ubfx %0, %8, #0, #4 \n\t"
|
||||
"ubfx %1, %8, #4, #4 \n\t"
|
||||
"ldr %0, [%7, %0, lsl #2] \n\t"
|
||||
"lsl %2, %10, #30 \n\t"
|
||||
"ldr %1, [%7, %1, lsl #2] \n\t"
|
||||
"lsl %3, %10, #31 \n\t"
|
||||
"vmov d0, %0, %1 \n\t"
|
||||
"bic %2, %2, #1<<30 \n\t"
|
||||
"vld1.32 {d1[]}, [%9,:32] \n\t"
|
||||
"vmov d2, %2, %3 \n\t"
|
||||
"veor d0, d0, d2 \n\t"
|
||||
"vmul.f32 d0, d0, d1 \n\t"
|
||||
"vst1.32 {d0}, [%4,:64]! \n\t"
|
||||
: "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
|
||||
"=m"(dst[0]), "=m"(dst[1])
|
||||
: "r"(v), "r"(idx), "r"(scale), "r"(sign)
|
||||
: "d0", "d1", "d2");
|
||||
return dst;
|
||||
}
|
||||
|
||||
#define VMUL4S VMUL4S
|
||||
static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
|
||||
unsigned sign, const float *scale)
|
||||
{
|
||||
unsigned v0, v1, v2, v3, nz;
|
||||
__asm__ ("vld1.32 {d2[],d3[]},[%13,:32] \n\t"
|
||||
"ubfx %0, %12, #0, #2 \n\t"
|
||||
"ubfx %1, %12, #2, #2 \n\t"
|
||||
"ldr %0, [%11,%0, lsl #2] \n\t"
|
||||
"ubfx %2, %12, #4, #2 \n\t"
|
||||
"ldr %1, [%11,%1, lsl #2] \n\t"
|
||||
"ubfx %3, %12, #6, #2 \n\t"
|
||||
"ldr %2, [%11,%2, lsl #2] \n\t"
|
||||
"vmov d0, %0, %1 \n\t"
|
||||
"ldr %3, [%11,%3, lsl #2] \n\t"
|
||||
"lsr %6, %12, #12 \n\t"
|
||||
"rbit %6, %6 \n\t"
|
||||
"vmov d1, %2, %3 \n\t"
|
||||
"lsls %6, %6, #1 \n\t"
|
||||
"and %0, %5, #1<<31 \n\t"
|
||||
"it cs \n\t"
|
||||
"lslcs %5, %5, #1 \n\t"
|
||||
"lsls %6, %6, #1 \n\t"
|
||||
"and %1, %5, #1<<31 \n\t"
|
||||
"it cs \n\t"
|
||||
"lslcs %5, %5, #1 \n\t"
|
||||
"lsls %6, %6, #1 \n\t"
|
||||
"and %2, %5, #1<<31 \n\t"
|
||||
"it cs \n\t"
|
||||
"lslcs %5, %5, #1 \n\t"
|
||||
"vmov d4, %0, %1 \n\t"
|
||||
"and %3, %5, #1<<31 \n\t"
|
||||
"vmov d5, %2, %3 \n\t"
|
||||
"veor q0, q0, q2 \n\t"
|
||||
"vmul.f32 q0, q0, q1 \n\t"
|
||||
"vst1.32 {q0}, [%4,:128]! \n\t"
|
||||
: "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
|
||||
"+r"(sign), "=r"(nz),
|
||||
"=m"(dst[0]), "=m"(dst[1]), "=m"(dst[2]), "=m"(dst[3])
|
||||
: "r"(v), "r"(idx), "r"(scale)
|
||||
: "cc", "d0", "d1", "d2", "d3", "d4", "d5");
|
||||
return dst;
|
||||
}
|
||||
|
||||
#endif /* HAVE_NEON_INLINE */
|
||||
|
||||
#endif /* AVCODEC_ARM_AAC_H */
|
||||
57
externals/ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c
vendored
Executable file
57
externals/ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright (c) 2012 Mans Rullgard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/aacpsdsp.h"
|
||||
|
||||
void ff_ps_add_squares_neon(float *dst, const float (*src)[2], int n);
|
||||
void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2],
|
||||
float *src1, int n);
|
||||
void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2],
|
||||
const float (*filter)[8][2],
|
||||
ptrdiff_t stride, int n);
|
||||
void ff_ps_hybrid_analysis_ileave_neon(float (*out)[32][2], float L[2][38][64],
|
||||
int i, int len);
|
||||
void ff_ps_hybrid_synthesis_deint_neon(float out[2][38][64], float (*in)[32][2],
|
||||
int i, int len);
|
||||
void ff_ps_decorrelate_neon(float (*out)[2], float (*delay)[2],
|
||||
float (*ap_delay)[PS_QMF_TIME_SLOTS+PS_MAX_AP_DELAY][2],
|
||||
const float phi_fract[2], float (*Q_fract)[2],
|
||||
const float *transient_gain, float g_decay_slope,
|
||||
int len);
|
||||
void ff_ps_stereo_interpolate_neon(float (*l)[2], float (*r)[2],
|
||||
float h[2][4], float h_step[2][4],
|
||||
int len);
|
||||
|
||||
av_cold void ff_psdsp_init_arm(PSDSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
s->add_squares = ff_ps_add_squares_neon;
|
||||
s->mul_pair_single = ff_ps_mul_pair_single_neon;
|
||||
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_neon;
|
||||
s->hybrid_analysis = ff_ps_hybrid_analysis_neon;
|
||||
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_neon;
|
||||
}
|
||||
}
|
||||
273
externals/ffmpeg/libavcodec/arm/aacpsdsp_neon.S
vendored
Executable file
273
externals/ffmpeg/libavcodec/arm/aacpsdsp_neon.S
vendored
Executable file
@@ -0,0 +1,273 @@
|
||||
/*
|
||||
* Copyright (c) 2012 Mans Rullgard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_ps_add_squares_neon, export=1
|
||||
mov r3, r0
|
||||
sub r2, r2, #4
|
||||
vld1.32 {q0}, [r1,:128]!
|
||||
vmul.f32 q0, q0, q0
|
||||
vld1.32 {q2}, [r1,:128]!
|
||||
vmul.f32 q2, q2, q2
|
||||
vld1.32 {q1}, [r0,:128]!
|
||||
1:
|
||||
vpadd.f32 d6, d0, d1
|
||||
vld1.32 {q0}, [r1,:128]!
|
||||
vpadd.f32 d7, d4, d5
|
||||
vmul.f32 q0, q0, q0
|
||||
vld1.32 {q2}, [r1,:128]!
|
||||
vadd.f32 q3, q1, q3
|
||||
vld1.32 {q1}, [r0,:128]!
|
||||
vmul.f32 q2, q2, q2
|
||||
vst1.32 {q3}, [r3,:128]!
|
||||
subs r2, r2, #4
|
||||
bgt 1b
|
||||
vpadd.f32 d6, d0, d1
|
||||
vpadd.f32 d7, d4, d5
|
||||
vadd.f32 q1, q1, q3
|
||||
vst1.32 {q1}, [r3,:128]!
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_ps_mul_pair_single_neon, export=1
|
||||
sub r3, r3, #4
|
||||
tst r1, #8
|
||||
bne 2f
|
||||
vld1.32 {q0}, [r1,:128]!
|
||||
1:
|
||||
vld1.32 {q3}, [r2,:128]!
|
||||
vmul.f32 d4, d0, d6[0]
|
||||
vmul.f32 d5, d1, d6[1]
|
||||
vld1.32 {q1}, [r1,:128]!
|
||||
vmul.f32 d6, d2, d7[0]
|
||||
vmul.f32 d7, d3, d7[1]
|
||||
vld1.32 {q0}, [r1,:128]!
|
||||
vst1.32 {q2,q3}, [r0,:128]!
|
||||
subs r3, r3, #4
|
||||
bgt 1b
|
||||
vld1.32 {q3}, [r2,:128]!
|
||||
vmul.f32 d4, d0, d6[0]
|
||||
vmul.f32 d5, d1, d6[1]
|
||||
vld1.32 {q1}, [r1,:128]!
|
||||
vmul.f32 d6, d2, d7[0]
|
||||
vmul.f32 d7, d3, d7[1]
|
||||
vst1.32 {q2,q3}, [r0,:128]!
|
||||
bx lr
|
||||
2:
|
||||
vld1.32 {d0}, [r1,:64]!
|
||||
vld1.32 {d1,d2}, [r1,:128]!
|
||||
1:
|
||||
vld1.32 {q3}, [r2,:128]!
|
||||
vmul.f32 d4, d0, d6[0]
|
||||
vmul.f32 d5, d1, d6[1]
|
||||
vld1.32 {d0,d1}, [r1,:128]!
|
||||
vmul.f32 d6, d2, d7[0]
|
||||
vmul.f32 d7, d0, d7[1]
|
||||
vmov d0, d1
|
||||
vld1.32 {d1,d2}, [r1,:128]!
|
||||
vst1.32 {q2,q3}, [r0,:128]!
|
||||
subs r3, r3, #4
|
||||
bgt 1b
|
||||
vld1.32 {q3}, [r2,:128]!
|
||||
vmul.f32 d4, d0, d6[0]
|
||||
vmul.f32 d5, d1, d6[1]
|
||||
vld1.32 {d0}, [r1,:64]!
|
||||
vmul.f32 d6, d2, d7[0]
|
||||
vmul.f32 d7, d0, d7[1]
|
||||
vst1.32 {q2,q3}, [r0,:128]!
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_ps_hybrid_synthesis_deint_neon, export=1
|
||||
push {r4-r8,lr}
|
||||
add r0, r0, r2, lsl #2
|
||||
add r1, r1, r2, lsl #5+1+2
|
||||
rsb r2, r2, #64
|
||||
mov r5, #64*4
|
||||
mov lr, r0
|
||||
add r4, r0, #38*64*4
|
||||
mov r12, r3
|
||||
2:
|
||||
vld1.32 {d0,d1}, [r1,:128]!
|
||||
vst1.32 {d0[0]}, [lr,:32], r5
|
||||
vst1.32 {d0[1]}, [r4,:32], r5
|
||||
vst1.32 {d1[0]}, [lr,:32], r5
|
||||
vst1.32 {d1[1]}, [r4,:32], r5
|
||||
subs r12, r12, #2
|
||||
bgt 2b
|
||||
add r0, r0, #4
|
||||
sub r2, r2, #1
|
||||
tst r2, #2
|
||||
bne 6f
|
||||
1:
|
||||
mov lr, r0
|
||||
add r4, r0, #38*64*4
|
||||
add r6, r1, # 32*2*4
|
||||
add r7, r1, #2*32*2*4
|
||||
add r8, r1, #3*32*2*4
|
||||
mov r12, r3
|
||||
2:
|
||||
vld1.32 {d0,d1}, [r1,:128]!
|
||||
vld1.32 {d2,d3}, [r6,:128]!
|
||||
vld1.32 {d4,d5}, [r7,:128]!
|
||||
vld1.32 {d6,d7}, [r8,:128]!
|
||||
vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [lr,:128], r5
|
||||
vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r4,:128], r5
|
||||
vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [lr,:128], r5
|
||||
vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r4,:128], r5
|
||||
subs r12, r12, #2
|
||||
bgt 2b
|
||||
add r0, r0, #16
|
||||
add r1, r1, #3*32*2*4
|
||||
subs r2, r2, #4
|
||||
bgt 1b
|
||||
pop {r4-r8,pc}
|
||||
6:
|
||||
mov lr, r0
|
||||
add r4, r0, #38*64*4
|
||||
add r6, r1, #32*2*4
|
||||
mov r12, r3
|
||||
2:
|
||||
vld1.32 {d0,d1}, [r1,:128]!
|
||||
vld1.32 {d2,d3}, [r6,:128]!
|
||||
vst2.32 {d0[0],d2[0]}, [lr,:64], r5
|
||||
vst2.32 {d0[1],d2[1]}, [r4,:64], r5
|
||||
vst2.32 {d1[0],d3[0]}, [lr,:64], r5
|
||||
vst2.32 {d1[1],d3[1]}, [r4,:64], r5
|
||||
subs r12, r12, #2
|
||||
bgt 2b
|
||||
add r0, r0, #8
|
||||
add r1, r1, #32*2*4
|
||||
sub r2, r2, #2
|
||||
b 1b
|
||||
endfunc
|
||||
|
||||
function ff_ps_hybrid_analysis_neon, export=1
|
||||
vldm r1, {d19-d31}
|
||||
ldr r12, [sp]
|
||||
lsl r3, r3, #3
|
||||
vadd.f32 d16, d19, d31
|
||||
vadd.f32 d17, d20, d30
|
||||
vsub.f32 d18, d19, d31
|
||||
vsub.f32 d19, d20, d30
|
||||
vsub.f32 d0, d21, d29
|
||||
vsub.f32 d1, d22, d28
|
||||
vadd.f32 d2, d21, d29
|
||||
vadd.f32 d3, d22, d28
|
||||
vadd.f32 d20, d23, d27
|
||||
vadd.f32 d21, d24, d26
|
||||
vsub.f32 d22, d23, d27
|
||||
vsub.f32 d23, d24, d26
|
||||
vmov.i32 d6, #1<<31
|
||||
vmov.i32 d7, #0
|
||||
vmov.f32 q14, #0.0
|
||||
vmov.f32 q15, #0.0
|
||||
vtrn.32 d6, d7
|
||||
vrev64.32 q9, q9
|
||||
vrev64.32 q0, q0
|
||||
vrev64.32 q11, q11
|
||||
veor q9, q9, q3
|
||||
veor q0, q0, q3
|
||||
veor q11, q11, q3
|
||||
vld1.32 {q13}, [r2,:128]!
|
||||
vtrn.32 q8, q9
|
||||
vtrn.32 q1, q0
|
||||
vtrn.32 q10, q11
|
||||
sub r12, r12, #1
|
||||
vmla.f32 q14, q8, q13
|
||||
vld1.32 {q2}, [r2,:128]!
|
||||
vmla.f32 q15, q9, q13
|
||||
1:
|
||||
vmla.f32 q14, q1, q2
|
||||
vld1.32 {q13}, [r2,:128]!
|
||||
vmla.f32 q15, q0, q2
|
||||
vmla.f32 q14, q10, q13
|
||||
vld1.32 {q2}, [r2,:128]!
|
||||
vmla.f32 q15, q11, q13
|
||||
vld1.32 {q13}, [r2,:128]!
|
||||
vadd.f32 d6, d28, d29
|
||||
vadd.f32 d7, d30, d31
|
||||
vmov.f32 q14, #0.0
|
||||
vmov.f32 q15, #0.0
|
||||
vmla.f32 q14, q8, q13
|
||||
vpadd.f32 d6, d6, d7
|
||||
vmla.f32 q15, q9, q13
|
||||
vmla.f32 d6, d25, d4[0]
|
||||
vld1.32 {q2}, [r2,:128]!
|
||||
vst1.32 {d6}, [r0,:64], r3
|
||||
subs r12, r12, #1
|
||||
bgt 1b
|
||||
vmla.f32 q14, q1, q2
|
||||
vld1.32 {q13}, [r2,:128]!
|
||||
vmla.f32 q15, q0, q2
|
||||
vmla.f32 q14, q10, q13
|
||||
vld1.32 {q2}, [r2,:128]!
|
||||
vmla.f32 q15, q11, q13
|
||||
vadd.f32 d6, d28, d29
|
||||
vadd.f32 d7, d30, d31
|
||||
vpadd.f32 d6, d6, d7
|
||||
vmla.f32 d6, d25, d4[0]
|
||||
vst1.32 {d6}, [r0,:64], r3
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_ps_stereo_interpolate_neon, export=1
|
||||
vld1.32 {q0}, [r2]
|
||||
vld1.32 {q14}, [r3]
|
||||
mov r2, r0
|
||||
mov r3, r1
|
||||
ldr r12, [sp]
|
||||
vadd.f32 q1, q0, q14
|
||||
vadd.f32 q0, q1, q14
|
||||
vld1.32 {q2}, [r0,:64]!
|
||||
vld1.32 {q3}, [r1,:64]!
|
||||
subs r12, r12, #1
|
||||
beq 2f
|
||||
1:
|
||||
vmul.f32 d16, d4, d2[0]
|
||||
vmul.f32 d17, d5, d0[0]
|
||||
vmul.f32 d18, d4, d2[1]
|
||||
vmul.f32 d19, d5, d0[1]
|
||||
vmla.f32 d16, d6, d3[0]
|
||||
vmla.f32 d17, d7, d1[0]
|
||||
vmla.f32 d18, d6, d3[1]
|
||||
vmla.f32 d19, d7, d1[1]
|
||||
vadd.f32 q1, q1, q14
|
||||
vadd.f32 q0, q0, q14
|
||||
vadd.f32 q1, q1, q14
|
||||
vadd.f32 q0, q0, q14
|
||||
vld1.32 {q2}, [r0,:64]!
|
||||
vld1.32 {q3}, [r1,:64]!
|
||||
vst1.32 {q8}, [r2,:64]!
|
||||
vst1.32 {q9}, [r3,:64]!
|
||||
subs r12, r12, #2
|
||||
bgt 1b
|
||||
it lt
|
||||
bxlt lr
|
||||
2:
|
||||
vmul.f32 d16, d4, d2[0]
|
||||
vmul.f32 d18, d4, d2[1]
|
||||
vmla.f32 d16, d6, d3[0]
|
||||
vmla.f32 d18, d6, d3[1]
|
||||
vst1.32 {d16}, [r2,:64]!
|
||||
vst1.32 {d18}, [r3,:64]!
|
||||
bx lr
|
||||
endfunc
|
||||
36
externals/ffmpeg/libavcodec/arm/ac3dsp_arm.S
vendored
Executable file
36
externals/ffmpeg/libavcodec/arm/ac3dsp_arm.S
vendored
Executable file
@@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_ac3_update_bap_counts_arm, export=1
|
||||
push {lr}
|
||||
ldrb lr, [r1], #1
|
||||
1:
|
||||
lsl r3, lr, #1
|
||||
ldrh r12, [r0, r3]
|
||||
subs r2, r2, #1
|
||||
it gt
|
||||
ldrbgt lr, [r1], #1
|
||||
add r12, r12, #1
|
||||
strh r12, [r0, r3]
|
||||
bgt 1b
|
||||
pop {pc}
|
||||
endfunc
|
||||
84
externals/ffmpeg/libavcodec/arm/ac3dsp_armv6.S
vendored
Executable file
84
externals/ffmpeg/libavcodec/arm/ac3dsp_armv6.S
vendored
Executable file
@@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_ac3_bit_alloc_calc_bap_armv6, export=1
|
||||
ldr r12, [sp]
|
||||
cmp r12, #-960
|
||||
beq 4f
|
||||
push {r4-r11,lr}
|
||||
add r5, sp, #40
|
||||
movrelx r4, X(ff_ac3_bin_to_band_tab), r11
|
||||
movrelx lr, X(ff_ac3_band_start_tab)
|
||||
ldm r5, {r5-r7}
|
||||
ldrb r4, [r4, r2]
|
||||
add r1, r1, r2, lsl #1 @ psd + start
|
||||
add r0, r0, r4, lsl #1 @ mask + band
|
||||
add r4, r4, lr
|
||||
add r7, r7, r2 @ bap + start
|
||||
1:
|
||||
ldrsh r9, [r0], #2 @ mask[band]
|
||||
mov r8, #0xff0
|
||||
sub r9, r9, r12 @ - snr_offset
|
||||
ldrb r10, [r4, #1]! @ band_start_tab[++band]
|
||||
subs r9, r9, r5 @ - floor
|
||||
it lt
|
||||
movlt r9, #0
|
||||
cmp r10, r3 @ - end
|
||||
and r9, r9, r8, lsl #1 @ & 0x1fe0
|
||||
ite gt
|
||||
subgt r8, r3, r2
|
||||
suble r8, r10, r2
|
||||
mov r2, r10
|
||||
add r9, r9, r5 @ + floor => m
|
||||
tst r8, #1
|
||||
add r11, r7, r8
|
||||
bne 3f
|
||||
b 5f
|
||||
2:
|
||||
ldrsh r8, [r1], #2
|
||||
ldrsh lr, [r1], #2
|
||||
sub r8, r8, r9
|
||||
sub lr, lr, r9
|
||||
usat r8, #6, r8, asr #5 @ address
|
||||
usat lr, #6, lr, asr #5
|
||||
ldrb r8, [r6, r8] @ bap_tab[address]
|
||||
ldrb lr, [r6, lr]
|
||||
strb r8, [r7], #1 @ bap[bin]
|
||||
strb lr, [r7], #1
|
||||
5: cmp r7, r11
|
||||
blo 2b
|
||||
cmp r3, r10
|
||||
bgt 1b
|
||||
pop {r4-r11,pc}
|
||||
3:
|
||||
ldrsh r8, [r1], #2 @ psd[bin]
|
||||
sub r8, r8, r9 @ - m
|
||||
usat r8, #6, r8, asr #5 @ address
|
||||
ldrb r8, [r6, r8] @ bap_tab[address]
|
||||
strb r8, [r7], #1 @ bap[bin]
|
||||
b 5b
|
||||
4:
|
||||
ldr r0, [sp, #12]
|
||||
mov r1, #0
|
||||
mov r2, #256
|
||||
b X(memset)
|
||||
endfunc
|
||||
73
externals/ffmpeg/libavcodec/arm/ac3dsp_init_arm.c
vendored
Executable file
73
externals/ffmpeg/libavcodec/arm/ac3dsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/ac3dsp.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
||||
int ff_ac3_max_msb_abs_int16_neon(const int16_t *src, int len);
|
||||
void ff_ac3_lshift_int16_neon(int16_t *src, unsigned len, unsigned shift);
|
||||
void ff_ac3_rshift_int32_neon(int32_t *src, unsigned len, unsigned shift);
|
||||
void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int len);
|
||||
void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
|
||||
void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src,
|
||||
const int16_t *window, unsigned n);
|
||||
void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
|
||||
const int32_t *coef0,
|
||||
const int32_t *coef1,
|
||||
int len);
|
||||
void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
|
||||
const float *coef0,
|
||||
const float *coef1,
|
||||
int len);
|
||||
|
||||
void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd,
|
||||
int start, int end,
|
||||
int snr_offset, int floor,
|
||||
const uint8_t *bap_tab, uint8_t *bap);
|
||||
|
||||
void ff_ac3_update_bap_counts_arm(uint16_t mant_cnt[16], uint8_t *bap, int len);
|
||||
|
||||
av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
c->update_bap_counts = ff_ac3_update_bap_counts_arm;
|
||||
|
||||
if (have_armv6(cpu_flags)) {
|
||||
c->bit_alloc_calc_bap = ff_ac3_bit_alloc_calc_bap_armv6;
|
||||
}
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
c->ac3_exponent_min = ff_ac3_exponent_min_neon;
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_neon;
|
||||
c->ac3_lshift_int16 = ff_ac3_lshift_int16_neon;
|
||||
c->ac3_rshift_int32 = ff_ac3_rshift_int32_neon;
|
||||
c->float_to_fixed24 = ff_float_to_fixed24_neon;
|
||||
c->extract_exponents = ff_ac3_extract_exponents_neon;
|
||||
c->apply_window_int16 = ff_apply_window_int16_neon;
|
||||
c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
|
||||
c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
|
||||
}
|
||||
}
|
||||
177
externals/ffmpeg/libavcodec/arm/ac3dsp_neon.S
vendored
Executable file
177
externals/ffmpeg/libavcodec/arm/ac3dsp_neon.S
vendored
Executable file
@@ -0,0 +1,177 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_ac3_max_msb_abs_int16_neon, export=1
|
||||
vmov.i16 q0, #0
|
||||
vmov.i16 q2, #0
|
||||
1: vld1.16 {q1}, [r0,:128]!
|
||||
vabs.s16 q1, q1
|
||||
vld1.16 {q3}, [r0,:128]!
|
||||
vabs.s16 q3, q3
|
||||
vorr q0, q0, q1
|
||||
vorr q2, q2, q3
|
||||
subs r1, r1, #16
|
||||
bgt 1b
|
||||
vorr q0, q0, q2
|
||||
vorr d0, d0, d1
|
||||
vpmax.u16 d0, d0, d0
|
||||
vpmax.u16 d0, d0, d0
|
||||
vmov.u16 r0, d0[0]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_ac3_exponent_min_neon, export=1
|
||||
cmp r1, #0
|
||||
it eq
|
||||
bxeq lr
|
||||
push {lr}
|
||||
mov r12, #256
|
||||
1:
|
||||
vld1.8 {q0}, [r0,:128]
|
||||
mov lr, r1
|
||||
add r3, r0, #256
|
||||
2: vld1.8 {q1}, [r3,:128], r12
|
||||
subs lr, lr, #1
|
||||
vmin.u8 q0, q0, q1
|
||||
bgt 2b
|
||||
subs r2, r2, #16
|
||||
vst1.8 {q0}, [r0,:128]!
|
||||
bgt 1b
|
||||
pop {pc}
|
||||
endfunc
|
||||
|
||||
function ff_ac3_lshift_int16_neon, export=1
|
||||
vdup.16 q0, r2
|
||||
1: vld1.16 {q1}, [r0,:128]
|
||||
vshl.s16 q1, q1, q0
|
||||
vst1.16 {q1}, [r0,:128]!
|
||||
subs r1, r1, #8
|
||||
bgt 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_ac3_rshift_int32_neon, export=1
|
||||
rsb r2, r2, #0
|
||||
vdup.32 q0, r2
|
||||
1: vld1.32 {q1}, [r0,:128]
|
||||
vshl.s32 q1, q1, q0
|
||||
vst1.32 {q1}, [r0,:128]!
|
||||
subs r1, r1, #4
|
||||
bgt 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_float_to_fixed24_neon, export=1
|
||||
1: vld1.32 {q0-q1}, [r1,:128]!
|
||||
vcvt.s32.f32 q0, q0, #24
|
||||
vld1.32 {q2-q3}, [r1,:128]!
|
||||
vcvt.s32.f32 q1, q1, #24
|
||||
vcvt.s32.f32 q2, q2, #24
|
||||
vst1.32 {q0-q1}, [r0,:128]!
|
||||
vcvt.s32.f32 q3, q3, #24
|
||||
vst1.32 {q2-q3}, [r0,:128]!
|
||||
subs r2, r2, #16
|
||||
bgt 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_ac3_extract_exponents_neon, export=1
|
||||
vmov.i32 q15, #8
|
||||
1:
|
||||
vld1.32 {q0}, [r1,:128]!
|
||||
vabs.s32 q1, q0
|
||||
vclz.i32 q3, q1
|
||||
vsub.i32 q3, q3, q15
|
||||
vmovn.i32 d6, q3
|
||||
vmovn.i16 d6, q3
|
||||
vst1.32 {d6[0]}, [r0,:32]!
|
||||
subs r2, r2, #4
|
||||
bgt 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_apply_window_int16_neon, export=1
|
||||
push {r4,lr}
|
||||
add r4, r1, r3, lsl #1
|
||||
add lr, r0, r3, lsl #1
|
||||
sub r4, r4, #16
|
||||
sub lr, lr, #16
|
||||
mov r12, #-16
|
||||
1:
|
||||
vld1.16 {q0}, [r1,:128]!
|
||||
vld1.16 {q2}, [r2,:128]!
|
||||
vld1.16 {q1}, [r4,:128], r12
|
||||
vrev64.16 q3, q2
|
||||
vqrdmulh.s16 q0, q0, q2
|
||||
vqrdmulh.s16 d2, d2, d7
|
||||
vqrdmulh.s16 d3, d3, d6
|
||||
vst1.16 {q0}, [r0,:128]!
|
||||
vst1.16 {q1}, [lr,:128], r12
|
||||
subs r3, r3, #16
|
||||
bgt 1b
|
||||
|
||||
pop {r4,pc}
|
||||
endfunc
|
||||
|
||||
function ff_ac3_sum_square_butterfly_int32_neon, export=1
|
||||
vmov.i64 q0, #0
|
||||
vmov.i64 q1, #0
|
||||
vmov.i64 q2, #0
|
||||
vmov.i64 q3, #0
|
||||
1:
|
||||
vld1.32 {d16}, [r1]!
|
||||
vld1.32 {d17}, [r2]!
|
||||
vadd.s32 d18, d16, d17
|
||||
vsub.s32 d19, d16, d17
|
||||
vmlal.s32 q0, d16, d16
|
||||
vmlal.s32 q1, d17, d17
|
||||
vmlal.s32 q2, d18, d18
|
||||
vmlal.s32 q3, d19, d19
|
||||
subs r3, r3, #2
|
||||
bgt 1b
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.s64 d1, d2, d3
|
||||
vadd.s64 d2, d4, d5
|
||||
vadd.s64 d3, d6, d7
|
||||
vst1.64 {q0-q1}, [r0]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_ac3_sum_square_butterfly_float_neon, export=1
|
||||
vmov.f32 q0, #0.0
|
||||
vmov.f32 q1, #0.0
|
||||
1:
|
||||
vld1.32 {d16}, [r1]!
|
||||
vld1.32 {d17}, [r2]!
|
||||
vadd.f32 d18, d16, d17
|
||||
vsub.f32 d19, d16, d17
|
||||
vmla.f32 d0, d16, d16
|
||||
vmla.f32 d1, d17, d17
|
||||
vmla.f32 d2, d18, d18
|
||||
vmla.f32 d3, d19, d19
|
||||
subs r3, r3, #2
|
||||
bgt 1b
|
||||
vpadd.f32 d0, d0, d1
|
||||
vpadd.f32 d1, d2, d3
|
||||
vst1.32 {q0}, [r0]
|
||||
bx lr
|
||||
endfunc
|
||||
32
externals/ffmpeg/libavcodec/arm/asm-offsets.h
vendored
Executable file
32
externals/ffmpeg/libavcodec/arm/asm-offsets.h
vendored
Executable file
@@ -0,0 +1,32 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Mans Rullgard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_ARM_ASM_OFFSETS_H
|
||||
#define AVCODEC_ARM_ASM_OFFSETS_H
|
||||
|
||||
/* MpegEncContext */
|
||||
#define Y_DC_SCALE 0x04
|
||||
#define C_DC_SCALE 0x08
|
||||
#define AC_PRED 0x0c
|
||||
#define BLOCK_LAST_INDEX 0x10
|
||||
#define H263_AIC 0x40
|
||||
#define INTER_SCANTAB_RASTER_END 0x88
|
||||
|
||||
#endif /* AVCODEC_ARM_ASM_OFFSETS_H */
|
||||
26
externals/ffmpeg/libavcodec/arm/audiodsp_arm.h
vendored
Executable file
26
externals/ffmpeg/libavcodec/arm/audiodsp_arm.h
vendored
Executable file
@@ -0,0 +1,26 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_ARM_AUDIODSP_ARM_H
|
||||
#define AVCODEC_ARM_AUDIODSP_ARM_H
|
||||
|
||||
#include "libavcodec/audiodsp.h"
|
||||
|
||||
void ff_audiodsp_init_neon(AudioDSPContext *c);
|
||||
|
||||
#endif /* AVCODEC_ARM_AUDIODSP_ARM_H */
|
||||
33
externals/ffmpeg/libavcodec/arm/audiodsp_init_arm.c
vendored
Executable file
33
externals/ffmpeg/libavcodec/arm/audiodsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,33 @@
|
||||
/*
|
||||
* ARM optimized audio functions
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/audiodsp.h"
|
||||
#include "audiodsp_arm.h"
|
||||
|
||||
av_cold void ff_audiodsp_init_arm(AudioDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags))
|
||||
ff_audiodsp_init_neon(c);
|
||||
}
|
||||
40
externals/ffmpeg/libavcodec/arm/audiodsp_init_neon.c
vendored
Executable file
40
externals/ffmpeg/libavcodec/arm/audiodsp_init_neon.c
vendored
Executable file
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* ARM NEON optimised audio functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/audiodsp.h"
|
||||
#include "audiodsp_arm.h"
|
||||
|
||||
void ff_vector_clipf_neon(float *dst, const float *src, int len, float min, float max);
|
||||
void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
|
||||
int32_t max, unsigned int len);
|
||||
|
||||
int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
|
||||
|
||||
av_cold void ff_audiodsp_init_neon(AudioDSPContext *c)
|
||||
{
|
||||
c->vector_clip_int32 = ff_vector_clip_int32_neon;
|
||||
c->vector_clipf = ff_vector_clipf_neon;
|
||||
|
||||
c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
|
||||
}
|
||||
63
externals/ffmpeg/libavcodec/arm/audiodsp_neon.S
vendored
Executable file
63
externals/ffmpeg/libavcodec/arm/audiodsp_neon.S
vendored
Executable file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
* ARM NEON optimised audio functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_vector_clipf_neon, export=1
|
||||
VFP vdup.32 q1, d0[1]
|
||||
VFP vdup.32 q0, d0[0]
|
||||
NOVFP vdup.32 q0, r3
|
||||
NOVFP vld1.32 {d2[],d3[]}, [sp]
|
||||
vld1.f32 {q2},[r1,:128]!
|
||||
vmin.f32 q10, q2, q1
|
||||
vld1.f32 {q3},[r1,:128]!
|
||||
vmin.f32 q11, q3, q1
|
||||
1: vmax.f32 q8, q10, q0
|
||||
vmax.f32 q9, q11, q0
|
||||
subs r2, r2, #8
|
||||
beq 2f
|
||||
vld1.f32 {q2},[r1,:128]!
|
||||
vmin.f32 q10, q2, q1
|
||||
vld1.f32 {q3},[r1,:128]!
|
||||
vmin.f32 q11, q3, q1
|
||||
vst1.f32 {q8},[r0,:128]!
|
||||
vst1.f32 {q9},[r0,:128]!
|
||||
b 1b
|
||||
2: vst1.f32 {q8},[r0,:128]!
|
||||
vst1.f32 {q9},[r0,:128]!
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_vector_clip_int32_neon, export=1
|
||||
vdup.32 q0, r2
|
||||
vdup.32 q1, r3
|
||||
ldr r2, [sp]
|
||||
1:
|
||||
vld1.32 {q2-q3}, [r1,:128]!
|
||||
vmin.s32 q2, q2, q1
|
||||
vmin.s32 q3, q3, q1
|
||||
vmax.s32 q2, q2, q0
|
||||
vmax.s32 q3, q3, q0
|
||||
vst1.32 {q2-q3}, [r0,:128]!
|
||||
subs r2, r2, #8
|
||||
bgt 1b
|
||||
bx lr
|
||||
endfunc
|
||||
26
externals/ffmpeg/libavcodec/arm/blockdsp_arm.h
vendored
Executable file
26
externals/ffmpeg/libavcodec/arm/blockdsp_arm.h
vendored
Executable file
@@ -0,0 +1,26 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_ARM_BLOCKDSP_ARM_H
|
||||
#define AVCODEC_ARM_BLOCKDSP_ARM_H
|
||||
|
||||
#include "libavcodec/blockdsp.h"
|
||||
|
||||
void ff_blockdsp_init_neon(BlockDSPContext *c);
|
||||
|
||||
#endif /* AVCODEC_ARM_BLOCKDSP_ARM_H */
|
||||
33
externals/ffmpeg/libavcodec/arm/blockdsp_init_arm.c
vendored
Executable file
33
externals/ffmpeg/libavcodec/arm/blockdsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,33 @@
|
||||
/*
|
||||
* ARM optimized block operations
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/blockdsp.h"
|
||||
#include "blockdsp_arm.h"
|
||||
|
||||
av_cold void ff_blockdsp_init_arm(BlockDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags))
|
||||
ff_blockdsp_init_neon(c);
|
||||
}
|
||||
35
externals/ffmpeg/libavcodec/arm/blockdsp_init_neon.c
vendored
Executable file
35
externals/ffmpeg/libavcodec/arm/blockdsp_init_neon.c
vendored
Executable file
@@ -0,0 +1,35 @@
|
||||
/*
|
||||
* ARM NEON optimised block operations
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/blockdsp.h"
|
||||
#include "blockdsp_arm.h"
|
||||
|
||||
void ff_clear_block_neon(int16_t *block);
|
||||
void ff_clear_blocks_neon(int16_t *blocks);
|
||||
|
||||
av_cold void ff_blockdsp_init_neon(BlockDSPContext *c)
|
||||
{
|
||||
c->clear_block = ff_clear_block_neon;
|
||||
c->clear_blocks = ff_clear_blocks_neon;
|
||||
}
|
||||
38
externals/ffmpeg/libavcodec/arm/blockdsp_neon.S
vendored
Executable file
38
externals/ffmpeg/libavcodec/arm/blockdsp_neon.S
vendored
Executable file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
* ARM NEON optimised block functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_clear_block_neon, export=1
|
||||
vmov.i16 q0, #0
|
||||
.rept 8
|
||||
vst1.16 {q0}, [r0,:128]!
|
||||
.endr
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_clear_blocks_neon, export=1
|
||||
vmov.i16 q0, #0
|
||||
.rept 8*6
|
||||
vst1.16 {q0}, [r0,:128]!
|
||||
.endr
|
||||
bx lr
|
||||
endfunc
|
||||
108
externals/ffmpeg/libavcodec/arm/cabac.h
vendored
Executable file
108
externals/ffmpeg/libavcodec/arm/cabac.h
vendored
Executable file
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_ARM_CABAC_H
|
||||
#define AVCODEC_ARM_CABAC_H
|
||||
|
||||
#include "config.h"
|
||||
#if HAVE_ARMV6T2_INLINE
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavcodec/cabac.h"
|
||||
|
||||
#define get_cabac_inline get_cabac_inline_arm
|
||||
static av_always_inline int get_cabac_inline_arm(CABACContext *c,
|
||||
uint8_t *const state)
|
||||
{
|
||||
int bit;
|
||||
void *reg_b, *reg_c, *tmp;
|
||||
|
||||
__asm__ volatile(
|
||||
"ldrb %[bit] , [%[state]] \n\t"
|
||||
"add %[r_b] , %[tables] , %[lps_off] \n\t"
|
||||
"mov %[tmp] , %[range] \n\t"
|
||||
"and %[range] , %[range] , #0xC0 \n\t"
|
||||
"add %[r_b] , %[r_b] , %[bit] \n\t"
|
||||
"ldrb %[range] , [%[r_b], %[range], lsl #1] \n\t"
|
||||
"add %[r_b] , %[tables] , %[norm_off] \n\t"
|
||||
"sub %[r_c] , %[tmp] , %[range] \n\t"
|
||||
"lsl %[tmp] , %[r_c] , #17 \n\t"
|
||||
"cmp %[tmp] , %[low] \n\t"
|
||||
"it gt \n\t"
|
||||
"movgt %[range] , %[r_c] \n\t"
|
||||
"itt cc \n\t"
|
||||
"mvncc %[bit] , %[bit] \n\t"
|
||||
"subcc %[low] , %[low] , %[tmp] \n\t"
|
||||
"add %[r_c] , %[tables] , %[mlps_off] \n\t"
|
||||
"ldrb %[tmp] , [%[r_b], %[range]] \n\t"
|
||||
"ldrb %[r_b] , [%[r_c], %[bit]] \n\t"
|
||||
"lsl %[low] , %[low] , %[tmp] \n\t"
|
||||
"lsl %[range] , %[range] , %[tmp] \n\t"
|
||||
"uxth %[r_c] , %[low] \n\t"
|
||||
"strb %[r_b] , [%[state]] \n\t"
|
||||
"tst %[r_c] , %[r_c] \n\t"
|
||||
"bne 2f \n\t"
|
||||
"ldr %[r_c] , [%[c], %[byte]] \n\t"
|
||||
#if UNCHECKED_BITSTREAM_READER
|
||||
"ldrh %[tmp] , [%[r_c]] \n\t"
|
||||
"add %[r_c] , %[r_c] , #2 \n\t"
|
||||
"str %[r_c] , [%[c], %[byte]] \n\t"
|
||||
#else
|
||||
"ldr %[r_b] , [%[c], %[end]] \n\t"
|
||||
"ldrh %[tmp] , [%[r_c]] \n\t"
|
||||
"cmp %[r_c] , %[r_b] \n\t"
|
||||
"itt lt \n\t"
|
||||
"addlt %[r_c] , %[r_c] , #2 \n\t"
|
||||
"strlt %[r_c] , [%[c], %[byte]] \n\t"
|
||||
#endif
|
||||
"sub %[r_c] , %[low] , #1 \n\t"
|
||||
"add %[r_b] , %[tables] , %[norm_off] \n\t"
|
||||
"eor %[r_c] , %[low] , %[r_c] \n\t"
|
||||
"rev %[tmp] , %[tmp] \n\t"
|
||||
"lsr %[r_c] , %[r_c] , #15 \n\t"
|
||||
"lsr %[tmp] , %[tmp] , #15 \n\t"
|
||||
"ldrb %[r_c] , [%[r_b], %[r_c]] \n\t"
|
||||
"movw %[r_b] , #0xFFFF \n\t"
|
||||
"sub %[tmp] , %[tmp] , %[r_b] \n\t"
|
||||
"rsb %[r_c] , %[r_c] , #7 \n\t"
|
||||
"lsl %[tmp] , %[tmp] , %[r_c] \n\t"
|
||||
"add %[low] , %[low] , %[tmp] \n\t"
|
||||
"2: \n\t"
|
||||
: [bit]"=&r"(bit),
|
||||
[low]"+&r"(c->low),
|
||||
[range]"+&r"(c->range),
|
||||
[r_b]"=&r"(reg_b),
|
||||
[r_c]"=&r"(reg_c),
|
||||
[tmp]"=&r"(tmp)
|
||||
: [c]"r"(c),
|
||||
[state]"r"(state),
|
||||
[tables]"r"(ff_h264_cabac_tables),
|
||||
[byte]"M"(offsetof(CABACContext, bytestream)),
|
||||
[end]"M"(offsetof(CABACContext, bytestream_end)),
|
||||
[norm_off]"I"(H264_NORM_SHIFT_OFFSET),
|
||||
[lps_off]"I"(H264_LPS_RANGE_OFFSET),
|
||||
[mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
|
||||
: "memory", "cc"
|
||||
);
|
||||
|
||||
return bit & 1;
|
||||
}
|
||||
#endif /* HAVE_ARMV6T2_INLINE */
|
||||
|
||||
#endif /* AVCODEC_ARM_CABAC_H */
|
||||
81
externals/ffmpeg/libavcodec/arm/dca.h
vendored
Executable file
81
externals/ffmpeg/libavcodec/arm/dca.h
vendored
Executable file
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_ARM_DCA_H
|
||||
#define AVCODEC_ARM_DCA_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "libavcodec/mathops.h"
|
||||
|
||||
#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB
|
||||
|
||||
#define decode_blockcodes decode_blockcodes
|
||||
static inline int decode_blockcodes(int code1, int code2, int levels,
|
||||
int32_t *values)
|
||||
{
|
||||
int32_t v0, v1, v2, v3, v4, v5;
|
||||
|
||||
__asm__ ("smmul %0, %6, %10 \n"
|
||||
"smmul %3, %7, %10 \n"
|
||||
"smlabb %6, %0, %9, %6 \n"
|
||||
"smlabb %7, %3, %9, %7 \n"
|
||||
"smmul %1, %0, %10 \n"
|
||||
"smmul %4, %3, %10 \n"
|
||||
"sub %6, %6, %8, lsr #1 \n"
|
||||
"sub %7, %7, %8, lsr #1 \n"
|
||||
"smlabb %0, %1, %9, %0 \n"
|
||||
"smlabb %3, %4, %9, %3 \n"
|
||||
"smmul %2, %1, %10 \n"
|
||||
"smmul %5, %4, %10 \n"
|
||||
"str %6, [%11, #0] \n"
|
||||
"str %7, [%11, #16] \n"
|
||||
"sub %0, %0, %8, lsr #1 \n"
|
||||
"sub %3, %3, %8, lsr #1 \n"
|
||||
"smlabb %1, %2, %9, %1 \n"
|
||||
"smlabb %4, %5, %9, %4 \n"
|
||||
"smmul %6, %2, %10 \n"
|
||||
"smmul %7, %5, %10 \n"
|
||||
"str %0, [%11, #4] \n"
|
||||
"str %3, [%11, #20] \n"
|
||||
"sub %1, %1, %8, lsr #1 \n"
|
||||
"sub %4, %4, %8, lsr #1 \n"
|
||||
"smlabb %2, %6, %9, %2 \n"
|
||||
"smlabb %5, %7, %9, %5 \n"
|
||||
"str %1, [%11, #8] \n"
|
||||
"str %4, [%11, #24] \n"
|
||||
"sub %2, %2, %8, lsr #1 \n"
|
||||
"sub %5, %5, %8, lsr #1 \n"
|
||||
"str %2, [%11, #12] \n"
|
||||
"str %5, [%11, #28] \n"
|
||||
: "=&r"(v0), "=&r"(v1), "=&r"(v2),
|
||||
"=&r"(v3), "=&r"(v4), "=&r"(v5),
|
||||
"+&r"(code1), "+&r"(code2)
|
||||
: "r"(levels - 1), "r"(-levels),
|
||||
"r"(ff_inverse[levels]), "r"(values)
|
||||
: "memory");
|
||||
|
||||
return code1 | code2;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* AVCODEC_ARM_DCA_H */
|
||||
50
externals/ffmpeg/libavcodec/arm/fft_fixed_init_arm.c
vendored
Executable file
50
externals/ffmpeg/libavcodec/arm/fft_fixed_init_arm.c
vendored
Executable file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
|
||||
#define FFT_FLOAT 0
|
||||
#include "libavcodec/fft.h"
|
||||
|
||||
void ff_fft_fixed_calc_neon(FFTContext *s, FFTComplex *z);
|
||||
void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i);
|
||||
void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i);
|
||||
|
||||
av_cold void ff_fft_fixed_init_arm(FFTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
|
||||
#if CONFIG_FFT
|
||||
s->fft_calc = ff_fft_fixed_calc_neon;
|
||||
#endif
|
||||
|
||||
#if CONFIG_MDCT
|
||||
if (!s->inverse && s->nbits >= 3) {
|
||||
s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
|
||||
s->mdct_calc = ff_mdct_fixed_calc_neon;
|
||||
s->mdct_calcw = ff_mdct_fixed_calcw_neon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
261
externals/ffmpeg/libavcodec/arm/fft_fixed_neon.S
vendored
Executable file
261
externals/ffmpeg/libavcodec/arm/fft_fixed_neon.S
vendored
Executable file
@@ -0,0 +1,261 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
.macro bflies d0, d1, r0, r1
|
||||
vrev64.32 \r0, \d1 @ t5, t6, t1, t2
|
||||
vhsub.s16 \r1, \d1, \r0 @ t1-t5, t2-t6, t5-t1, t6-t2
|
||||
vhadd.s16 \r0, \d1, \r0 @ t1+t5, t2+t6, t5+t1, t6+t2
|
||||
vext.16 \r1, \r1, \r1, #1 @ t2-t6, t5-t1, t6-t2, t1-t5
|
||||
vtrn.32 \r0, \r1 @ t1+t5, t2+t6, t2-t6, t5-t1
|
||||
@ t5, t6, t4, t3
|
||||
vhsub.s16 \d1, \d0, \r0
|
||||
vhadd.s16 \d0, \d0, \r0
|
||||
.endm
|
||||
|
||||
.macro transform01 q0, q1, d3, c0, c1, r0, w0, w1
|
||||
vrev32.16 \r0, \d3
|
||||
vmull.s16 \w0, \d3, \c0
|
||||
vmlal.s16 \w0, \r0, \c1
|
||||
vshrn.s32 \d3, \w0, #15
|
||||
bflies \q0, \q1, \w0, \w1
|
||||
.endm
|
||||
|
||||
.macro transform2 d0, d1, d2, d3, q0, q1, c0, c1, c2, c3, \
|
||||
r0, r1, w0, w1
|
||||
vrev32.16 \r0, \d1
|
||||
vrev32.16 \r1, \d3
|
||||
vmull.s16 \w0, \d1, \c0
|
||||
vmlal.s16 \w0, \r0, \c1
|
||||
vmull.s16 \w1, \d3, \c2
|
||||
vmlal.s16 \w1, \r1, \c3
|
||||
vshrn.s32 \d1, \w0, #15
|
||||
vshrn.s32 \d3, \w1, #15
|
||||
bflies \q0, \q1, \w0, \w1
|
||||
.endm
|
||||
|
||||
.macro fft4 d0, d1, r0, r1
|
||||
vhsub.s16 \r0, \d0, \d1 @ t3, t4, t8, t7
|
||||
vhsub.s16 \r1, \d1, \d0
|
||||
vhadd.s16 \d0, \d0, \d1 @ t1, t2, t6, t5
|
||||
vmov.i64 \d1, #0xffff00000000
|
||||
vbit \r0, \r1, \d1
|
||||
vrev64.16 \r1, \r0 @ t7, t8, t4, t3
|
||||
vtrn.32 \r0, \r1 @ t3, t4, t7, t8
|
||||
vtrn.32 \d0, \r0 @ t1, t2, t3, t4, t6, t5, t8, t7
|
||||
vhsub.s16 \d1, \d0, \r0 @ r2, i2, r3, i1
|
||||
vhadd.s16 \d0, \d0, \r0 @ r0, i0, r1, i3
|
||||
.endm
|
||||
|
||||
.macro fft8 d0, d1, d2, d3, q0, q1, c0, c1, r0, r1, w0, w1
|
||||
fft4 \d0, \d1, \r0, \r1
|
||||
vtrn.32 \d0, \d1 @ z0, z2, z1, z3
|
||||
vhadd.s16 \r0, \d2, \d3 @ t1, t2, t3, t4
|
||||
vhsub.s16 \d3, \d2, \d3 @ z5, z7
|
||||
vmov \d2, \r0
|
||||
transform01 \q0, \q1, \d3, \c0, \c1, \r0, \w0, \w1
|
||||
.endm
|
||||
|
||||
function fft4_neon
|
||||
vld1.16 {d0-d1}, [r0]
|
||||
fft4 d0, d1, d2, d3
|
||||
vst1.16 {d0-d1}, [r0]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function fft8_neon
|
||||
vld1.16 {d0-d3}, [r0,:128]
|
||||
movrel r1, coefs
|
||||
vld1.16 {d30}, [r1,:64]
|
||||
vdup.16 d31, d30[0]
|
||||
fft8 d0, d1, d2, d3, q0, q1, d31, d30, d20, d21, q8, q9
|
||||
vtrn.32 d0, d1
|
||||
vtrn.32 d2, d3
|
||||
vst1.16 {d0-d3}, [r0,:128]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function fft16_neon
|
||||
vld1.16 {d0-d3}, [r0,:128]!
|
||||
vld1.16 {d4-d7}, [r0,:128]
|
||||
movrel r1, coefs
|
||||
sub r0, r0, #32
|
||||
vld1.16 {d28-d31},[r1,:128]
|
||||
vdup.16 d31, d28[0]
|
||||
fft8 d0, d1, d2, d3, q0, q1, d31, d28, d20, d21, q8, q9
|
||||
vswp d5, d6
|
||||
fft4 q2, q3, q8, q9
|
||||
vswp d5, d6
|
||||
vtrn.32 q0, q1 @ z0, z4, z2, z6, z1, z5, z3, z7
|
||||
vtrn.32 q2, q3 @ z8, z12,z10,z14,z9, z13,z11,z15
|
||||
vswp d1, d2
|
||||
vdup.16 d31, d28[0]
|
||||
transform01 q0, q2, d5, d31, d28, d20, q8, q9
|
||||
vdup.16 d26, d29[0]
|
||||
vdup.16 d27, d30[0]
|
||||
transform2 d2, d6, d3, d7, q1, q3, d26, d30, d27, d29, \
|
||||
d20, d21, q8, q9
|
||||
vtrn.32 q0, q1
|
||||
vtrn.32 q2, q3
|
||||
vst1.16 {d0-d3}, [r0,:128]!
|
||||
vst1.16 {d4-d7}, [r0,:128]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function fft_pass_neon
|
||||
push {r4,lr}
|
||||
movrel lr, coefs + 24
|
||||
vld1.16 {d30}, [lr,:64]
|
||||
lsl r12, r2, #3
|
||||
vmov d31, d30
|
||||
add r3, r1, r2, lsl #2
|
||||
mov lr, #-8
|
||||
sub r3, r3, #2
|
||||
mov r4, r0
|
||||
vld1.16 {d27[]}, [r3,:16]
|
||||
sub r3, r3, #6
|
||||
vld1.16 {q0}, [r4,:128], r12
|
||||
vld1.16 {q1}, [r4,:128], r12
|
||||
vld1.16 {q2}, [r4,:128], r12
|
||||
vld1.16 {q3}, [r4,:128], r12
|
||||
vld1.16 {d28}, [r1,:64]!
|
||||
vld1.16 {d29}, [r3,:64], lr
|
||||
vswp d1, d2
|
||||
vswp d5, d6
|
||||
vtrn.32 d0, d1
|
||||
vtrn.32 d4, d5
|
||||
vdup.16 d25, d28[1]
|
||||
vmul.s16 d27, d27, d31
|
||||
transform01 q0, q2, d5, d25, d27, d20, q8, q9
|
||||
b 2f
|
||||
1:
|
||||
mov r4, r0
|
||||
vdup.16 d26, d29[0]
|
||||
vld1.16 {q0}, [r4,:128], r12
|
||||
vld1.16 {q1}, [r4,:128], r12
|
||||
vld1.16 {q2}, [r4,:128], r12
|
||||
vld1.16 {q3}, [r4,:128], r12
|
||||
vld1.16 {d28}, [r1,:64]!
|
||||
vld1.16 {d29}, [r3,:64], lr
|
||||
vswp d1, d2
|
||||
vswp d5, d6
|
||||
vtrn.32 d0, d1
|
||||
vtrn.32 d4, d5
|
||||
vdup.16 d24, d28[0]
|
||||
vdup.16 d25, d28[1]
|
||||
vdup.16 d27, d29[3]
|
||||
vmul.s16 q13, q13, q15
|
||||
transform2 d0, d4, d1, d5, q0, q2, d24, d26, d25, d27, \
|
||||
d16, d17, q9, q10
|
||||
2:
|
||||
vtrn.32 d2, d3
|
||||
vtrn.32 d6, d7
|
||||
vdup.16 d24, d28[2]
|
||||
vdup.16 d26, d29[2]
|
||||
vdup.16 d25, d28[3]
|
||||
vdup.16 d27, d29[1]
|
||||
vmul.s16 q13, q13, q15
|
||||
transform2 d2, d6, d3, d7, q1, q3, d24, d26, d25, d27, \
|
||||
d16, d17, q9, q10
|
||||
vtrn.32 d0, d1
|
||||
vtrn.32 d2, d3
|
||||
vtrn.32 d4, d5
|
||||
vtrn.32 d6, d7
|
||||
vswp d1, d2
|
||||
vswp d5, d6
|
||||
mov r4, r0
|
||||
vst1.16 {q0}, [r4,:128], r12
|
||||
vst1.16 {q1}, [r4,:128], r12
|
||||
vst1.16 {q2}, [r4,:128], r12
|
||||
vst1.16 {q3}, [r4,:128], r12
|
||||
add r0, r0, #16
|
||||
subs r2, r2, #2
|
||||
bgt 1b
|
||||
pop {r4,pc}
|
||||
endfunc
|
||||
|
||||
#define F_SQRT1_2 23170
|
||||
#define F_COS_16_1 30274
|
||||
#define F_COS_16_3 12540
|
||||
|
||||
const coefs, align=4
|
||||
.short F_SQRT1_2, -F_SQRT1_2, -F_SQRT1_2, F_SQRT1_2
|
||||
.short F_COS_16_1,-F_COS_16_1,-F_COS_16_1, F_COS_16_1
|
||||
.short F_COS_16_3,-F_COS_16_3,-F_COS_16_3, F_COS_16_3
|
||||
.short 1, -1, -1, 1
|
||||
endconst
|
||||
|
||||
.macro def_fft n, n2, n4
|
||||
function fft\n\()_neon
|
||||
push {r4, lr}
|
||||
mov r4, r0
|
||||
bl fft\n2\()_neon
|
||||
add r0, r4, #\n4*2*4
|
||||
bl fft\n4\()_neon
|
||||
add r0, r4, #\n4*3*4
|
||||
bl fft\n4\()_neon
|
||||
mov r0, r4
|
||||
pop {r4, lr}
|
||||
movrelx r1, X(ff_cos_\n\()_fixed)
|
||||
mov r2, #\n4/2
|
||||
b fft_pass_neon
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
def_fft 32, 16, 8
|
||||
def_fft 64, 32, 16
|
||||
def_fft 128, 64, 32
|
||||
def_fft 256, 128, 64
|
||||
def_fft 512, 256, 128
|
||||
def_fft 1024, 512, 256
|
||||
def_fft 2048, 1024, 512
|
||||
def_fft 4096, 2048, 1024
|
||||
def_fft 8192, 4096, 2048
|
||||
def_fft 16384, 8192, 4096
|
||||
def_fft 32768, 16384, 8192
|
||||
def_fft 65536, 32768, 16384
|
||||
|
||||
function ff_fft_fixed_calc_neon, export=1
|
||||
ldr r2, [r0]
|
||||
sub r2, r2, #2
|
||||
movrel r3, fft_fixed_tab_neon
|
||||
ldr r3, [r3, r2, lsl #2]
|
||||
mov r0, r1
|
||||
bx r3
|
||||
endfunc
|
||||
|
||||
const fft_fixed_tab_neon, relocate=1
|
||||
.word fft4_neon
|
||||
.word fft8_neon
|
||||
.word fft16_neon
|
||||
.word fft32_neon
|
||||
.word fft64_neon
|
||||
.word fft128_neon
|
||||
.word fft256_neon
|
||||
.word fft512_neon
|
||||
.word fft1024_neon
|
||||
.word fft2048_neon
|
||||
.word fft4096_neon
|
||||
.word fft8192_neon
|
||||
.word fft16384_neon
|
||||
.word fft32768_neon
|
||||
.word fft65536_neon
|
||||
endconst
|
||||
61
externals/ffmpeg/libavcodec/arm/fft_init_arm.c
vendored
Executable file
61
externals/ffmpeg/libavcodec/arm/fft_init_arm.c
vendored
Executable file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
|
||||
#include "libavcodec/fft.h"
|
||||
|
||||
void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
|
||||
|
||||
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
|
||||
|
||||
void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
av_cold void ff_fft_init_arm(FFTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_vfp_vm(cpu_flags)) {
|
||||
s->fft_calc = ff_fft_calc_vfp;
|
||||
#if CONFIG_MDCT
|
||||
s->imdct_half = ff_imdct_half_vfp;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
#if CONFIG_FFT
|
||||
s->fft_permute = ff_fft_permute_neon;
|
||||
s->fft_calc = ff_fft_calc_neon;
|
||||
#endif
|
||||
#if CONFIG_MDCT
|
||||
s->imdct_calc = ff_imdct_calc_neon;
|
||||
s->imdct_half = ff_imdct_half_neon;
|
||||
s->mdct_calc = ff_mdct_calc_neon;
|
||||
s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
375
externals/ffmpeg/libavcodec/arm/fft_neon.S
vendored
Executable file
375
externals/ffmpeg/libavcodec/arm/fft_neon.S
vendored
Executable file
@@ -0,0 +1,375 @@
|
||||
/*
|
||||
* ARM NEON optimised FFT
|
||||
*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2009 Naotoshi Nojiri
|
||||
*
|
||||
* This algorithm (though not any of the implementation details) is
|
||||
* based on libdjbfft by D. J. Bernstein.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
#define M_SQRT1_2 0.70710678118654752440
|
||||
|
||||
|
||||
function fft4_neon
|
||||
vld1.32 {d0-d3}, [r0,:128]
|
||||
|
||||
vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2
|
||||
vsub.f32 d6, d0, d1 @ r0-r1,i0-i1
|
||||
vsub.f32 d7, d16, d17 @ r3-r2,i2-i3
|
||||
vadd.f32 d4, d0, d1 @ r0+r1,i0+i1
|
||||
vadd.f32 d5, d2, d3 @ i2+i3,r2+r3
|
||||
vadd.f32 d1, d6, d7
|
||||
vsub.f32 d3, d6, d7
|
||||
vadd.f32 d0, d4, d5
|
||||
vsub.f32 d2, d4, d5
|
||||
|
||||
vst1.32 {d0-d3}, [r0,:128]
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function fft8_neon
|
||||
mov r1, r0
|
||||
vld1.32 {d0-d3}, [r1,:128]!
|
||||
vld1.32 {d16-d19}, [r1,:128]
|
||||
|
||||
movw r2, #0x04f3 @ sqrt(1/2)
|
||||
movt r2, #0x3f35
|
||||
eor r3, r2, #1<<31
|
||||
vdup.32 d31, r2
|
||||
|
||||
vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2
|
||||
vadd.f32 d4, d16, d17 @ r4+r5,i4+i5
|
||||
vmov d28, r3, r2
|
||||
vadd.f32 d5, d18, d19 @ r6+r7,i6+i7
|
||||
vsub.f32 d17, d16, d17 @ r4-r5,i4-i5
|
||||
vsub.f32 d19, d18, d19 @ r6-r7,i6-i7
|
||||
vrev64.32 d29, d28
|
||||
vadd.f32 d20, d0, d1 @ r0+r1,i0+i1
|
||||
vadd.f32 d21, d2, d3 @ r2+r3,i2+i3
|
||||
vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w
|
||||
vext.32 q3, q2, q2, #1
|
||||
vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w
|
||||
vsub.f32 d23, d22, d23 @ i2-i3,r3-r2
|
||||
vsub.f32 d22, d0, d1 @ r0-r1,i0-i1
|
||||
vmul.f32 d24, d17, d31 @ a2r*w,a2i*w
|
||||
vmul.f32 d25, d19, d31 @ a3r*w,a3i*w
|
||||
vadd.f32 d0, d20, d21
|
||||
vsub.f32 d2, d20, d21
|
||||
vadd.f32 d1, d22, d23
|
||||
vrev64.32 q13, q13
|
||||
vsub.f32 d3, d22, d23
|
||||
vsub.f32 d6, d6, d7
|
||||
vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2
|
||||
vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6
|
||||
vadd.f32 d7, d4, d5
|
||||
vsub.f32 d18, d2, d6
|
||||
vext.32 q13, q12, q12, #1
|
||||
vadd.f32 d2, d2, d6
|
||||
vsub.f32 d16, d0, d7
|
||||
vadd.f32 d5, d25, d24
|
||||
vsub.f32 d4, d26, d27
|
||||
vadd.f32 d0, d0, d7
|
||||
vsub.f32 d17, d1, d5
|
||||
vsub.f32 d19, d3, d4
|
||||
vadd.f32 d3, d3, d4
|
||||
vadd.f32 d1, d1, d5
|
||||
|
||||
vst1.32 {d16-d19}, [r1,:128]
|
||||
vst1.32 {d0-d3}, [r0,:128]
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function fft16_neon
|
||||
movrel r1, mppm
|
||||
vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
|
||||
pld [r0, #32]
|
||||
vld1.32 {d2-d3}, [r1,:128]
|
||||
vext.32 q13, q9, q9, #1
|
||||
vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
|
||||
vadd.f32 d4, d16, d17
|
||||
vsub.f32 d5, d16, d17
|
||||
vadd.f32 d18, d18, d19
|
||||
vsub.f32 d19, d26, d27
|
||||
|
||||
vadd.f32 d20, d22, d23
|
||||
vsub.f32 d22, d22, d23
|
||||
vsub.f32 d23, d24, d25
|
||||
vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1}
|
||||
vadd.f32 d21, d24, d25
|
||||
vmul.f32 d24, d22, d2
|
||||
vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3}
|
||||
vmul.f32 d25, d23, d3
|
||||
vuzp.32 d16, d17 @ {r0,r1,i0,i1}
|
||||
vmul.f32 q1, q11, d2[1]
|
||||
vuzp.32 d18, d19 @ {r2,r3,i2,i3}
|
||||
vrev64.32 q12, q12
|
||||
vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6}
|
||||
vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
|
||||
vzip.32 q10, q11
|
||||
vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
|
||||
vadd.f32 d0, d22, d20
|
||||
vadd.f32 d1, d21, d23
|
||||
vsub.f32 d2, d21, d23
|
||||
vsub.f32 d3, d22, d20
|
||||
sub r0, r0, #96
|
||||
vext.32 q13, q13, q13, #1
|
||||
vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5}
|
||||
vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
|
||||
vext.32 q15, q15, q15, #1
|
||||
vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7}
|
||||
vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
|
||||
vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3}
|
||||
vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
|
||||
vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6}
|
||||
vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a}
|
||||
movrelx r2, X(ff_cos_16)
|
||||
vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8}
|
||||
vrev64.32 d1, d1
|
||||
vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a}
|
||||
vrev64.32 d3, d3
|
||||
movrel r3, pmmp
|
||||
vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
|
||||
vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
|
||||
vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9}
|
||||
vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13}
|
||||
vld1.32 {d4-d5}, [r2,:64]
|
||||
vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11}
|
||||
vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15}
|
||||
vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
|
||||
vld1.32 {d6-d7}, [r3,:128]
|
||||
vrev64.32 q1, q14
|
||||
vmul.f32 q14, q14, d4[1]
|
||||
vmul.f32 q1, q1, q3
|
||||
vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a}
|
||||
vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
|
||||
vzip.32 q12, q14
|
||||
vadd.f32 d0, d28, d24
|
||||
vadd.f32 d1, d25, d29
|
||||
vsub.f32 d2, d25, d29
|
||||
vsub.f32 d3, d28, d24
|
||||
vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9}
|
||||
vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
|
||||
vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13}
|
||||
mov r1, #32
|
||||
vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5}
|
||||
vrev64.32 q0, q13
|
||||
vmul.f32 q13, q13, d5[0]
|
||||
vrev64.32 q1, q15
|
||||
vmul.f32 q15, q15, d5[1]
|
||||
vst2.32 {d16-d17},[r0,:128], r1
|
||||
vmul.f32 q0, q0, q3
|
||||
vst2.32 {d20-d21},[r0,:128], r1
|
||||
vmul.f32 q1, q1, q3
|
||||
vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6}
|
||||
vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a}
|
||||
vst2.32 {d24-d25},[r0,:128], r1
|
||||
vst2.32 {d28-d29},[r0,:128]
|
||||
vzip.32 q13, q15
|
||||
sub r0, r0, #80
|
||||
vadd.f32 d0, d30, d26
|
||||
vadd.f32 d1, d27, d31
|
||||
vsub.f32 d2, d27, d31
|
||||
vsub.f32 d3, d30, d26
|
||||
vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11}
|
||||
vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3}
|
||||
vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15}
|
||||
vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7}
|
||||
vst2.32 {d18-d19},[r0,:128], r1
|
||||
vst2.32 {d22-d23},[r0,:128], r1
|
||||
vst2.32 {d26-d27},[r0,:128], r1
|
||||
vst2.32 {d30-d31},[r0,:128]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function fft_pass_neon
|
||||
push {r4-r6,lr}
|
||||
mov r6, r2 @ n
|
||||
lsl r5, r2, #3 @ 2 * n * sizeof FFTSample
|
||||
lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex
|
||||
lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex
|
||||
add r3, r2, r4
|
||||
add r4, r4, r0 @ &z[o1]
|
||||
add r2, r2, r0 @ &z[o2]
|
||||
add r3, r3, r0 @ &z[o3]
|
||||
vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
|
||||
movrel r12, pmmp
|
||||
vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
|
||||
add r5, r5, r1 @ wim
|
||||
vld1.32 {d6-d7}, [r12,:128] @ pmmp
|
||||
vswp d21, d22
|
||||
vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]}
|
||||
sub r5, r5, #4 @ wim--
|
||||
vrev64.32 q1, q11
|
||||
vmul.f32 q11, q11, d4[1]
|
||||
vmul.f32 q1, q1, q3
|
||||
vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1]
|
||||
vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
|
||||
vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
|
||||
sub r6, r6, #1 @ n--
|
||||
vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
|
||||
vzip.32 q10, q11
|
||||
vadd.f32 d0, d22, d20
|
||||
vadd.f32 d1, d21, d23
|
||||
vsub.f32 d2, d21, d23
|
||||
vsub.f32 d3, d22, d20
|
||||
vsub.f32 q10, q8, q0
|
||||
vadd.f32 q8, q8, q0
|
||||
vsub.f32 q11, q9, q1
|
||||
vadd.f32 q9, q9, q1
|
||||
vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]}
|
||||
vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]}
|
||||
vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]}
|
||||
vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]}
|
||||
sub r5, r5, #8 @ wim -= 2
|
||||
1:
|
||||
vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
|
||||
vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
|
||||
vswp d21, d22
|
||||
vld1.32 {d4}, [r1]! @ {wre[0],wre[1]}
|
||||
vrev64.32 q0, q10
|
||||
vmul.f32 q10, q10, d4[0]
|
||||
vrev64.32 q1, q11
|
||||
vmul.f32 q11, q11, d4[1]
|
||||
vld1.32 {d5}, [r5] @ {wim[-1],wim[0]}
|
||||
vmul.f32 q0, q0, q3
|
||||
sub r5, r5, #8 @ wim -= 2
|
||||
vmul.f32 q1, q1, q3
|
||||
vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6}
|
||||
vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
|
||||
vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
|
||||
subs r6, r6, #1 @ n--
|
||||
vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
|
||||
vzip.32 q10, q11
|
||||
vadd.f32 d0, d22, d20
|
||||
vadd.f32 d1, d21, d23
|
||||
vsub.f32 d2, d21, d23
|
||||
vsub.f32 d3, d22, d20
|
||||
vsub.f32 q10, q8, q0
|
||||
vadd.f32 q8, q8, q0
|
||||
vsub.f32 q11, q9, q1
|
||||
vadd.f32 q9, q9, q1
|
||||
vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]}
|
||||
vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]}
|
||||
vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]}
|
||||
vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]}
|
||||
bne 1b
|
||||
|
||||
pop {r4-r6,pc}
|
||||
endfunc
|
||||
|
||||
.macro def_fft n, n2, n4
|
||||
.align 6
|
||||
function fft\n\()_neon
|
||||
push {r4, lr}
|
||||
mov r4, r0
|
||||
bl fft\n2\()_neon
|
||||
add r0, r4, #\n4*2*8
|
||||
bl fft\n4\()_neon
|
||||
add r0, r4, #\n4*3*8
|
||||
bl fft\n4\()_neon
|
||||
mov r0, r4
|
||||
pop {r4, lr}
|
||||
movrelx r1, X(ff_cos_\n)
|
||||
mov r2, #\n4/2
|
||||
b fft_pass_neon
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
def_fft 32, 16, 8
|
||||
def_fft 64, 32, 16
|
||||
def_fft 128, 64, 32
|
||||
def_fft 256, 128, 64
|
||||
def_fft 512, 256, 128
|
||||
def_fft 1024, 512, 256
|
||||
def_fft 2048, 1024, 512
|
||||
def_fft 4096, 2048, 1024
|
||||
def_fft 8192, 4096, 2048
|
||||
def_fft 16384, 8192, 4096
|
||||
def_fft 32768, 16384, 8192
|
||||
def_fft 65536, 32768, 16384
|
||||
|
||||
function ff_fft_calc_neon, export=1
|
||||
ldr r2, [r0]
|
||||
sub r2, r2, #2
|
||||
movrel r3, fft_tab_neon
|
||||
ldr r3, [r3, r2, lsl #2]
|
||||
mov r0, r1
|
||||
bx r3
|
||||
endfunc
|
||||
|
||||
function ff_fft_permute_neon, export=1
|
||||
push {r4,lr}
|
||||
mov r12, #1
|
||||
ldr r2, [r0] @ nbits
|
||||
ldr r3, [r0, #12] @ tmp_buf
|
||||
ldr r0, [r0, #8] @ revtab
|
||||
lsl r12, r12, r2
|
||||
mov r2, r12
|
||||
1:
|
||||
vld1.32 {d0-d1}, [r1,:128]!
|
||||
ldr r4, [r0], #4
|
||||
uxth lr, r4
|
||||
uxth r4, r4, ror #16
|
||||
add lr, r3, lr, lsl #3
|
||||
add r4, r3, r4, lsl #3
|
||||
vst1.32 {d0}, [lr,:64]
|
||||
vst1.32 {d1}, [r4,:64]
|
||||
subs r12, r12, #2
|
||||
bgt 1b
|
||||
|
||||
sub r1, r1, r2, lsl #3
|
||||
1:
|
||||
vld1.32 {d0-d3}, [r3,:128]!
|
||||
vst1.32 {d0-d3}, [r1,:128]!
|
||||
subs r2, r2, #4
|
||||
bgt 1b
|
||||
|
||||
pop {r4,pc}
|
||||
endfunc
|
||||
|
||||
const fft_tab_neon, relocate=1
|
||||
.word fft4_neon
|
||||
.word fft8_neon
|
||||
.word fft16_neon
|
||||
.word fft32_neon
|
||||
.word fft64_neon
|
||||
.word fft128_neon
|
||||
.word fft256_neon
|
||||
.word fft512_neon
|
||||
.word fft1024_neon
|
||||
.word fft2048_neon
|
||||
.word fft4096_neon
|
||||
.word fft8192_neon
|
||||
.word fft16384_neon
|
||||
.word fft32768_neon
|
||||
.word fft65536_neon
|
||||
endconst
|
||||
|
||||
const pmmp, align=4
|
||||
.float +1.0, -1.0, -1.0, +1.0
|
||||
endconst
|
||||
|
||||
const mppm, align=4
|
||||
.float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
|
||||
endconst
|
||||
530
externals/ffmpeg/libavcodec/arm/fft_vfp.S
vendored
Executable file
530
externals/ffmpeg/libavcodec/arm/fft_vfp.S
vendored
Executable file
@@ -0,0 +1,530 @@
|
||||
/*
|
||||
* Copyright (c) 2013 RISC OS Open Ltd
|
||||
* Author: Ben Avison <bavison@riscosopen.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
|
||||
@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
|
||||
@ all single-precision VFP registers may be corrupted on exit. The a2
|
||||
@ register may not be clobbered in these functions, as it holds the
|
||||
@ stored original FPSCR.
|
||||
|
||||
function ff_fft_calc_vfp, export=1
|
||||
ldr ip, [a1, #0] @ nbits
|
||||
mov a1, a2
|
||||
movrel a2, (fft_tab_vfp - 8)
|
||||
ldr pc, [a2, ip, lsl #2]
|
||||
endfunc
|
||||
const fft_tab_vfp, relocate=1
|
||||
.word fft4_vfp
|
||||
.word fft8_vfp
|
||||
.word X(ff_fft16_vfp) @ this one alone is exported
|
||||
.word fft32_vfp
|
||||
.word fft64_vfp
|
||||
.word fft128_vfp
|
||||
.word fft256_vfp
|
||||
.word fft512_vfp
|
||||
.word fft1024_vfp
|
||||
.word fft2048_vfp
|
||||
.word fft4096_vfp
|
||||
.word fft8192_vfp
|
||||
.word fft16384_vfp
|
||||
.word fft32768_vfp
|
||||
.word fft65536_vfp
|
||||
endconst
|
||||
|
||||
function fft4_vfp
|
||||
vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
|
||||
vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
|
||||
vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
|
||||
vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
|
||||
@ stall
|
||||
vadd.f s12, s0, s8 @ i0
|
||||
vadd.f s13, s1, s9 @ i1
|
||||
vadd.f s14, s2, s10 @ i2
|
||||
vadd.f s15, s3, s11 @ i3
|
||||
vsub.f s8, s0, s8 @ i4
|
||||
vsub.f s9, s1, s9 @ i5
|
||||
vsub.f s10, s2, s10 @ i6
|
||||
vsub.f s11, s3, s11 @ i7
|
||||
@ stall
|
||||
@ stall
|
||||
vadd.f s0, s12, s14 @ z[0].re
|
||||
vsub.f s4, s12, s14 @ z[2].re
|
||||
vadd.f s1, s13, s15 @ z[0].im
|
||||
vsub.f s5, s13, s15 @ z[2].im
|
||||
vadd.f s7, s9, s10 @ z[3].im
|
||||
vsub.f s3, s9, s10 @ z[1].im
|
||||
vadd.f s2, s8, s11 @ z[1].re
|
||||
vsub.f s6, s8, s11 @ z[3].re
|
||||
@ stall
|
||||
@ stall
|
||||
vstr d0, [a1, #0*2*4]
|
||||
vstr d2, [a1, #2*2*4]
|
||||
@ stall
|
||||
@ stall
|
||||
vstr d1, [a1, #1*2*4]
|
||||
vstr d3, [a1, #3*2*4]
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
.macro macro_fft8_head
|
||||
@ FFT4
|
||||
vldr d4, [a1, #0 * 2*4]
|
||||
vldr d6, [a1, #1 * 2*4]
|
||||
vldr d5, [a1, #2 * 2*4]
|
||||
vldr d7, [a1, #3 * 2*4]
|
||||
@ BF
|
||||
vldr d12, [a1, #4 * 2*4]
|
||||
vadd.f s16, s8, s12 @ vector op
|
||||
vldr d14, [a1, #5 * 2*4]
|
||||
vldr d13, [a1, #6 * 2*4]
|
||||
vldr d15, [a1, #7 * 2*4]
|
||||
vsub.f s20, s8, s12 @ vector op
|
||||
vadd.f s0, s16, s18
|
||||
vsub.f s2, s16, s18
|
||||
vadd.f s1, s17, s19
|
||||
vsub.f s3, s17, s19
|
||||
vadd.f s7, s21, s22
|
||||
vsub.f s5, s21, s22
|
||||
vadd.f s4, s20, s23
|
||||
vsub.f s6, s20, s23
|
||||
vsub.f s20, s24, s28 @ vector op
|
||||
vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
|
||||
vstr d1, [a1, #1 * 2*4]
|
||||
vldr s0, cos1pi4
|
||||
vadd.f s16, s24, s28 @ vector op
|
||||
vstr d2, [a1, #2 * 2*4]
|
||||
vstr d3, [a1, #3 * 2*4]
|
||||
vldr d12, [a1, #0 * 2*4]
|
||||
@ TRANSFORM
|
||||
vmul.f s20, s20, s0 @ vector x scalar op
|
||||
vldr d13, [a1, #1 * 2*4]
|
||||
vldr d14, [a1, #2 * 2*4]
|
||||
vldr d15, [a1, #3 * 2*4]
|
||||
@ BUTTERFLIES
|
||||
vadd.f s0, s18, s16
|
||||
vadd.f s1, s17, s19
|
||||
vsub.f s2, s17, s19
|
||||
vsub.f s3, s18, s16
|
||||
vadd.f s4, s21, s20
|
||||
vsub.f s5, s21, s20
|
||||
vadd.f s6, s22, s23
|
||||
vsub.f s7, s22, s23
|
||||
vadd.f s8, s0, s24 @ vector op
|
||||
vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
|
||||
vstr d1, [a1, #1 * 2*4]
|
||||
vldr d6, [a1, #0 * 2*4]
|
||||
vldr d7, [a1, #1 * 2*4]
|
||||
vadd.f s1, s5, s6
|
||||
vadd.f s0, s7, s4
|
||||
vsub.f s2, s5, s6
|
||||
vsub.f s3, s7, s4
|
||||
vsub.f s12, s24, s12 @ vector op
|
||||
vsub.f s5, s29, s1
|
||||
vsub.f s4, s28, s0
|
||||
vsub.f s6, s30, s2
|
||||
vsub.f s7, s31, s3
|
||||
vadd.f s16, s0, s28 @ vector op
|
||||
vstr d6, [a1, #4 * 2*4]
|
||||
vstr d7, [a1, #6 * 2*4]
|
||||
vstr d4, [a1, #0 * 2*4]
|
||||
vstr d5, [a1, #2 * 2*4]
|
||||
vstr d2, [a1, #5 * 2*4]
|
||||
vstr d3, [a1, #7 * 2*4]
|
||||
.endm
|
||||
|
||||
.macro macro_fft8_tail
|
||||
vstr d8, [a1, #1 * 2*4]
|
||||
vstr d9, [a1, #3 * 2*4]
|
||||
.endm
|
||||
|
||||
function .Lfft8_internal_vfp
|
||||
macro_fft8_head
|
||||
macro_fft8_tail
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function fft8_vfp
|
||||
ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
|
||||
fmrx a2, FPSCR
|
||||
fmxr FPSCR, a3
|
||||
vpush {s16-s31}
|
||||
mov ip, lr
|
||||
bl .Lfft8_internal_vfp
|
||||
vpop {s16-s31}
|
||||
fmxr FPSCR, a2
|
||||
bx ip
|
||||
endfunc
|
||||
|
||||
.align 3
|
||||
cos1pi4: @ cos(1*pi/4) = sqrt(2)
|
||||
.float 0.707106769084930419921875
|
||||
cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
|
||||
.float 0.92387950420379638671875
|
||||
cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
|
||||
.float 0.3826834261417388916015625
|
||||
|
||||
function .Lfft16_internal_vfp
|
||||
macro_fft8_head
|
||||
@ FFT4(z+8)
|
||||
vldr d10, [a1, #8 * 2*4]
|
||||
vldr d12, [a1, #9 * 2*4]
|
||||
vldr d11, [a1, #10 * 2*4]
|
||||
vldr d13, [a1, #11 * 2*4]
|
||||
macro_fft8_tail
|
||||
vadd.f s16, s20, s24 @ vector op
|
||||
@ FFT4(z+12)
|
||||
vldr d4, [a1, #12 * 2*4]
|
||||
vldr d6, [a1, #13 * 2*4]
|
||||
vldr d5, [a1, #14 * 2*4]
|
||||
vsub.f s20, s20, s24 @ vector op
|
||||
vldr d7, [a1, #15 * 2*4]
|
||||
vadd.f s0, s16, s18
|
||||
vsub.f s4, s16, s18
|
||||
vadd.f s1, s17, s19
|
||||
vsub.f s5, s17, s19
|
||||
vadd.f s7, s21, s22
|
||||
vsub.f s3, s21, s22
|
||||
vadd.f s2, s20, s23
|
||||
vsub.f s6, s20, s23
|
||||
vadd.f s16, s8, s12 @ vector op
|
||||
vstr d0, [a1, #8 * 2*4]
|
||||
vstr d2, [a1, #10 * 2*4]
|
||||
vstr d1, [a1, #9 * 2*4]
|
||||
vsub.f s20, s8, s12
|
||||
vstr d3, [a1, #11 * 2*4]
|
||||
@ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
|
||||
vldr d12, [a1, #10 * 2*4]
|
||||
vadd.f s0, s16, s18
|
||||
vadd.f s1, s17, s19
|
||||
vsub.f s6, s16, s18
|
||||
vsub.f s7, s17, s19
|
||||
vsub.f s3, s21, s22
|
||||
vadd.f s2, s20, s23
|
||||
vadd.f s5, s21, s22
|
||||
vsub.f s4, s20, s23
|
||||
vstr d0, [a1, #12 * 2*4]
|
||||
vmov s0, s6
|
||||
@ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
|
||||
vldr d6, [a1, #9 * 2*4]
|
||||
vstr d1, [a1, #13 * 2*4]
|
||||
vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
|
||||
vstr d2, [a1, #15 * 2*4]
|
||||
vldr d7, [a1, #13 * 2*4]
|
||||
vadd.f s4, s25, s24
|
||||
vsub.f s5, s25, s24
|
||||
vsub.f s6, s0, s7
|
||||
vadd.f s7, s0, s7
|
||||
vmul.f s20, s12, s3 @ vector op
|
||||
@ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
|
||||
vldr d4, [a1, #11 * 2*4]
|
||||
vldr d5, [a1, #15 * 2*4]
|
||||
vldr s1, cos3pi8
|
||||
vmul.f s24, s4, s2 @ vector * scalar op
|
||||
vmul.f s28, s12, s1 @ vector * scalar op
|
||||
vmul.f s12, s8, s1 @ vector * scalar op
|
||||
vadd.f s4, s20, s29
|
||||
vsub.f s5, s21, s28
|
||||
vsub.f s6, s22, s31
|
||||
vadd.f s7, s23, s30
|
||||
vmul.f s8, s8, s3 @ vector * scalar op
|
||||
vldr d8, [a1, #1 * 2*4]
|
||||
vldr d9, [a1, #5 * 2*4]
|
||||
vldr d10, [a1, #3 * 2*4]
|
||||
vldr d11, [a1, #7 * 2*4]
|
||||
vldr d14, [a1, #2 * 2*4]
|
||||
vadd.f s0, s6, s4
|
||||
vadd.f s1, s5, s7
|
||||
vsub.f s2, s5, s7
|
||||
vsub.f s3, s6, s4
|
||||
vadd.f s4, s12, s9
|
||||
vsub.f s5, s13, s8
|
||||
vsub.f s6, s14, s11
|
||||
vadd.f s7, s15, s10
|
||||
vadd.f s12, s0, s16 @ vector op
|
||||
vstr d0, [a1, #1 * 2*4]
|
||||
vstr d1, [a1, #5 * 2*4]
|
||||
vldr d4, [a1, #1 * 2*4]
|
||||
vldr d5, [a1, #5 * 2*4]
|
||||
vadd.f s0, s6, s4
|
||||
vadd.f s1, s5, s7
|
||||
vsub.f s2, s5, s7
|
||||
vsub.f s3, s6, s4
|
||||
vsub.f s8, s16, s8 @ vector op
|
||||
vstr d6, [a1, #1 * 2*4]
|
||||
vstr d7, [a1, #5 * 2*4]
|
||||
vldr d15, [a1, #6 * 2*4]
|
||||
vsub.f s4, s20, s0
|
||||
vsub.f s5, s21, s1
|
||||
vsub.f s6, s22, s2
|
||||
vsub.f s7, s23, s3
|
||||
vadd.f s20, s0, s20 @ vector op
|
||||
vstr d4, [a1, #9 * 2*4]
|
||||
@ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
|
||||
vldr d6, [a1, #8 * 2*4]
|
||||
vstr d5, [a1, #13 * 2*4]
|
||||
vldr d7, [a1, #12 * 2*4]
|
||||
vstr d2, [a1, #11 * 2*4]
|
||||
vldr d8, [a1, #0 * 2*4]
|
||||
vstr d3, [a1, #15 * 2*4]
|
||||
vldr d9, [a1, #4 * 2*4]
|
||||
vadd.f s0, s26, s24
|
||||
vadd.f s1, s25, s27
|
||||
vsub.f s2, s25, s27
|
||||
vsub.f s3, s26, s24
|
||||
vadd.f s4, s14, s12
|
||||
vadd.f s5, s13, s15
|
||||
vsub.f s6, s13, s15
|
||||
vsub.f s7, s14, s12
|
||||
vadd.f s8, s0, s28 @ vector op
|
||||
vstr d0, [a1, #3 * 2*4]
|
||||
vstr d1, [a1, #7 * 2*4]
|
||||
vldr d6, [a1, #3 * 2*4]
|
||||
vldr d7, [a1, #7 * 2*4]
|
||||
vsub.f s0, s16, s4
|
||||
vsub.f s1, s17, s5
|
||||
vsub.f s2, s18, s6
|
||||
vsub.f s3, s19, s7
|
||||
vsub.f s12, s28, s12 @ vector op
|
||||
vadd.f s16, s4, s16 @ vector op
|
||||
vstr d10, [a1, #3 * 2*4]
|
||||
vstr d11, [a1, #7 * 2*4]
|
||||
vstr d4, [a1, #2 * 2*4]
|
||||
vstr d5, [a1, #6 * 2*4]
|
||||
vstr d0, [a1, #8 * 2*4]
|
||||
vstr d1, [a1, #12 * 2*4]
|
||||
vstr d6, [a1, #10 * 2*4]
|
||||
vstr d7, [a1, #14 * 2*4]
|
||||
vstr d8, [a1, #0 * 2*4]
|
||||
vstr d9, [a1, #4 * 2*4]
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_fft16_vfp, export=1
|
||||
ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
|
||||
fmrx a2, FPSCR
|
||||
fmxr FPSCR, a3
|
||||
vpush {s16-s31}
|
||||
mov ip, lr
|
||||
bl .Lfft16_internal_vfp
|
||||
vpop {s16-s31}
|
||||
fmxr FPSCR, a2
|
||||
bx ip
|
||||
endfunc
|
||||
|
||||
.macro pass n, z0, z1, z2, z3
|
||||
add v6, v5, #4*2*\n
|
||||
@ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
|
||||
@ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
|
||||
@ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
|
||||
@ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
|
||||
vldr d8, [\z2, #8*(o2+1)] @ s16,s17
|
||||
vldmdb v6!, {s2}
|
||||
vldr d9, [\z3, #8*(o3+1)] @ s18,s19
|
||||
vldmia v5!, {s0,s1} @ s0 is unused
|
||||
vldr s7, [\z2, #8*o2] @ t1
|
||||
vmul.f s20, s16, s2 @ vector * scalar
|
||||
vldr s0, [\z3, #8*o3] @ t5
|
||||
vldr s6, [\z2, #8*o2+4] @ t2
|
||||
vldr s3, [\z3, #8*o3+4] @ t6
|
||||
vmul.f s16, s16, s1 @ vector * scalar
|
||||
ldr a4, =\n-1
|
||||
1: add \z0, \z0, #8*2
|
||||
.if \n*4*2 >= 512
|
||||
add \z1, \z1, #8*2
|
||||
.endif
|
||||
.if \n*4*2 >= 256
|
||||
add \z2, \z2, #8*2
|
||||
.endif
|
||||
.if \n*4*2 >= 512
|
||||
add \z3, \z3, #8*2
|
||||
.endif
|
||||
@ up to 2 stalls (VFP vector issuing / waiting for s0)
|
||||
@ depending upon whether this is the first iteration and
|
||||
@ how many add instructions are inserted above
|
||||
vadd.f s4, s0, s7 @ t5
|
||||
vadd.f s5, s6, s3 @ t6
|
||||
vsub.f s6, s6, s3 @ t4
|
||||
vsub.f s7, s0, s7 @ t3
|
||||
vldr d6, [\z0, #8*0-8*2] @ s12,s13
|
||||
vadd.f s0, s16, s21 @ t1
|
||||
vldr d7, [\z1, #8*o1-8*2] @ s14,s15
|
||||
vsub.f s1, s18, s23 @ t5
|
||||
vadd.f s8, s4, s12 @ vector + vector
|
||||
@ stall (VFP vector issuing)
|
||||
@ stall (VFP vector issuing)
|
||||
@ stall (VFP vector issuing)
|
||||
vsub.f s4, s12, s4
|
||||
vsub.f s5, s13, s5
|
||||
vsub.f s6, s14, s6
|
||||
vsub.f s7, s15, s7
|
||||
vsub.f s2, s17, s20 @ t2
|
||||
vadd.f s3, s19, s22 @ t6
|
||||
vstr d4, [\z0, #8*0-8*2] @ s8,s9
|
||||
vstr d5, [\z1, #8*o1-8*2] @ s10,s11
|
||||
@ stall (waiting for s5)
|
||||
vstr d2, [\z2, #8*o2-8*2] @ s4,s5
|
||||
vadd.f s4, s1, s0 @ t5
|
||||
vstr d3, [\z3, #8*o3-8*2] @ s6,s7
|
||||
vsub.f s7, s1, s0 @ t3
|
||||
vadd.f s5, s2, s3 @ t6
|
||||
vsub.f s6, s2, s3 @ t4
|
||||
vldr d6, [\z0, #8*1-8*2] @ s12,s13
|
||||
vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15
|
||||
vldr d4, [\z2, #8*o2] @ s8,s9
|
||||
vldmdb v6!, {s2,s3}
|
||||
vldr d5, [\z3, #8*o3] @ s10,s11
|
||||
vadd.f s20, s4, s12 @ vector + vector
|
||||
vldmia v5!, {s0,s1}
|
||||
vldr d8, [\z2, #8*(o2+1)] @ s16,s17
|
||||
@ stall (VFP vector issuing)
|
||||
vsub.f s4, s12, s4
|
||||
vsub.f s5, s13, s5
|
||||
vsub.f s6, s14, s6
|
||||
vsub.f s7, s15, s7
|
||||
vmul.f s12, s8, s3 @ vector * scalar
|
||||
vstr d10, [\z0, #8*1-8*2] @ s20,s21
|
||||
vldr d9, [\z3, #8*(o3+1)] @ s18,s19
|
||||
vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23
|
||||
vmul.f s8, s8, s0 @ vector * scalar
|
||||
vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5
|
||||
@ stall (waiting for s7)
|
||||
vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7
|
||||
vmul.f s20, s16, s2 @ vector * scalar
|
||||
@ stall (VFP vector issuing)
|
||||
@ stall (VFP vector issuing)
|
||||
@ stall (VFP vector issuing)
|
||||
vadd.f s7, s8, s13 @ t1
|
||||
vsub.f s6, s9, s12 @ t2
|
||||
vsub.f s0, s10, s15 @ t5
|
||||
vadd.f s3, s11, s14 @ t6
|
||||
vmul.f s16, s16, s1 @ vector * scalar
|
||||
subs a4, a4, #1
|
||||
bne 1b
|
||||
@ What remains is identical to the first two indentations of
|
||||
@ the above, but without the increment of z
|
||||
vadd.f s4, s0, s7 @ t5
|
||||
vadd.f s5, s6, s3 @ t6
|
||||
vsub.f s6, s6, s3 @ t4
|
||||
vsub.f s7, s0, s7 @ t3
|
||||
vldr d6, [\z0, #8*0] @ s12,s13
|
||||
vadd.f s0, s16, s21 @ t1
|
||||
vldr d7, [\z1, #8*o1] @ s14,s15
|
||||
vsub.f s1, s18, s23 @ t5
|
||||
vadd.f s8, s4, s12 @ vector + vector
|
||||
vsub.f s4, s12, s4
|
||||
vsub.f s5, s13, s5
|
||||
vsub.f s6, s14, s6
|
||||
vsub.f s7, s15, s7
|
||||
vsub.f s2, s17, s20 @ t2
|
||||
vadd.f s3, s19, s22 @ t6
|
||||
vstr d4, [\z0, #8*0] @ s8,s9
|
||||
vstr d5, [\z1, #8*o1] @ s10,s11
|
||||
vstr d2, [\z2, #8*o2] @ s4,s5
|
||||
vadd.f s4, s1, s0 @ t5
|
||||
vstr d3, [\z3, #8*o3] @ s6,s7
|
||||
vsub.f s7, s1, s0 @ t3
|
||||
vadd.f s5, s2, s3 @ t6
|
||||
vsub.f s6, s2, s3 @ t4
|
||||
vldr d6, [\z0, #8*1] @ s12,s13
|
||||
vldr d7, [\z1, #8*(o1+1)] @ s14,s15
|
||||
vadd.f s20, s4, s12 @ vector + vector
|
||||
vsub.f s4, s12, s4
|
||||
vsub.f s5, s13, s5
|
||||
vsub.f s6, s14, s6
|
||||
vsub.f s7, s15, s7
|
||||
vstr d10, [\z0, #8*1] @ s20,s21
|
||||
vstr d11, [\z1, #8*(o1+1)] @ s22,s23
|
||||
vstr d2, [\z2, #8*(o2+1)] @ s4,s5
|
||||
vstr d3, [\z3, #8*(o3+1)] @ s6,s7
|
||||
.endm
|
||||
|
||||
.macro def_fft n, n2, n4
|
||||
function .Lfft\n\()_internal_vfp
|
||||
.if \n >= 512
|
||||
push {v1-v6,lr}
|
||||
.elseif \n >= 256
|
||||
push {v1-v2,v5-v6,lr}
|
||||
.else
|
||||
push {v1,v5-v6,lr}
|
||||
.endif
|
||||
mov v1, a1
|
||||
bl .Lfft\n2\()_internal_vfp
|
||||
add a1, v1, #8*(\n/4)*2
|
||||
bl .Lfft\n4\()_internal_vfp
|
||||
movrelx v5, X(ff_cos_\n), a1
|
||||
add a1, v1, #8*(\n/4)*3
|
||||
bl .Lfft\n4\()_internal_vfp
|
||||
.if \n >= 512
|
||||
.set o1, 0*(\n/4/2)
|
||||
.set o2, 0*(\n/4/2)
|
||||
.set o3, 0*(\n/4/2)
|
||||
add v2, v1, #8*2*(\n/4/2)
|
||||
add v3, v1, #8*4*(\n/4/2)
|
||||
add v4, v1, #8*6*(\n/4/2)
|
||||
pass (\n/4/2), v1, v2, v3, v4
|
||||
pop {v1-v6,pc}
|
||||
.elseif \n >= 256
|
||||
.set o1, 2*(\n/4/2)
|
||||
.set o2, 0*(\n/4/2)
|
||||
.set o3, 2*(\n/4/2)
|
||||
add v2, v1, #8*4*(\n/4/2)
|
||||
pass (\n/4/2), v1, v1, v2, v2
|
||||
pop {v1-v2,v5-v6,pc}
|
||||
.else
|
||||
.set o1, 2*(\n/4/2)
|
||||
.set o2, 4*(\n/4/2)
|
||||
.set o3, 6*(\n/4/2)
|
||||
pass (\n/4/2), v1, v1, v1, v1
|
||||
pop {v1,v5-v6,pc}
|
||||
.endif
|
||||
endfunc
|
||||
|
||||
function fft\n\()_vfp
|
||||
ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */
|
||||
fmrx a2, FPSCR
|
||||
fmxr FPSCR, a3
|
||||
vpush {s16-s31}
|
||||
mov ip, lr
|
||||
bl .Lfft\n\()_internal_vfp
|
||||
vpop {s16-s31}
|
||||
fmxr FPSCR, a2
|
||||
bx ip
|
||||
endfunc
|
||||
|
||||
.ltorg
|
||||
.endm
|
||||
|
||||
def_fft 32, 16, 8
|
||||
def_fft 64, 32, 16
|
||||
def_fft 128, 64, 32
|
||||
def_fft 256, 128, 64
|
||||
def_fft 512, 256, 128
|
||||
def_fft 1024, 512, 256
|
||||
def_fft 2048, 1024, 512
|
||||
def_fft 4096, 2048, 1024
|
||||
def_fft 8192, 4096, 2048
|
||||
def_fft 16384, 8192, 4096
|
||||
def_fft 32768, 16384, 8192
|
||||
def_fft 65536, 32768, 16384
|
||||
146
externals/ffmpeg/libavcodec/arm/flacdsp_arm.S
vendored
Executable file
146
externals/ffmpeg/libavcodec/arm/flacdsp_arm.S
vendored
Executable file
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function flac_lpc_16_1_arm
|
||||
ldr r12, [sp]
|
||||
push {r4, lr}
|
||||
ldr r1, [r1]
|
||||
subs r12, r12, #2
|
||||
ldr lr, [r0], #4
|
||||
beq 2f
|
||||
it lt
|
||||
poplt {r4, pc}
|
||||
1:
|
||||
mul r4, lr, r1
|
||||
ldm r0, {r2, lr}
|
||||
add_sh r2, r2, r4, asr r3
|
||||
mul r4, r2, r1
|
||||
subs r12, r12, #2
|
||||
add_sh lr, lr, r4, asr r3
|
||||
stm r0!, {r2, lr}
|
||||
bgt 1b
|
||||
it lt
|
||||
poplt {r4, pc}
|
||||
2:
|
||||
mul r4, lr, r1
|
||||
ldr r2, [r0]
|
||||
add_sh r2, r2, r4, asr r3
|
||||
str r2, [r0]
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
function flac_lpc_16_2_arm
|
||||
ldr r12, [sp]
|
||||
subs r12, r12, r2
|
||||
it le
|
||||
bxle lr
|
||||
|
||||
push {r4-r9, lr}
|
||||
ldm r0!, {r6, r7}
|
||||
ldm r1, {r8, r9}
|
||||
subs r12, r12, #1
|
||||
beq 2f
|
||||
1:
|
||||
mul r4, r6, r8
|
||||
mul r5, r7, r8
|
||||
mla r4, r7, r9, r4
|
||||
ldm r0, {r6, r7}
|
||||
add_sh r6, r6, r4, asr r3
|
||||
mla r5, r6, r9, r5
|
||||
add_sh r7, r7, r5, asr r3
|
||||
stm r0!, {r6, r7}
|
||||
subs r12, r12, #2
|
||||
bgt 1b
|
||||
it lt
|
||||
poplt {r4-r9, pc}
|
||||
2:
|
||||
mul r4, r6, r8
|
||||
mla r4, r7, r9, r4
|
||||
ldr r5, [r0]
|
||||
add_sh r5, r5, r4, asr r3
|
||||
str r5, [r0]
|
||||
pop {r4-r9, pc}
|
||||
endfunc
|
||||
|
||||
function ff_flac_lpc_16_arm, export=1
|
||||
cmp r2, #2
|
||||
blt flac_lpc_16_1_arm
|
||||
beq flac_lpc_16_2_arm
|
||||
|
||||
ldr r12, [sp]
|
||||
subs r12, r12, r2
|
||||
it le
|
||||
bxle lr
|
||||
|
||||
push {r4-r9, lr}
|
||||
|
||||
subs r12, r12, #1
|
||||
beq 3f
|
||||
1:
|
||||
sub lr, r2, #2
|
||||
mov r4, #0
|
||||
mov r5, #0
|
||||
|
||||
ldr r7, [r0], #4
|
||||
ldr r9, [r1], #4
|
||||
2:
|
||||
mla r4, r7, r9, r4
|
||||
ldm r0!, {r6, r7}
|
||||
mla r5, r6, r9, r5
|
||||
ldm r1!, {r8, r9}
|
||||
mla r4, r6, r8, r4
|
||||
subs lr, lr, #2
|
||||
mla r5, r7, r8, r5
|
||||
bgt 2b
|
||||
blt 6f
|
||||
|
||||
mla r4, r7, r9, r4
|
||||
ldr r7, [r0], #4
|
||||
mla r5, r7, r9, r5
|
||||
ldr r9, [r1], #4
|
||||
6:
|
||||
mla r4, r7, r9, r4
|
||||
ldm r0, {r6, r7}
|
||||
add_sh r6, r6, r4, asr r3
|
||||
mla r5, r6, r9, r5
|
||||
add_sh r7, r7, r5, asr r3
|
||||
stm r0!, {r6, r7}
|
||||
sub r0, r0, r2, lsl #2
|
||||
sub r1, r1, r2, lsl #2
|
||||
|
||||
subs r12, r12, #2
|
||||
bgt 1b
|
||||
it lt
|
||||
poplt {r4-r9, pc}
|
||||
3:
|
||||
mov r4, #0
|
||||
4:
|
||||
ldr r5, [r1], #4
|
||||
ldr r6, [r0], #4
|
||||
mla r4, r5, r6, r4
|
||||
subs r2, r2, #1
|
||||
bgt 4b
|
||||
ldr r5, [r0]
|
||||
add_sh r5, r5, r4, asr r3
|
||||
str r5, [r0]
|
||||
pop {r4-r9, pc}
|
||||
endfunc
|
||||
32
externals/ffmpeg/libavcodec/arm/flacdsp_init_arm.c
vendored
Executable file
32
externals/ffmpeg/libavcodec/arm/flacdsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,32 @@
|
||||
/*
|
||||
* Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/flacdsp.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order,
|
||||
int qlevel, int len);
|
||||
|
||||
av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
|
||||
int bps)
|
||||
{
|
||||
if (CONFIG_FLAC_DECODER)
|
||||
c->lpc16 = ff_flac_lpc_16_arm;
|
||||
}
|
||||
53
externals/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c
vendored
Executable file
53
externals/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c
vendored
Executable file
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
* ARM optimized Format Conversion Utils
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/fmtconvert.h"
|
||||
|
||||
void ff_int32_to_float_fmul_array8_neon(FmtConvertContext *c, float *dst,
|
||||
const int32_t *src, const float *mul,
|
||||
int len);
|
||||
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src,
|
||||
float mul, int len);
|
||||
|
||||
void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src,
|
||||
float mul, int len);
|
||||
void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst,
|
||||
const int32_t *src, const float *mul,
|
||||
int len);
|
||||
|
||||
av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_vfp_vm(cpu_flags)) {
|
||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_vfp;
|
||||
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_vfp;
|
||||
}
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_neon;
|
||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
|
||||
}
|
||||
}
|
||||
88
externals/ffmpeg/libavcodec/arm/fmtconvert_neon.S
vendored
Executable file
88
externals/ffmpeg/libavcodec/arm/fmtconvert_neon.S
vendored
Executable file
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
* ARM NEON optimised Format Conversion Utils
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>b
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_int32_to_float_fmul_scalar_neon, export=1
|
||||
VFP vdup.32 q0, d0[0]
|
||||
VFP len .req r2
|
||||
NOVFP vdup.32 q0, r2
|
||||
NOVFP len .req r3
|
||||
|
||||
vld1.32 {q1},[r1,:128]!
|
||||
vcvt.f32.s32 q3, q1
|
||||
vld1.32 {q2},[r1,:128]!
|
||||
vcvt.f32.s32 q8, q2
|
||||
1: subs len, len, #8
|
||||
pld [r1, #16]
|
||||
vmul.f32 q9, q3, q0
|
||||
vmul.f32 q10, q8, q0
|
||||
beq 2f
|
||||
vld1.32 {q1},[r1,:128]!
|
||||
vcvt.f32.s32 q3, q1
|
||||
vld1.32 {q2},[r1,:128]!
|
||||
vcvt.f32.s32 q8, q2
|
||||
vst1.32 {q9}, [r0,:128]!
|
||||
vst1.32 {q10},[r0,:128]!
|
||||
b 1b
|
||||
2: vst1.32 {q9}, [r0,:128]!
|
||||
vst1.32 {q10},[r0,:128]!
|
||||
bx lr
|
||||
.unreq len
|
||||
endfunc
|
||||
|
||||
function ff_int32_to_float_fmul_array8_neon, export=1
|
||||
ldr r0, [sp]
|
||||
lsr r0, r0, #3
|
||||
subs r0, r0, #1
|
||||
beq 1f
|
||||
2:
|
||||
vld1.32 {q0-q1}, [r2,:128]!
|
||||
vld1.32 {q2-q3}, [r2,:128]!
|
||||
vld1.32 {d20}, [r3]!
|
||||
subs r0, r0, #2
|
||||
vcvt.f32.s32 q0, q0
|
||||
vcvt.f32.s32 q1, q1
|
||||
vdup.32 q8, d20[0]
|
||||
vcvt.f32.s32 q2, q2
|
||||
vcvt.f32.s32 q3, q3
|
||||
vmul.f32 q0, q0, q8
|
||||
vdup.32 q9, d20[1]
|
||||
vmul.f32 q1, q1, q8
|
||||
vmul.f32 q2, q2, q9
|
||||
vmul.f32 q3, q3, q9
|
||||
vst1.32 {q0-q1}, [r1,:128]!
|
||||
vst1.32 {q2-q3}, [r1,:128]!
|
||||
bgt 2b
|
||||
it lt
|
||||
bxlt lr
|
||||
1:
|
||||
vld1.32 {q0-q1}, [r2,:128]
|
||||
vld1.32 {d16[],d17[]}, [r3]
|
||||
vcvt.f32.s32 q0, q0
|
||||
vcvt.f32.s32 q1, q1
|
||||
vmul.f32 q0, q0, q8
|
||||
vmul.f32 q1, q1, q8
|
||||
vst1.32 {q0-q1}, [r1,:128]
|
||||
bx lr
|
||||
endfunc
|
||||
221
externals/ffmpeg/libavcodec/arm/fmtconvert_vfp.S
vendored
Executable file
221
externals/ffmpeg/libavcodec/arm/fmtconvert_vfp.S
vendored
Executable file
@@ -0,0 +1,221 @@
|
||||
/*
|
||||
* Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
/**
|
||||
* ARM VFP optimised int32 to float conversion.
|
||||
* Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
|
||||
* (16 bytes alignment is best for BCM2835), little-endian.
|
||||
*/
|
||||
@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len)
|
||||
function ff_int32_to_float_fmul_array8_vfp, export=1
|
||||
push {lr}
|
||||
ldr a1, [sp, #4]
|
||||
subs lr, a1, #3*8
|
||||
bcc 50f @ too short to pipeline
|
||||
@ Now need to find (len / 8) % 3. The approximation
|
||||
@ x / 24 = (x * 0xAB) >> 12
|
||||
@ is good for x < 4096, which is true for both AC3 and DCA.
|
||||
mov a1, #0xAB
|
||||
ldr ip, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
|
||||
mul a1, lr, a1
|
||||
vpush {s16-s31}
|
||||
mov a1, a1, lsr #12
|
||||
add a1, a1, a1, lsl #1
|
||||
rsb a1, a1, lr, lsr #3
|
||||
cmp a1, #1
|
||||
fmrx a1, FPSCR
|
||||
fmxr FPSCR, ip
|
||||
beq 11f
|
||||
blo 10f
|
||||
@ Array is (2 + multiple of 3) x 8 floats long
|
||||
@ drop through...
|
||||
vldmia a3!, {s16-s23}
|
||||
vldmia a4!, {s2,s3}
|
||||
vldmia a3!, {s24-s31}
|
||||
vcvt.f32.s32 s16, s16
|
||||
vcvt.f32.s32 s17, s17
|
||||
vcvt.f32.s32 s18, s18
|
||||
vcvt.f32.s32 s19, s19
|
||||
vcvt.f32.s32 s20, s20
|
||||
vcvt.f32.s32 s21, s21
|
||||
vcvt.f32.s32 s22, s22
|
||||
vcvt.f32.s32 s23, s23
|
||||
vmul.f32 s16, s16, s2
|
||||
@ drop through...
|
||||
3:
|
||||
vldmia a3!, {s8-s15}
|
||||
vldmia a4!, {s1}
|
||||
vcvt.f32.s32 s24, s24
|
||||
vcvt.f32.s32 s25, s25
|
||||
vcvt.f32.s32 s26, s26
|
||||
vcvt.f32.s32 s27, s27
|
||||
vcvt.f32.s32 s28, s28
|
||||
vcvt.f32.s32 s29, s29
|
||||
vcvt.f32.s32 s30, s30
|
||||
vcvt.f32.s32 s31, s31
|
||||
vmul.f32 s24, s24, s3
|
||||
vstmia a2!, {s16-s19}
|
||||
vstmia a2!, {s20-s23}
|
||||
2:
|
||||
vldmia a3!, {s16-s23}
|
||||
vldmia a4!, {s2}
|
||||
vcvt.f32.s32 s8, s8
|
||||
vcvt.f32.s32 s9, s9
|
||||
vcvt.f32.s32 s10, s10
|
||||
vcvt.f32.s32 s11, s11
|
||||
vcvt.f32.s32 s12, s12
|
||||
vcvt.f32.s32 s13, s13
|
||||
vcvt.f32.s32 s14, s14
|
||||
vcvt.f32.s32 s15, s15
|
||||
vmul.f32 s8, s8, s1
|
||||
vstmia a2!, {s24-s27}
|
||||
vstmia a2!, {s28-s31}
|
||||
1:
|
||||
vldmia a3!, {s24-s31}
|
||||
vldmia a4!, {s3}
|
||||
vcvt.f32.s32 s16, s16
|
||||
vcvt.f32.s32 s17, s17
|
||||
vcvt.f32.s32 s18, s18
|
||||
vcvt.f32.s32 s19, s19
|
||||
vcvt.f32.s32 s20, s20
|
||||
vcvt.f32.s32 s21, s21
|
||||
vcvt.f32.s32 s22, s22
|
||||
vcvt.f32.s32 s23, s23
|
||||
vmul.f32 s16, s16, s2
|
||||
vstmia a2!, {s8-s11}
|
||||
vstmia a2!, {s12-s15}
|
||||
|
||||
subs lr, lr, #8*3
|
||||
bpl 3b
|
||||
|
||||
vcvt.f32.s32 s24, s24
|
||||
vcvt.f32.s32 s25, s25
|
||||
vcvt.f32.s32 s26, s26
|
||||
vcvt.f32.s32 s27, s27
|
||||
vcvt.f32.s32 s28, s28
|
||||
vcvt.f32.s32 s29, s29
|
||||
vcvt.f32.s32 s30, s30
|
||||
vcvt.f32.s32 s31, s31
|
||||
vmul.f32 s24, s24, s3
|
||||
vstmia a2!, {s16-s19}
|
||||
vstmia a2!, {s20-s23}
|
||||
vstmia a2!, {s24-s27}
|
||||
vstmia a2!, {s28-s31}
|
||||
|
||||
fmxr FPSCR, a1
|
||||
vpop {s16-s31}
|
||||
pop {pc}
|
||||
|
||||
10: @ Array is (multiple of 3) x 8 floats long
|
||||
vldmia a3!, {s8-s15}
|
||||
vldmia a4!, {s1,s2}
|
||||
vldmia a3!, {s16-s23}
|
||||
vcvt.f32.s32 s8, s8
|
||||
vcvt.f32.s32 s9, s9
|
||||
vcvt.f32.s32 s10, s10
|
||||
vcvt.f32.s32 s11, s11
|
||||
vcvt.f32.s32 s12, s12
|
||||
vcvt.f32.s32 s13, s13
|
||||
vcvt.f32.s32 s14, s14
|
||||
vcvt.f32.s32 s15, s15
|
||||
vmul.f32 s8, s8, s1
|
||||
b 1b
|
||||
|
||||
11: @ Array is (1 + multiple of 3) x 8 floats long
|
||||
vldmia a3!, {s24-s31}
|
||||
vldmia a4!, {s3}
|
||||
vldmia a3!, {s8-s15}
|
||||
vldmia a4!, {s1}
|
||||
vcvt.f32.s32 s24, s24
|
||||
vcvt.f32.s32 s25, s25
|
||||
vcvt.f32.s32 s26, s26
|
||||
vcvt.f32.s32 s27, s27
|
||||
vcvt.f32.s32 s28, s28
|
||||
vcvt.f32.s32 s29, s29
|
||||
vcvt.f32.s32 s30, s30
|
||||
vcvt.f32.s32 s31, s31
|
||||
vmul.f32 s24, s24, s3
|
||||
b 2b
|
||||
|
||||
50:
|
||||
ldr lr, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
|
||||
fmrx ip, FPSCR
|
||||
fmxr FPSCR, lr
|
||||
51:
|
||||
vldmia a3!, {s8-s15}
|
||||
vldmia a4!, {s0}
|
||||
vcvt.f32.s32 s8, s8
|
||||
vcvt.f32.s32 s9, s9
|
||||
vcvt.f32.s32 s10, s10
|
||||
vcvt.f32.s32 s11, s11
|
||||
vcvt.f32.s32 s12, s12
|
||||
vcvt.f32.s32 s13, s13
|
||||
vcvt.f32.s32 s14, s14
|
||||
vcvt.f32.s32 s15, s15
|
||||
vmul.f32 s8, s8, s0
|
||||
subs a1, a1, #8
|
||||
vstmia a2!, {s8-s11}
|
||||
vstmia a2!, {s12-s15}
|
||||
bne 51b
|
||||
|
||||
fmxr FPSCR, ip
|
||||
pop {pc}
|
||||
endfunc
|
||||
|
||||
/**
|
||||
* ARM VFP optimised int32 to float conversion.
|
||||
* Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
|
||||
* (16 bytes alignment is best for BCM2835), little-endian.
|
||||
* TODO: could be further optimised by unrolling and interleaving, as above
|
||||
*/
|
||||
@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len)
|
||||
function ff_int32_to_float_fmul_scalar_vfp, export=1
|
||||
VFP tmp .req a4
|
||||
VFP len .req a3
|
||||
NOVFP tmp .req a3
|
||||
NOVFP len .req a4
|
||||
NOVFP vmov s0, a3
|
||||
ldr tmp, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
|
||||
fmrx ip, FPSCR
|
||||
fmxr FPSCR, tmp
|
||||
1:
|
||||
vldmia a2!, {s8-s15}
|
||||
vcvt.f32.s32 s8, s8
|
||||
vcvt.f32.s32 s9, s9
|
||||
vcvt.f32.s32 s10, s10
|
||||
vcvt.f32.s32 s11, s11
|
||||
vcvt.f32.s32 s12, s12
|
||||
vcvt.f32.s32 s13, s13
|
||||
vcvt.f32.s32 s14, s14
|
||||
vcvt.f32.s32 s15, s15
|
||||
vmul.f32 s8, s8, s0
|
||||
subs len, len, #8
|
||||
vstmia a1!, {s8-s11}
|
||||
vstmia a1!, {s12-s15}
|
||||
bne 1b
|
||||
|
||||
fmxr FPSCR, ip
|
||||
bx lr
|
||||
endfunc
|
||||
.unreq tmp
|
||||
.unreq len
|
||||
35
externals/ffmpeg/libavcodec/arm/g722dsp_init_arm.c
vendored
Executable file
35
externals/ffmpeg/libavcodec/arm/g722dsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/g722dsp.h"
|
||||
|
||||
extern void ff_g722_apply_qmf_neon(const int16_t *prev_samples, int xout[2]);
|
||||
|
||||
av_cold void ff_g722dsp_init_arm(G722DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags))
|
||||
dsp->apply_qmf = ff_g722_apply_qmf_neon;
|
||||
}
|
||||
69
externals/ffmpeg/libavcodec/arm/g722dsp_neon.S
vendored
Executable file
69
externals/ffmpeg/libavcodec/arm/g722dsp_neon.S
vendored
Executable file
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* ARM NEON optimised DSP functions for G722 coding
|
||||
* Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_g722_apply_qmf_neon, export=1, align=4
|
||||
movrel r3, qmf_coeffs
|
||||
vld1.s16 {d2,d3,d4}, [r0]! /* load prev_samples */
|
||||
vld1.s16 {d16,d17,d18}, [r3,:64]! /* load qmf_coeffs */
|
||||
vmull.s16 q0, d2, d16
|
||||
vmlal.s16 q0, d3, d17
|
||||
vmlal.s16 q0, d4, d18
|
||||
|
||||
vld1.s16 {d5,d6,d7}, [r0]! /* load prev_samples */
|
||||
vld1.s16 {d19,d20,d21}, [r3,:64]! /* load qmf_coeffs */
|
||||
vmlal.s16 q0, d5, d19
|
||||
vmlal.s16 q0, d6, d20
|
||||
vmlal.s16 q0, d7, d21
|
||||
|
||||
vadd.s32 d0, d1, d0
|
||||
vrev64.32 d0, d0
|
||||
vst1.s32 {d0}, [r1]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
const qmf_coeffs, align=4
|
||||
.hword 3
|
||||
.hword -11
|
||||
.hword -11
|
||||
.hword 53
|
||||
.hword 12
|
||||
.hword -156
|
||||
.hword 32
|
||||
.hword 362
|
||||
.hword -210
|
||||
.hword -805
|
||||
.hword 951
|
||||
.hword 3876
|
||||
.hword 3876
|
||||
.hword 951
|
||||
.hword -805
|
||||
.hword -210
|
||||
.hword 362
|
||||
.hword 32
|
||||
.hword -156
|
||||
.hword 12
|
||||
.hword 53
|
||||
.hword -11
|
||||
.hword -11
|
||||
.hword 3
|
||||
endconst
|
||||
57
externals/ffmpeg/libavcodec/arm/h264chroma_init_arm.c
vendored
Executable file
57
externals/ffmpeg/libavcodec/arm/h264chroma_init_arm.c
vendored
Executable file
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* ARM NEON optimised H.264 chroma functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/h264chroma.h"
|
||||
|
||||
void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
|
||||
void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
|
||||
av_cold void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth)
|
||||
{
|
||||
const int high_bit_depth = bit_depth > 8;
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags) && !high_bit_depth) {
|
||||
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
|
||||
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
|
||||
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
|
||||
|
||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
|
||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
|
||||
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
|
||||
}
|
||||
}
|
||||
463
externals/ffmpeg/libavcodec/arm/h264cmc_neon.S
vendored
Executable file
463
externals/ffmpeg/libavcodec/arm/h264cmc_neon.S
vendored
Executable file
@@ -0,0 +1,463 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
|
||||
.macro h264_chroma_mc8 type, codec=h264
|
||||
function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
|
||||
push {r4-r7, lr}
|
||||
ldrd r4, r5, [sp, #20]
|
||||
.ifc \type,avg
|
||||
mov lr, r0
|
||||
.endif
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
|
||||
.ifc \codec,rv40
|
||||
movrel r6, rv40bias
|
||||
lsr r7, r5, #1
|
||||
add r6, r6, r7, lsl #3
|
||||
lsr r7, r4, #1
|
||||
add r6, r6, r7, lsl #1
|
||||
vld1.16 {d22[],d23[]}, [r6,:16]
|
||||
.endif
|
||||
.ifc \codec,vc1
|
||||
vmov.u16 q11, #28
|
||||
.endif
|
||||
|
||||
A muls r7, r4, r5
|
||||
T mul r7, r4, r5
|
||||
T cmp r7, #0
|
||||
rsb r6, r7, r5, lsl #3
|
||||
rsb r12, r7, r4, lsl #3
|
||||
sub r4, r7, r4, lsl #3
|
||||
sub r4, r4, r5, lsl #3
|
||||
add r4, r4, #64
|
||||
|
||||
beq 2f
|
||||
|
||||
vdup.8 d0, r4
|
||||
vdup.8 d1, r12
|
||||
vld1.8 {d4, d5}, [r1], r2
|
||||
vdup.8 d2, r6
|
||||
vdup.8 d3, r7
|
||||
vext.8 d5, d4, d5, #1
|
||||
|
||||
1: vld1.8 {d6, d7}, [r1], r2
|
||||
vmull.u8 q8, d4, d0
|
||||
vmlal.u8 q8, d5, d1
|
||||
vext.8 d7, d6, d7, #1
|
||||
vld1.8 {d4, d5}, [r1], r2
|
||||
vmlal.u8 q8, d6, d2
|
||||
pld [r1]
|
||||
vext.8 d5, d4, d5, #1
|
||||
vmlal.u8 q8, d7, d3
|
||||
vmull.u8 q9, d6, d0
|
||||
subs r3, r3, #2
|
||||
vmlal.u8 q9, d7, d1
|
||||
vmlal.u8 q9, d4, d2
|
||||
vmlal.u8 q9, d5, d3
|
||||
pld [r1, r2]
|
||||
.ifc \codec,h264
|
||||
vrshrn.u16 d16, q8, #6
|
||||
vrshrn.u16 d17, q9, #6
|
||||
.else
|
||||
vadd.u16 q8, q8, q11
|
||||
vadd.u16 q9, q9, q11
|
||||
vshrn.u16 d16, q8, #6
|
||||
vshrn.u16 d17, q9, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
vld1.8 {d20}, [lr,:64], r2
|
||||
vld1.8 {d21}, [lr,:64], r2
|
||||
vrhadd.u8 q8, q8, q10
|
||||
.endif
|
||||
vst1.8 {d16}, [r0,:64], r2
|
||||
vst1.8 {d17}, [r0,:64], r2
|
||||
bgt 1b
|
||||
|
||||
pop {r4-r7, pc}
|
||||
|
||||
2: adds r12, r12, r6
|
||||
vdup.8 d0, r4
|
||||
beq 5f
|
||||
tst r6, r6
|
||||
vdup.8 d1, r12
|
||||
|
||||
beq 4f
|
||||
|
||||
vld1.8 {d4}, [r1], r2
|
||||
|
||||
3: vld1.8 {d6}, [r1], r2
|
||||
vmull.u8 q8, d4, d0
|
||||
vmlal.u8 q8, d6, d1
|
||||
vld1.8 {d4}, [r1], r2
|
||||
vmull.u8 q9, d6, d0
|
||||
vmlal.u8 q9, d4, d1
|
||||
pld [r1]
|
||||
.ifc \codec,h264
|
||||
vrshrn.u16 d16, q8, #6
|
||||
vrshrn.u16 d17, q9, #6
|
||||
.else
|
||||
vadd.u16 q8, q8, q11
|
||||
vadd.u16 q9, q9, q11
|
||||
vshrn.u16 d16, q8, #6
|
||||
vshrn.u16 d17, q9, #6
|
||||
.endif
|
||||
pld [r1, r2]
|
||||
.ifc \type,avg
|
||||
vld1.8 {d20}, [lr,:64], r2
|
||||
vld1.8 {d21}, [lr,:64], r2
|
||||
vrhadd.u8 q8, q8, q10
|
||||
.endif
|
||||
subs r3, r3, #2
|
||||
vst1.8 {d16}, [r0,:64], r2
|
||||
vst1.8 {d17}, [r0,:64], r2
|
||||
bgt 3b
|
||||
|
||||
pop {r4-r7, pc}
|
||||
|
||||
4: vld1.8 {d4, d5}, [r1], r2
|
||||
vld1.8 {d6, d7}, [r1], r2
|
||||
vext.8 d5, d4, d5, #1
|
||||
vext.8 d7, d6, d7, #1
|
||||
pld [r1]
|
||||
subs r3, r3, #2
|
||||
vmull.u8 q8, d4, d0
|
||||
vmlal.u8 q8, d5, d1
|
||||
vmull.u8 q9, d6, d0
|
||||
vmlal.u8 q9, d7, d1
|
||||
pld [r1, r2]
|
||||
.ifc \codec,h264
|
||||
vrshrn.u16 d16, q8, #6
|
||||
vrshrn.u16 d17, q9, #6
|
||||
.else
|
||||
vadd.u16 q8, q8, q11
|
||||
vadd.u16 q9, q9, q11
|
||||
vshrn.u16 d16, q8, #6
|
||||
vshrn.u16 d17, q9, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
vld1.8 {d20}, [lr,:64], r2
|
||||
vld1.8 {d21}, [lr,:64], r2
|
||||
vrhadd.u8 q8, q8, q10
|
||||
.endif
|
||||
vst1.8 {d16}, [r0,:64], r2
|
||||
vst1.8 {d17}, [r0,:64], r2
|
||||
bgt 4b
|
||||
|
||||
pop {r4-r7, pc}
|
||||
|
||||
5: vld1.8 {d4}, [r1], r2
|
||||
vld1.8 {d5}, [r1], r2
|
||||
pld [r1]
|
||||
subs r3, r3, #2
|
||||
vmull.u8 q8, d4, d0
|
||||
vmull.u8 q9, d5, d0
|
||||
pld [r1, r2]
|
||||
.ifc \codec,h264
|
||||
vrshrn.u16 d16, q8, #6
|
||||
vrshrn.u16 d17, q9, #6
|
||||
.else
|
||||
vadd.u16 q8, q8, q11
|
||||
vadd.u16 q9, q9, q11
|
||||
vshrn.u16 d16, q8, #6
|
||||
vshrn.u16 d17, q9, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
vld1.8 {d20}, [lr,:64], r2
|
||||
vld1.8 {d21}, [lr,:64], r2
|
||||
vrhadd.u8 q8, q8, q10
|
||||
.endif
|
||||
vst1.8 {d16}, [r0,:64], r2
|
||||
vst1.8 {d17}, [r0,:64], r2
|
||||
bgt 5b
|
||||
|
||||
pop {r4-r7, pc}
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
|
||||
.macro h264_chroma_mc4 type, codec=h264
|
||||
function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
|
||||
push {r4-r7, lr}
|
||||
ldrd r4, r5, [sp, #20]
|
||||
.ifc \type,avg
|
||||
mov lr, r0
|
||||
.endif
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
|
||||
.ifc \codec,rv40
|
||||
movrel r6, rv40bias
|
||||
lsr r7, r5, #1
|
||||
add r6, r6, r7, lsl #3
|
||||
lsr r7, r4, #1
|
||||
add r6, r6, r7, lsl #1
|
||||
vld1.16 {d22[],d23[]}, [r6,:16]
|
||||
.endif
|
||||
.ifc \codec,vc1
|
||||
vmov.u16 q11, #28
|
||||
.endif
|
||||
|
||||
A muls r7, r4, r5
|
||||
T mul r7, r4, r5
|
||||
T cmp r7, #0
|
||||
rsb r6, r7, r5, lsl #3
|
||||
rsb r12, r7, r4, lsl #3
|
||||
sub r4, r7, r4, lsl #3
|
||||
sub r4, r4, r5, lsl #3
|
||||
add r4, r4, #64
|
||||
|
||||
beq 2f
|
||||
|
||||
vdup.8 d0, r4
|
||||
vdup.8 d1, r12
|
||||
vld1.8 {d4}, [r1], r2
|
||||
vdup.8 d2, r6
|
||||
vdup.8 d3, r7
|
||||
|
||||
vext.8 d5, d4, d5, #1
|
||||
vtrn.32 d4, d5
|
||||
|
||||
vtrn.32 d0, d1
|
||||
vtrn.32 d2, d3
|
||||
|
||||
1: vld1.8 {d6}, [r1], r2
|
||||
vext.8 d7, d6, d7, #1
|
||||
vtrn.32 d6, d7
|
||||
vmull.u8 q8, d4, d0
|
||||
vmlal.u8 q8, d6, d2
|
||||
vld1.8 {d4}, [r1], r2
|
||||
vext.8 d5, d4, d5, #1
|
||||
vtrn.32 d4, d5
|
||||
pld [r1]
|
||||
vmull.u8 q9, d6, d0
|
||||
vmlal.u8 q9, d4, d2
|
||||
vadd.i16 d16, d16, d17
|
||||
vadd.i16 d17, d18, d19
|
||||
.ifc \codec,h264
|
||||
vrshrn.u16 d16, q8, #6
|
||||
.else
|
||||
vadd.u16 q8, q8, q11
|
||||
vshrn.u16 d16, q8, #6
|
||||
.endif
|
||||
subs r3, r3, #2
|
||||
pld [r1, r2]
|
||||
.ifc \type,avg
|
||||
vld1.32 {d20[0]}, [lr,:32], r2
|
||||
vld1.32 {d20[1]}, [lr,:32], r2
|
||||
vrhadd.u8 d16, d16, d20
|
||||
.endif
|
||||
vst1.32 {d16[0]}, [r0,:32], r2
|
||||
vst1.32 {d16[1]}, [r0,:32], r2
|
||||
bgt 1b
|
||||
|
||||
pop {r4-r7, pc}
|
||||
|
||||
2: adds r12, r12, r6
|
||||
vdup.8 d0, r4
|
||||
beq 5f
|
||||
tst r6, r6
|
||||
vdup.8 d1, r12
|
||||
vtrn.32 d0, d1
|
||||
|
||||
beq 4f
|
||||
|
||||
vext.32 d1, d0, d1, #1
|
||||
vld1.32 {d4[0]}, [r1], r2
|
||||
|
||||
3: vld1.32 {d4[1]}, [r1], r2
|
||||
vmull.u8 q8, d4, d0
|
||||
vld1.32 {d4[0]}, [r1], r2
|
||||
vmull.u8 q9, d4, d1
|
||||
vadd.i16 d16, d16, d17
|
||||
vadd.i16 d17, d18, d19
|
||||
pld [r1]
|
||||
.ifc \codec,h264
|
||||
vrshrn.u16 d16, q8, #6
|
||||
.else
|
||||
vadd.u16 q8, q8, q11
|
||||
vshrn.u16 d16, q8, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
vld1.32 {d20[0]}, [lr,:32], r2
|
||||
vld1.32 {d20[1]}, [lr,:32], r2
|
||||
vrhadd.u8 d16, d16, d20
|
||||
.endif
|
||||
subs r3, r3, #2
|
||||
pld [r1, r2]
|
||||
vst1.32 {d16[0]}, [r0,:32], r2
|
||||
vst1.32 {d16[1]}, [r0,:32], r2
|
||||
bgt 3b
|
||||
|
||||
pop {r4-r7, pc}
|
||||
|
||||
4: vld1.8 {d4}, [r1], r2
|
||||
vld1.8 {d6}, [r1], r2
|
||||
vext.8 d5, d4, d5, #1
|
||||
vext.8 d7, d6, d7, #1
|
||||
vtrn.32 d4, d5
|
||||
vtrn.32 d6, d7
|
||||
vmull.u8 q8, d4, d0
|
||||
vmull.u8 q9, d6, d0
|
||||
subs r3, r3, #2
|
||||
vadd.i16 d16, d16, d17
|
||||
vadd.i16 d17, d18, d19
|
||||
pld [r1]
|
||||
.ifc \codec,h264
|
||||
vrshrn.u16 d16, q8, #6
|
||||
.else
|
||||
vadd.u16 q8, q8, q11
|
||||
vshrn.u16 d16, q8, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
vld1.32 {d20[0]}, [lr,:32], r2
|
||||
vld1.32 {d20[1]}, [lr,:32], r2
|
||||
vrhadd.u8 d16, d16, d20
|
||||
.endif
|
||||
pld [r1]
|
||||
vst1.32 {d16[0]}, [r0,:32], r2
|
||||
vst1.32 {d16[1]}, [r0,:32], r2
|
||||
bgt 4b
|
||||
|
||||
pop {r4-r7, pc}
|
||||
|
||||
5: vld1.32 {d4[0]}, [r1], r2
|
||||
vld1.32 {d4[1]}, [r1], r2
|
||||
vmull.u8 q8, d4, d0
|
||||
subs r3, r3, #2
|
||||
pld [r1]
|
||||
.ifc \codec,h264
|
||||
vrshrn.u16 d16, q8, #6
|
||||
.else
|
||||
vadd.u16 q8, q8, q11
|
||||
vshrn.u16 d16, q8, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
vld1.32 {d20[0]}, [lr,:32], r2
|
||||
vld1.32 {d20[1]}, [lr,:32], r2
|
||||
vrhadd.u8 d16, d16, d20
|
||||
.endif
|
||||
pld [r1]
|
||||
vst1.32 {d16[0]}, [r0,:32], r2
|
||||
vst1.32 {d16[1]}, [r0,:32], r2
|
||||
bgt 5b
|
||||
|
||||
pop {r4-r7, pc}
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro h264_chroma_mc2 type
|
||||
function ff_\type\()_h264_chroma_mc2_neon, export=1
|
||||
push {r4-r6, lr}
|
||||
ldr r4, [sp, #16]
|
||||
ldr lr, [sp, #20]
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
orrs r5, r4, lr
|
||||
beq 2f
|
||||
|
||||
mul r5, r4, lr
|
||||
rsb r6, r5, lr, lsl #3
|
||||
rsb r12, r5, r4, lsl #3
|
||||
sub r4, r5, r4, lsl #3
|
||||
sub r4, r4, lr, lsl #3
|
||||
add r4, r4, #64
|
||||
vdup.8 d0, r4
|
||||
vdup.8 d2, r12
|
||||
vdup.8 d1, r6
|
||||
vdup.8 d3, r5
|
||||
vtrn.16 q0, q1
|
||||
1:
|
||||
vld1.32 {d4[0]}, [r1], r2
|
||||
vld1.32 {d4[1]}, [r1], r2
|
||||
vrev64.32 d5, d4
|
||||
vld1.32 {d5[1]}, [r1]
|
||||
vext.8 q3, q2, q2, #1
|
||||
vtrn.16 q2, q3
|
||||
vmull.u8 q8, d4, d0
|
||||
vmlal.u8 q8, d5, d1
|
||||
.ifc \type,avg
|
||||
vld1.16 {d18[0]}, [r0,:16], r2
|
||||
vld1.16 {d18[1]}, [r0,:16]
|
||||
sub r0, r0, r2
|
||||
.endif
|
||||
vtrn.32 d16, d17
|
||||
vadd.i16 d16, d16, d17
|
||||
vrshrn.u16 d16, q8, #6
|
||||
.ifc \type,avg
|
||||
vrhadd.u8 d16, d16, d18
|
||||
.endif
|
||||
vst1.16 {d16[0]}, [r0,:16], r2
|
||||
vst1.16 {d16[1]}, [r0,:16], r2
|
||||
subs r3, r3, #2
|
||||
bgt 1b
|
||||
pop {r4-r6, pc}
|
||||
2:
|
||||
.ifc \type,put
|
||||
ldrh_post r5, r1, r2
|
||||
strh_post r5, r0, r2
|
||||
ldrh_post r6, r1, r2
|
||||
strh_post r6, r0, r2
|
||||
.else
|
||||
vld1.16 {d16[0]}, [r1], r2
|
||||
vld1.16 {d16[1]}, [r1], r2
|
||||
vld1.16 {d18[0]}, [r0,:16], r2
|
||||
vld1.16 {d18[1]}, [r0,:16]
|
||||
sub r0, r0, r2
|
||||
vrhadd.u8 d16, d16, d18
|
||||
vst1.16 {d16[0]}, [r0,:16], r2
|
||||
vst1.16 {d16[1]}, [r0,:16], r2
|
||||
.endif
|
||||
subs r3, r3, #2
|
||||
bgt 2b
|
||||
pop {r4-r6, pc}
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_chroma_mc8 put
|
||||
h264_chroma_mc8 avg
|
||||
h264_chroma_mc4 put
|
||||
h264_chroma_mc4 avg
|
||||
h264_chroma_mc2 put
|
||||
h264_chroma_mc2 avg
|
||||
|
||||
#if CONFIG_RV40_DECODER
|
||||
const rv40bias
|
||||
.short 0, 16, 32, 16
|
||||
.short 32, 28, 32, 28
|
||||
.short 0, 32, 16, 32
|
||||
.short 32, 28, 32, 28
|
||||
endconst
|
||||
|
||||
h264_chroma_mc8 put, rv40
|
||||
h264_chroma_mc8 avg, rv40
|
||||
h264_chroma_mc4 put, rv40
|
||||
h264_chroma_mc4 avg, rv40
|
||||
#endif
|
||||
|
||||
#if CONFIG_VC1DSP
|
||||
h264_chroma_mc8 put, vc1
|
||||
h264_chroma_mc8 avg, vc1
|
||||
h264_chroma_mc4 put, vc1
|
||||
h264_chroma_mc4 avg, vc1
|
||||
#endif
|
||||
120
externals/ffmpeg/libavcodec/arm/h264dsp_init_arm.c
vendored
Executable file
120
externals/ffmpeg/libavcodec/arm/h264dsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/h264dsp.h"
|
||||
#include "libavcodec/arm/startcode.h"
|
||||
|
||||
void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_h_loop_filter_chroma422_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
|
||||
void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
|
||||
int log2_den, int weight, int offset);
|
||||
void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
|
||||
int log2_den, int weight, int offset);
|
||||
void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
|
||||
int log2_den, int weight, int offset);
|
||||
|
||||
void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int height, int log2_den, int weightd,
|
||||
int weights, int offset);
|
||||
void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int height, int log2_den, int weightd,
|
||||
int weights, int offset);
|
||||
void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int height, int log2_den, int weightd,
|
||||
int weights, int offset);
|
||||
|
||||
void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
|
||||
void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
|
||||
static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
#if HAVE_NEON
|
||||
if (bit_depth == 8) {
|
||||
c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
|
||||
c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
|
||||
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
|
||||
|
||||
if (chroma_format_idc <= 1)
|
||||
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
|
||||
else
|
||||
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon;
|
||||
|
||||
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
|
||||
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
|
||||
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
|
||||
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
|
||||
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
|
||||
|
||||
c->h264_idct_add = ff_h264_idct_add_neon;
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_neon;
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
|
||||
if (chroma_format_idc <= 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_neon;
|
||||
c->h264_idct8_add = ff_h264_idct8_add_neon;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
|
||||
}
|
||||
#endif // HAVE_NEON
|
||||
}
|
||||
|
||||
av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if HAVE_ARMV6
|
||||
if (have_setend(cpu_flags))
|
||||
c->startcode_find_candidate = ff_startcode_find_candidate_armv6;
|
||||
#endif
|
||||
if (have_neon(cpu_flags))
|
||||
h264dsp_init_neon(c, bit_depth, chroma_format_idc);
|
||||
}
|
||||
560
externals/ffmpeg/libavcodec/arm/h264dsp_neon.S
vendored
Executable file
560
externals/ffmpeg/libavcodec/arm/h264dsp_neon.S
vendored
Executable file
@@ -0,0 +1,560 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
/* H.264 loop filter */
|
||||
|
||||
.macro h264_loop_filter_start
|
||||
ldr r12, [sp]
|
||||
tst r2, r2
|
||||
ldr r12, [r12]
|
||||
it ne
|
||||
tstne r3, r3
|
||||
vmov.32 d24[0], r12
|
||||
and r12, r12, r12, lsl #16
|
||||
it eq
|
||||
bxeq lr
|
||||
ands r12, r12, r12, lsl #8
|
||||
it lt
|
||||
bxlt lr
|
||||
.endm
|
||||
|
||||
.macro h264_loop_filter_luma
|
||||
vdup.8 q11, r2 @ alpha
|
||||
vmovl.u8 q12, d24
|
||||
vabd.u8 q6, q8, q0 @ abs(p0 - q0)
|
||||
vmovl.u16 q12, d24
|
||||
vabd.u8 q14, q9, q8 @ abs(p1 - p0)
|
||||
vsli.16 q12, q12, #8
|
||||
vabd.u8 q15, q1, q0 @ abs(q1 - q0)
|
||||
vsli.32 q12, q12, #16
|
||||
vclt.u8 q6, q6, q11 @ < alpha
|
||||
vdup.8 q11, r3 @ beta
|
||||
vclt.s8 q7, q12, #0
|
||||
vclt.u8 q14, q14, q11 @ < beta
|
||||
vclt.u8 q15, q15, q11 @ < beta
|
||||
vbic q6, q6, q7
|
||||
vabd.u8 q4, q10, q8 @ abs(p2 - p0)
|
||||
vand q6, q6, q14
|
||||
vabd.u8 q5, q2, q0 @ abs(q2 - q0)
|
||||
vclt.u8 q4, q4, q11 @ < beta
|
||||
vand q6, q6, q15
|
||||
vclt.u8 q5, q5, q11 @ < beta
|
||||
vand q4, q4, q6
|
||||
vand q5, q5, q6
|
||||
vand q12, q12, q6
|
||||
vrhadd.u8 q14, q8, q0
|
||||
vsub.i8 q6, q12, q4
|
||||
vqadd.u8 q7, q9, q12
|
||||
vhadd.u8 q10, q10, q14
|
||||
vsub.i8 q6, q6, q5
|
||||
vhadd.u8 q14, q2, q14
|
||||
vmin.u8 q7, q7, q10
|
||||
vqsub.u8 q11, q9, q12
|
||||
vqadd.u8 q2, q1, q12
|
||||
vmax.u8 q7, q7, q11
|
||||
vqsub.u8 q11, q1, q12
|
||||
vmin.u8 q14, q2, q14
|
||||
vmovl.u8 q2, d0
|
||||
vmax.u8 q14, q14, q11
|
||||
vmovl.u8 q10, d1
|
||||
vsubw.u8 q2, q2, d16
|
||||
vsubw.u8 q10, q10, d17
|
||||
vshl.i16 q2, q2, #2
|
||||
vshl.i16 q10, q10, #2
|
||||
vaddw.u8 q2, q2, d18
|
||||
vaddw.u8 q10, q10, d19
|
||||
vsubw.u8 q2, q2, d2
|
||||
vsubw.u8 q10, q10, d3
|
||||
vrshrn.i16 d4, q2, #3
|
||||
vrshrn.i16 d5, q10, #3
|
||||
vbsl q4, q7, q9
|
||||
vbsl q5, q14, q1
|
||||
vneg.s8 q7, q6
|
||||
vmovl.u8 q14, d16
|
||||
vmin.s8 q2, q2, q6
|
||||
vmovl.u8 q6, d17
|
||||
vmax.s8 q2, q2, q7
|
||||
vmovl.u8 q11, d0
|
||||
vmovl.u8 q12, d1
|
||||
vaddw.s8 q14, q14, d4
|
||||
vaddw.s8 q6, q6, d5
|
||||
vsubw.s8 q11, q11, d4
|
||||
vsubw.s8 q12, q12, d5
|
||||
vqmovun.s16 d16, q14
|
||||
vqmovun.s16 d17, q6
|
||||
vqmovun.s16 d0, q11
|
||||
vqmovun.s16 d1, q12
|
||||
.endm
|
||||
|
||||
function ff_h264_v_loop_filter_luma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
vld1.8 {d0, d1}, [r0,:128], r1
|
||||
vld1.8 {d2, d3}, [r0,:128], r1
|
||||
vld1.8 {d4, d5}, [r0,:128], r1
|
||||
sub r0, r0, r1, lsl #2
|
||||
sub r0, r0, r1, lsl #1
|
||||
vld1.8 {d20,d21}, [r0,:128], r1
|
||||
vld1.8 {d18,d19}, [r0,:128], r1
|
||||
vld1.8 {d16,d17}, [r0,:128], r1
|
||||
|
||||
vpush {d8-d15}
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
sub r0, r0, r1, lsl #1
|
||||
vst1.8 {d8, d9}, [r0,:128], r1
|
||||
vst1.8 {d16,d17}, [r0,:128], r1
|
||||
vst1.8 {d0, d1}, [r0,:128], r1
|
||||
vst1.8 {d10,d11}, [r0,:128]
|
||||
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_h264_h_loop_filter_luma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub r0, r0, #4
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d20}, [r0], r1
|
||||
vld1.8 {d18}, [r0], r1
|
||||
vld1.8 {d16}, [r0], r1
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d26}, [r0], r1
|
||||
vld1.8 {d7}, [r0], r1
|
||||
vld1.8 {d21}, [r0], r1
|
||||
vld1.8 {d19}, [r0], r1
|
||||
vld1.8 {d17}, [r0], r1
|
||||
vld1.8 {d1}, [r0], r1
|
||||
vld1.8 {d3}, [r0], r1
|
||||
vld1.8 {d5}, [r0], r1
|
||||
vld1.8 {d27}, [r0], r1
|
||||
|
||||
transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
|
||||
|
||||
vpush {d8-d15}
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
transpose_4x4 q4, q8, q0, q5
|
||||
|
||||
sub r0, r0, r1, lsl #4
|
||||
add r0, r0, #2
|
||||
vst1.32 {d8[0]}, [r0], r1
|
||||
vst1.32 {d16[0]}, [r0], r1
|
||||
vst1.32 {d0[0]}, [r0], r1
|
||||
vst1.32 {d10[0]}, [r0], r1
|
||||
vst1.32 {d8[1]}, [r0], r1
|
||||
vst1.32 {d16[1]}, [r0], r1
|
||||
vst1.32 {d0[1]}, [r0], r1
|
||||
vst1.32 {d10[1]}, [r0], r1
|
||||
vst1.32 {d9[0]}, [r0], r1
|
||||
vst1.32 {d17[0]}, [r0], r1
|
||||
vst1.32 {d1[0]}, [r0], r1
|
||||
vst1.32 {d11[0]}, [r0], r1
|
||||
vst1.32 {d9[1]}, [r0], r1
|
||||
vst1.32 {d17[1]}, [r0], r1
|
||||
vst1.32 {d1[1]}, [r0], r1
|
||||
vst1.32 {d11[1]}, [r0], r1
|
||||
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
.macro h264_loop_filter_chroma
|
||||
vdup.8 d22, r2 @ alpha
|
||||
vmovl.u8 q12, d24
|
||||
vabd.u8 d26, d16, d0 @ abs(p0 - q0)
|
||||
vmovl.u8 q2, d0
|
||||
vabd.u8 d28, d18, d16 @ abs(p1 - p0)
|
||||
vsubw.u8 q2, q2, d16
|
||||
vsli.16 d24, d24, #8
|
||||
vshl.i16 q2, q2, #2
|
||||
vabd.u8 d30, d2, d0 @ abs(q1 - q0)
|
||||
vaddw.u8 q2, q2, d18
|
||||
vclt.u8 d26, d26, d22 @ < alpha
|
||||
vsubw.u8 q2, q2, d2
|
||||
vdup.8 d22, r3 @ beta
|
||||
vrshrn.i16 d4, q2, #3
|
||||
vclt.u8 d28, d28, d22 @ < beta
|
||||
vclt.u8 d30, d30, d22 @ < beta
|
||||
vmin.s8 d4, d4, d24
|
||||
vneg.s8 d25, d24
|
||||
vand d26, d26, d28
|
||||
vmax.s8 d4, d4, d25
|
||||
vand d26, d26, d30
|
||||
vmovl.u8 q11, d0
|
||||
vand d4, d4, d26
|
||||
vmovl.u8 q14, d16
|
||||
vaddw.s8 q14, q14, d4
|
||||
vsubw.s8 q11, q11, d4
|
||||
vqmovun.s16 d16, q14
|
||||
vqmovun.s16 d0, q11
|
||||
.endm
|
||||
|
||||
function ff_h264_v_loop_filter_chroma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub r0, r0, r1, lsl #1
|
||||
vld1.8 {d18}, [r0,:64], r1
|
||||
vld1.8 {d16}, [r0,:64], r1
|
||||
vld1.8 {d0}, [r0,:64], r1
|
||||
vld1.8 {d2}, [r0,:64]
|
||||
|
||||
h264_loop_filter_chroma
|
||||
|
||||
sub r0, r0, r1, lsl #1
|
||||
vst1.8 {d16}, [r0,:64], r1
|
||||
vst1.8 {d0}, [r0,:64], r1
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_h264_h_loop_filter_chroma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub r0, r0, #2
|
||||
h_loop_filter_chroma420:
|
||||
vld1.32 {d18[0]}, [r0], r1
|
||||
vld1.32 {d16[0]}, [r0], r1
|
||||
vld1.32 {d0[0]}, [r0], r1
|
||||
vld1.32 {d2[0]}, [r0], r1
|
||||
vld1.32 {d18[1]}, [r0], r1
|
||||
vld1.32 {d16[1]}, [r0], r1
|
||||
vld1.32 {d0[1]}, [r0], r1
|
||||
vld1.32 {d2[1]}, [r0], r1
|
||||
|
||||
vtrn.16 d18, d0
|
||||
vtrn.16 d16, d2
|
||||
vtrn.8 d18, d16
|
||||
vtrn.8 d0, d2
|
||||
|
||||
h264_loop_filter_chroma
|
||||
|
||||
vtrn.16 d18, d0
|
||||
vtrn.16 d16, d2
|
||||
vtrn.8 d18, d16
|
||||
vtrn.8 d0, d2
|
||||
|
||||
sub r0, r0, r1, lsl #3
|
||||
vst1.32 {d18[0]}, [r0], r1
|
||||
vst1.32 {d16[0]}, [r0], r1
|
||||
vst1.32 {d0[0]}, [r0], r1
|
||||
vst1.32 {d2[0]}, [r0], r1
|
||||
vst1.32 {d18[1]}, [r0], r1
|
||||
vst1.32 {d16[1]}, [r0], r1
|
||||
vst1.32 {d0[1]}, [r0], r1
|
||||
vst1.32 {d2[1]}, [r0], r1
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_h264_h_loop_filter_chroma422_neon, export=1
|
||||
h264_loop_filter_start
|
||||
push {r4, lr}
|
||||
add r4, r0, r1
|
||||
add r1, r1, r1
|
||||
sub r0, r0, #2
|
||||
|
||||
bl h_loop_filter_chroma420
|
||||
|
||||
ldr r12, [sp, #8]
|
||||
ldr r12, [r12]
|
||||
vmov.32 d24[0], r12
|
||||
sub r0, r4, #2
|
||||
|
||||
bl h_loop_filter_chroma420
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
@ Biweighted prediction
|
||||
|
||||
.macro biweight_16 macs, macd
|
||||
vdup.8 d0, r4
|
||||
vdup.8 d1, r5
|
||||
vmov q2, q8
|
||||
vmov q3, q8
|
||||
1: subs r3, r3, #2
|
||||
vld1.8 {d20-d21},[r0,:128], r2
|
||||
\macd q2, d0, d20
|
||||
pld [r0]
|
||||
\macd q3, d0, d21
|
||||
vld1.8 {d22-d23},[r1,:128], r2
|
||||
\macs q2, d1, d22
|
||||
pld [r1]
|
||||
\macs q3, d1, d23
|
||||
vmov q12, q8
|
||||
vld1.8 {d28-d29},[r0,:128], r2
|
||||
vmov q13, q8
|
||||
\macd q12, d0, d28
|
||||
pld [r0]
|
||||
\macd q13, d0, d29
|
||||
vld1.8 {d30-d31},[r1,:128], r2
|
||||
\macs q12, d1, d30
|
||||
pld [r1]
|
||||
\macs q13, d1, d31
|
||||
vshl.s16 q2, q2, q9
|
||||
vshl.s16 q3, q3, q9
|
||||
vqmovun.s16 d4, q2
|
||||
vqmovun.s16 d5, q3
|
||||
vshl.s16 q12, q12, q9
|
||||
vshl.s16 q13, q13, q9
|
||||
vqmovun.s16 d24, q12
|
||||
vqmovun.s16 d25, q13
|
||||
vmov q3, q8
|
||||
vst1.8 {d4- d5}, [r6,:128], r2
|
||||
vmov q2, q8
|
||||
vst1.8 {d24-d25},[r6,:128], r2
|
||||
bne 1b
|
||||
pop {r4-r6, pc}
|
||||
.endm
|
||||
|
||||
.macro biweight_8 macs, macd
|
||||
vdup.8 d0, r4
|
||||
vdup.8 d1, r5
|
||||
vmov q1, q8
|
||||
vmov q10, q8
|
||||
1: subs r3, r3, #2
|
||||
vld1.8 {d4},[r0,:64], r2
|
||||
\macd q1, d0, d4
|
||||
pld [r0]
|
||||
vld1.8 {d5},[r1,:64], r2
|
||||
\macs q1, d1, d5
|
||||
pld [r1]
|
||||
vld1.8 {d6},[r0,:64], r2
|
||||
\macd q10, d0, d6
|
||||
pld [r0]
|
||||
vld1.8 {d7},[r1,:64], r2
|
||||
\macs q10, d1, d7
|
||||
pld [r1]
|
||||
vshl.s16 q1, q1, q9
|
||||
vqmovun.s16 d2, q1
|
||||
vshl.s16 q10, q10, q9
|
||||
vqmovun.s16 d4, q10
|
||||
vmov q10, q8
|
||||
vst1.8 {d2},[r6,:64], r2
|
||||
vmov q1, q8
|
||||
vst1.8 {d4},[r6,:64], r2
|
||||
bne 1b
|
||||
pop {r4-r6, pc}
|
||||
.endm
|
||||
|
||||
.macro biweight_4 macs, macd
|
||||
vdup.8 d0, r4
|
||||
vdup.8 d1, r5
|
||||
vmov q1, q8
|
||||
vmov q10, q8
|
||||
1: subs r3, r3, #4
|
||||
vld1.32 {d4[0]},[r0,:32], r2
|
||||
vld1.32 {d4[1]},[r0,:32], r2
|
||||
\macd q1, d0, d4
|
||||
pld [r0]
|
||||
vld1.32 {d5[0]},[r1,:32], r2
|
||||
vld1.32 {d5[1]},[r1,:32], r2
|
||||
\macs q1, d1, d5
|
||||
pld [r1]
|
||||
blt 2f
|
||||
vld1.32 {d6[0]},[r0,:32], r2
|
||||
vld1.32 {d6[1]},[r0,:32], r2
|
||||
\macd q10, d0, d6
|
||||
pld [r0]
|
||||
vld1.32 {d7[0]},[r1,:32], r2
|
||||
vld1.32 {d7[1]},[r1,:32], r2
|
||||
\macs q10, d1, d7
|
||||
pld [r1]
|
||||
vshl.s16 q1, q1, q9
|
||||
vqmovun.s16 d2, q1
|
||||
vshl.s16 q10, q10, q9
|
||||
vqmovun.s16 d4, q10
|
||||
vmov q10, q8
|
||||
vst1.32 {d2[0]},[r6,:32], r2
|
||||
vst1.32 {d2[1]},[r6,:32], r2
|
||||
vmov q1, q8
|
||||
vst1.32 {d4[0]},[r6,:32], r2
|
||||
vst1.32 {d4[1]},[r6,:32], r2
|
||||
bne 1b
|
||||
pop {r4-r6, pc}
|
||||
2: vshl.s16 q1, q1, q9
|
||||
vqmovun.s16 d2, q1
|
||||
vst1.32 {d2[0]},[r6,:32], r2
|
||||
vst1.32 {d2[1]},[r6,:32], r2
|
||||
pop {r4-r6, pc}
|
||||
.endm
|
||||
|
||||
.macro biweight_func w
|
||||
function ff_biweight_h264_pixels_\w\()_neon, export=1
|
||||
push {r4-r6, lr}
|
||||
ldr r12, [sp, #16]
|
||||
add r4, sp, #20
|
||||
ldm r4, {r4-r6}
|
||||
lsr lr, r4, #31
|
||||
add r6, r6, #1
|
||||
eors lr, lr, r5, lsr #30
|
||||
orr r6, r6, #1
|
||||
vdup.16 q9, r12
|
||||
lsl r6, r6, r12
|
||||
vmvn q9, q9
|
||||
vdup.16 q8, r6
|
||||
mov r6, r0
|
||||
beq 10f
|
||||
subs lr, lr, #1
|
||||
beq 20f
|
||||
subs lr, lr, #1
|
||||
beq 30f
|
||||
b 40f
|
||||
10: biweight_\w vmlal.u8, vmlal.u8
|
||||
20: rsb r4, r4, #0
|
||||
biweight_\w vmlal.u8, vmlsl.u8
|
||||
30: rsb r4, r4, #0
|
||||
rsb r5, r5, #0
|
||||
biweight_\w vmlsl.u8, vmlsl.u8
|
||||
40: rsb r5, r5, #0
|
||||
biweight_\w vmlsl.u8, vmlal.u8
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
biweight_func 16
|
||||
biweight_func 8
|
||||
biweight_func 4
|
||||
|
||||
@ Weighted prediction
|
||||
|
||||
.macro weight_16 add
|
||||
vdup.8 d0, r12
|
||||
1: subs r2, r2, #2
|
||||
vld1.8 {d20-d21},[r0,:128], r1
|
||||
vmull.u8 q2, d0, d20
|
||||
pld [r0]
|
||||
vmull.u8 q3, d0, d21
|
||||
vld1.8 {d28-d29},[r0,:128], r1
|
||||
vmull.u8 q12, d0, d28
|
||||
pld [r0]
|
||||
vmull.u8 q13, d0, d29
|
||||
\add q2, q8, q2
|
||||
vrshl.s16 q2, q2, q9
|
||||
\add q3, q8, q3
|
||||
vrshl.s16 q3, q3, q9
|
||||
vqmovun.s16 d4, q2
|
||||
vqmovun.s16 d5, q3
|
||||
\add q12, q8, q12
|
||||
vrshl.s16 q12, q12, q9
|
||||
\add q13, q8, q13
|
||||
vrshl.s16 q13, q13, q9
|
||||
vqmovun.s16 d24, q12
|
||||
vqmovun.s16 d25, q13
|
||||
vst1.8 {d4- d5}, [r4,:128], r1
|
||||
vst1.8 {d24-d25},[r4,:128], r1
|
||||
bne 1b
|
||||
pop {r4, pc}
|
||||
.endm
|
||||
|
||||
.macro weight_8 add
|
||||
vdup.8 d0, r12
|
||||
1: subs r2, r2, #2
|
||||
vld1.8 {d4},[r0,:64], r1
|
||||
vmull.u8 q1, d0, d4
|
||||
pld [r0]
|
||||
vld1.8 {d6},[r0,:64], r1
|
||||
vmull.u8 q10, d0, d6
|
||||
\add q1, q8, q1
|
||||
pld [r0]
|
||||
vrshl.s16 q1, q1, q9
|
||||
vqmovun.s16 d2, q1
|
||||
\add q10, q8, q10
|
||||
vrshl.s16 q10, q10, q9
|
||||
vqmovun.s16 d4, q10
|
||||
vst1.8 {d2},[r4,:64], r1
|
||||
vst1.8 {d4},[r4,:64], r1
|
||||
bne 1b
|
||||
pop {r4, pc}
|
||||
.endm
|
||||
|
||||
.macro weight_4 add
|
||||
vdup.8 d0, r12
|
||||
vmov q1, q8
|
||||
vmov q10, q8
|
||||
1: subs r2, r2, #4
|
||||
vld1.32 {d4[0]},[r0,:32], r1
|
||||
vld1.32 {d4[1]},[r0,:32], r1
|
||||
vmull.u8 q1, d0, d4
|
||||
pld [r0]
|
||||
blt 2f
|
||||
vld1.32 {d6[0]},[r0,:32], r1
|
||||
vld1.32 {d6[1]},[r0,:32], r1
|
||||
vmull.u8 q10, d0, d6
|
||||
pld [r0]
|
||||
\add q1, q8, q1
|
||||
vrshl.s16 q1, q1, q9
|
||||
vqmovun.s16 d2, q1
|
||||
\add q10, q8, q10
|
||||
vrshl.s16 q10, q10, q9
|
||||
vqmovun.s16 d4, q10
|
||||
vmov q10, q8
|
||||
vst1.32 {d2[0]},[r4,:32], r1
|
||||
vst1.32 {d2[1]},[r4,:32], r1
|
||||
vmov q1, q8
|
||||
vst1.32 {d4[0]},[r4,:32], r1
|
||||
vst1.32 {d4[1]},[r4,:32], r1
|
||||
bne 1b
|
||||
pop {r4, pc}
|
||||
2: \add q1, q8, q1
|
||||
vrshl.s16 q1, q1, q9
|
||||
vqmovun.s16 d2, q1
|
||||
vst1.32 {d2[0]},[r4,:32], r1
|
||||
vst1.32 {d2[1]},[r4,:32], r1
|
||||
pop {r4, pc}
|
||||
.endm
|
||||
|
||||
.macro weight_func w
|
||||
function ff_weight_h264_pixels_\w\()_neon, export=1
|
||||
push {r4, lr}
|
||||
ldr r12, [sp, #8]
|
||||
ldr r4, [sp, #12]
|
||||
cmp r3, #1
|
||||
lsl r4, r4, r3
|
||||
vdup.16 q8, r4
|
||||
mov r4, r0
|
||||
ble 20f
|
||||
rsb lr, r3, #1
|
||||
vdup.16 q9, lr
|
||||
cmp r12, #0
|
||||
blt 10f
|
||||
weight_\w vhadd.s16
|
||||
10: rsb r12, r12, #0
|
||||
weight_\w vhsub.s16
|
||||
20: rsb lr, r3, #0
|
||||
vdup.16 q9, lr
|
||||
cmp r12, #0
|
||||
blt 10f
|
||||
weight_\w vadd.s16
|
||||
10: rsb r12, r12, #0
|
||||
weight_\w vsub.s16
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
weight_func 16
|
||||
weight_func 8
|
||||
weight_func 4
|
||||
417
externals/ffmpeg/libavcodec/arm/h264idct_neon.S
vendored
Executable file
417
externals/ffmpeg/libavcodec/arm/h264idct_neon.S
vendored
Executable file
@@ -0,0 +1,417 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_h264_idct_add_neon, export=1
|
||||
h264_idct_add_neon_nothumb:
|
||||
vld1.64 {d0-d3}, [r1,:128]
|
||||
vmov.i16 q15, #0
|
||||
|
||||
vswp d1, d2
|
||||
vst1.16 {q15}, [r1,:128]!
|
||||
vadd.i16 d4, d0, d1
|
||||
vst1.16 {q15}, [r1,:128]!
|
||||
vshr.s16 q8, q1, #1
|
||||
vsub.i16 d5, d0, d1
|
||||
vadd.i16 d6, d2, d17
|
||||
vsub.i16 d7, d16, d3
|
||||
vadd.i16 q0, q2, q3
|
||||
vsub.i16 q1, q2, q3
|
||||
|
||||
vtrn.16 d0, d1
|
||||
vtrn.16 d3, d2
|
||||
vtrn.32 d0, d3
|
||||
vtrn.32 d1, d2
|
||||
|
||||
vadd.i16 d4, d0, d3
|
||||
vld1.32 {d18[0]}, [r0,:32], r2
|
||||
vswp d1, d3
|
||||
vshr.s16 q8, q1, #1
|
||||
vld1.32 {d19[1]}, [r0,:32], r2
|
||||
vsub.i16 d5, d0, d1
|
||||
vld1.32 {d18[1]}, [r0,:32], r2
|
||||
vadd.i16 d6, d16, d3
|
||||
vld1.32 {d19[0]}, [r0,:32], r2
|
||||
vsub.i16 d7, d2, d17
|
||||
sub r0, r0, r2, lsl #2
|
||||
vadd.i16 q0, q2, q3
|
||||
vsub.i16 q1, q2, q3
|
||||
|
||||
vrshr.s16 q0, q0, #6
|
||||
vrshr.s16 q1, q1, #6
|
||||
|
||||
vaddw.u8 q0, q0, d18
|
||||
vaddw.u8 q1, q1, d19
|
||||
|
||||
vqmovun.s16 d0, q0
|
||||
vqmovun.s16 d1, q1
|
||||
|
||||
vst1.32 {d0[0]}, [r0,:32], r2
|
||||
vst1.32 {d1[1]}, [r0,:32], r2
|
||||
vst1.32 {d0[1]}, [r0,:32], r2
|
||||
vst1.32 {d1[0]}, [r0,:32], r2
|
||||
|
||||
sub r1, r1, #32
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_dc_add_neon, export=1
|
||||
h264_idct_dc_add_neon_nothumb:
|
||||
mov r3, #0
|
||||
vld1.16 {d2[],d3[]}, [r1,:16]
|
||||
strh r3, [r1]
|
||||
vrshr.s16 q1, q1, #6
|
||||
vld1.32 {d0[0]}, [r0,:32], r2
|
||||
vld1.32 {d0[1]}, [r0,:32], r2
|
||||
vaddw.u8 q2, q1, d0
|
||||
vld1.32 {d1[0]}, [r0,:32], r2
|
||||
vld1.32 {d1[1]}, [r0,:32], r2
|
||||
vaddw.u8 q1, q1, d1
|
||||
vqmovun.s16 d0, q2
|
||||
vqmovun.s16 d1, q1
|
||||
sub r0, r0, r2, lsl #2
|
||||
vst1.32 {d0[0]}, [r0,:32], r2
|
||||
vst1.32 {d0[1]}, [r0,:32], r2
|
||||
vst1.32 {d1[0]}, [r0,:32], r2
|
||||
vst1.32 {d1[1]}, [r0,:32], r2
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_add16_neon, export=1
|
||||
push {r4-r8,lr}
|
||||
mov r4, r0
|
||||
mov r5, r1
|
||||
mov r1, r2
|
||||
mov r2, r3
|
||||
ldr r6, [sp, #24]
|
||||
movrel r7, scan8
|
||||
mov ip, #16
|
||||
1: ldrb r8, [r7], #1
|
||||
ldr r0, [r5], #4
|
||||
ldrb r8, [r6, r8]
|
||||
subs r8, r8, #1
|
||||
blt 2f
|
||||
ldrsh lr, [r1]
|
||||
add r0, r0, r4
|
||||
it ne
|
||||
movne lr, #0
|
||||
cmp lr, #0
|
||||
ite ne
|
||||
adrne lr, h264_idct_dc_add_neon_nothumb + CONFIG_THUMB
|
||||
adreq lr, h264_idct_add_neon_nothumb + CONFIG_THUMB
|
||||
blx lr
|
||||
2: subs ip, ip, #1
|
||||
add r1, r1, #32
|
||||
bne 1b
|
||||
pop {r4-r8,pc}
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_add16intra_neon, export=1
|
||||
push {r4-r8,lr}
|
||||
mov r4, r0
|
||||
mov r5, r1
|
||||
mov r1, r2
|
||||
mov r2, r3
|
||||
ldr r6, [sp, #24]
|
||||
movrel r7, scan8
|
||||
mov ip, #16
|
||||
1: ldrb r8, [r7], #1
|
||||
ldr r0, [r5], #4
|
||||
ldrb r8, [r6, r8]
|
||||
add r0, r0, r4
|
||||
cmp r8, #0
|
||||
ldrsh r8, [r1]
|
||||
iteet ne
|
||||
adrne lr, h264_idct_add_neon_nothumb + CONFIG_THUMB
|
||||
adreq lr, h264_idct_dc_add_neon_nothumb + CONFIG_THUMB
|
||||
cmpeq r8, #0
|
||||
blxne lr
|
||||
subs ip, ip, #1
|
||||
add r1, r1, #32
|
||||
bne 1b
|
||||
pop {r4-r8,pc}
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_add8_neon, export=1
|
||||
push {r4-r10,lr}
|
||||
ldm r0, {r4,r9}
|
||||
add r5, r1, #16*4
|
||||
add r1, r2, #16*32
|
||||
mov r2, r3
|
||||
mov r10, r1
|
||||
ldr r6, [sp, #32]
|
||||
movrel r7, scan8+16
|
||||
mov r12, #0
|
||||
1: ldrb r8, [r7, r12]
|
||||
ldr r0, [r5, r12, lsl #2]
|
||||
ldrb r8, [r6, r8]
|
||||
add r0, r0, r4
|
||||
add r1, r10, r12, lsl #5
|
||||
cmp r8, #0
|
||||
ldrsh r8, [r1]
|
||||
iteet ne
|
||||
adrne lr, h264_idct_add_neon_nothumb + CONFIG_THUMB
|
||||
adreq lr, h264_idct_dc_add_neon_nothumb + CONFIG_THUMB
|
||||
cmpeq r8, #0
|
||||
blxne lr
|
||||
add r12, r12, #1
|
||||
cmp r12, #4
|
||||
itt eq
|
||||
moveq r12, #16
|
||||
moveq r4, r9
|
||||
cmp r12, #20
|
||||
blt 1b
|
||||
pop {r4-r10,pc}
|
||||
endfunc
|
||||
|
||||
.macro idct8x8_cols pass
|
||||
.if \pass == 0
|
||||
qa .req q2
|
||||
qb .req q14
|
||||
vshr.s16 q2, q10, #1
|
||||
vadd.i16 q0, q8, q12
|
||||
vld1.16 {q14-q15},[r1,:128]
|
||||
vst1.16 {q3}, [r1,:128]!
|
||||
vst1.16 {q3}, [r1,:128]!
|
||||
vsub.i16 q1, q8, q12
|
||||
vshr.s16 q3, q14, #1
|
||||
vsub.i16 q2, q2, q14
|
||||
vadd.i16 q3, q3, q10
|
||||
.else
|
||||
qa .req q14
|
||||
qb .req q2
|
||||
vtrn.32 q8, q10
|
||||
vtrn.16 q12, q13
|
||||
vtrn.32 q9, q11
|
||||
vtrn.32 q12, q2
|
||||
vtrn.32 q13, q15
|
||||
vswp d21, d4
|
||||
vshr.s16 q14, q10, #1
|
||||
vswp d17, d24
|
||||
vshr.s16 q3, q2, #1
|
||||
vswp d19, d26
|
||||
vadd.i16 q0, q8, q12
|
||||
vswp d23, d30
|
||||
vsub.i16 q1, q8, q12
|
||||
vsub.i16 q14, q14, q2
|
||||
vadd.i16 q3, q3, q10
|
||||
.endif
|
||||
vadd.i16 q10, q1, qa
|
||||
vsub.i16 q12, q1, qa
|
||||
vadd.i16 q8, q0, q3
|
||||
vsub.i16 qb, q0, q3
|
||||
vsub.i16 q0, q13, q11
|
||||
vadd.i16 q1, q15, q9
|
||||
vsub.i16 qa, q15, q9
|
||||
vadd.i16 q3, q13, q11
|
||||
vsub.i16 q0, q0, q15
|
||||
vsub.i16 q1, q1, q11
|
||||
vadd.i16 qa, qa, q13
|
||||
vadd.i16 q3, q3, q9
|
||||
vshr.s16 q9, q9, #1
|
||||
vshr.s16 q11, q11, #1
|
||||
vshr.s16 q13, q13, #1
|
||||
vshr.s16 q15, q15, #1
|
||||
vsub.i16 q0, q0, q15
|
||||
vsub.i16 q1, q1, q11
|
||||
vadd.i16 qa, qa, q13
|
||||
vadd.i16 q3, q3, q9
|
||||
vshr.s16 q9, q0, #2
|
||||
vshr.s16 q11, q1, #2
|
||||
vshr.s16 q13, qa, #2
|
||||
vshr.s16 q15, q3, #2
|
||||
vsub.i16 q3, q3, q9
|
||||
vsub.i16 qa, q11, qa
|
||||
vadd.i16 q1, q1, q13
|
||||
vadd.i16 q0, q0, q15
|
||||
.if \pass == 0
|
||||
vsub.i16 q15, q8, q3
|
||||
vadd.i16 q8, q8, q3
|
||||
vadd.i16 q9, q10, q2
|
||||
vsub.i16 q2, q10, q2
|
||||
vtrn.16 q8, q9
|
||||
vadd.i16 q10, q12, q1
|
||||
vtrn.16 q2, q15
|
||||
vadd.i16 q11, q14, q0
|
||||
vsub.i16 q13, q12, q1
|
||||
vtrn.16 q10, q11
|
||||
vsub.i16 q12, q14, q0
|
||||
.else
|
||||
vsub.i16 q15, q8, q3
|
||||
vadd.i16 q8, q8, q3
|
||||
vadd.i16 q9, q10, q14
|
||||
vsub.i16 q14, q10, q14
|
||||
vadd.i16 q10, q12, q1
|
||||
vsub.i16 q13, q12, q1
|
||||
vadd.i16 q11, q2, q0
|
||||
vsub.i16 q12, q2, q0
|
||||
.endif
|
||||
.unreq qa
|
||||
.unreq qb
|
||||
.endm
|
||||
|
||||
function ff_h264_idct8_add_neon, export=1
|
||||
h264_idct8_add_neon_nothumb:
|
||||
vmov.i16 q3, #0
|
||||
vld1.16 {q8-q9}, [r1,:128]
|
||||
vst1.16 {q3}, [r1,:128]!
|
||||
vst1.16 {q3}, [r1,:128]!
|
||||
vld1.16 {q10-q11},[r1,:128]
|
||||
vst1.16 {q3}, [r1,:128]!
|
||||
vst1.16 {q3}, [r1,:128]!
|
||||
vld1.16 {q12-q13},[r1,:128]
|
||||
vst1.16 {q3}, [r1,:128]!
|
||||
vst1.16 {q3}, [r1,:128]!
|
||||
|
||||
idct8x8_cols 0
|
||||
idct8x8_cols 1
|
||||
|
||||
mov r3, r0
|
||||
vrshr.s16 q8, q8, #6
|
||||
vld1.8 {d0}, [r0,:64], r2
|
||||
vrshr.s16 q9, q9, #6
|
||||
vld1.8 {d1}, [r0,:64], r2
|
||||
vrshr.s16 q10, q10, #6
|
||||
vld1.8 {d2}, [r0,:64], r2
|
||||
vrshr.s16 q11, q11, #6
|
||||
vld1.8 {d3}, [r0,:64], r2
|
||||
vrshr.s16 q12, q12, #6
|
||||
vld1.8 {d4}, [r0,:64], r2
|
||||
vrshr.s16 q13, q13, #6
|
||||
vld1.8 {d5}, [r0,:64], r2
|
||||
vrshr.s16 q14, q14, #6
|
||||
vld1.8 {d6}, [r0,:64], r2
|
||||
vrshr.s16 q15, q15, #6
|
||||
vld1.8 {d7}, [r0,:64], r2
|
||||
vaddw.u8 q8, q8, d0
|
||||
vaddw.u8 q9, q9, d1
|
||||
vaddw.u8 q10, q10, d2
|
||||
vqmovun.s16 d0, q8
|
||||
vaddw.u8 q11, q11, d3
|
||||
vqmovun.s16 d1, q9
|
||||
vaddw.u8 q12, q12, d4
|
||||
vqmovun.s16 d2, q10
|
||||
vst1.8 {d0}, [r3,:64], r2
|
||||
vaddw.u8 q13, q13, d5
|
||||
vqmovun.s16 d3, q11
|
||||
vst1.8 {d1}, [r3,:64], r2
|
||||
vaddw.u8 q14, q14, d6
|
||||
vqmovun.s16 d4, q12
|
||||
vst1.8 {d2}, [r3,:64], r2
|
||||
vaddw.u8 q15, q15, d7
|
||||
vqmovun.s16 d5, q13
|
||||
vst1.8 {d3}, [r3,:64], r2
|
||||
vqmovun.s16 d6, q14
|
||||
vqmovun.s16 d7, q15
|
||||
vst1.8 {d4}, [r3,:64], r2
|
||||
vst1.8 {d5}, [r3,:64], r2
|
||||
vst1.8 {d6}, [r3,:64], r2
|
||||
vst1.8 {d7}, [r3,:64], r2
|
||||
|
||||
sub r1, r1, #128
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct8_dc_add_neon, export=1
|
||||
h264_idct8_dc_add_neon_nothumb:
|
||||
mov r3, #0
|
||||
vld1.16 {d30[],d31[]},[r1,:16]
|
||||
strh r3, [r1]
|
||||
vld1.32 {d0}, [r0,:64], r2
|
||||
vrshr.s16 q15, q15, #6
|
||||
vld1.32 {d1}, [r0,:64], r2
|
||||
vld1.32 {d2}, [r0,:64], r2
|
||||
vaddw.u8 q8, q15, d0
|
||||
vld1.32 {d3}, [r0,:64], r2
|
||||
vaddw.u8 q9, q15, d1
|
||||
vld1.32 {d4}, [r0,:64], r2
|
||||
vaddw.u8 q10, q15, d2
|
||||
vld1.32 {d5}, [r0,:64], r2
|
||||
vaddw.u8 q11, q15, d3
|
||||
vld1.32 {d6}, [r0,:64], r2
|
||||
vaddw.u8 q12, q15, d4
|
||||
vld1.32 {d7}, [r0,:64], r2
|
||||
vaddw.u8 q13, q15, d5
|
||||
vaddw.u8 q14, q15, d6
|
||||
vaddw.u8 q15, q15, d7
|
||||
vqmovun.s16 d0, q8
|
||||
vqmovun.s16 d1, q9
|
||||
vqmovun.s16 d2, q10
|
||||
vqmovun.s16 d3, q11
|
||||
sub r0, r0, r2, lsl #3
|
||||
vst1.32 {d0}, [r0,:64], r2
|
||||
vqmovun.s16 d4, q12
|
||||
vst1.32 {d1}, [r0,:64], r2
|
||||
vqmovun.s16 d5, q13
|
||||
vst1.32 {d2}, [r0,:64], r2
|
||||
vqmovun.s16 d6, q14
|
||||
vst1.32 {d3}, [r0,:64], r2
|
||||
vqmovun.s16 d7, q15
|
||||
vst1.32 {d4}, [r0,:64], r2
|
||||
vst1.32 {d5}, [r0,:64], r2
|
||||
vst1.32 {d6}, [r0,:64], r2
|
||||
vst1.32 {d7}, [r0,:64], r2
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct8_add4_neon, export=1
|
||||
push {r4-r8,lr}
|
||||
mov r4, r0
|
||||
mov r5, r1
|
||||
mov r1, r2
|
||||
mov r2, r3
|
||||
ldr r6, [sp, #24]
|
||||
movrel r7, scan8
|
||||
mov r12, #16
|
||||
1: ldrb r8, [r7], #4
|
||||
ldr r0, [r5], #16
|
||||
ldrb r8, [r6, r8]
|
||||
subs r8, r8, #1
|
||||
blt 2f
|
||||
ldrsh lr, [r1]
|
||||
add r0, r0, r4
|
||||
it ne
|
||||
movne lr, #0
|
||||
cmp lr, #0
|
||||
ite ne
|
||||
adrne lr, h264_idct8_dc_add_neon_nothumb + CONFIG_THUMB
|
||||
adreq lr, h264_idct8_add_neon_nothumb + CONFIG_THUMB
|
||||
blx lr
|
||||
2: subs r12, r12, #4
|
||||
add r1, r1, #128
|
||||
bne 1b
|
||||
pop {r4-r8,pc}
|
||||
endfunc
|
||||
|
||||
const scan8
|
||||
.byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
|
||||
.byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
|
||||
.byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
|
||||
.byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
|
||||
.byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
|
||||
.byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
|
||||
.byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
|
||||
.byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
|
||||
.byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
|
||||
.byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
|
||||
.byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
|
||||
.byte 6+13*8, 7+13*8, 6+14*8, 7+14*8
|
||||
endconst
|
||||
95
externals/ffmpeg/libavcodec/arm/h264pred_init_arm.c
vendored
Executable file
95
externals/ffmpeg/libavcodec/arm/h264pred_init_arm.c
vendored
Executable file
@@ -0,0 +1,95 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/h264pred.h"
|
||||
|
||||
void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
|
||||
const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
#if HAVE_NEON
|
||||
const int high_depth = bit_depth > 8;
|
||||
|
||||
if (high_depth)
|
||||
return;
|
||||
|
||||
if (chroma_format_idc <= 1) {
|
||||
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon;
|
||||
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon;
|
||||
if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
|
||||
h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
|
||||
h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon;
|
||||
if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
|
||||
codec_id != AV_CODEC_ID_VP8) {
|
||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon;
|
||||
h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
|
||||
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
|
||||
h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
|
||||
h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
|
||||
h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
|
||||
h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
|
||||
}
|
||||
}
|
||||
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon;
|
||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon;
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon;
|
||||
h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
|
||||
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
|
||||
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
|
||||
if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
|
||||
codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon;
|
||||
#endif // HAVE_NEON
|
||||
}
|
||||
|
||||
av_cold void ff_h264_pred_init_arm(H264PredContext *h, int codec_id,
|
||||
int bit_depth, const int chroma_format_idc)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags))
|
||||
h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
|
||||
}
|
||||
359
externals/ffmpeg/libavcodec/arm/h264pred_neon.S
vendored
Executable file
359
externals/ffmpeg/libavcodec/arm/h264pred_neon.S
vendored
Executable file
@@ -0,0 +1,359 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
.macro ldcol.8 rd, rs, rt, n=8, hi=0
|
||||
.if \n == 8 || \hi == 0
|
||||
vld1.8 {\rd[0]}, [\rs], \rt
|
||||
vld1.8 {\rd[1]}, [\rs], \rt
|
||||
vld1.8 {\rd[2]}, [\rs], \rt
|
||||
vld1.8 {\rd[3]}, [\rs], \rt
|
||||
.endif
|
||||
.if \n == 8 || \hi == 1
|
||||
vld1.8 {\rd[4]}, [\rs], \rt
|
||||
vld1.8 {\rd[5]}, [\rs], \rt
|
||||
vld1.8 {\rd[6]}, [\rs], \rt
|
||||
vld1.8 {\rd[7]}, [\rs], \rt
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro add16x8 dq, dl, dh, rl, rh
|
||||
vaddl.u8 \dq, \rl, \rh
|
||||
vadd.u16 \dl, \dl, \dh
|
||||
vpadd.u16 \dl, \dl, \dl
|
||||
vpadd.u16 \dl, \dl, \dl
|
||||
.endm
|
||||
|
||||
function ff_pred16x16_128_dc_neon, export=1
|
||||
vmov.i8 q0, #128
|
||||
b .L_pred16x16_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_top_dc_neon, export=1
|
||||
sub r2, r0, r1
|
||||
vld1.8 {q0}, [r2,:128]
|
||||
add16x8 q0, d0, d1, d0, d1
|
||||
vrshrn.u16 d0, q0, #4
|
||||
vdup.8 q0, d0[0]
|
||||
b .L_pred16x16_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_left_dc_neon, export=1
|
||||
sub r2, r0, #1
|
||||
ldcol.8 d0, r2, r1
|
||||
ldcol.8 d1, r2, r1
|
||||
add16x8 q0, d0, d1, d0, d1
|
||||
vrshrn.u16 d0, q0, #4
|
||||
vdup.8 q0, d0[0]
|
||||
b .L_pred16x16_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_dc_neon, export=1
|
||||
sub r2, r0, r1
|
||||
vld1.8 {q0}, [r2,:128]
|
||||
sub r2, r0, #1
|
||||
ldcol.8 d2, r2, r1
|
||||
ldcol.8 d3, r2, r1
|
||||
vaddl.u8 q0, d0, d1
|
||||
vaddl.u8 q1, d2, d3
|
||||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpadd.u16 d0, d0, d0
|
||||
vpadd.u16 d0, d0, d0
|
||||
vrshrn.u16 d0, q0, #5
|
||||
vdup.8 q0, d0[0]
|
||||
.L_pred16x16_dc_end:
|
||||
mov r3, #8
|
||||
6: vst1.8 {q0}, [r0,:128], r1
|
||||
vst1.8 {q0}, [r0,:128], r1
|
||||
subs r3, r3, #1
|
||||
bne 6b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_hor_neon, export=1
|
||||
sub r2, r0, #1
|
||||
mov r3, #16
|
||||
1: vld1.8 {d0[],d1[]},[r2], r1
|
||||
vst1.8 {q0}, [r0,:128], r1
|
||||
subs r3, r3, #1
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_vert_neon, export=1
|
||||
sub r0, r0, r1
|
||||
vld1.8 {q0}, [r0,:128], r1
|
||||
mov r3, #8
|
||||
1: vst1.8 {q0}, [r0,:128], r1
|
||||
vst1.8 {q0}, [r0,:128], r1
|
||||
subs r3, r3, #1
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_plane_neon, export=1
|
||||
sub r3, r0, r1
|
||||
add r2, r3, #8
|
||||
sub r3, r3, #1
|
||||
vld1.8 {d0}, [r3]
|
||||
vld1.8 {d2}, [r2,:64], r1
|
||||
ldcol.8 d1, r3, r1
|
||||
add r3, r3, r1
|
||||
ldcol.8 d3, r3, r1
|
||||
vrev64.8 q0, q0
|
||||
vaddl.u8 q8, d2, d3
|
||||
vsubl.u8 q2, d2, d0
|
||||
vsubl.u8 q3, d3, d1
|
||||
movrel r3, p16weight
|
||||
vld1.8 {q0}, [r3,:128]
|
||||
vmul.s16 q2, q2, q0
|
||||
vmul.s16 q3, q3, q0
|
||||
vadd.i16 d4, d4, d5
|
||||
vadd.i16 d5, d6, d7
|
||||
vpadd.i16 d4, d4, d5
|
||||
vpadd.i16 d4, d4, d4
|
||||
vshll.s16 q3, d4, #2
|
||||
vaddw.s16 q2, q3, d4
|
||||
vrshrn.s32 d4, q2, #6
|
||||
mov r3, #0
|
||||
vtrn.16 d4, d5
|
||||
vadd.i16 d2, d4, d5
|
||||
vshl.i16 d3, d2, #3
|
||||
vrev64.16 d16, d17
|
||||
vsub.i16 d3, d3, d2
|
||||
vadd.i16 d16, d16, d0
|
||||
vshl.i16 d2, d16, #4
|
||||
vsub.i16 d2, d2, d3
|
||||
vshl.i16 d3, d4, #4
|
||||
vext.16 q0, q0, q0, #7
|
||||
vsub.i16 d6, d5, d3
|
||||
vmov.16 d0[0], r3
|
||||
vmul.i16 q0, q0, d4[0]
|
||||
vdup.16 q1, d2[0]
|
||||
vdup.16 q2, d4[0]
|
||||
vdup.16 q3, d6[0]
|
||||
vshl.i16 q2, q2, #3
|
||||
vadd.i16 q1, q1, q0
|
||||
vadd.i16 q3, q3, q2
|
||||
mov r3, #16
|
||||
1:
|
||||
vqshrun.s16 d0, q1, #5
|
||||
vadd.i16 q1, q1, q2
|
||||
vqshrun.s16 d1, q1, #5
|
||||
vadd.i16 q1, q1, q3
|
||||
vst1.8 {q0}, [r0,:128], r1
|
||||
subs r3, r3, #1
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
const p16weight, align=4
|
||||
.short 1,2,3,4,5,6,7,8
|
||||
endconst
|
||||
|
||||
function ff_pred8x8_hor_neon, export=1
|
||||
sub r2, r0, #1
|
||||
mov r3, #8
|
||||
1: vld1.8 {d0[]}, [r2], r1
|
||||
vst1.8 {d0}, [r0,:64], r1
|
||||
subs r3, r3, #1
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_vert_neon, export=1
|
||||
sub r0, r0, r1
|
||||
vld1.8 {d0}, [r0,:64], r1
|
||||
mov r3, #4
|
||||
1: vst1.8 {d0}, [r0,:64], r1
|
||||
vst1.8 {d0}, [r0,:64], r1
|
||||
subs r3, r3, #1
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_plane_neon, export=1
|
||||
sub r3, r0, r1
|
||||
add r2, r3, #4
|
||||
sub r3, r3, #1
|
||||
vld1.32 {d0[0]}, [r3]
|
||||
vld1.32 {d2[0]}, [r2,:32], r1
|
||||
ldcol.8 d0, r3, r1, 4, hi=1
|
||||
add r3, r3, r1
|
||||
ldcol.8 d3, r3, r1, 4
|
||||
vaddl.u8 q8, d2, d3
|
||||
vrev32.8 d0, d0
|
||||
vtrn.32 d2, d3
|
||||
vsubl.u8 q2, d2, d0
|
||||
movrel r3, p16weight
|
||||
vld1.16 {q0}, [r3,:128]
|
||||
vmul.s16 d4, d4, d0
|
||||
vmul.s16 d5, d5, d0
|
||||
vpadd.i16 d4, d4, d5
|
||||
vpaddl.s16 d4, d4
|
||||
vshl.i32 d5, d4, #4
|
||||
vadd.s32 d4, d4, d5
|
||||
vrshrn.s32 d4, q2, #5
|
||||
mov r3, #0
|
||||
vtrn.16 d4, d5
|
||||
vadd.i16 d2, d4, d5
|
||||
vshl.i16 d3, d2, #2
|
||||
vrev64.16 d16, d16
|
||||
vsub.i16 d3, d3, d2
|
||||
vadd.i16 d16, d16, d0
|
||||
vshl.i16 d2, d16, #4
|
||||
vsub.i16 d2, d2, d3
|
||||
vshl.i16 d3, d4, #3
|
||||
vext.16 q0, q0, q0, #7
|
||||
vsub.i16 d6, d5, d3
|
||||
vmov.16 d0[0], r3
|
||||
vmul.i16 q0, q0, d4[0]
|
||||
vdup.16 q1, d2[0]
|
||||
vdup.16 q2, d4[0]
|
||||
vdup.16 q3, d6[0]
|
||||
vshl.i16 q2, q2, #3
|
||||
vadd.i16 q1, q1, q0
|
||||
vadd.i16 q3, q3, q2
|
||||
mov r3, #8
|
||||
1:
|
||||
vqshrun.s16 d0, q1, #5
|
||||
vadd.i16 q1, q1, q3
|
||||
vst1.8 {d0}, [r0,:64], r1
|
||||
subs r3, r3, #1
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_128_dc_neon, export=1
|
||||
vmov.i8 q0, #128
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_top_dc_neon, export=1
|
||||
sub r2, r0, r1
|
||||
vld1.8 {d0}, [r2,:64]
|
||||
vpaddl.u8 d0, d0
|
||||
vpadd.u16 d0, d0, d0
|
||||
vrshrn.u16 d0, q0, #2
|
||||
vdup.8 d1, d0[1]
|
||||
vdup.8 d0, d0[0]
|
||||
vtrn.32 d0, d1
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_left_dc_neon, export=1
|
||||
sub r2, r0, #1
|
||||
ldcol.8 d0, r2, r1
|
||||
vpaddl.u8 d0, d0
|
||||
vpadd.u16 d0, d0, d0
|
||||
vrshrn.u16 d0, q0, #2
|
||||
vdup.8 d1, d0[1]
|
||||
vdup.8 d0, d0[0]
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_dc_neon, export=1
|
||||
sub r2, r0, r1
|
||||
vld1.8 {d0}, [r2,:64]
|
||||
sub r2, r0, #1
|
||||
ldcol.8 d1, r2, r1
|
||||
vtrn.32 d0, d1
|
||||
vpaddl.u8 q0, q0
|
||||
vpadd.u16 d0, d0, d1
|
||||
vpadd.u16 d1, d0, d0
|
||||
vrshrn.u16 d2, q0, #3
|
||||
vrshrn.u16 d3, q0, #2
|
||||
vdup.8 d0, d2[4]
|
||||
vdup.8 d1, d3[3]
|
||||
vdup.8 d4, d3[2]
|
||||
vdup.8 d5, d2[5]
|
||||
vtrn.32 q0, q2
|
||||
.L_pred8x8_dc_end:
|
||||
mov r3, #4
|
||||
add r2, r0, r1, lsl #2
|
||||
6: vst1.8 {d0}, [r0,:64], r1
|
||||
vst1.8 {d1}, [r2,:64], r1
|
||||
subs r3, r3, #1
|
||||
bne 6b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_l0t_dc_neon, export=1
|
||||
sub r2, r0, r1
|
||||
vld1.8 {d0}, [r2,:64]
|
||||
sub r2, r0, #1
|
||||
ldcol.8 d1, r2, r1, 4
|
||||
vtrn.32 d0, d1
|
||||
vpaddl.u8 q0, q0
|
||||
vpadd.u16 d0, d0, d1
|
||||
vpadd.u16 d1, d0, d0
|
||||
vrshrn.u16 d2, q0, #3
|
||||
vrshrn.u16 d3, q0, #2
|
||||
vdup.8 d0, d2[4]
|
||||
vdup.8 d1, d3[0]
|
||||
vdup.8 q2, d3[2]
|
||||
vtrn.32 q0, q2
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_l00_dc_neon, export=1
|
||||
sub r2, r0, #1
|
||||
ldcol.8 d0, r2, r1, 4
|
||||
vpaddl.u8 d0, d0
|
||||
vpadd.u16 d0, d0, d0
|
||||
vrshrn.u16 d0, q0, #2
|
||||
vmov.i8 d1, #128
|
||||
vdup.8 d0, d0[0]
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_0lt_dc_neon, export=1
|
||||
sub r2, r0, r1
|
||||
vld1.8 {d0}, [r2,:64]
|
||||
add r2, r0, r1, lsl #2
|
||||
sub r2, r2, #1
|
||||
ldcol.8 d1, r2, r1, 4, hi=1
|
||||
vtrn.32 d0, d1
|
||||
vpaddl.u8 q0, q0
|
||||
vpadd.u16 d0, d0, d1
|
||||
vpadd.u16 d1, d0, d0
|
||||
vrshrn.u16 d3, q0, #2
|
||||
vrshrn.u16 d2, q0, #3
|
||||
vdup.8 d0, d3[0]
|
||||
vdup.8 d1, d3[3]
|
||||
vdup.8 d4, d3[2]
|
||||
vdup.8 d5, d2[5]
|
||||
vtrn.32 q0, q2
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_0l0_dc_neon, export=1
|
||||
add r2, r0, r1, lsl #2
|
||||
sub r2, r2, #1
|
||||
ldcol.8 d1, r2, r1, 4
|
||||
vpaddl.u8 d2, d1
|
||||
vpadd.u16 d2, d2, d2
|
||||
vrshrn.u16 d1, q1, #2
|
||||
vmov.i8 d0, #128
|
||||
vdup.8 d1, d1[0]
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
171
externals/ffmpeg/libavcodec/arm/h264qpel_init_arm.c
vendored
Executable file
171
externals/ffmpeg/libavcodec/arm/h264qpel_init_arm.c
vendored
Executable file
@@ -0,0 +1,171 @@
|
||||
/*
|
||||
* ARM NEON optimised DSP functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/h264qpel.h"
|
||||
|
||||
void ff_put_h264_qpel16_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel16_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
void ff_put_h264_qpel8_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
void ff_avg_h264_qpel16_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel16_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
void ff_avg_h264_qpel8_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
av_cold void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth)
|
||||
{
|
||||
const int high_bit_depth = bit_depth > 8;
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags) && !high_bit_depth) {
|
||||
c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
|
||||
c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
|
||||
|
||||
c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
|
||||
c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
|
||||
|
||||
c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon;
|
||||
c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon;
|
||||
|
||||
c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
|
||||
c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
|
||||
}
|
||||
}
|
||||
955
externals/ffmpeg/libavcodec/arm/h264qpel_neon.S
vendored
Executable file
955
externals/ffmpeg/libavcodec/arm/h264qpel_neon.S
vendored
Executable file
@@ -0,0 +1,955 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
/* H.264 qpel MC */
|
||||
|
||||
.macro lowpass_const r
|
||||
movw \r, #5
|
||||
movt \r, #20
|
||||
vmov.32 d6[0], \r
|
||||
.endm
|
||||
|
||||
.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
|
||||
.if \narrow
|
||||
t0 .req q0
|
||||
t1 .req q8
|
||||
.else
|
||||
t0 .req \d0
|
||||
t1 .req \d1
|
||||
.endif
|
||||
vext.8 d2, \r0, \r1, #2
|
||||
vext.8 d3, \r0, \r1, #3
|
||||
vaddl.u8 q1, d2, d3
|
||||
vext.8 d4, \r0, \r1, #1
|
||||
vext.8 d5, \r0, \r1, #4
|
||||
vaddl.u8 q2, d4, d5
|
||||
vext.8 d30, \r0, \r1, #5
|
||||
vaddl.u8 t0, \r0, d30
|
||||
vext.8 d18, \r2, \r3, #2
|
||||
vmla.i16 t0, q1, d6[1]
|
||||
vext.8 d19, \r2, \r3, #3
|
||||
vaddl.u8 q9, d18, d19
|
||||
vext.8 d20, \r2, \r3, #1
|
||||
vmls.i16 t0, q2, d6[0]
|
||||
vext.8 d21, \r2, \r3, #4
|
||||
vaddl.u8 q10, d20, d21
|
||||
vext.8 d31, \r2, \r3, #5
|
||||
vaddl.u8 t1, \r2, d31
|
||||
vmla.i16 t1, q9, d6[1]
|
||||
vmls.i16 t1, q10, d6[0]
|
||||
.if \narrow
|
||||
vqrshrun.s16 \d0, t0, #5
|
||||
vqrshrun.s16 \d1, t1, #5
|
||||
.endif
|
||||
.unreq t0
|
||||
.unreq t1
|
||||
.endm
|
||||
|
||||
.macro lowpass_8_1 r0, r1, d0, narrow=1
|
||||
.if \narrow
|
||||
t0 .req q0
|
||||
.else
|
||||
t0 .req \d0
|
||||
.endif
|
||||
vext.8 d2, \r0, \r1, #2
|
||||
vext.8 d3, \r0, \r1, #3
|
||||
vaddl.u8 q1, d2, d3
|
||||
vext.8 d4, \r0, \r1, #1
|
||||
vext.8 d5, \r0, \r1, #4
|
||||
vaddl.u8 q2, d4, d5
|
||||
vext.8 d30, \r0, \r1, #5
|
||||
vaddl.u8 t0, \r0, d30
|
||||
vmla.i16 t0, q1, d6[1]
|
||||
vmls.i16 t0, q2, d6[0]
|
||||
.if \narrow
|
||||
vqrshrun.s16 \d0, t0, #5
|
||||
.endif
|
||||
.unreq t0
|
||||
.endm
|
||||
|
||||
.macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
|
||||
vext.16 q1, \r0, \r1, #2
|
||||
vext.16 q0, \r0, \r1, #3
|
||||
vaddl.s16 q9, d2, d0
|
||||
vext.16 q2, \r0, \r1, #1
|
||||
vaddl.s16 q1, d3, d1
|
||||
vext.16 q3, \r0, \r1, #4
|
||||
vaddl.s16 q10, d4, d6
|
||||
vext.16 \r1, \r0, \r1, #5
|
||||
vaddl.s16 q2, d5, d7
|
||||
vaddl.s16 q0, \h0, \h1
|
||||
vaddl.s16 q8, \l0, \l1
|
||||
|
||||
vshl.i32 q3, q9, #4
|
||||
vshl.i32 q9, q9, #2
|
||||
vshl.i32 q15, q10, #2
|
||||
vadd.i32 q9, q9, q3
|
||||
vadd.i32 q10, q10, q15
|
||||
|
||||
vshl.i32 q3, q1, #4
|
||||
vshl.i32 q1, q1, #2
|
||||
vshl.i32 q15, q2, #2
|
||||
vadd.i32 q1, q1, q3
|
||||
vadd.i32 q2, q2, q15
|
||||
|
||||
vadd.i32 q9, q9, q8
|
||||
vsub.i32 q9, q9, q10
|
||||
|
||||
vadd.i32 q1, q1, q0
|
||||
vsub.i32 q1, q1, q2
|
||||
|
||||
vrshrn.s32 d18, q9, #10
|
||||
vrshrn.s32 d19, q1, #10
|
||||
|
||||
vqmovun.s16 \d, q9
|
||||
.endm
|
||||
|
||||
function put_h264_qpel16_h_lowpass_neon_packed
|
||||
mov r4, lr
|
||||
mov r12, #16
|
||||
mov r3, #8
|
||||
bl put_h264_qpel8_h_lowpass_neon
|
||||
sub r1, r1, r2, lsl #4
|
||||
add r1, r1, #8
|
||||
mov r12, #16
|
||||
mov lr, r4
|
||||
b put_h264_qpel8_h_lowpass_neon
|
||||
endfunc
|
||||
|
||||
.macro h264_qpel_h_lowpass type
|
||||
function \type\()_h264_qpel16_h_lowpass_neon
|
||||
push {lr}
|
||||
mov r12, #16
|
||||
bl \type\()_h264_qpel8_h_lowpass_neon
|
||||
sub r0, r0, r3, lsl #4
|
||||
sub r1, r1, r2, lsl #4
|
||||
add r0, r0, #8
|
||||
add r1, r1, #8
|
||||
mov r12, #16
|
||||
pop {lr}
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_h_lowpass_neon
|
||||
1: vld1.8 {d0, d1}, [r1], r2
|
||||
vld1.8 {d16,d17}, [r1], r2
|
||||
subs r12, r12, #2
|
||||
lowpass_8 d0, d1, d16, d17, d0, d16
|
||||
.ifc \type,avg
|
||||
vld1.8 {d2}, [r0,:64], r3
|
||||
vrhadd.u8 d0, d0, d2
|
||||
vld1.8 {d3}, [r0,:64]
|
||||
vrhadd.u8 d16, d16, d3
|
||||
sub r0, r0, r3
|
||||
.endif
|
||||
vst1.8 {d0}, [r0,:64], r3
|
||||
vst1.8 {d16}, [r0,:64], r3
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel_h_lowpass put
|
||||
h264_qpel_h_lowpass avg
|
||||
|
||||
.macro h264_qpel_h_lowpass_l2 type
|
||||
function \type\()_h264_qpel16_h_lowpass_l2_neon
|
||||
push {lr}
|
||||
mov r12, #16
|
||||
bl \type\()_h264_qpel8_h_lowpass_l2_neon
|
||||
sub r0, r0, r2, lsl #4
|
||||
sub r1, r1, r2, lsl #4
|
||||
sub r3, r3, r2, lsl #4
|
||||
add r0, r0, #8
|
||||
add r1, r1, #8
|
||||
add r3, r3, #8
|
||||
mov r12, #16
|
||||
pop {lr}
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_h_lowpass_l2_neon
|
||||
1: vld1.8 {d0, d1}, [r1], r2
|
||||
vld1.8 {d16,d17}, [r1], r2
|
||||
vld1.8 {d28}, [r3], r2
|
||||
vld1.8 {d29}, [r3], r2
|
||||
subs r12, r12, #2
|
||||
lowpass_8 d0, d1, d16, d17, d0, d1
|
||||
vrhadd.u8 q0, q0, q14
|
||||
.ifc \type,avg
|
||||
vld1.8 {d2}, [r0,:64], r2
|
||||
vrhadd.u8 d0, d0, d2
|
||||
vld1.8 {d3}, [r0,:64]
|
||||
vrhadd.u8 d1, d1, d3
|
||||
sub r0, r0, r2
|
||||
.endif
|
||||
vst1.8 {d0}, [r0,:64], r2
|
||||
vst1.8 {d1}, [r0,:64], r2
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel_h_lowpass_l2 put
|
||||
h264_qpel_h_lowpass_l2 avg
|
||||
|
||||
function put_h264_qpel16_v_lowpass_neon_packed
|
||||
mov r4, lr
|
||||
mov r2, #8
|
||||
bl put_h264_qpel8_v_lowpass_neon
|
||||
sub r1, r1, r3, lsl #2
|
||||
bl put_h264_qpel8_v_lowpass_neon
|
||||
sub r1, r1, r3, lsl #4
|
||||
sub r1, r1, r3, lsl #2
|
||||
add r1, r1, #8
|
||||
bl put_h264_qpel8_v_lowpass_neon
|
||||
sub r1, r1, r3, lsl #2
|
||||
mov lr, r4
|
||||
b put_h264_qpel8_v_lowpass_neon
|
||||
endfunc
|
||||
|
||||
.macro h264_qpel_v_lowpass type
|
||||
function \type\()_h264_qpel16_v_lowpass_neon
|
||||
mov r4, lr
|
||||
bl \type\()_h264_qpel8_v_lowpass_neon
|
||||
sub r1, r1, r3, lsl #2
|
||||
bl \type\()_h264_qpel8_v_lowpass_neon
|
||||
sub r0, r0, r2, lsl #4
|
||||
add r0, r0, #8
|
||||
sub r1, r1, r3, lsl #4
|
||||
sub r1, r1, r3, lsl #2
|
||||
add r1, r1, #8
|
||||
bl \type\()_h264_qpel8_v_lowpass_neon
|
||||
sub r1, r1, r3, lsl #2
|
||||
mov lr, r4
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_v_lowpass_neon
|
||||
vld1.8 {d8}, [r1], r3
|
||||
vld1.8 {d10}, [r1], r3
|
||||
vld1.8 {d12}, [r1], r3
|
||||
vld1.8 {d14}, [r1], r3
|
||||
vld1.8 {d22}, [r1], r3
|
||||
vld1.8 {d24}, [r1], r3
|
||||
vld1.8 {d26}, [r1], r3
|
||||
vld1.8 {d28}, [r1], r3
|
||||
vld1.8 {d9}, [r1], r3
|
||||
vld1.8 {d11}, [r1], r3
|
||||
vld1.8 {d13}, [r1], r3
|
||||
vld1.8 {d15}, [r1], r3
|
||||
vld1.8 {d23}, [r1]
|
||||
|
||||
transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
|
||||
lowpass_8 d8, d9, d10, d11, d8, d10
|
||||
lowpass_8 d12, d13, d14, d15, d12, d14
|
||||
lowpass_8 d22, d23, d24, d25, d22, d24
|
||||
lowpass_8 d26, d27, d28, d29, d26, d28
|
||||
transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
|
||||
|
||||
.ifc \type,avg
|
||||
vld1.8 {d9}, [r0,:64], r2
|
||||
vrhadd.u8 d8, d8, d9
|
||||
vld1.8 {d11}, [r0,:64], r2
|
||||
vrhadd.u8 d10, d10, d11
|
||||
vld1.8 {d13}, [r0,:64], r2
|
||||
vrhadd.u8 d12, d12, d13
|
||||
vld1.8 {d15}, [r0,:64], r2
|
||||
vrhadd.u8 d14, d14, d15
|
||||
vld1.8 {d23}, [r0,:64], r2
|
||||
vrhadd.u8 d22, d22, d23
|
||||
vld1.8 {d25}, [r0,:64], r2
|
||||
vrhadd.u8 d24, d24, d25
|
||||
vld1.8 {d27}, [r0,:64], r2
|
||||
vrhadd.u8 d26, d26, d27
|
||||
vld1.8 {d29}, [r0,:64], r2
|
||||
vrhadd.u8 d28, d28, d29
|
||||
sub r0, r0, r2, lsl #3
|
||||
.endif
|
||||
|
||||
vst1.8 {d8}, [r0,:64], r2
|
||||
vst1.8 {d10}, [r0,:64], r2
|
||||
vst1.8 {d12}, [r0,:64], r2
|
||||
vst1.8 {d14}, [r0,:64], r2
|
||||
vst1.8 {d22}, [r0,:64], r2
|
||||
vst1.8 {d24}, [r0,:64], r2
|
||||
vst1.8 {d26}, [r0,:64], r2
|
||||
vst1.8 {d28}, [r0,:64], r2
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel_v_lowpass put
|
||||
h264_qpel_v_lowpass avg
|
||||
|
||||
.macro h264_qpel_v_lowpass_l2 type
|
||||
function \type\()_h264_qpel16_v_lowpass_l2_neon
|
||||
mov r4, lr
|
||||
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
sub r1, r1, r3, lsl #2
|
||||
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
sub r0, r0, r3, lsl #4
|
||||
sub r12, r12, r2, lsl #4
|
||||
add r0, r0, #8
|
||||
add r12, r12, #8
|
||||
sub r1, r1, r3, lsl #4
|
||||
sub r1, r1, r3, lsl #2
|
||||
add r1, r1, #8
|
||||
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
sub r1, r1, r3, lsl #2
|
||||
mov lr, r4
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
vld1.8 {d8}, [r1], r3
|
||||
vld1.8 {d10}, [r1], r3
|
||||
vld1.8 {d12}, [r1], r3
|
||||
vld1.8 {d14}, [r1], r3
|
||||
vld1.8 {d22}, [r1], r3
|
||||
vld1.8 {d24}, [r1], r3
|
||||
vld1.8 {d26}, [r1], r3
|
||||
vld1.8 {d28}, [r1], r3
|
||||
vld1.8 {d9}, [r1], r3
|
||||
vld1.8 {d11}, [r1], r3
|
||||
vld1.8 {d13}, [r1], r3
|
||||
vld1.8 {d15}, [r1], r3
|
||||
vld1.8 {d23}, [r1]
|
||||
|
||||
transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
|
||||
lowpass_8 d8, d9, d10, d11, d8, d9
|
||||
lowpass_8 d12, d13, d14, d15, d12, d13
|
||||
lowpass_8 d22, d23, d24, d25, d22, d23
|
||||
lowpass_8 d26, d27, d28, d29, d26, d27
|
||||
transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
|
||||
|
||||
vld1.8 {d0}, [r12], r2
|
||||
vld1.8 {d1}, [r12], r2
|
||||
vld1.8 {d2}, [r12], r2
|
||||
vld1.8 {d3}, [r12], r2
|
||||
vld1.8 {d4}, [r12], r2
|
||||
vrhadd.u8 q0, q0, q4
|
||||
vld1.8 {d5}, [r12], r2
|
||||
vrhadd.u8 q1, q1, q6
|
||||
vld1.8 {d10}, [r12], r2
|
||||
vrhadd.u8 q2, q2, q11
|
||||
vld1.8 {d11}, [r12], r2
|
||||
vrhadd.u8 q5, q5, q13
|
||||
|
||||
.ifc \type,avg
|
||||
vld1.8 {d16}, [r0,:64], r3
|
||||
vrhadd.u8 d0, d0, d16
|
||||
vld1.8 {d17}, [r0,:64], r3
|
||||
vrhadd.u8 d1, d1, d17
|
||||
vld1.8 {d16}, [r0,:64], r3
|
||||
vrhadd.u8 d2, d2, d16
|
||||
vld1.8 {d17}, [r0,:64], r3
|
||||
vrhadd.u8 d3, d3, d17
|
||||
vld1.8 {d16}, [r0,:64], r3
|
||||
vrhadd.u8 d4, d4, d16
|
||||
vld1.8 {d17}, [r0,:64], r3
|
||||
vrhadd.u8 d5, d5, d17
|
||||
vld1.8 {d16}, [r0,:64], r3
|
||||
vrhadd.u8 d10, d10, d16
|
||||
vld1.8 {d17}, [r0,:64], r3
|
||||
vrhadd.u8 d11, d11, d17
|
||||
sub r0, r0, r3, lsl #3
|
||||
.endif
|
||||
|
||||
vst1.8 {d0}, [r0,:64], r3
|
||||
vst1.8 {d1}, [r0,:64], r3
|
||||
vst1.8 {d2}, [r0,:64], r3
|
||||
vst1.8 {d3}, [r0,:64], r3
|
||||
vst1.8 {d4}, [r0,:64], r3
|
||||
vst1.8 {d5}, [r0,:64], r3
|
||||
vst1.8 {d10}, [r0,:64], r3
|
||||
vst1.8 {d11}, [r0,:64], r3
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel_v_lowpass_l2 put
|
||||
h264_qpel_v_lowpass_l2 avg
|
||||
|
||||
function put_h264_qpel8_hv_lowpass_neon_top
|
||||
lowpass_const r12
|
||||
mov r12, #12
|
||||
1: vld1.8 {d0, d1}, [r1], r3
|
||||
vld1.8 {d16,d17}, [r1], r3
|
||||
subs r12, r12, #2
|
||||
lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
|
||||
vst1.8 {d22-d25}, [r4,:128]!
|
||||
bne 1b
|
||||
|
||||
vld1.8 {d0, d1}, [r1]
|
||||
lowpass_8_1 d0, d1, q12, narrow=0
|
||||
|
||||
mov r12, #-16
|
||||
add r4, r4, r12
|
||||
vld1.8 {d30,d31}, [r4,:128], r12
|
||||
vld1.8 {d20,d21}, [r4,:128], r12
|
||||
vld1.8 {d18,d19}, [r4,:128], r12
|
||||
vld1.8 {d16,d17}, [r4,:128], r12
|
||||
vld1.8 {d14,d15}, [r4,:128], r12
|
||||
vld1.8 {d12,d13}, [r4,:128], r12
|
||||
vld1.8 {d10,d11}, [r4,:128], r12
|
||||
vld1.8 {d8, d9}, [r4,:128], r12
|
||||
vld1.8 {d6, d7}, [r4,:128], r12
|
||||
vld1.8 {d4, d5}, [r4,:128], r12
|
||||
vld1.8 {d2, d3}, [r4,:128], r12
|
||||
vld1.8 {d0, d1}, [r4,:128]
|
||||
|
||||
swap4 d1, d3, d5, d7, d8, d10, d12, d14
|
||||
transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
|
||||
|
||||
swap4 d17, d19, d21, d31, d24, d26, d28, d22
|
||||
transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
|
||||
|
||||
vst1.8 {d30,d31}, [r4,:128]!
|
||||
vst1.8 {d6, d7}, [r4,:128]!
|
||||
vst1.8 {d20,d21}, [r4,:128]!
|
||||
vst1.8 {d4, d5}, [r4,:128]!
|
||||
vst1.8 {d18,d19}, [r4,:128]!
|
||||
vst1.8 {d2, d3}, [r4,:128]!
|
||||
vst1.8 {d16,d17}, [r4,:128]!
|
||||
vst1.8 {d0, d1}, [r4,:128]
|
||||
|
||||
lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
|
||||
lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
|
||||
lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
|
||||
lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
|
||||
|
||||
vld1.8 {d16,d17}, [r4,:128], r12
|
||||
vld1.8 {d30,d31}, [r4,:128], r12
|
||||
lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
|
||||
vld1.8 {d16,d17}, [r4,:128], r12
|
||||
vld1.8 {d30,d31}, [r4,:128], r12
|
||||
lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
|
||||
vld1.8 {d16,d17}, [r4,:128], r12
|
||||
vld1.8 {d30,d31}, [r4,:128], r12
|
||||
lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
|
||||
vld1.8 {d16,d17}, [r4,:128], r12
|
||||
vld1.8 {d30,d31}, [r4,:128]
|
||||
lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
|
||||
|
||||
transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
.macro h264_qpel8_hv_lowpass type
|
||||
function \type\()_h264_qpel8_hv_lowpass_neon
|
||||
mov r10, lr
|
||||
bl put_h264_qpel8_hv_lowpass_neon_top
|
||||
.ifc \type,avg
|
||||
vld1.8 {d0}, [r0,:64], r2
|
||||
vrhadd.u8 d12, d12, d0
|
||||
vld1.8 {d1}, [r0,:64], r2
|
||||
vrhadd.u8 d13, d13, d1
|
||||
vld1.8 {d2}, [r0,:64], r2
|
||||
vrhadd.u8 d14, d14, d2
|
||||
vld1.8 {d3}, [r0,:64], r2
|
||||
vrhadd.u8 d15, d15, d3
|
||||
vld1.8 {d4}, [r0,:64], r2
|
||||
vrhadd.u8 d8, d8, d4
|
||||
vld1.8 {d5}, [r0,:64], r2
|
||||
vrhadd.u8 d9, d9, d5
|
||||
vld1.8 {d6}, [r0,:64], r2
|
||||
vrhadd.u8 d10, d10, d6
|
||||
vld1.8 {d7}, [r0,:64], r2
|
||||
vrhadd.u8 d11, d11, d7
|
||||
sub r0, r0, r2, lsl #3
|
||||
.endif
|
||||
|
||||
vst1.8 {d12}, [r0,:64], r2
|
||||
vst1.8 {d13}, [r0,:64], r2
|
||||
vst1.8 {d14}, [r0,:64], r2
|
||||
vst1.8 {d15}, [r0,:64], r2
|
||||
vst1.8 {d8}, [r0,:64], r2
|
||||
vst1.8 {d9}, [r0,:64], r2
|
||||
vst1.8 {d10}, [r0,:64], r2
|
||||
vst1.8 {d11}, [r0,:64], r2
|
||||
|
||||
mov lr, r10
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel8_hv_lowpass put
|
||||
h264_qpel8_hv_lowpass avg
|
||||
|
||||
.macro h264_qpel8_hv_lowpass_l2 type
|
||||
function \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
mov r10, lr
|
||||
bl put_h264_qpel8_hv_lowpass_neon_top
|
||||
|
||||
vld1.8 {d0, d1}, [r2,:128]!
|
||||
vld1.8 {d2, d3}, [r2,:128]!
|
||||
vrhadd.u8 q0, q0, q6
|
||||
vld1.8 {d4, d5}, [r2,:128]!
|
||||
vrhadd.u8 q1, q1, q7
|
||||
vld1.8 {d6, d7}, [r2,:128]!
|
||||
vrhadd.u8 q2, q2, q4
|
||||
vrhadd.u8 q3, q3, q5
|
||||
.ifc \type,avg
|
||||
vld1.8 {d16}, [r0,:64], r3
|
||||
vrhadd.u8 d0, d0, d16
|
||||
vld1.8 {d17}, [r0,:64], r3
|
||||
vrhadd.u8 d1, d1, d17
|
||||
vld1.8 {d18}, [r0,:64], r3
|
||||
vrhadd.u8 d2, d2, d18
|
||||
vld1.8 {d19}, [r0,:64], r3
|
||||
vrhadd.u8 d3, d3, d19
|
||||
vld1.8 {d20}, [r0,:64], r3
|
||||
vrhadd.u8 d4, d4, d20
|
||||
vld1.8 {d21}, [r0,:64], r3
|
||||
vrhadd.u8 d5, d5, d21
|
||||
vld1.8 {d22}, [r0,:64], r3
|
||||
vrhadd.u8 d6, d6, d22
|
||||
vld1.8 {d23}, [r0,:64], r3
|
||||
vrhadd.u8 d7, d7, d23
|
||||
sub r0, r0, r3, lsl #3
|
||||
.endif
|
||||
vst1.8 {d0}, [r0,:64], r3
|
||||
vst1.8 {d1}, [r0,:64], r3
|
||||
vst1.8 {d2}, [r0,:64], r3
|
||||
vst1.8 {d3}, [r0,:64], r3
|
||||
vst1.8 {d4}, [r0,:64], r3
|
||||
vst1.8 {d5}, [r0,:64], r3
|
||||
vst1.8 {d6}, [r0,:64], r3
|
||||
vst1.8 {d7}, [r0,:64], r3
|
||||
|
||||
mov lr, r10
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel8_hv_lowpass_l2 put
|
||||
h264_qpel8_hv_lowpass_l2 avg
|
||||
|
||||
.macro h264_qpel16_hv type
|
||||
function \type\()_h264_qpel16_hv_lowpass_neon
|
||||
mov r9, lr
|
||||
bl \type\()_h264_qpel8_hv_lowpass_neon
|
||||
sub r1, r1, r3, lsl #2
|
||||
bl \type\()_h264_qpel8_hv_lowpass_neon
|
||||
sub r1, r1, r3, lsl #4
|
||||
sub r1, r1, r3, lsl #2
|
||||
add r1, r1, #8
|
||||
sub r0, r0, r2, lsl #4
|
||||
add r0, r0, #8
|
||||
bl \type\()_h264_qpel8_hv_lowpass_neon
|
||||
sub r1, r1, r3, lsl #2
|
||||
mov lr, r9
|
||||
b \type\()_h264_qpel8_hv_lowpass_neon
|
||||
endfunc
|
||||
|
||||
function \type\()_h264_qpel16_hv_lowpass_l2_neon
|
||||
mov r9, lr
|
||||
sub r2, r4, #256
|
||||
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
sub r1, r1, r3, lsl #2
|
||||
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
sub r1, r1, r3, lsl #4
|
||||
sub r1, r1, r3, lsl #2
|
||||
add r1, r1, #8
|
||||
sub r0, r0, r3, lsl #4
|
||||
add r0, r0, #8
|
||||
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
sub r1, r1, r3, lsl #2
|
||||
mov lr, r9
|
||||
b \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel16_hv put
|
||||
h264_qpel16_hv avg
|
||||
|
||||
.macro h264_qpel8 type
|
||||
function ff_\type\()_h264_qpel8_mc10_neon, export=1
|
||||
lowpass_const r3
|
||||
mov r3, r1
|
||||
sub r1, r1, #2
|
||||
mov r12, #8
|
||||
b \type\()_h264_qpel8_h_lowpass_l2_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc20_neon, export=1
|
||||
lowpass_const r3
|
||||
sub r1, r1, #2
|
||||
mov r3, r2
|
||||
mov r12, #8
|
||||
b \type\()_h264_qpel8_h_lowpass_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc30_neon, export=1
|
||||
lowpass_const r3
|
||||
add r3, r1, #1
|
||||
sub r1, r1, #2
|
||||
mov r12, #8
|
||||
b \type\()_h264_qpel8_h_lowpass_l2_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc01_neon, export=1
|
||||
push {lr}
|
||||
mov r12, r1
|
||||
\type\()_h264_qpel8_mc01:
|
||||
lowpass_const r3
|
||||
mov r3, r2
|
||||
sub r1, r1, r2, lsl #1
|
||||
vpush {d8-d15}
|
||||
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
vpop {d8-d15}
|
||||
pop {pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc11_neon, export=1
|
||||
push {r0, r1, r11, lr}
|
||||
\type\()_h264_qpel8_mc11:
|
||||
lowpass_const r3
|
||||
mov r11, sp
|
||||
A bic sp, sp, #15
|
||||
T bic r0, r11, #15
|
||||
T mov sp, r0
|
||||
sub sp, sp, #64
|
||||
mov r0, sp
|
||||
sub r1, r1, #2
|
||||
mov r3, #8
|
||||
mov r12, #8
|
||||
vpush {d8-d15}
|
||||
bl put_h264_qpel8_h_lowpass_neon
|
||||
ldrd r0, r1, [r11], #8
|
||||
mov r3, r2
|
||||
add r12, sp, #64
|
||||
sub r1, r1, r2, lsl #1
|
||||
mov r2, #8
|
||||
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
vpop {d8-d15}
|
||||
mov sp, r11
|
||||
pop {r11, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc21_neon, export=1
|
||||
push {r0, r1, r4, r10, r11, lr}
|
||||
\type\()_h264_qpel8_mc21:
|
||||
lowpass_const r3
|
||||
mov r11, sp
|
||||
A bic sp, sp, #15
|
||||
T bic r0, r11, #15
|
||||
T mov sp, r0
|
||||
sub sp, sp, #(8*8+16*12)
|
||||
sub r1, r1, #2
|
||||
mov r3, #8
|
||||
mov r0, sp
|
||||
mov r12, #8
|
||||
vpush {d8-d15}
|
||||
bl put_h264_qpel8_h_lowpass_neon
|
||||
mov r4, r0
|
||||
ldrd r0, r1, [r11], #8
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, r2
|
||||
sub r2, r4, #64
|
||||
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
vpop {d8-d15}
|
||||
mov sp, r11
|
||||
pop {r4, r10, r11, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc31_neon, export=1
|
||||
add r1, r1, #1
|
||||
push {r0, r1, r11, lr}
|
||||
sub r1, r1, #1
|
||||
b \type\()_h264_qpel8_mc11
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc02_neon, export=1
|
||||
push {lr}
|
||||
lowpass_const r3
|
||||
sub r1, r1, r2, lsl #1
|
||||
mov r3, r2
|
||||
vpush {d8-d15}
|
||||
bl \type\()_h264_qpel8_v_lowpass_neon
|
||||
vpop {d8-d15}
|
||||
pop {pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc12_neon, export=1
|
||||
push {r0, r1, r4, r10, r11, lr}
|
||||
\type\()_h264_qpel8_mc12:
|
||||
lowpass_const r3
|
||||
mov r11, sp
|
||||
A bic sp, sp, #15
|
||||
T bic r0, r11, #15
|
||||
T mov sp, r0
|
||||
sub sp, sp, #(8*8+16*12)
|
||||
sub r1, r1, r2, lsl #1
|
||||
mov r3, r2
|
||||
mov r2, #8
|
||||
mov r0, sp
|
||||
vpush {d8-d15}
|
||||
bl put_h264_qpel8_v_lowpass_neon
|
||||
mov r4, r0
|
||||
ldrd r0, r1, [r11], #8
|
||||
sub r1, r1, r3, lsl #1
|
||||
sub r1, r1, #2
|
||||
sub r2, r4, #64
|
||||
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
vpop {d8-d15}
|
||||
mov sp, r11
|
||||
pop {r4, r10, r11, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc22_neon, export=1
|
||||
push {r4, r10, r11, lr}
|
||||
mov r11, sp
|
||||
A bic sp, sp, #15
|
||||
T bic r4, r11, #15
|
||||
T mov sp, r4
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, r2
|
||||
sub sp, sp, #(16*12)
|
||||
mov r4, sp
|
||||
vpush {d8-d15}
|
||||
bl \type\()_h264_qpel8_hv_lowpass_neon
|
||||
vpop {d8-d15}
|
||||
mov sp, r11
|
||||
pop {r4, r10, r11, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc32_neon, export=1
|
||||
push {r0, r1, r4, r10, r11, lr}
|
||||
add r1, r1, #1
|
||||
b \type\()_h264_qpel8_mc12
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc03_neon, export=1
|
||||
push {lr}
|
||||
add r12, r1, r2
|
||||
b \type\()_h264_qpel8_mc01
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc13_neon, export=1
|
||||
push {r0, r1, r11, lr}
|
||||
add r1, r1, r2
|
||||
b \type\()_h264_qpel8_mc11
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc23_neon, export=1
|
||||
push {r0, r1, r4, r10, r11, lr}
|
||||
add r1, r1, r2
|
||||
b \type\()_h264_qpel8_mc21
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc33_neon, export=1
|
||||
add r1, r1, #1
|
||||
push {r0, r1, r11, lr}
|
||||
add r1, r1, r2
|
||||
sub r1, r1, #1
|
||||
b \type\()_h264_qpel8_mc11
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel8 put
|
||||
h264_qpel8 avg
|
||||
|
||||
.macro h264_qpel16 type
|
||||
function ff_\type\()_h264_qpel16_mc10_neon, export=1
|
||||
lowpass_const r3
|
||||
mov r3, r1
|
||||
sub r1, r1, #2
|
||||
b \type\()_h264_qpel16_h_lowpass_l2_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc20_neon, export=1
|
||||
lowpass_const r3
|
||||
sub r1, r1, #2
|
||||
mov r3, r2
|
||||
b \type\()_h264_qpel16_h_lowpass_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc30_neon, export=1
|
||||
lowpass_const r3
|
||||
add r3, r1, #1
|
||||
sub r1, r1, #2
|
||||
b \type\()_h264_qpel16_h_lowpass_l2_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc01_neon, export=1
|
||||
push {r4, lr}
|
||||
mov r12, r1
|
||||
\type\()_h264_qpel16_mc01:
|
||||
lowpass_const r3
|
||||
mov r3, r2
|
||||
sub r1, r1, r2, lsl #1
|
||||
vpush {d8-d15}
|
||||
bl \type\()_h264_qpel16_v_lowpass_l2_neon
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc11_neon, export=1
|
||||
push {r0, r1, r4, r11, lr}
|
||||
\type\()_h264_qpel16_mc11:
|
||||
lowpass_const r3
|
||||
mov r11, sp
|
||||
A bic sp, sp, #15
|
||||
T bic r0, r11, #15
|
||||
T mov sp, r0
|
||||
sub sp, sp, #256
|
||||
mov r0, sp
|
||||
sub r1, r1, #2
|
||||
mov r3, #16
|
||||
vpush {d8-d15}
|
||||
bl put_h264_qpel16_h_lowpass_neon
|
||||
ldrd r0, r1, [r11], #8
|
||||
mov r3, r2
|
||||
add r12, sp, #64
|
||||
sub r1, r1, r2, lsl #1
|
||||
mov r2, #16
|
||||
bl \type\()_h264_qpel16_v_lowpass_l2_neon
|
||||
vpop {d8-d15}
|
||||
mov sp, r11
|
||||
pop {r4, r11, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc21_neon, export=1
|
||||
push {r0, r1, r4-r5, r9-r11, lr}
|
||||
\type\()_h264_qpel16_mc21:
|
||||
lowpass_const r3
|
||||
mov r11, sp
|
||||
A bic sp, sp, #15
|
||||
T bic r0, r11, #15
|
||||
T mov sp, r0
|
||||
sub sp, sp, #(16*16+16*12)
|
||||
sub r1, r1, #2
|
||||
mov r0, sp
|
||||
vpush {d8-d15}
|
||||
bl put_h264_qpel16_h_lowpass_neon_packed
|
||||
mov r4, r0
|
||||
ldrd r0, r1, [r11], #8
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, r2
|
||||
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
|
||||
vpop {d8-d15}
|
||||
mov sp, r11
|
||||
pop {r4-r5, r9-r11, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc31_neon, export=1
|
||||
add r1, r1, #1
|
||||
push {r0, r1, r4, r11, lr}
|
||||
sub r1, r1, #1
|
||||
b \type\()_h264_qpel16_mc11
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc02_neon, export=1
|
||||
push {r4, lr}
|
||||
lowpass_const r3
|
||||
sub r1, r1, r2, lsl #1
|
||||
mov r3, r2
|
||||
vpush {d8-d15}
|
||||
bl \type\()_h264_qpel16_v_lowpass_neon
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc12_neon, export=1
|
||||
push {r0, r1, r4-r5, r9-r11, lr}
|
||||
\type\()_h264_qpel16_mc12:
|
||||
lowpass_const r3
|
||||
mov r11, sp
|
||||
A bic sp, sp, #15
|
||||
T bic r0, r11, #15
|
||||
T mov sp, r0
|
||||
sub sp, sp, #(16*16+16*12)
|
||||
sub r1, r1, r2, lsl #1
|
||||
mov r0, sp
|
||||
mov r3, r2
|
||||
vpush {d8-d15}
|
||||
bl put_h264_qpel16_v_lowpass_neon_packed
|
||||
mov r4, r0
|
||||
ldrd r0, r1, [r11], #8
|
||||
sub r1, r1, r3, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r2, r3
|
||||
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
|
||||
vpop {d8-d15}
|
||||
mov sp, r11
|
||||
pop {r4-r5, r9-r11, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc22_neon, export=1
|
||||
push {r4, r9-r11, lr}
|
||||
lowpass_const r3
|
||||
mov r11, sp
|
||||
A bic sp, sp, #15
|
||||
T bic r4, r11, #15
|
||||
T mov sp, r4
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, r2
|
||||
sub sp, sp, #(16*12)
|
||||
mov r4, sp
|
||||
vpush {d8-d15}
|
||||
bl \type\()_h264_qpel16_hv_lowpass_neon
|
||||
vpop {d8-d15}
|
||||
mov sp, r11
|
||||
pop {r4, r9-r11, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc32_neon, export=1
|
||||
push {r0, r1, r4-r5, r9-r11, lr}
|
||||
add r1, r1, #1
|
||||
b \type\()_h264_qpel16_mc12
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc03_neon, export=1
|
||||
push {r4, lr}
|
||||
add r12, r1, r2
|
||||
b \type\()_h264_qpel16_mc01
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc13_neon, export=1
|
||||
push {r0, r1, r4, r11, lr}
|
||||
add r1, r1, r2
|
||||
b \type\()_h264_qpel16_mc11
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc23_neon, export=1
|
||||
push {r0, r1, r4-r5, r9-r11, lr}
|
||||
add r1, r1, r2
|
||||
b \type\()_h264_qpel16_mc21
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel16_mc33_neon, export=1
|
||||
add r1, r1, #1
|
||||
push {r0, r1, r4, r11, lr}
|
||||
add r1, r1, r2
|
||||
sub r1, r1, #1
|
||||
b \type\()_h264_qpel16_mc11
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_qpel16 put
|
||||
h264_qpel16 avg
|
||||
26
externals/ffmpeg/libavcodec/arm/hevcdsp_arm.h
vendored
Executable file
26
externals/ffmpeg/libavcodec/arm/hevcdsp_arm.h
vendored
Executable file
@@ -0,0 +1,26 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
|
||||
#define AVCODEC_ARM_HEVCDSP_ARM_H
|
||||
|
||||
#include "libavcodec/hevcdsp.h"
|
||||
|
||||
void ff_hevc_dsp_init_neon(HEVCDSPContext *c, const int bit_depth);
|
||||
|
||||
#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
|
||||
385
externals/ffmpeg/libavcodec/arm/hevcdsp_deblock_neon.S
vendored
Executable file
385
externals/ffmpeg/libavcodec/arm/hevcdsp_deblock_neon.S
vendored
Executable file
@@ -0,0 +1,385 @@
|
||||
/*
|
||||
* Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
.macro hevc_loop_filter_chroma_start
|
||||
ldr r12, [r2]
|
||||
ldr r3, [r2, #4]
|
||||
add r2, r3, r12
|
||||
cmp r2, #0
|
||||
it eq
|
||||
bxeq lr
|
||||
.endm
|
||||
|
||||
.macro hevc_loop_filter_chroma_body
|
||||
vsubl.u8 q3, d4, d2
|
||||
vsubl.u8 q11, d18, d19
|
||||
vshl.i16 q3, #2
|
||||
vadd.i16 q11, q3
|
||||
vdup.16 d0, r12
|
||||
vdup.16 d1, r3
|
||||
vrshr.s16 q11, q11, #3
|
||||
vneg.s16 q12, q0
|
||||
vmovl.u8 q2, d4
|
||||
vmin.s16 q11, q11, q0
|
||||
vmax.s16 q11, q11, q12
|
||||
vaddw.u8 q1, q11, d2
|
||||
vsub.i16 q2, q11
|
||||
vqmovun.s16 d2, q1
|
||||
vqmovun.s16 d4, q2
|
||||
.endm
|
||||
|
||||
.macro hevc_loop_filter_luma_start
|
||||
ldr r12, [r3]
|
||||
ldr r3, [r3, #4]
|
||||
lsl r3, #16
|
||||
orr r3, r12
|
||||
cmp r3, #0
|
||||
it eq
|
||||
bxeq lr
|
||||
lsr r3, #16
|
||||
.endm
|
||||
|
||||
.macro hevc_loop_filter_luma_body
|
||||
vmovl.u8 q8, d16
|
||||
vmovl.u8 q9, d18
|
||||
vmovl.u8 q10, d20
|
||||
vmovl.u8 q11, d22
|
||||
vmovl.u8 q12, d24
|
||||
vmovl.u8 q13, d26
|
||||
vmovl.u8 q14, d28
|
||||
vmovl.u8 q15, d30
|
||||
|
||||
vadd.i16 q7, q9, q11
|
||||
vadd.i16 q6, q14, q12
|
||||
vsub.i16 q7, q10
|
||||
vsub.i16 q6, q13
|
||||
vabd.s16 q7, q7, q10
|
||||
vabd.s16 q6, q6, q13
|
||||
|
||||
|
||||
vdup.16 q0, r2
|
||||
vmov q4, q7
|
||||
vmov q5, q6
|
||||
vdup.16 d4, r12
|
||||
vtrn.16 q7, q4
|
||||
vtrn.16 q6, q5
|
||||
|
||||
vshl.u64 q7, #32
|
||||
vshr.u64 q4, #32
|
||||
vshl.u64 q6, #32
|
||||
vshr.u64 q5, #32
|
||||
vshr.u64 q7, #32
|
||||
vshr.u64 q6, #32
|
||||
vshl.u64 q5, #32
|
||||
vshl.u64 q4, #32
|
||||
vorr q6, q5
|
||||
vorr q7, q4
|
||||
vdup.16 d5, r3
|
||||
vadd.i16 q5, q7, q6
|
||||
|
||||
vmov q4, q5
|
||||
vmov q3, q5
|
||||
vtrn.32 q3, q4
|
||||
|
||||
vadd.i16 q4, q3
|
||||
|
||||
vshl.s16 q5, q5, #1
|
||||
vcgt.s16 q3, q0, q4
|
||||
|
||||
vmovn.i16 d6, q3
|
||||
vshr.s16 q1, q0, #2
|
||||
vmovn.i16 d6, q3
|
||||
vcgt.s16 q5, q1, q5
|
||||
vmov r7, s12
|
||||
cmp r7, #0
|
||||
beq bypasswrite
|
||||
|
||||
vpadd.i32 d0, d14, d12
|
||||
vpadd.i32 d1, d15, d13
|
||||
vmov q4, q2
|
||||
vshl.s16 q2, #2
|
||||
vshr.s16 q1, q1, #1
|
||||
vrhadd.s16 q2, q4
|
||||
|
||||
vabd.s16 q7, q8, q11
|
||||
vaba.s16 q7, q15, q12
|
||||
|
||||
vmovn.i32 d0, q0
|
||||
vmov r5, r6, s0, s1
|
||||
vcgt.s16 q6, q1, q7
|
||||
vand q5, q5, q6
|
||||
vabd.s16 q7, q11, q12
|
||||
vcgt.s16 q6, q2, q7
|
||||
vand q5, q5, q6
|
||||
|
||||
vmov q2, q5
|
||||
vtrn.s16 q5, q2
|
||||
vshr.u64 q2, #32
|
||||
vshl.u64 q5, #32
|
||||
vshl.u64 q2, #32
|
||||
vshr.u64 q5, #32
|
||||
vorr q5, q2
|
||||
|
||||
vmov q2, q5
|
||||
vshl.i16 q7, q4, #1
|
||||
vtrn.32 q2, q5
|
||||
vand q5, q2
|
||||
vneg.s16 q6, q7
|
||||
vmovn.i16 d4, q5
|
||||
vmovn.i16 d4, q2
|
||||
vmov r8, s8
|
||||
|
||||
and r9, r8, r7
|
||||
cmp r9, #0
|
||||
beq 1f
|
||||
|
||||
vadd.i16 q2, q11, q12
|
||||
vadd.i16 q4, q9, q8
|
||||
vadd.i16 q1, q2, q10
|
||||
vdup.16 d10, r9
|
||||
vadd.i16 q0, q1, q9
|
||||
vshl.i16 q4, #1
|
||||
lsr r9, #16
|
||||
vadd.i16 q1, q0
|
||||
vrshr.s16 q3, q0, #2
|
||||
vadd.i16 q1, q13
|
||||
vadd.i16 q4, q0
|
||||
vsub.i16 q3, q10
|
||||
vrshr.s16 q1, #3
|
||||
vrshr.s16 q4, #3
|
||||
vmax.s16 q3, q6
|
||||
vsub.i16 q1, q11
|
||||
vsub.i16 q4, q9
|
||||
vmin.s16 q3, q7
|
||||
vmax.s16 q4, q6
|
||||
vmax.s16 q1, q6
|
||||
vadd.i16 q3, q10
|
||||
vmin.s16 q4, q7
|
||||
vmin.s16 q1, q7
|
||||
vdup.16 d11, r9
|
||||
vadd.i16 q4, q9
|
||||
vadd.i16 q1, q11
|
||||
vbit q9, q4, q5
|
||||
vadd.i16 q4, q2, q13
|
||||
vbit q11, q1, q5
|
||||
vadd.i16 q0, q4, q14
|
||||
vadd.i16 q2, q15, q14
|
||||
vadd.i16 q4, q0
|
||||
|
||||
vshl.i16 q2, #1
|
||||
vadd.i16 q4, q10
|
||||
vbit q10, q3, q5
|
||||
vrshr.s16 q4, #3
|
||||
vadd.i16 q2, q0
|
||||
vrshr.s16 q3, q0, #2
|
||||
vsub.i16 q4, q12
|
||||
vrshr.s16 q2, #3
|
||||
vsub.i16 q3, q13
|
||||
vmax.s16 q4, q6
|
||||
vsub.i16 q2, q14
|
||||
vmax.s16 q3, q6
|
||||
vmin.s16 q4, q7
|
||||
vmax.s16 q2, q6
|
||||
vmin.s16 q3, q7
|
||||
vadd.i16 q4, q12
|
||||
vmin.s16 q2, q7
|
||||
vadd.i16 q3, q13
|
||||
vbit q12, q4, q5
|
||||
vadd.i16 q2, q14
|
||||
vbit q13, q3, q5
|
||||
vbit q14, q2, q5
|
||||
|
||||
1:
|
||||
mvn r8, r8
|
||||
and r9, r8, r7
|
||||
cmp r9, #0
|
||||
beq 2f
|
||||
|
||||
vdup.16 q4, r2
|
||||
|
||||
vdup.16 d10, r9
|
||||
lsr r9, #16
|
||||
vmov q1, q4
|
||||
vdup.16 d11, r9
|
||||
vshr.s16 q1, #1
|
||||
vsub.i16 q2, q12, q11
|
||||
vadd.i16 q4, q1
|
||||
vshl.s16 q0, q2, #3
|
||||
vshr.s16 q4, #3
|
||||
vadd.i16 q2, q0
|
||||
vsub.i16 q0, q13, q10
|
||||
vsub.i16 q2, q0
|
||||
vshl.i16 q0, q0, #1
|
||||
vsub.i16 q2, q0
|
||||
vshl.s16 q1, q7, 2
|
||||
vrshr.s16 q2, q2, #4
|
||||
vadd.i16 q1, q7
|
||||
vabs.s16 q3, q2
|
||||
vshr.s16 q6, q6, #1
|
||||
vcgt.s16 q1, q1, q3
|
||||
vand q5, q1
|
||||
vshr.s16 q7, q7, #1
|
||||
vmax.s16 q2, q2, q6
|
||||
vmin.s16 q2, q2, q7
|
||||
|
||||
vshr.s16 q7, q7, #1
|
||||
vrhadd.s16 q3, q9, q11
|
||||
vneg.s16 q6, q7
|
||||
vsub.s16 q3, q10
|
||||
vdup.16 d2, r5
|
||||
vhadd.s16 q3, q2
|
||||
vdup.16 d3, r6
|
||||
vmax.s16 q3, q3, q6
|
||||
vcgt.s16 q1, q4, q1
|
||||
vmin.s16 q3, q3, q7
|
||||
vand q1, q5
|
||||
vadd.i16 q3, q10
|
||||
lsr r5, #16
|
||||
lsr r6, #16
|
||||
vbit q10, q3, q1
|
||||
|
||||
vrhadd.s16 q3, q14, q12
|
||||
vdup.16 d2, r5
|
||||
vsub.s16 q3, q13
|
||||
vdup.16 d3, r6
|
||||
vhsub.s16 q3, q2
|
||||
vcgt.s16 q1, q4, q1
|
||||
vmax.s16 q3, q3, q6
|
||||
vand q1, q5
|
||||
vmin.s16 q3, q3, q7
|
||||
vadd.i16 q3, q13
|
||||
vbit q13, q3, q1
|
||||
vadd.i16 q0, q11, q2
|
||||
vsub.i16 q4, q12, q2
|
||||
vbit q11, q0, q5
|
||||
vbit q12, q4, q5
|
||||
|
||||
2:
|
||||
vqmovun.s16 d16, q8
|
||||
vqmovun.s16 d18, q9
|
||||
vqmovun.s16 d20, q10
|
||||
vqmovun.s16 d22, q11
|
||||
vqmovun.s16 d24, q12
|
||||
vqmovun.s16 d26, q13
|
||||
vqmovun.s16 d28, q14
|
||||
vqmovun.s16 d30, q15
|
||||
.endm
|
||||
|
||||
function ff_hevc_v_loop_filter_luma_neon, export=1
|
||||
hevc_loop_filter_luma_start
|
||||
push {r5-r11}
|
||||
vpush {d8-d15}
|
||||
sub r0, #4
|
||||
vld1.8 {d16}, [r0], r1
|
||||
vld1.8 {d18}, [r0], r1
|
||||
vld1.8 {d20}, [r0], r1
|
||||
vld1.8 {d22}, [r0], r1
|
||||
vld1.8 {d24}, [r0], r1
|
||||
vld1.8 {d26}, [r0], r1
|
||||
vld1.8 {d28}, [r0], r1
|
||||
vld1.8 {d30}, [r0], r1
|
||||
sub r0, r0, r1, lsl #3
|
||||
transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
|
||||
hevc_loop_filter_luma_body
|
||||
transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
|
||||
vst1.8 {d16}, [r0], r1
|
||||
vst1.8 {d18}, [r0], r1
|
||||
vst1.8 {d20}, [r0], r1
|
||||
vst1.8 {d22}, [r0], r1
|
||||
vst1.8 {d24}, [r0], r1
|
||||
vst1.8 {d26}, [r0], r1
|
||||
vst1.8 {d28}, [r0], r1
|
||||
vst1.8 {d30}, [r0]
|
||||
vpop {d8-d15}
|
||||
pop {r5-r11}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_h_loop_filter_luma_neon, export=1
|
||||
hevc_loop_filter_luma_start
|
||||
push {r5-r11}
|
||||
vpush {d8-d15}
|
||||
sub r0, r0, r1, lsl #2
|
||||
vld1.8 {d16}, [r0], r1
|
||||
vld1.8 {d18}, [r0], r1
|
||||
vld1.8 {d20}, [r0], r1
|
||||
vld1.8 {d22}, [r0], r1
|
||||
vld1.8 {d24}, [r0], r1
|
||||
vld1.8 {d26}, [r0], r1
|
||||
vld1.8 {d28}, [r0], r1
|
||||
vld1.8 {d30}, [r0], r1
|
||||
sub r0, r0, r1, lsl #3
|
||||
add r0, r1
|
||||
hevc_loop_filter_luma_body
|
||||
vst1.8 {d18}, [r0], r1
|
||||
vst1.8 {d20}, [r0], r1
|
||||
vst1.8 {d22}, [r0], r1
|
||||
vst1.8 {d24}, [r0], r1
|
||||
vst1.8 {d26}, [r0], r1
|
||||
vst1.8 {d28}, [r0]
|
||||
bypasswrite:
|
||||
vpop {d8-d15}
|
||||
pop {r5-r11}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_v_loop_filter_chroma_neon, export=1
|
||||
hevc_loop_filter_chroma_start
|
||||
sub r0, #4
|
||||
vld1.8 {d16}, [r0], r1
|
||||
vld1.8 {d17}, [r0], r1
|
||||
vld1.8 {d18}, [r0], r1
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d19}, [r0], r1
|
||||
vld1.8 {d20}, [r0], r1
|
||||
vld1.8 {d21}, [r0], r1
|
||||
sub r0, r0, r1, lsl #3
|
||||
transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
|
||||
hevc_loop_filter_chroma_body
|
||||
transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
|
||||
vst1.8 {d16}, [r0], r1
|
||||
vst1.8 {d17}, [r0], r1
|
||||
vst1.8 {d18}, [r0], r1
|
||||
vst1.8 {d2}, [r0], r1
|
||||
vst1.8 {d4}, [r0], r1
|
||||
vst1.8 {d19}, [r0], r1
|
||||
vst1.8 {d20}, [r0], r1
|
||||
vst1.8 {d21}, [r0]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_h_loop_filter_chroma_neon, export=1
|
||||
hevc_loop_filter_chroma_start
|
||||
sub r0, r0, r1, lsl #1
|
||||
vld1.8 {d18}, [r0], r1
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d19}, [r0]
|
||||
sub r0, r0, r1, lsl #1
|
||||
hevc_loop_filter_chroma_body
|
||||
vst1.8 {d2}, [r0], r1
|
||||
vst1.8 {d4}, [r0]
|
||||
bx lr
|
||||
endfunc
|
||||
1043
externals/ffmpeg/libavcodec/arm/hevcdsp_idct_neon.S
vendored
Executable file
1043
externals/ffmpeg/libavcodec/arm/hevcdsp_idct_neon.S
vendored
Executable file
File diff suppressed because it is too large
Load Diff
34
externals/ffmpeg/libavcodec/arm/hevcdsp_init_arm.c
vendored
Executable file
34
externals/ffmpeg/libavcodec/arm/hevcdsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,34 @@
|
||||
/*
|
||||
* Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
|
||||
#include "libavcodec/hevcdsp.h"
|
||||
#include "hevcdsp_arm.h"
|
||||
|
||||
av_cold void ff_hevc_dsp_init_arm(HEVCDSPContext *c, const int bit_depth)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags))
|
||||
ff_hevc_dsp_init_neon(c, bit_depth);
|
||||
}
|
||||
320
externals/ffmpeg/libavcodec/arm/hevcdsp_init_neon.c
vendored
Executable file
320
externals/ffmpeg/libavcodec/arm/hevcdsp_init_neon.c
vendored
Executable file
@@ -0,0 +1,320 @@
|
||||
/*
|
||||
* Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/hevcdsp.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "hevcdsp_arm.h"
|
||||
|
||||
void ff_hevc_sao_band_filter_neon_8_wrapper(uint8_t *_dst, uint8_t *_src,
|
||||
ptrdiff_t stride_dst, ptrdiff_t stride_src,
|
||||
int16_t *sao_offset_val, int sao_left_class,
|
||||
int width, int height);
|
||||
void ff_hevc_sao_edge_filter_neon_8_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
|
||||
int eo, int width, int height);
|
||||
|
||||
void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
|
||||
void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
|
||||
void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
|
||||
void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
|
||||
void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs,
|
||||
ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs,
|
||||
ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs,
|
||||
ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs,
|
||||
ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs,
|
||||
ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs,
|
||||
ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs,
|
||||
ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs,
|
||||
ptrdiff_t stride);
|
||||
void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
|
||||
void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
|
||||
void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
|
||||
void ff_hevc_idct_32x32_dc_8_neon(int16_t *coeffs);
|
||||
void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
|
||||
void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
|
||||
void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
|
||||
void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
|
||||
void ff_hevc_idct_4x4_8_neon(int16_t *coeffs, int col_limit);
|
||||
void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
|
||||
void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
|
||||
void ff_hevc_idct_32x32_8_neon(int16_t *coeffs, int col_limit);
|
||||
void ff_hevc_idct_4x4_10_neon(int16_t *coeffs, int col_limit);
|
||||
void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
|
||||
void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit);
|
||||
void ff_hevc_idct_32x32_10_neon(int16_t *coeffs, int col_limit);
|
||||
void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
|
||||
|
||||
#define PUT_PIXELS(name) \
|
||||
void name(int16_t *dst, uint8_t *src, \
|
||||
ptrdiff_t srcstride, int height, \
|
||||
intptr_t mx, intptr_t my, int width)
|
||||
PUT_PIXELS(ff_hevc_put_pixels_w2_neon_8);
|
||||
PUT_PIXELS(ff_hevc_put_pixels_w4_neon_8);
|
||||
PUT_PIXELS(ff_hevc_put_pixels_w6_neon_8);
|
||||
PUT_PIXELS(ff_hevc_put_pixels_w8_neon_8);
|
||||
PUT_PIXELS(ff_hevc_put_pixels_w12_neon_8);
|
||||
PUT_PIXELS(ff_hevc_put_pixels_w16_neon_8);
|
||||
PUT_PIXELS(ff_hevc_put_pixels_w24_neon_8);
|
||||
PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
|
||||
PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
|
||||
PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
|
||||
#undef PUT_PIXELS
|
||||
|
||||
static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int width);
|
||||
static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
|
||||
int width, int height, int16_t* src2, ptrdiff_t src2stride);
|
||||
void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
|
||||
int16_t *src2,
|
||||
int height, intptr_t mx, intptr_t my, int width);
|
||||
#define QPEL_FUNC(name) \
|
||||
void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
|
||||
int height, int width)
|
||||
|
||||
QPEL_FUNC(ff_hevc_put_qpel_v1_neon_8);
|
||||
QPEL_FUNC(ff_hevc_put_qpel_v2_neon_8);
|
||||
QPEL_FUNC(ff_hevc_put_qpel_v3_neon_8);
|
||||
QPEL_FUNC(ff_hevc_put_qpel_h1_neon_8);
|
||||
QPEL_FUNC(ff_hevc_put_qpel_h2_neon_8);
|
||||
QPEL_FUNC(ff_hevc_put_qpel_h3_neon_8);
|
||||
QPEL_FUNC(ff_hevc_put_qpel_h1v1_neon_8);
|
||||
QPEL_FUNC(ff_hevc_put_qpel_h1v2_neon_8);
|
||||
QPEL_FUNC(ff_hevc_put_qpel_h1v3_neon_8);
|
||||
QPEL_FUNC(ff_hevc_put_qpel_h2v1_neon_8);
|
||||
QPEL_FUNC(ff_hevc_put_qpel_h2v2_neon_8);
|
||||
QPEL_FUNC(ff_hevc_put_qpel_h2v3_neon_8);
|
||||
QPEL_FUNC(ff_hevc_put_qpel_h3v1_neon_8);
|
||||
QPEL_FUNC(ff_hevc_put_qpel_h3v2_neon_8);
|
||||
QPEL_FUNC(ff_hevc_put_qpel_h3v3_neon_8);
|
||||
#undef QPEL_FUNC
|
||||
|
||||
#define QPEL_FUNC_UW_PIX(name) \
|
||||
void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
|
||||
int height, intptr_t mx, intptr_t my, int width);
|
||||
QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w4_neon_8);
|
||||
QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w8_neon_8);
|
||||
QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w16_neon_8);
|
||||
QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w24_neon_8);
|
||||
QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w32_neon_8);
|
||||
QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w48_neon_8);
|
||||
QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w64_neon_8);
|
||||
#undef QPEL_FUNC_UW_PIX
|
||||
|
||||
#define QPEL_FUNC_UW(name) \
|
||||
void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
|
||||
int width, int height, int16_t* src2, ptrdiff_t src2stride);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_pixels_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v1_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v2_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v3_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v1_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v2_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v3_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v1_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v2_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v3_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v1_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8);
|
||||
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8);
|
||||
#undef QPEL_FUNC_UW
|
||||
|
||||
void ff_hevc_sao_band_filter_neon_8(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int width, int height, int16_t *offset_table);
|
||||
|
||||
void ff_hevc_sao_band_filter_neon_8_wrapper(uint8_t *_dst, uint8_t *_src,
|
||||
ptrdiff_t stride_dst, ptrdiff_t stride_src,
|
||||
int16_t *sao_offset_val, int sao_left_class,
|
||||
int width, int height) {
|
||||
uint8_t *dst = _dst;
|
||||
uint8_t *src = _src;
|
||||
int16_t offset_table[32] = {0};
|
||||
int k;
|
||||
|
||||
for (k = 0; k < 4; k++) {
|
||||
offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
|
||||
}
|
||||
|
||||
ff_hevc_sao_band_filter_neon_8(dst, src, stride_dst, stride_src, width, height, offset_table);
|
||||
}
|
||||
|
||||
void ff_hevc_sao_edge_filter_neon_8(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int width, int height,
|
||||
int a_stride, int b_stride, int16_t *sao_offset_val, uint8_t *edge_idx);
|
||||
|
||||
void ff_hevc_sao_edge_filter_neon_8_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
|
||||
int eo, int width, int height) {
|
||||
static uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
|
||||
static const int8_t pos[4][2][2] = {
|
||||
{ { -1, 0 }, { 1, 0 } }, // horizontal
|
||||
{ { 0, -1 }, { 0, 1 } }, // vertical
|
||||
{ { -1, -1 }, { 1, 1 } }, // 45 degree
|
||||
{ { 1, -1 }, { -1, 1 } }, // 135 degree
|
||||
};
|
||||
uint8_t *dst = _dst;
|
||||
uint8_t *src = _src;
|
||||
int a_stride, b_stride;
|
||||
ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
|
||||
|
||||
a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
|
||||
b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
|
||||
|
||||
ff_hevc_sao_edge_filter_neon_8(dst, src, stride_dst, stride_src, width, height, a_stride, b_stride, sao_offset_val, edge_idx);
|
||||
}
|
||||
|
||||
void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, intptr_t mx, intptr_t my, int width) {
|
||||
|
||||
put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width);
|
||||
}
|
||||
|
||||
void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, intptr_t mx, intptr_t my, int width) {
|
||||
|
||||
put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0);
|
||||
}
|
||||
|
||||
void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
|
||||
int16_t *src2,
|
||||
int height, intptr_t mx, intptr_t my, int width) {
|
||||
put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
|
||||
}
|
||||
|
||||
av_cold void ff_hevc_dsp_init_neon(HEVCDSPContext *c, const int bit_depth)
|
||||
{
|
||||
if (bit_depth == 8) {
|
||||
int x;
|
||||
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon;
|
||||
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon;
|
||||
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon;
|
||||
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon;
|
||||
c->sao_band_filter[0] = ff_hevc_sao_band_filter_neon_8_wrapper;
|
||||
c->sao_band_filter[1] = ff_hevc_sao_band_filter_neon_8_wrapper;
|
||||
c->sao_band_filter[2] = ff_hevc_sao_band_filter_neon_8_wrapper;
|
||||
c->sao_band_filter[3] = ff_hevc_sao_band_filter_neon_8_wrapper;
|
||||
c->sao_band_filter[4] = ff_hevc_sao_band_filter_neon_8_wrapper;
|
||||
c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_neon_8_wrapper;
|
||||
c->sao_edge_filter[1] = ff_hevc_sao_edge_filter_neon_8_wrapper;
|
||||
c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_neon_8_wrapper;
|
||||
c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_neon_8_wrapper;
|
||||
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_neon_8_wrapper;
|
||||
c->add_residual[0] = ff_hevc_add_residual_4x4_8_neon;
|
||||
c->add_residual[1] = ff_hevc_add_residual_8x8_8_neon;
|
||||
c->add_residual[2] = ff_hevc_add_residual_16x16_8_neon;
|
||||
c->add_residual[3] = ff_hevc_add_residual_32x32_8_neon;
|
||||
c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_neon;
|
||||
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon;
|
||||
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon;
|
||||
c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon;
|
||||
c->idct[0] = ff_hevc_idct_4x4_8_neon;
|
||||
c->idct[1] = ff_hevc_idct_8x8_8_neon;
|
||||
c->idct[2] = ff_hevc_idct_16x16_8_neon;
|
||||
c->idct[3] = ff_hevc_idct_32x32_8_neon;
|
||||
c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
|
||||
put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8;
|
||||
put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8;
|
||||
put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8;
|
||||
put_hevc_qpel_neon[0][1] = ff_hevc_put_qpel_h1_neon_8;
|
||||
put_hevc_qpel_neon[0][2] = ff_hevc_put_qpel_h2_neon_8;
|
||||
put_hevc_qpel_neon[0][3] = ff_hevc_put_qpel_h3_neon_8;
|
||||
put_hevc_qpel_neon[1][1] = ff_hevc_put_qpel_h1v1_neon_8;
|
||||
put_hevc_qpel_neon[1][2] = ff_hevc_put_qpel_h2v1_neon_8;
|
||||
put_hevc_qpel_neon[1][3] = ff_hevc_put_qpel_h3v1_neon_8;
|
||||
put_hevc_qpel_neon[2][1] = ff_hevc_put_qpel_h1v2_neon_8;
|
||||
put_hevc_qpel_neon[2][2] = ff_hevc_put_qpel_h2v2_neon_8;
|
||||
put_hevc_qpel_neon[2][3] = ff_hevc_put_qpel_h3v2_neon_8;
|
||||
put_hevc_qpel_neon[3][1] = ff_hevc_put_qpel_h1v3_neon_8;
|
||||
put_hevc_qpel_neon[3][2] = ff_hevc_put_qpel_h2v3_neon_8;
|
||||
put_hevc_qpel_neon[3][3] = ff_hevc_put_qpel_h3v3_neon_8;
|
||||
put_hevc_qpel_uw_neon[1][0] = ff_hevc_put_qpel_uw_v1_neon_8;
|
||||
put_hevc_qpel_uw_neon[2][0] = ff_hevc_put_qpel_uw_v2_neon_8;
|
||||
put_hevc_qpel_uw_neon[3][0] = ff_hevc_put_qpel_uw_v3_neon_8;
|
||||
put_hevc_qpel_uw_neon[0][1] = ff_hevc_put_qpel_uw_h1_neon_8;
|
||||
put_hevc_qpel_uw_neon[0][2] = ff_hevc_put_qpel_uw_h2_neon_8;
|
||||
put_hevc_qpel_uw_neon[0][3] = ff_hevc_put_qpel_uw_h3_neon_8;
|
||||
put_hevc_qpel_uw_neon[1][1] = ff_hevc_put_qpel_uw_h1v1_neon_8;
|
||||
put_hevc_qpel_uw_neon[1][2] = ff_hevc_put_qpel_uw_h2v1_neon_8;
|
||||
put_hevc_qpel_uw_neon[1][3] = ff_hevc_put_qpel_uw_h3v1_neon_8;
|
||||
put_hevc_qpel_uw_neon[2][1] = ff_hevc_put_qpel_uw_h1v2_neon_8;
|
||||
put_hevc_qpel_uw_neon[2][2] = ff_hevc_put_qpel_uw_h2v2_neon_8;
|
||||
put_hevc_qpel_uw_neon[2][3] = ff_hevc_put_qpel_uw_h3v2_neon_8;
|
||||
put_hevc_qpel_uw_neon[3][1] = ff_hevc_put_qpel_uw_h1v3_neon_8;
|
||||
put_hevc_qpel_uw_neon[3][2] = ff_hevc_put_qpel_uw_h2v3_neon_8;
|
||||
put_hevc_qpel_uw_neon[3][3] = ff_hevc_put_qpel_uw_h3v3_neon_8;
|
||||
for (x = 0; x < 10; x++) {
|
||||
c->put_hevc_qpel[x][1][0] = ff_hevc_put_qpel_neon_wrapper;
|
||||
c->put_hevc_qpel[x][0][1] = ff_hevc_put_qpel_neon_wrapper;
|
||||
c->put_hevc_qpel[x][1][1] = ff_hevc_put_qpel_neon_wrapper;
|
||||
c->put_hevc_qpel_uni[x][1][0] = ff_hevc_put_qpel_uni_neon_wrapper;
|
||||
c->put_hevc_qpel_uni[x][0][1] = ff_hevc_put_qpel_uni_neon_wrapper;
|
||||
c->put_hevc_qpel_uni[x][1][1] = ff_hevc_put_qpel_uni_neon_wrapper;
|
||||
c->put_hevc_qpel_bi[x][1][0] = ff_hevc_put_qpel_bi_neon_wrapper;
|
||||
c->put_hevc_qpel_bi[x][0][1] = ff_hevc_put_qpel_bi_neon_wrapper;
|
||||
c->put_hevc_qpel_bi[x][1][1] = ff_hevc_put_qpel_bi_neon_wrapper;
|
||||
}
|
||||
c->put_hevc_qpel[0][0][0] = ff_hevc_put_pixels_w2_neon_8;
|
||||
c->put_hevc_qpel[1][0][0] = ff_hevc_put_pixels_w4_neon_8;
|
||||
c->put_hevc_qpel[2][0][0] = ff_hevc_put_pixels_w6_neon_8;
|
||||
c->put_hevc_qpel[3][0][0] = ff_hevc_put_pixels_w8_neon_8;
|
||||
c->put_hevc_qpel[4][0][0] = ff_hevc_put_pixels_w12_neon_8;
|
||||
c->put_hevc_qpel[5][0][0] = ff_hevc_put_pixels_w16_neon_8;
|
||||
c->put_hevc_qpel[6][0][0] = ff_hevc_put_pixels_w24_neon_8;
|
||||
c->put_hevc_qpel[7][0][0] = ff_hevc_put_pixels_w32_neon_8;
|
||||
c->put_hevc_qpel[8][0][0] = ff_hevc_put_pixels_w48_neon_8;
|
||||
c->put_hevc_qpel[9][0][0] = ff_hevc_put_pixels_w64_neon_8;
|
||||
|
||||
c->put_hevc_qpel_uni[1][0][0] = ff_hevc_put_qpel_uw_pixels_w4_neon_8;
|
||||
c->put_hevc_qpel_uni[3][0][0] = ff_hevc_put_qpel_uw_pixels_w8_neon_8;
|
||||
c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_qpel_uw_pixels_w16_neon_8;
|
||||
c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_qpel_uw_pixels_w24_neon_8;
|
||||
c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_qpel_uw_pixels_w32_neon_8;
|
||||
c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
|
||||
c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
|
||||
}
|
||||
|
||||
if (bit_depth == 10) {
|
||||
c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
|
||||
c->add_residual[1] = ff_hevc_add_residual_8x8_10_neon;
|
||||
c->add_residual[2] = ff_hevc_add_residual_16x16_10_neon;
|
||||
c->add_residual[3] = ff_hevc_add_residual_32x32_10_neon;
|
||||
|
||||
c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_neon;
|
||||
c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_neon;
|
||||
c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_neon;
|
||||
c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon;
|
||||
|
||||
c->idct[0] = ff_hevc_idct_4x4_10_neon;
|
||||
c->idct[1] = ff_hevc_idct_8x8_10_neon;
|
||||
c->idct[2] = ff_hevc_idct_16x16_10_neon;
|
||||
c->idct[3] = ff_hevc_idct_32x32_10_neon;
|
||||
}
|
||||
}
|
||||
999
externals/ffmpeg/libavcodec/arm/hevcdsp_qpel_neon.S
vendored
Executable file
999
externals/ffmpeg/libavcodec/arm/hevcdsp_qpel_neon.S
vendored
Executable file
@@ -0,0 +1,999 @@
|
||||
/*
|
||||
* Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
#define MAX_PB_SIZE #64
|
||||
|
||||
.macro regshuffle_d8
|
||||
vmov d16, d17
|
||||
vmov d17, d18
|
||||
vmov d18, d19
|
||||
vmov d19, d20
|
||||
vmov d20, d21
|
||||
vmov d21, d22
|
||||
vmov d22, d23
|
||||
.endm
|
||||
|
||||
.macro regshuffle_q8
|
||||
vmov q0, q1
|
||||
vmov q1, q2
|
||||
vmov q2, q3
|
||||
vmov q3, q4
|
||||
vmov q4, q5
|
||||
vmov q5, q6
|
||||
vmov q6, q7
|
||||
.endm
|
||||
|
||||
.macro vextin8
|
||||
pld [r2]
|
||||
vld1.8 {q11}, [r2], r3
|
||||
vext.8 d16, d22, d23, #1
|
||||
vext.8 d17, d22, d23, #2
|
||||
vext.8 d18, d22, d23, #3
|
||||
vext.8 d19, d22, d23, #4
|
||||
vext.8 d20, d22, d23, #5
|
||||
vext.8 d21, d22, d23, #6
|
||||
vext.8 d22, d22, d23, #7
|
||||
.endm
|
||||
|
||||
.macro loadin8
|
||||
pld [r2]
|
||||
vld1.8 {d16}, [r2], r3
|
||||
pld [r2]
|
||||
vld1.8 {d17}, [r2], r3
|
||||
pld [r2]
|
||||
vld1.8 {d18}, [r2], r3
|
||||
pld [r2]
|
||||
vld1.8 {d19}, [r2], r3
|
||||
pld [r2]
|
||||
vld1.8 {d20}, [r2], r3
|
||||
pld [r2]
|
||||
vld1.8 {d21}, [r2], r3
|
||||
pld [r2]
|
||||
vld1.8 {d22}, [r2], r3
|
||||
pld [r2]
|
||||
vld1.8 {d23}, [r2], r3
|
||||
.endm
|
||||
|
||||
.macro qpel_filter_1_32b
|
||||
vmov.i16 d16, #58
|
||||
vmov.i16 d17, #10
|
||||
vmull.s16 q9, d6, d16 // 58 * d0
|
||||
vmull.s16 q10, d7, d16 // 58 * d1
|
||||
vmov.i16 d16, #17
|
||||
vmull.s16 q11, d4, d17 // 10 * c0
|
||||
vmull.s16 q12, d5, d17 // 10 * c1
|
||||
vmov.i16 d17, #5
|
||||
vmull.s16 q13, d8, d16 // 17 * e0
|
||||
vmull.s16 q14, d9, d16 // 17 * e1
|
||||
vmull.s16 q15, d10, d17 // 5 * f0
|
||||
vmull.s16 q8, d11, d17 // 5 * f1
|
||||
vsub.s32 q9, q11 // 58 * d0 - 10 * c0
|
||||
vsub.s32 q10, q12 // 58 * d1 - 10 * c1
|
||||
vshll.s16 q11, d2, #2 // 4 * b0
|
||||
vshll.s16 q12, d3, #2 // 4 * b1
|
||||
vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0
|
||||
vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1
|
||||
vsubl.s16 q13, d12, d0 // g0 - a0
|
||||
vsubl.s16 q14, d13, d1 // g1 - a1
|
||||
vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
|
||||
vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
|
||||
vsub.s32 q13, q15 // g0 - a0 - 5 * f0
|
||||
vsub.s32 q14, q8 // g1 - a1 - 5 * f1
|
||||
vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
|
||||
vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
|
||||
vqshrn.s32 d16, q9, #6
|
||||
vqshrn.s32 d17, q10, #6
|
||||
.endm
|
||||
|
||||
// input q0 - q7
|
||||
// output q8
|
||||
.macro qpel_filter_2_32b
|
||||
vmov.i32 q8, #11
|
||||
vaddl.s16 q9, d6, d8 // d0 + e0
|
||||
vaddl.s16 q10, d7, d9 // d1 + e1
|
||||
vaddl.s16 q11, d4, d10 // c0 + f0
|
||||
vaddl.s16 q12, d5, d11 // c1 + f1
|
||||
vmul.s32 q11, q8 // 11 * (c0 + f0)
|
||||
vmul.s32 q12, q8 // 11 * (c1 + f1)
|
||||
vmov.i32 q8, #40
|
||||
vaddl.s16 q15, d2, d12 // b0 + g0
|
||||
vmul.s32 q9, q8 // 40 * (d0 + e0)
|
||||
vmul.s32 q10, q8 // 40 * (d1 + e1)
|
||||
vaddl.s16 q8, d3, d13 // b1 + g1
|
||||
vaddl.s16 q13, d0, d14 // a0 + h0
|
||||
vaddl.s16 q14, d1, d15 // a1 + h1
|
||||
vshl.s32 q15, #2 // 4*(b0+g0)
|
||||
vshl.s32 q8, #2 // 4*(b1+g1)
|
||||
vadd.s32 q11, q13 // 11 * (c0 + f0) + a0 + h0
|
||||
vadd.s32 q12, q14 // 11 * (c1 + f1) + a1 + h1
|
||||
vadd.s32 q9, q15 // 40 * (d0 + e0) + 4*(b0+g0)
|
||||
vadd.s32 q10, q8 // 40 * (d1 + e1) + 4*(b1+g1)
|
||||
vsub.s32 q9, q11 // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
|
||||
vsub.s32 q10, q12 // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1)
|
||||
vqshrn.s32 d16, q9, #6
|
||||
vqshrn.s32 d17, q10, #6
|
||||
.endm
|
||||
|
||||
.macro qpel_filter_3_32b
|
||||
vmov.i16 d16, #58
|
||||
vmov.i16 d17, #10
|
||||
vmull.s16 q9, d8, d16 // 58 * d0
|
||||
vmull.s16 q10, d9, d16 // 58 * d1
|
||||
vmov.i16 d16, #17
|
||||
vmull.s16 q11, d10, d17 // 10 * c0
|
||||
vmull.s16 q12, d11, d17 // 10 * c1
|
||||
vmov.i16 d17, #5
|
||||
vmull.s16 q13, d6, d16 // 17 * e0
|
||||
vmull.s16 q14, d7, d16 // 17 * e1
|
||||
vmull.s16 q15, d4, d17 // 5 * f0
|
||||
vmull.s16 q8, d5, d17 // 5 * f1
|
||||
vsub.s32 q9, q11 // 58 * d0 - 10 * c0
|
||||
vsub.s32 q10, q12 // 58 * d1 - 10 * c1
|
||||
vshll.s16 q11, d12, #2 // 4 * b0
|
||||
vshll.s16 q12, d13, #2 // 4 * b1
|
||||
vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0
|
||||
vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1
|
||||
vsubl.s16 q13, d2, d14 // g0 - a0
|
||||
vsubl.s16 q14, d3, d15 // g1 - a1
|
||||
vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
|
||||
vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
|
||||
vsub.s32 q13, q15 // g0 - a0 - 5 * f0
|
||||
vsub.s32 q14, q8 // g1 - a1 - 5 * f1
|
||||
vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
|
||||
vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
|
||||
vqshrn.s32 d16, q9, #6
|
||||
vqshrn.s32 d17, q10, #6
|
||||
.endm
|
||||
|
||||
.macro qpel_filter_1 out=q7
|
||||
vmov.u8 d24, #58
|
||||
vmov.u8 d25, #10
|
||||
vshll.u8 q13, d20, #4 // 16*e
|
||||
vshll.u8 q14, d21, #2 // 4*f
|
||||
vmull.u8 \out, d19, d24 // 58*d
|
||||
vaddw.u8 q13, q13, d20 // 17*e
|
||||
vmull.u8 q15, d18, d25 // 10*c
|
||||
vaddw.u8 q14, q14, d21 // 5*f
|
||||
vsubl.u8 q12, d22, d16 // g - a
|
||||
vadd.u16 \out, q13 // 58d + 17e
|
||||
vshll.u8 q13, d17, #2 // 4*b
|
||||
vadd.u16 q15, q14 // 10*c + 5*f
|
||||
vadd.s16 q13, q12 // - a + 4*b + g
|
||||
vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f
|
||||
vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f
|
||||
.endm
|
||||
|
||||
.macro qpel_filter_2 out=q7
|
||||
vmov.i16 q12, #10
|
||||
vmov.i16 q14, #11
|
||||
vaddl.u8 q13, d19, d20 // d + e
|
||||
vaddl.u8 q15, d18, d21 // c + f
|
||||
vmul.u16 q13, q12 // 10 * (d+e)
|
||||
vmul.u16 q15, q14 // 11 * ( c + f)
|
||||
vaddl.u8 \out, d17, d22 // b + g
|
||||
vaddl.u8 q12, d16, d23 // a + h
|
||||
vadd.u16 \out, q13 // b + 10 * (d + e) + g
|
||||
vadd.s16 q12, q15
|
||||
vshl.u16 \out, #2 // 4 * (b + 10 * (d + e) + g)
|
||||
vsub.s16 \out, q12
|
||||
.endm
|
||||
|
||||
.macro qpel_filter_3 out=q7
|
||||
vmov.u8 d24, #58
|
||||
vmov.u8 d25, #10
|
||||
vshll.u8 q13, d19, #4 // 16*e
|
||||
vshll.u8 q14, d18, #2 // 4*f
|
||||
vmull.u8 \out, d20, d24 // 58*d
|
||||
vaddw.u8 q13, q13, d19 // 17*e
|
||||
vmull.u8 q15, d21, d25 // 10*c
|
||||
vaddw.u8 q14, q14, d18 // 5*f
|
||||
vsubl.u8 q12, d17, d23 // g - a
|
||||
vadd.u16 \out, q13 // 58d + 17e
|
||||
vshll.u8 q13, d22, #2 // 4*b
|
||||
vadd.u16 q15, q14 // 10*c + 5*f
|
||||
vadd.s16 q13, q12 // - a + 4*b + g
|
||||
vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f
|
||||
vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f
|
||||
.endm
|
||||
|
||||
.macro hevc_put_qpel_vX_neon_8 filter
|
||||
push {r4, r5, r6, r7}
|
||||
ldr r4, [sp, #16] // height
|
||||
ldr r5, [sp, #20] // width
|
||||
vpush {d8-d15}
|
||||
sub r2, r2, r3, lsl #1
|
||||
sub r2, r3
|
||||
mov r12, r4
|
||||
mov r6, r0
|
||||
mov r7, r2
|
||||
lsl r1, #1
|
||||
0: loadin8
|
||||
cmp r5, #4
|
||||
beq 4f
|
||||
8: subs r4, #1
|
||||
\filter
|
||||
vst1.16 {q7}, [r0], r1
|
||||
regshuffle_d8
|
||||
vld1.8 {d23}, [r2], r3
|
||||
bne 8b
|
||||
subs r5, #8
|
||||
beq 99f
|
||||
mov r4, r12
|
||||
add r6, #16
|
||||
mov r0, r6
|
||||
add r7, #8
|
||||
mov r2, r7
|
||||
b 0b
|
||||
4: subs r4, #1
|
||||
\filter
|
||||
vst1.16 d14, [r0], r1
|
||||
regshuffle_d8
|
||||
vld1.32 {d23[0]}, [r2], r3
|
||||
bne 4b
|
||||
99: vpop {d8-d15}
|
||||
pop {r4, r5, r6, r7}
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
.macro hevc_put_qpel_uw_vX_neon_8 filter
|
||||
push {r4-r10}
|
||||
ldr r5, [sp, #28] // width
|
||||
ldr r4, [sp, #32] // height
|
||||
ldr r8, [sp, #36] // src2
|
||||
ldr r9, [sp, #40] // src2stride
|
||||
vpush {d8-d15}
|
||||
sub r2, r2, r3, lsl #1
|
||||
sub r2, r3
|
||||
mov r12, r4
|
||||
mov r6, r0
|
||||
mov r7, r2
|
||||
cmp r8, #0
|
||||
bne .Lbi\@
|
||||
0: loadin8
|
||||
cmp r5, #4
|
||||
beq 4f
|
||||
8: subs r4, #1
|
||||
\filter
|
||||
vqrshrun.s16 d0, q7, #6
|
||||
vst1.8 d0, [r0], r1
|
||||
regshuffle_d8
|
||||
vld1.8 {d23}, [r2], r3
|
||||
bne 8b
|
||||
subs r5, #8
|
||||
beq 99f
|
||||
mov r4, r12
|
||||
add r6, #8
|
||||
mov r0, r6
|
||||
add r7, #8
|
||||
mov r2, r7
|
||||
b 0b
|
||||
4: subs r4, #1
|
||||
\filter
|
||||
vqrshrun.s16 d0, q7, #6
|
||||
vst1.32 d0[0], [r0], r1
|
||||
regshuffle_d8
|
||||
vld1.32 {d23[0]}, [r2], r3
|
||||
bne 4b
|
||||
b 99f
|
||||
.Lbi\@: lsl r9, #1
|
||||
mov r10, r8
|
||||
0: loadin8
|
||||
cmp r5, #4
|
||||
beq 4f
|
||||
8: subs r4, #1
|
||||
\filter
|
||||
vld1.16 {q0}, [r8], r9
|
||||
vqadd.s16 q0, q7
|
||||
vqrshrun.s16 d0, q0, #7
|
||||
vst1.8 d0, [r0], r1
|
||||
regshuffle_d8
|
||||
vld1.8 {d23}, [r2], r3
|
||||
bne 8b
|
||||
subs r5, #8
|
||||
beq 99f
|
||||
mov r4, r12
|
||||
add r6, #8
|
||||
mov r0, r6
|
||||
add r10, #16
|
||||
mov r8, r10
|
||||
add r7, #8
|
||||
mov r2, r7
|
||||
b 0b
|
||||
4: subs r4, #1
|
||||
\filter
|
||||
vld1.16 d0, [r8], r9
|
||||
vqadd.s16 d0, d14
|
||||
vqrshrun.s16 d0, q0, #7
|
||||
vst1.32 d0[0], [r0], r1
|
||||
regshuffle_d8
|
||||
vld1.32 {d23[0]}, [r2], r3
|
||||
bne 4b
|
||||
99: vpop {d8-d15}
|
||||
pop {r4-r10}
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
function ff_hevc_put_qpel_v1_neon_8, export=1
|
||||
hevc_put_qpel_vX_neon_8 qpel_filter_1
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_v2_neon_8, export=1
|
||||
hevc_put_qpel_vX_neon_8 qpel_filter_2
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_v3_neon_8, export=1
|
||||
hevc_put_qpel_vX_neon_8 qpel_filter_3
|
||||
endfunc
|
||||
|
||||
|
||||
function ff_hevc_put_qpel_uw_v1_neon_8, export=1
|
||||
hevc_put_qpel_uw_vX_neon_8 qpel_filter_1
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_uw_v2_neon_8, export=1
|
||||
hevc_put_qpel_uw_vX_neon_8 qpel_filter_2
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_uw_v3_neon_8, export=1
|
||||
hevc_put_qpel_uw_vX_neon_8 qpel_filter_3
|
||||
endfunc
|
||||
|
||||
.macro hevc_put_qpel_hX_neon_8 filter
|
||||
push {r4, r5, r6, r7}
|
||||
ldr r4, [sp, #16] // height
|
||||
ldr r5, [sp, #20] // width
|
||||
|
||||
vpush {d8-d15}
|
||||
sub r2, #4
|
||||
lsl r1, #1
|
||||
mov r12, r4
|
||||
mov r6, r0
|
||||
mov r7, r2
|
||||
cmp r5, #4
|
||||
beq 4f
|
||||
8: subs r4, #1
|
||||
vextin8
|
||||
\filter
|
||||
vst1.16 {q7}, [r0], r1
|
||||
bne 8b
|
||||
subs r5, #8
|
||||
beq 99f
|
||||
mov r4, r12
|
||||
add r6, #16
|
||||
mov r0, r6
|
||||
add r7, #8
|
||||
mov r2, r7
|
||||
cmp r5, #4
|
||||
bne 8b
|
||||
4: subs r4, #1
|
||||
vextin8
|
||||
\filter
|
||||
vst1.16 d14, [r0], r1
|
||||
bne 4b
|
||||
99: vpop {d8-d15}
|
||||
pop {r4, r5, r6, r7}
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
.macro hevc_put_qpel_uw_hX_neon_8 filter
|
||||
push {r4-r10}
|
||||
ldr r5, [sp, #28] // width
|
||||
ldr r4, [sp, #32] // height
|
||||
ldr r8, [sp, #36] // src2
|
||||
ldr r9, [sp, #40] // src2stride
|
||||
vpush {d8-d15}
|
||||
sub r2, #4
|
||||
mov r12, r4
|
||||
mov r6, r0
|
||||
mov r7, r2
|
||||
cmp r8, #0
|
||||
bne .Lbi\@
|
||||
cmp r5, #4
|
||||
beq 4f
|
||||
8: subs r4, #1
|
||||
vextin8
|
||||
\filter
|
||||
vqrshrun.s16 d0, q7, #6
|
||||
vst1.8 d0, [r0], r1
|
||||
bne 8b
|
||||
subs r5, #8
|
||||
beq 99f
|
||||
mov r4, r12
|
||||
add r6, #8
|
||||
mov r0, r6
|
||||
add r7, #8
|
||||
mov r2, r7
|
||||
cmp r5, #4
|
||||
bne 8b
|
||||
4: subs r4, #1
|
||||
vextin8
|
||||
\filter
|
||||
vqrshrun.s16 d0, q7, #6
|
||||
vst1.32 d0[0], [r0], r1
|
||||
bne 4b
|
||||
b 99f
|
||||
.Lbi\@:
|
||||
lsl r9, #1
|
||||
cmp r5, #4
|
||||
beq 4f
|
||||
mov r10, r8
|
||||
8: subs r4, #1
|
||||
vextin8
|
||||
\filter
|
||||
vld1.16 {q0}, [r8], r9
|
||||
vqadd.s16 q0, q7
|
||||
vqrshrun.s16 d0, q0, #7
|
||||
vst1.8 d0, [r0], r1
|
||||
bne 8b
|
||||
subs r5, #8
|
||||
beq 99f
|
||||
mov r4, r12
|
||||
add r6, #8
|
||||
add r10, #16
|
||||
mov r8, r10
|
||||
mov r0, r6
|
||||
add r7, #8
|
||||
mov r2, r7
|
||||
cmp r5, #4
|
||||
bne 8b
|
||||
4: subs r4, #1
|
||||
vextin8
|
||||
\filter
|
||||
vld1.16 d0, [r8], r9
|
||||
vqadd.s16 d0, d14
|
||||
vqrshrun.s16 d0, q0, #7
|
||||
vst1.32 d0[0], [r0], r1
|
||||
bne 4b
|
||||
99: vpop {d8-d15}
|
||||
pop {r4-r10}
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
function ff_hevc_put_qpel_h1_neon_8, export=1
|
||||
hevc_put_qpel_hX_neon_8 qpel_filter_1
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_h2_neon_8, export=1
|
||||
hevc_put_qpel_hX_neon_8 qpel_filter_2
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_h3_neon_8, export=1
|
||||
hevc_put_qpel_hX_neon_8 qpel_filter_3
|
||||
endfunc
|
||||
|
||||
|
||||
function ff_hevc_put_qpel_uw_h1_neon_8, export=1
|
||||
hevc_put_qpel_uw_hX_neon_8 qpel_filter_1
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_uw_h2_neon_8, export=1
|
||||
hevc_put_qpel_uw_hX_neon_8 qpel_filter_2
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_uw_h3_neon_8, export=1
|
||||
hevc_put_qpel_uw_hX_neon_8 qpel_filter_3
|
||||
endfunc
|
||||
|
||||
.macro hevc_put_qpel_hXvY_neon_8 filterh filterv
|
||||
push {r4, r5, r6, r7}
|
||||
ldr r4, [sp, #16] // height
|
||||
ldr r5, [sp, #20] // width
|
||||
|
||||
vpush {d8-d15}
|
||||
sub r2, #4
|
||||
sub r2, r2, r3, lsl #1
|
||||
sub r2, r3 // extra_before 3
|
||||
lsl r1, #1
|
||||
mov r12, r4
|
||||
mov r6, r0
|
||||
mov r7, r2
|
||||
0: vextin8
|
||||
\filterh q0
|
||||
vextin8
|
||||
\filterh q1
|
||||
vextin8
|
||||
\filterh q2
|
||||
vextin8
|
||||
\filterh q3
|
||||
vextin8
|
||||
\filterh q4
|
||||
vextin8
|
||||
\filterh q5
|
||||
vextin8
|
||||
\filterh q6
|
||||
vextin8
|
||||
\filterh q7
|
||||
cmp r5, #4
|
||||
beq 4f
|
||||
8: subs r4, #1
|
||||
\filterv
|
||||
vst1.16 {q8}, [r0], r1
|
||||
regshuffle_q8
|
||||
vextin8
|
||||
\filterh q7
|
||||
bne 8b
|
||||
subs r5, #8
|
||||
beq 99f
|
||||
mov r4, r12
|
||||
add r6, #16
|
||||
mov r0, r6
|
||||
add r7, #8
|
||||
mov r2, r7
|
||||
b 0b
|
||||
4: subs r4, #1
|
||||
\filterv
|
||||
vst1.16 d16, [r0], r1
|
||||
regshuffle_q8
|
||||
vextin8
|
||||
\filterh q7
|
||||
bne 4b
|
||||
99: vpop {d8-d15}
|
||||
pop {r4, r5, r6, r7}
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv
|
||||
push {r4-r10}
|
||||
ldr r5, [sp, #28] // width
|
||||
ldr r4, [sp, #32] // height
|
||||
ldr r8, [sp, #36] // src2
|
||||
ldr r9, [sp, #40] // src2stride
|
||||
vpush {d8-d15}
|
||||
sub r2, #4
|
||||
sub r2, r2, r3, lsl #1
|
||||
sub r2, r3 // extra_before 3
|
||||
mov r12, r4
|
||||
mov r6, r0
|
||||
mov r7, r2
|
||||
cmp r8, #0
|
||||
bne .Lbi\@
|
||||
0: vextin8
|
||||
\filterh q0
|
||||
vextin8
|
||||
\filterh q1
|
||||
vextin8
|
||||
\filterh q2
|
||||
vextin8
|
||||
\filterh q3
|
||||
vextin8
|
||||
\filterh q4
|
||||
vextin8
|
||||
\filterh q5
|
||||
vextin8
|
||||
\filterh q6
|
||||
vextin8
|
||||
\filterh q7
|
||||
cmp r5, #4
|
||||
beq 4f
|
||||
8: subs r4, #1
|
||||
\filterv
|
||||
vqrshrun.s16 d0, q8, #6
|
||||
vst1.8 d0, [r0], r1
|
||||
regshuffle_q8
|
||||
vextin8
|
||||
\filterh q7
|
||||
bne 8b
|
||||
subs r5, #8
|
||||
beq 99f
|
||||
mov r4, r12
|
||||
add r6, #8
|
||||
mov r0, r6
|
||||
add r7, #8
|
||||
mov r2, r7
|
||||
b 0b
|
||||
4: subs r4, #1
|
||||
\filterv
|
||||
vqrshrun.s16 d0, q8, #6
|
||||
vst1.32 d0[0], [r0], r1
|
||||
regshuffle_q8
|
||||
vextin8
|
||||
\filterh q7
|
||||
bne 4b
|
||||
b 99f
|
||||
.Lbi\@: lsl r9, #1
|
||||
mov r10, r8
|
||||
0: vextin8
|
||||
\filterh q0
|
||||
vextin8
|
||||
\filterh q1
|
||||
vextin8
|
||||
\filterh q2
|
||||
vextin8
|
||||
\filterh q3
|
||||
vextin8
|
||||
\filterh q4
|
||||
vextin8
|
||||
\filterh q5
|
||||
vextin8
|
||||
\filterh q6
|
||||
vextin8
|
||||
\filterh q7
|
||||
cmp r5, #4
|
||||
beq 4f
|
||||
8: subs r4, #1
|
||||
\filterv
|
||||
vld1.16 {q0}, [r8], r9
|
||||
vqadd.s16 q0, q8
|
||||
vqrshrun.s16 d0, q0, #7
|
||||
vst1.8 d0, [r0], r1
|
||||
regshuffle_q8
|
||||
vextin8
|
||||
\filterh q7
|
||||
bne 8b
|
||||
subs r5, #8
|
||||
beq 99f
|
||||
mov r4, r12
|
||||
add r6, #8
|
||||
mov r0, r6
|
||||
add r10, #16
|
||||
mov r8, r10
|
||||
add r7, #8
|
||||
mov r2, r7
|
||||
b 0b
|
||||
4: subs r4, #1
|
||||
\filterv
|
||||
vld1.16 d0, [r8], r9
|
||||
vqadd.s16 d0, d16
|
||||
vqrshrun.s16 d0, q0, #7
|
||||
vst1.32 d0[0], [r0], r1
|
||||
regshuffle_q8
|
||||
vextin8
|
||||
\filterh q7
|
||||
bne 4b
|
||||
99: vpop {d8-d15}
|
||||
pop {r4-r10}
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
|
||||
function ff_hevc_put_qpel_h1v1_neon_8, export=1
|
||||
hevc_put_qpel_hXvY_neon_8 qpel_filter_1, qpel_filter_1_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_h2v1_neon_8, export=1
|
||||
hevc_put_qpel_hXvY_neon_8 qpel_filter_2, qpel_filter_1_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_h3v1_neon_8, export=1
|
||||
hevc_put_qpel_hXvY_neon_8 qpel_filter_3, qpel_filter_1_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_h1v2_neon_8, export=1
|
||||
hevc_put_qpel_hXvY_neon_8 qpel_filter_1, qpel_filter_2_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_h2v2_neon_8, export=1
|
||||
hevc_put_qpel_hXvY_neon_8 qpel_filter_2, qpel_filter_2_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_h3v2_neon_8, export=1
|
||||
hevc_put_qpel_hXvY_neon_8 qpel_filter_3, qpel_filter_2_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_h1v3_neon_8, export=1
|
||||
hevc_put_qpel_hXvY_neon_8 qpel_filter_1, qpel_filter_3_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_h2v3_neon_8, export=1
|
||||
hevc_put_qpel_hXvY_neon_8 qpel_filter_2, qpel_filter_3_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_h3v3_neon_8, export=1
|
||||
hevc_put_qpel_hXvY_neon_8 qpel_filter_3, qpel_filter_3_32b
|
||||
endfunc
|
||||
|
||||
|
||||
function ff_hevc_put_qpel_uw_h1v1_neon_8, export=1
|
||||
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1, qpel_filter_1_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_uw_h2v1_neon_8, export=1
|
||||
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2, qpel_filter_1_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_uw_h3v1_neon_8, export=1
|
||||
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3, qpel_filter_1_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_uw_h1v2_neon_8, export=1
|
||||
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1, qpel_filter_2_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_uw_h2v2_neon_8, export=1
|
||||
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2, qpel_filter_2_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_uw_h3v2_neon_8, export=1
|
||||
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3, qpel_filter_2_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_uw_h1v3_neon_8, export=1
|
||||
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1, qpel_filter_3_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_uw_h2v3_neon_8, export=1
|
||||
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2, qpel_filter_3_32b
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_uw_h3v3_neon_8, export=1
|
||||
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3, qpel_filter_3_32b
|
||||
endfunc
|
||||
|
||||
.macro init_put_pixels
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
mov r12, MAX_PB_SIZE
|
||||
lsl r12, #1
|
||||
.endm
|
||||
|
||||
function ff_hevc_put_pixels_w2_neon_8, export=1
|
||||
init_put_pixels
|
||||
vmov.u8 d5, #255
|
||||
vshr.u64 d5, #32
|
||||
0: subs r3, #1
|
||||
vld1.32 {d0[0]}, [r1], r2
|
||||
pld [r1]
|
||||
vld1.32 d6, [r0]
|
||||
vshll.u8 q0, d0, #6
|
||||
vbit d6, d0, d5
|
||||
vst1.32 d6, [r0], r12
|
||||
bne 0b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_pixels_w4_neon_8, export=1
|
||||
init_put_pixels
|
||||
0: subs r3, #2
|
||||
vld1.32 {d0[0]}, [r1], r2
|
||||
vld1.32 {d0[1]}, [r1], r2
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
vshll.u8 q0, d0, #6
|
||||
vst1.64 {d0}, [r0], r12
|
||||
vst1.64 {d1}, [r0], r12
|
||||
bne 0b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_pixels_w6_neon_8, export=1
|
||||
init_put_pixels
|
||||
vmov.u8 q10, #255
|
||||
vshr.u64 d21, #32
|
||||
0: subs r3, #1
|
||||
vld1.16 {d0}, [r1], r2
|
||||
pld [r1]
|
||||
vshll.u8 q0, d0, #6
|
||||
vld1.8 {q12}, [r0]
|
||||
vbit q12, q0, q10
|
||||
vst1.8 {q12}, [r0], r12
|
||||
bne 0b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_pixels_w8_neon_8, export=1
|
||||
init_put_pixels
|
||||
0: subs r3, #2
|
||||
vld1.8 {d0}, [r1], r2
|
||||
vld1.8 {d2}, [r1], r2
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
vshll.u8 q0, d0, #6
|
||||
vshll.u8 q1, d2, #6
|
||||
vst1.16 {q0}, [r0], r12
|
||||
vst1.16 {q1}, [r0], r12
|
||||
bne 0b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_pixels_w12_neon_8, export=1
|
||||
init_put_pixels
|
||||
0: subs r3, #2
|
||||
vld1.64 {d0}, [r1]
|
||||
add r1, #8
|
||||
vld1.32 {d1[0]}, [r1], r2
|
||||
sub r1, #8
|
||||
vld1.64 {d2}, [r1]
|
||||
add r1, #8
|
||||
vld1.32 {d1[1]}, [r1], r2
|
||||
sub r1, #8
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
vshll.u8 q8, d0, #6
|
||||
vshll.u8 q9, d1, #6
|
||||
vshll.u8 q10, d2, #6
|
||||
vmov d22, d19
|
||||
vst1.64 {d16, d17, d18}, [r0], r12
|
||||
vst1.64 {d20, d21, d22}, [r0], r12
|
||||
bne 0b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_pixels_w16_neon_8, export=1
|
||||
init_put_pixels
|
||||
0: subs r3, #2
|
||||
vld1.8 {q0}, [r1], r2
|
||||
vld1.8 {q1}, [r1], r2
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
vshll.u8 q8, d0, #6
|
||||
vshll.u8 q9, d1, #6
|
||||
vshll.u8 q10, d2, #6
|
||||
vshll.u8 q11, d3, #6
|
||||
vst1.8 {q8, q9}, [r0], r12
|
||||
vst1.8 {q10, q11}, [r0], r12
|
||||
bne 0b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_pixels_w24_neon_8, export=1
|
||||
init_put_pixels
|
||||
0: subs r3, #1
|
||||
vld1.8 {d0, d1, d2}, [r1], r2
|
||||
pld [r1]
|
||||
vshll.u8 q10, d0, #6
|
||||
vshll.u8 q11, d1, #6
|
||||
vshll.u8 q12, d2, #6
|
||||
vstm r0, {q10, q11, q12}
|
||||
add r0, r12
|
||||
bne 0b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_pixels_w32_neon_8, export=1
|
||||
init_put_pixels
|
||||
0: subs r3, #1
|
||||
vld1.8 {q0, q1}, [r1], r2
|
||||
pld [r1]
|
||||
vshll.u8 q8, d0, #6
|
||||
vshll.u8 q9, d1, #6
|
||||
vshll.u8 q10, d2, #6
|
||||
vshll.u8 q11, d3, #6
|
||||
vstm r0, {q8, q9, q10, q11}
|
||||
add r0, r12
|
||||
bne 0b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_pixels_w48_neon_8, export=1
|
||||
init_put_pixels
|
||||
0: subs r3, #1
|
||||
vld1.8 {q0, q1}, [r1]
|
||||
add r1, #32
|
||||
vld1.8 {q2}, [r1], r2
|
||||
sub r1, #32
|
||||
pld [r1]
|
||||
vshll.u8 q8, d0, #6
|
||||
vshll.u8 q9, d1, #6
|
||||
vshll.u8 q10, d2, #6
|
||||
vshll.u8 q11, d3, #6
|
||||
vshll.u8 q12, d4, #6
|
||||
vshll.u8 q13, d5, #6
|
||||
vstm r0, {q8, q9, q10, q11, q12, q13}
|
||||
add r0, r12
|
||||
bne 0b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_pixels_w64_neon_8, export=1
|
||||
init_put_pixels
|
||||
0: subs r3, #1
|
||||
vld1.8 {q0, q1}, [r1]
|
||||
add r1, #32
|
||||
vld1.8 {q2, q3}, [r1], r2
|
||||
sub r1, #32
|
||||
pld [r1]
|
||||
vshll.u8 q8, d0, #6
|
||||
vshll.u8 q9, d1, #6
|
||||
vshll.u8 q10, d2, #6
|
||||
vshll.u8 q11, d3, #6
|
||||
vshll.u8 q12, d4, #6
|
||||
vshll.u8 q13, d5, #6
|
||||
vshll.u8 q14, d6, #6
|
||||
vshll.u8 q15, d7, #6
|
||||
vstm r0, {q8, q9, q10, q11, q12, q13, q14, q15}
|
||||
add r0, r12
|
||||
bne 0b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_qpel_uw_pixels_neon_8, export=1
|
||||
push {r4-r9}
|
||||
ldr r5, [sp, #24] // width
|
||||
ldr r4, [sp, #28] // height
|
||||
ldr r8, [sp, #32] // src2
|
||||
ldr r9, [sp, #36] // src2stride
|
||||
vpush {d8-d15}
|
||||
cmp r8, #0
|
||||
bne 2f
|
||||
1: subs r4, #1
|
||||
vld1.8 {d0}, [r2], r3
|
||||
vst1.8 d0, [r0], r1
|
||||
bne 1b
|
||||
vpop {d8-d15}
|
||||
pop {r4-r9}
|
||||
bx lr
|
||||
2: subs r4, #1
|
||||
vld1.8 {d0}, [r2], r3
|
||||
vld1.16 {q1}, [r8], r9
|
||||
vshll.u8 q0, d0, #6
|
||||
vqadd.s16 q0, q1
|
||||
vqrshrun.s16 d0, q0, #7
|
||||
vst1.8 d0, [r0], r1
|
||||
bne 2b
|
||||
vpop {d8-d15}
|
||||
pop {r4-r9}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4
|
||||
function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
|
||||
ldr r12, [sp] // height
|
||||
1: subs r12, #4
|
||||
vld1.32 {\regs} , [r2], r3
|
||||
vld1.32 {\regs2} , [r2], r3
|
||||
vld1.32 {\regs3} , [r2], r3
|
||||
vld1.32 {\regs4} , [r2], r3
|
||||
vst1.32 {\regs} , [r0], r1
|
||||
vst1.32 {\regs2} , [r0], r1
|
||||
vst1.32 {\regs3} , [r0], r1
|
||||
vst1.32 {\regs4} , [r0], r1
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4
|
||||
function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
|
||||
push {r4-r5}
|
||||
ldr r12, [sp, #8] // height
|
||||
1: subs r12, #2
|
||||
mov r4, r2
|
||||
vld1.32 {\regs} , [r2]!
|
||||
vld1.32 {\regs2} , [r2]
|
||||
add r2, r4, r3
|
||||
mov r4, r2
|
||||
vld1.32 {\regs3} , [r2]!
|
||||
vld1.32 {\regs4} , [r2]
|
||||
add r2, r4, r3
|
||||
mov r5, r0
|
||||
vst1.32 {\regs} , [r0]!
|
||||
vst1.32 {\regs2} , [r0]
|
||||
add r0, r5, r1
|
||||
mov r5, r0
|
||||
vst1.32 {\regs3} , [r0]!
|
||||
vst1.32 {\regs4} , [r0]
|
||||
add r0, r5, r1
|
||||
bne 1b
|
||||
pop {r4-r5}
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
put_qpel_uw_pixels 4, d0[0], d0[1], d1[0], d1[1]
|
||||
put_qpel_uw_pixels 8, d0, d1, d2, d3
|
||||
put_qpel_uw_pixels_m 12, d0, d1[0], d2, d3[0]
|
||||
put_qpel_uw_pixels 16, q0, q1, q2, q3
|
||||
put_qpel_uw_pixels 24, d0-d2, d3-d5, d16-d18, d19-d21
|
||||
put_qpel_uw_pixels 32, q0-q1, q2-q3, q8-q9, q10-q11
|
||||
put_qpel_uw_pixels_m 48, q0-q1, q2, q8-q9, q10
|
||||
put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11
|
||||
177
externals/ffmpeg/libavcodec/arm/hevcdsp_sao_neon.S
vendored
Executable file
177
externals/ffmpeg/libavcodec/arm/hevcdsp_sao_neon.S
vendored
Executable file
@@ -0,0 +1,177 @@
|
||||
/*
|
||||
* Copyright (c) 2017 Meng Wang <wangmeng.kids@bytedance.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
function ff_hevc_sao_band_filter_neon_8, export=1
|
||||
push {r4-r10}
|
||||
ldr r5, [sp, #28] // width
|
||||
ldr r4, [sp, #32] // height
|
||||
ldr r8, [sp, #36] // offset_table
|
||||
vpush {d8-d15}
|
||||
mov r12, r4 // r12 = height
|
||||
mov r6, r0 // r6 = r0 = dst
|
||||
mov r7, r1 // r7 = r1 = src
|
||||
vldm r8, {q0-q3}
|
||||
vmov.u16 q15, #1
|
||||
vmov.u8 q14, #32
|
||||
0: pld [r1]
|
||||
cmp r5, #4
|
||||
beq 4f
|
||||
8: subs r4, #1
|
||||
vld1.8 {d16}, [r1], r3
|
||||
vshr.u8 d17, d16, #3 // index = [src>>3]
|
||||
vshll.u8 q9, d17, #1 // lowIndex = 2*index
|
||||
vadd.u16 q11, q9, q15 // highIndex = (2*index+1) << 8
|
||||
vshl.u16 q10, q11, #8 // q10: highIndex; q9: lowIndex;
|
||||
vadd.u16 q10, q9 // combine high and low index;
|
||||
// Look-up Table Round 1; index range: 0-15
|
||||
vtbx.8 d24, {q0-q1}, d20
|
||||
vtbx.8 d25, {q0-q1}, d21
|
||||
// Look-up Table Round 2; index range: 16-31
|
||||
vsub.u8 q10, q14 // Look-up with 8bit
|
||||
vtbx.8 d24, {q2-q3}, d20
|
||||
vtbx.8 d25, {q2-q3}, d21
|
||||
vaddw.u8 q13, q12, d16
|
||||
vqmovun.s16 d8, q13
|
||||
vst1.8 d8, [r0], r2
|
||||
bne 8b
|
||||
subs r5, #8
|
||||
beq 99f
|
||||
mov r4, r12
|
||||
add r6, #8
|
||||
mov r0, r6
|
||||
add r7, #8
|
||||
mov r1, r7
|
||||
b 0b
|
||||
4: subs r4, #1
|
||||
vld1.32 {d16[0]}, [r1], r3
|
||||
vshr.u8 d17, d16, #3 // src>>3
|
||||
vshll.u8 q9, d17, #1 // lowIndex = 2*index
|
||||
vadd.u16 q11, q9, q15 // highIndex = (2*index+1) << 8
|
||||
vshl.u16 q10, q11, #8 // q10: highIndex; q9: lowIndex;
|
||||
vadd.u16 q10, q9 // combine high and low index;
|
||||
// Look-up Table Round 1; index range: 0-15
|
||||
vtbx.8 d24, {q0-q1}, d20
|
||||
vtbx.8 d25, {q0-q1}, d21
|
||||
// Look-up Table Round 2; index range: 16-32
|
||||
vsub.u8 q10, q14 // Look-up with 8bit
|
||||
vtbx.8 d24, {q2-q3}, d20
|
||||
vtbx.8 d25, {q2-q3}, d21
|
||||
vaddw.u8 q13, q12, d16
|
||||
vqmovun.s16 d14, q13
|
||||
vst1.32 d14[0], [r0], r2
|
||||
bne 4b
|
||||
b 99f
|
||||
99:
|
||||
vpop {d8-d15}
|
||||
pop {r4-r10}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_sao_edge_filter_neon_8, export=1
|
||||
push {r4-r11}
|
||||
ldr r5, [sp, #32] // width
|
||||
ldr r4, [sp, #36] // height
|
||||
ldr r8, [sp, #40] // a_stride
|
||||
ldr r9, [sp, #44] // b_stride
|
||||
ldr r10, [sp, #48] // sao_offset_val
|
||||
ldr r11, [sp, #52] // edge_idx
|
||||
vpush {d8-d15}
|
||||
mov r12, r4 // r12 = height
|
||||
mov r6, r0 // r6 = r0 = dst
|
||||
mov r7, r1 // r7 = r1 = src
|
||||
vld1.8 {d0}, [r11] // edge_idx tabel load in d0 5x8bit
|
||||
vld1.16 {q1}, [r10] // sao_offset_val table load in q1, 5x16bit
|
||||
vmov.u8 d1, #2
|
||||
vmov.u16 q2, #1
|
||||
0: mov r10, r1
|
||||
add r10, r8 // src[x + a_stride]
|
||||
mov r11, r1
|
||||
add r11, r9 // src[x + b_stride]
|
||||
pld [r1]
|
||||
cmp r5, #4
|
||||
beq 4f
|
||||
8: subs r4, #1
|
||||
vld1.8 {d16}, [r1], r3 // src[x] 8x8bit
|
||||
vld1.8 {d17}, [r10], r3 // src[x + a_stride]
|
||||
vld1.8 {d18}, [r11], r3 // src[x + b_stride]
|
||||
vcgt.u8 d8, d16, d17
|
||||
vshr.u8 d9, d8, #7
|
||||
vclt.u8 d8, d16, d17
|
||||
vadd.u8 d8, d9 // diff0
|
||||
vcgt.u8 d10, d16, d18
|
||||
vshr.u8 d11, d10, #7
|
||||
vclt.u8 d10, d16, d18
|
||||
vadd.u8 d10, d11 // diff1
|
||||
vadd.s8 d8, d10
|
||||
vadd.s8 d8, d1
|
||||
vtbx.8 d9, {d0}, d8 // offset_val
|
||||
vshll.u8 q6, d9, #1 // lowIndex
|
||||
vadd.u16 q7, q6, q2
|
||||
vshl.u16 q10, q7, #8 // highIndex
|
||||
vadd.u16 q10, q6 // combine lowIndex and highIndex, offset_val
|
||||
vtbx.8 d22, {q1}, d20
|
||||
vtbx.8 d23, {q1}, d21
|
||||
vaddw.u8 q12, q11, d16
|
||||
vqmovun.s16 d26, q12
|
||||
vst1.8 d26, [r0], r2
|
||||
bne 8b
|
||||
subs r5, #8
|
||||
beq 99f
|
||||
mov r4, r12
|
||||
add r6, #8
|
||||
mov r0, r6
|
||||
add r7, #8
|
||||
mov r1, r7
|
||||
b 0b
|
||||
4: subs r4, #1
|
||||
vld1.32 {d16[0]}, [r1], r3
|
||||
vld1.32 {d17[0]}, [r10], r3 // src[x + a_stride]
|
||||
vld1.32 {d18[0]}, [r11], r3 // src[x + b_stride]
|
||||
vcgt.u8 d8, d16, d17
|
||||
vshr.u8 d9, d8, #7
|
||||
vclt.u8 d8, d16, d17
|
||||
vadd.u8 d8, d9 // diff0
|
||||
vcgt.u8 d10, d16, d18
|
||||
vshr.u8 d11, d10, #7
|
||||
vclt.u8 d10, d16, d18
|
||||
vadd.u8 d10, d11 // diff1
|
||||
vadd.s8 d8, d10
|
||||
vadd.s8 d8, d1
|
||||
vtbx.8 d9, {d0}, d8 // offset_val
|
||||
vshll.u8 q6, d9, #1 // lowIndex
|
||||
vadd.u16 q7, q6, q2
|
||||
vshl.u16 q10, q7, #8 // highIndex
|
||||
vadd.u16 q10, q6 // combine lowIndex and highIndex, offset_val
|
||||
vtbx.8 d22, {q1}, d20
|
||||
vtbx.8 d23, {q1}, d21
|
||||
vaddw.u8 q12, q11, d16
|
||||
vqmovun.s16 d26, q12
|
||||
vst1.32 d26[0], [r0], r2
|
||||
bne 4b
|
||||
b 99f
|
||||
99:
|
||||
vpop {d8-d15}
|
||||
pop {r4-r11}
|
||||
bx lr
|
||||
endfunc
|
||||
603
externals/ffmpeg/libavcodec/arm/hpeldsp_arm.S
vendored
Executable file
603
externals/ffmpeg/libavcodec/arm/hpeldsp_arm.S
vendored
Executable file
@@ -0,0 +1,603 @@
|
||||
@
|
||||
@ ARMv4-optimized halfpel functions
|
||||
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
|
||||
@
|
||||
@ This file is part of FFmpeg.
|
||||
@
|
||||
@ FFmpeg is free software; you can redistribute it and/or
|
||||
@ modify it under the terms of the GNU Lesser General Public
|
||||
@ License as published by the Free Software Foundation; either
|
||||
@ version 2.1 of the License, or (at your option) any later version.
|
||||
@
|
||||
@ FFmpeg is distributed in the hope that it will be useful,
|
||||
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
@ Lesser General Public License for more details.
|
||||
@
|
||||
@ You should have received a copy of the GNU Lesser General Public
|
||||
@ License along with FFmpeg; if not, write to the Free Software
|
||||
@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
@
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
#if !HAVE_ARMV5TE_EXTERNAL
|
||||
#define pld @
|
||||
#endif
|
||||
|
||||
.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
|
||||
mov \Rd0, \Rn0, lsr #(\shift * 8)
|
||||
mov \Rd1, \Rn1, lsr #(\shift * 8)
|
||||
mov \Rd2, \Rn2, lsr #(\shift * 8)
|
||||
mov \Rd3, \Rn3, lsr #(\shift * 8)
|
||||
orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
|
||||
orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
|
||||
orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
|
||||
orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
|
||||
.endm
|
||||
.macro ALIGN_DWORD shift, R0, R1, R2
|
||||
mov \R0, \R0, lsr #(\shift * 8)
|
||||
orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
|
||||
mov \R1, \R1, lsr #(\shift * 8)
|
||||
orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
|
||||
.endm
|
||||
.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
|
||||
mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
|
||||
mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
|
||||
orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
|
||||
orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
|
||||
.endm
|
||||
|
||||
.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
|
||||
@ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
|
||||
@ Rmask = 0xFEFEFEFE
|
||||
@ Rn = destroy
|
||||
eor \Rd0, \Rn0, \Rm0
|
||||
eor \Rd1, \Rn1, \Rm1
|
||||
orr \Rn0, \Rn0, \Rm0
|
||||
orr \Rn1, \Rn1, \Rm1
|
||||
and \Rd0, \Rd0, \Rmask
|
||||
and \Rd1, \Rd1, \Rmask
|
||||
sub \Rd0, \Rn0, \Rd0, lsr #1
|
||||
sub \Rd1, \Rn1, \Rd1, lsr #1
|
||||
.endm
|
||||
|
||||
.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
|
||||
@ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
|
||||
@ Rmask = 0xFEFEFEFE
|
||||
@ Rn = destroy
|
||||
eor \Rd0, \Rn0, \Rm0
|
||||
eor \Rd1, \Rn1, \Rm1
|
||||
and \Rn0, \Rn0, \Rm0
|
||||
and \Rn1, \Rn1, \Rm1
|
||||
and \Rd0, \Rd0, \Rmask
|
||||
and \Rd1, \Rd1, \Rmask
|
||||
add \Rd0, \Rn0, \Rd0, lsr #1
|
||||
add \Rd1, \Rn1, \Rd1, lsr #1
|
||||
.endm
|
||||
|
||||
.macro JMP_ALIGN tmp, reg
|
||||
ands \tmp, \reg, #3
|
||||
bic \reg, \reg, #3
|
||||
beq 1f
|
||||
subs \tmp, \tmp, #1
|
||||
beq 2f
|
||||
subs \tmp, \tmp, #1
|
||||
beq 3f
|
||||
b 4f
|
||||
.endm
|
||||
|
||||
@ ----------------------------------------------------------------
|
||||
function ff_put_pixels16_arm, export=1, align=5
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
push {r4-r11, lr}
|
||||
JMP_ALIGN r5, r1
|
||||
1:
|
||||
ldm r1, {r4-r7}
|
||||
add r1, r1, r2
|
||||
stm r0, {r4-r7}
|
||||
pld [r1]
|
||||
subs r3, r3, #1
|
||||
add r0, r0, r2
|
||||
bne 1b
|
||||
pop {r4-r11, pc}
|
||||
.align 5
|
||||
2:
|
||||
ldm r1, {r4-r8}
|
||||
add r1, r1, r2
|
||||
ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
|
||||
pld [r1]
|
||||
subs r3, r3, #1
|
||||
stm r0, {r9-r12}
|
||||
add r0, r0, r2
|
||||
bne 2b
|
||||
pop {r4-r11, pc}
|
||||
.align 5
|
||||
3:
|
||||
ldm r1, {r4-r8}
|
||||
add r1, r1, r2
|
||||
ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
|
||||
pld [r1]
|
||||
subs r3, r3, #1
|
||||
stm r0, {r9-r12}
|
||||
add r0, r0, r2
|
||||
bne 3b
|
||||
pop {r4-r11, pc}
|
||||
.align 5
|
||||
4:
|
||||
ldm r1, {r4-r8}
|
||||
add r1, r1, r2
|
||||
ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
|
||||
pld [r1]
|
||||
subs r3, r3, #1
|
||||
stm r0, {r9-r12}
|
||||
add r0, r0, r2
|
||||
bne 4b
|
||||
pop {r4-r11,pc}
|
||||
endfunc
|
||||
|
||||
@ ----------------------------------------------------------------
|
||||
function ff_put_pixels8_arm, export=1, align=5
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
push {r4-r5,lr}
|
||||
JMP_ALIGN r5, r1
|
||||
1:
|
||||
ldm r1, {r4-r5}
|
||||
add r1, r1, r2
|
||||
subs r3, r3, #1
|
||||
pld [r1]
|
||||
stm r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 1b
|
||||
pop {r4-r5,pc}
|
||||
.align 5
|
||||
2:
|
||||
ldm r1, {r4-r5, r12}
|
||||
add r1, r1, r2
|
||||
ALIGN_DWORD 1, r4, r5, r12
|
||||
pld [r1]
|
||||
subs r3, r3, #1
|
||||
stm r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 2b
|
||||
pop {r4-r5,pc}
|
||||
.align 5
|
||||
3:
|
||||
ldm r1, {r4-r5, r12}
|
||||
add r1, r1, r2
|
||||
ALIGN_DWORD 2, r4, r5, r12
|
||||
pld [r1]
|
||||
subs r3, r3, #1
|
||||
stm r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 3b
|
||||
pop {r4-r5,pc}
|
||||
.align 5
|
||||
4:
|
||||
ldm r1, {r4-r5, r12}
|
||||
add r1, r1, r2
|
||||
ALIGN_DWORD 3, r4, r5, r12
|
||||
pld [r1]
|
||||
subs r3, r3, #1
|
||||
stm r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 4b
|
||||
pop {r4-r5,pc}
|
||||
endfunc
|
||||
|
||||
@ ----------------------------------------------------------------
|
||||
function ff_put_pixels8_x2_arm, export=1, align=5
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
push {r4-r10,lr}
|
||||
ldr r12, =0xfefefefe
|
||||
JMP_ALIGN r5, r1
|
||||
1:
|
||||
ldm r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
|
||||
pld [r1]
|
||||
RND_AVG32 r8, r9, r4, r5, r6, r7, r12
|
||||
subs r3, r3, #1
|
||||
stm r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
bne 1b
|
||||
pop {r4-r10,pc}
|
||||
.align 5
|
||||
2:
|
||||
ldm r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
|
||||
ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
|
||||
pld [r1]
|
||||
RND_AVG32 r4, r5, r6, r7, r8, r9, r12
|
||||
subs r3, r3, #1
|
||||
stm r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 2b
|
||||
pop {r4-r10,pc}
|
||||
.align 5
|
||||
3:
|
||||
ldm r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
|
||||
ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
|
||||
pld [r1]
|
||||
RND_AVG32 r4, r5, r6, r7, r8, r9, r12
|
||||
subs r3, r3, #1
|
||||
stm r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 3b
|
||||
pop {r4-r10,pc}
|
||||
.align 5
|
||||
4:
|
||||
ldm r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
|
||||
pld [r1]
|
||||
RND_AVG32 r8, r9, r6, r7, r5, r10, r12
|
||||
subs r3, r3, #1
|
||||
stm r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
bne 4b
|
||||
pop {r4-r10,pc}
|
||||
endfunc
|
||||
|
||||
function ff_put_no_rnd_pixels8_x2_arm, export=1, align=5
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
push {r4-r10,lr}
|
||||
ldr r12, =0xfefefefe
|
||||
JMP_ALIGN r5, r1
|
||||
1:
|
||||
ldm r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
|
||||
pld [r1]
|
||||
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
|
||||
subs r3, r3, #1
|
||||
stm r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
bne 1b
|
||||
pop {r4-r10,pc}
|
||||
.align 5
|
||||
2:
|
||||
ldm r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
|
||||
ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
|
||||
pld [r1]
|
||||
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
|
||||
subs r3, r3, #1
|
||||
stm r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 2b
|
||||
pop {r4-r10,pc}
|
||||
.align 5
|
||||
3:
|
||||
ldm r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
|
||||
ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
|
||||
pld [r1]
|
||||
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
|
||||
subs r3, r3, #1
|
||||
stm r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 3b
|
||||
pop {r4-r10,pc}
|
||||
.align 5
|
||||
4:
|
||||
ldm r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
|
||||
pld [r1]
|
||||
NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
|
||||
subs r3, r3, #1
|
||||
stm r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
bne 4b
|
||||
pop {r4-r10,pc}
|
||||
endfunc
|
||||
|
||||
|
||||
@ ----------------------------------------------------------------
|
||||
function ff_put_pixels8_y2_arm, export=1, align=5
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
push {r4-r11,lr}
|
||||
mov r3, r3, lsr #1
|
||||
ldr r12, =0xfefefefe
|
||||
JMP_ALIGN r5, r1
|
||||
1:
|
||||
ldm r1, {r4-r5}
|
||||
add r1, r1, r2
|
||||
6: ldm r1, {r6-r7}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
RND_AVG32 r8, r9, r4, r5, r6, r7, r12
|
||||
ldm r1, {r4-r5}
|
||||
add r1, r1, r2
|
||||
stm r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
pld [r1]
|
||||
RND_AVG32 r8, r9, r6, r7, r4, r5, r12
|
||||
subs r3, r3, #1
|
||||
stm r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
pop {r4-r11,pc}
|
||||
.align 5
|
||||
2:
|
||||
ldm r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 1, r4, r5, r6
|
||||
6: ldm r1, {r7-r9}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 1, r7, r8, r9
|
||||
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
|
||||
stm r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
ldm r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 1, r4, r5, r6
|
||||
subs r3, r3, #1
|
||||
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
|
||||
stm r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
pop {r4-r11,pc}
|
||||
.align 5
|
||||
3:
|
||||
ldm r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 2, r4, r5, r6
|
||||
6: ldm r1, {r7-r9}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 2, r7, r8, r9
|
||||
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
|
||||
stm r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
ldm r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 2, r4, r5, r6
|
||||
subs r3, r3, #1
|
||||
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
|
||||
stm r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
pop {r4-r11,pc}
|
||||
.align 5
|
||||
4:
|
||||
ldm r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 3, r4, r5, r6
|
||||
6: ldm r1, {r7-r9}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 3, r7, r8, r9
|
||||
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
|
||||
stm r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
ldm r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 3, r4, r5, r6
|
||||
subs r3, r3, #1
|
||||
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
|
||||
stm r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
pop {r4-r11,pc}
|
||||
endfunc
|
||||
|
||||
function ff_put_no_rnd_pixels8_y2_arm, export=1, align=5
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
push {r4-r11,lr}
|
||||
mov r3, r3, lsr #1
|
||||
ldr r12, =0xfefefefe
|
||||
JMP_ALIGN r5, r1
|
||||
1:
|
||||
ldm r1, {r4-r5}
|
||||
add r1, r1, r2
|
||||
6: ldm r1, {r6-r7}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
|
||||
ldm r1, {r4-r5}
|
||||
add r1, r1, r2
|
||||
stm r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
pld [r1]
|
||||
NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
|
||||
subs r3, r3, #1
|
||||
stm r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
pop {r4-r11,pc}
|
||||
.align 5
|
||||
2:
|
||||
ldm r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 1, r4, r5, r6
|
||||
6: ldm r1, {r7-r9}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 1, r7, r8, r9
|
||||
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
|
||||
stm r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
ldm r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 1, r4, r5, r6
|
||||
subs r3, r3, #1
|
||||
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
|
||||
stm r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
pop {r4-r11,pc}
|
||||
.align 5
|
||||
3:
|
||||
ldm r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 2, r4, r5, r6
|
||||
6: ldm r1, {r7-r9}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 2, r7, r8, r9
|
||||
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
|
||||
stm r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
ldm r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 2, r4, r5, r6
|
||||
subs r3, r3, #1
|
||||
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
|
||||
stm r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
pop {r4-r11,pc}
|
||||
.align 5
|
||||
4:
|
||||
ldm r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 3, r4, r5, r6
|
||||
6: ldm r1, {r7-r9}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 3, r7, r8, r9
|
||||
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
|
||||
stm r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
ldm r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ALIGN_DWORD 3, r4, r5, r6
|
||||
subs r3, r3, #1
|
||||
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
|
||||
stm r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
pop {r4-r11,pc}
|
||||
endfunc
|
||||
|
||||
.ltorg
|
||||
|
||||
@ ----------------------------------------------------------------
|
||||
.macro RND_XY2_IT align, rnd
|
||||
@ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
|
||||
@ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
|
||||
.if \align == 0
|
||||
ldm r1, {r6-r8}
|
||||
.elseif \align == 3
|
||||
ldm r1, {r5-r7}
|
||||
.else
|
||||
ldm r1, {r8-r10}
|
||||
.endif
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
.if \align == 0
|
||||
ALIGN_DWORD_D 1, r4, r5, r6, r7, r8
|
||||
.elseif \align == 1
|
||||
ALIGN_DWORD_D 1, r4, r5, r8, r9, r10
|
||||
ALIGN_DWORD_D 2, r6, r7, r8, r9, r10
|
||||
.elseif \align == 2
|
||||
ALIGN_DWORD_D 2, r4, r5, r8, r9, r10
|
||||
ALIGN_DWORD_D 3, r6, r7, r8, r9, r10
|
||||
.elseif \align == 3
|
||||
ALIGN_DWORD_D 3, r4, r5, r5, r6, r7
|
||||
.endif
|
||||
ldr r14, =0x03030303
|
||||
tst r3, #1
|
||||
and r8, r4, r14
|
||||
and r9, r5, r14
|
||||
and r10, r6, r14
|
||||
and r11, r7, r14
|
||||
it eq
|
||||
andeq r14, r14, r14, \rnd #1
|
||||
add r8, r8, r10
|
||||
add r9, r9, r11
|
||||
ldr r12, =0xfcfcfcfc >> 2
|
||||
itt eq
|
||||
addeq r8, r8, r14
|
||||
addeq r9, r9, r14
|
||||
and r4, r12, r4, lsr #2
|
||||
and r5, r12, r5, lsr #2
|
||||
and r6, r12, r6, lsr #2
|
||||
and r7, r12, r7, lsr #2
|
||||
add r10, r4, r6
|
||||
add r11, r5, r7
|
||||
subs r3, r3, #1
|
||||
.endm
|
||||
|
||||
.macro RND_XY2_EXPAND align, rnd
|
||||
RND_XY2_IT \align, \rnd
|
||||
6: push {r8-r11}
|
||||
RND_XY2_IT \align, \rnd
|
||||
pop {r4-r7}
|
||||
add r4, r4, r8
|
||||
add r5, r5, r9
|
||||
ldr r14, =0x0f0f0f0f
|
||||
add r6, r6, r10
|
||||
add r7, r7, r11
|
||||
and r4, r14, r4, lsr #2
|
||||
and r5, r14, r5, lsr #2
|
||||
add r4, r4, r6
|
||||
add r5, r5, r7
|
||||
stm r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bge 6b
|
||||
pop {r4-r11,pc}
|
||||
.endm
|
||||
|
||||
function ff_put_pixels8_xy2_arm, export=1, align=5
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
push {r4-r11,lr} @ R14 is also called LR
|
||||
JMP_ALIGN r5, r1
|
||||
1: RND_XY2_EXPAND 0, lsl
|
||||
.align 5
|
||||
2: RND_XY2_EXPAND 1, lsl
|
||||
.align 5
|
||||
3: RND_XY2_EXPAND 2, lsl
|
||||
.align 5
|
||||
4: RND_XY2_EXPAND 3, lsl
|
||||
endfunc
|
||||
|
||||
function ff_put_no_rnd_pixels8_xy2_arm, export=1, align=5
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
push {r4-r11,lr}
|
||||
JMP_ALIGN r5, r1
|
||||
1: RND_XY2_EXPAND 0, lsr
|
||||
.align 5
|
||||
2: RND_XY2_EXPAND 1, lsr
|
||||
.align 5
|
||||
3: RND_XY2_EXPAND 2, lsr
|
||||
.align 5
|
||||
4: RND_XY2_EXPAND 3, lsr
|
||||
endfunc
|
||||
29
externals/ffmpeg/libavcodec/arm/hpeldsp_arm.h
vendored
Executable file
29
externals/ffmpeg/libavcodec/arm/hpeldsp_arm.h
vendored
Executable file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_ARM_HPELDSP_ARM_H
|
||||
#define AVCODEC_ARM_HPELDSP_ARM_H
|
||||
|
||||
#include "libavcodec/hpeldsp.h"
|
||||
|
||||
void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags);
|
||||
void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags);
|
||||
|
||||
#endif /* AVCODEC_ARM_HPELDSP_ARM_H */
|
||||
261
externals/ffmpeg/libavcodec/arm/hpeldsp_armv6.S
vendored
Executable file
261
externals/ffmpeg/libavcodec/arm/hpeldsp_armv6.S
vendored
Executable file
@@ -0,0 +1,261 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
.macro call_2x_pixels type, subp
|
||||
function ff_\type\()_pixels16\subp\()_armv6, export=1
|
||||
push {r0-r3, lr}
|
||||
bl X(ff_\type\()_pixels8\subp\()_armv6)
|
||||
pop {r0-r3, lr}
|
||||
add r0, r0, #8
|
||||
add r1, r1, #8
|
||||
b X(ff_\type\()_pixels8\subp\()_armv6)
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
call_2x_pixels avg
|
||||
call_2x_pixels put, _x2
|
||||
call_2x_pixels put, _y2
|
||||
call_2x_pixels put, _x2_no_rnd
|
||||
call_2x_pixels put, _y2_no_rnd
|
||||
|
||||
function ff_put_pixels16_armv6, export=1
|
||||
push {r4-r11}
|
||||
1:
|
||||
ldr r5, [r1, #4]
|
||||
ldr r6, [r1, #8]
|
||||
ldr r7, [r1, #12]
|
||||
ldr_post r4, r1, r2
|
||||
strd r6, r7, [r0, #8]
|
||||
ldr r9, [r1, #4]
|
||||
strd_post r4, r5, r0, r2
|
||||
ldr r10, [r1, #8]
|
||||
ldr r11, [r1, #12]
|
||||
ldr_post r8, r1, r2
|
||||
strd r10, r11, [r0, #8]
|
||||
subs r3, r3, #2
|
||||
strd_post r8, r9, r0, r2
|
||||
bne 1b
|
||||
|
||||
pop {r4-r11}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_put_pixels8_armv6, export=1
|
||||
push {r4-r7}
|
||||
1:
|
||||
ldr r5, [r1, #4]
|
||||
ldr_post r4, r1, r2
|
||||
ldr r7, [r1, #4]
|
||||
strd_post r4, r5, r0, r2
|
||||
ldr_post r6, r1, r2
|
||||
subs r3, r3, #2
|
||||
strd_post r6, r7, r0, r2
|
||||
bne 1b
|
||||
|
||||
pop {r4-r7}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_put_pixels8_x2_armv6, export=1
|
||||
push {r4-r11, lr}
|
||||
mov r12, #1
|
||||
orr r12, r12, r12, lsl #8
|
||||
orr r12, r12, r12, lsl #16
|
||||
1:
|
||||
ldr r4, [r1]
|
||||
subs r3, r3, #2
|
||||
ldr r5, [r1, #4]
|
||||
ldr r7, [r1, #5]
|
||||
lsr r6, r4, #8
|
||||
ldr_pre r8, r1, r2
|
||||
orr r6, r6, r5, lsl #24
|
||||
ldr r9, [r1, #4]
|
||||
ldr r11, [r1, #5]
|
||||
lsr r10, r8, #8
|
||||
add r1, r1, r2
|
||||
orr r10, r10, r9, lsl #24
|
||||
eor r14, r4, r6
|
||||
uhadd8 r4, r4, r6
|
||||
eor r6, r5, r7
|
||||
uhadd8 r5, r5, r7
|
||||
and r14, r14, r12
|
||||
and r6, r6, r12
|
||||
uadd8 r4, r4, r14
|
||||
eor r14, r8, r10
|
||||
uadd8 r5, r5, r6
|
||||
eor r6, r9, r11
|
||||
uhadd8 r8, r8, r10
|
||||
and r14, r14, r12
|
||||
uhadd8 r9, r9, r11
|
||||
and r6, r6, r12
|
||||
uadd8 r8, r8, r14
|
||||
strd_post r4, r5, r0, r2
|
||||
uadd8 r9, r9, r6
|
||||
strd_post r8, r9, r0, r2
|
||||
bne 1b
|
||||
|
||||
pop {r4-r11, pc}
|
||||
endfunc
|
||||
|
||||
function ff_put_pixels8_y2_armv6, export=1
|
||||
push {r4-r11}
|
||||
mov r12, #1
|
||||
orr r12, r12, r12, lsl #8
|
||||
orr r12, r12, r12, lsl #16
|
||||
ldr r4, [r1]
|
||||
ldr r5, [r1, #4]
|
||||
ldr_pre r6, r1, r2
|
||||
ldr r7, [r1, #4]
|
||||
1:
|
||||
subs r3, r3, #2
|
||||
uhadd8 r8, r4, r6
|
||||
eor r10, r4, r6
|
||||
uhadd8 r9, r5, r7
|
||||
eor r11, r5, r7
|
||||
and r10, r10, r12
|
||||
ldr_pre r4, r1, r2
|
||||
uadd8 r8, r8, r10
|
||||
and r11, r11, r12
|
||||
uadd8 r9, r9, r11
|
||||
ldr r5, [r1, #4]
|
||||
uhadd8 r10, r4, r6
|
||||
eor r6, r4, r6
|
||||
uhadd8 r11, r5, r7
|
||||
and r6, r6, r12
|
||||
eor r7, r5, r7
|
||||
uadd8 r10, r10, r6
|
||||
and r7, r7, r12
|
||||
ldrc_pre ne, r6, r1, r2
|
||||
uadd8 r11, r11, r7
|
||||
strd_post r8, r9, r0, r2
|
||||
it ne
|
||||
ldrne r7, [r1, #4]
|
||||
strd_post r10, r11, r0, r2
|
||||
bne 1b
|
||||
|
||||
pop {r4-r11}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_put_pixels8_x2_no_rnd_armv6, export=1
|
||||
push {r4-r9, lr}
|
||||
1:
|
||||
subs r3, r3, #2
|
||||
ldr r4, [r1]
|
||||
ldr r5, [r1, #4]
|
||||
ldr r7, [r1, #5]
|
||||
ldr_pre r8, r1, r2
|
||||
ldr r9, [r1, #4]
|
||||
ldr r14, [r1, #5]
|
||||
add r1, r1, r2
|
||||
lsr r6, r4, #8
|
||||
orr r6, r6, r5, lsl #24
|
||||
lsr r12, r8, #8
|
||||
orr r12, r12, r9, lsl #24
|
||||
uhadd8 r4, r4, r6
|
||||
uhadd8 r5, r5, r7
|
||||
uhadd8 r8, r8, r12
|
||||
uhadd8 r9, r9, r14
|
||||
stm r0, {r4,r5}
|
||||
add r0, r0, r2
|
||||
stm r0, {r8,r9}
|
||||
add r0, r0, r2
|
||||
bne 1b
|
||||
|
||||
pop {r4-r9, pc}
|
||||
endfunc
|
||||
|
||||
function ff_put_pixels8_y2_no_rnd_armv6, export=1
|
||||
push {r4-r9, lr}
|
||||
ldr r4, [r1]
|
||||
ldr r5, [r1, #4]
|
||||
ldr_pre r6, r1, r2
|
||||
ldr r7, [r1, #4]
|
||||
1:
|
||||
subs r3, r3, #2
|
||||
uhadd8 r8, r4, r6
|
||||
ldr_pre r4, r1, r2
|
||||
uhadd8 r9, r5, r7
|
||||
ldr r5, [r1, #4]
|
||||
uhadd8 r12, r4, r6
|
||||
ldrc_pre ne, r6, r1, r2
|
||||
uhadd8 r14, r5, r7
|
||||
it ne
|
||||
ldrne r7, [r1, #4]
|
||||
stm r0, {r8,r9}
|
||||
add r0, r0, r2
|
||||
stm r0, {r12,r14}
|
||||
add r0, r0, r2
|
||||
bne 1b
|
||||
|
||||
pop {r4-r9, pc}
|
||||
endfunc
|
||||
|
||||
function ff_avg_pixels8_armv6, export=1
|
||||
pld [r1, r2]
|
||||
push {r4-r10, lr}
|
||||
mov lr, #1
|
||||
orr lr, lr, lr, lsl #8
|
||||
orr lr, lr, lr, lsl #16
|
||||
ldrd r4, r5, [r0]
|
||||
ldr r10, [r1, #4]
|
||||
ldr_post r9, r1, r2
|
||||
subs r3, r3, #2
|
||||
1:
|
||||
pld [r1, r2]
|
||||
eor r8, r4, r9
|
||||
uhadd8 r4, r4, r9
|
||||
eor r12, r5, r10
|
||||
ldrd_reg r6, r7, r0, r2
|
||||
uhadd8 r5, r5, r10
|
||||
and r8, r8, lr
|
||||
ldr r10, [r1, #4]
|
||||
and r12, r12, lr
|
||||
uadd8 r4, r4, r8
|
||||
ldr_post r9, r1, r2
|
||||
eor r8, r6, r9
|
||||
uadd8 r5, r5, r12
|
||||
pld [r1, r2, lsl #1]
|
||||
eor r12, r7, r10
|
||||
uhadd8 r6, r6, r9
|
||||
strd_post r4, r5, r0, r2
|
||||
uhadd8 r7, r7, r10
|
||||
beq 2f
|
||||
and r8, r8, lr
|
||||
ldrd_reg r4, r5, r0, r2
|
||||
uadd8 r6, r6, r8
|
||||
ldr r10, [r1, #4]
|
||||
and r12, r12, lr
|
||||
subs r3, r3, #2
|
||||
uadd8 r7, r7, r12
|
||||
ldr_post r9, r1, r2
|
||||
strd_post r6, r7, r0, r2
|
||||
b 1b
|
||||
2:
|
||||
and r8, r8, lr
|
||||
and r12, r12, lr
|
||||
uadd8 r6, r6, r8
|
||||
uadd8 r7, r7, r12
|
||||
strd_post r6, r7, r0, r2
|
||||
|
||||
pop {r4-r10, pc}
|
||||
endfunc
|
||||
71
externals/ffmpeg/libavcodec/arm/hpeldsp_init_arm.c
vendored
Executable file
71
externals/ffmpeg/libavcodec/arm/hpeldsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
* ARM-optimized halfpel functions
|
||||
* Copyright (c) 2001 Lionel Ulmer
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/pixels.h"
|
||||
#include "hpeldsp_arm.h"
|
||||
|
||||
void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
|
||||
|
||||
CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8)
|
||||
CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8)
|
||||
CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8)
|
||||
CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8)
|
||||
CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8)
|
||||
CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8)
|
||||
|
||||
av_cold void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
c->put_pixels_tab[0][0] = ff_put_pixels16_arm;
|
||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm;
|
||||
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm;
|
||||
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm;
|
||||
c->put_pixels_tab[1][0] = ff_put_pixels8_arm;
|
||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm;
|
||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm;
|
||||
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm;
|
||||
|
||||
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm;
|
||||
c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm;
|
||||
c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm;
|
||||
c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm;
|
||||
c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm;
|
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm;
|
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm;
|
||||
c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm;
|
||||
|
||||
if (have_armv6(cpu_flags))
|
||||
ff_hpeldsp_init_armv6(c, flags);
|
||||
if (have_neon(cpu_flags))
|
||||
ff_hpeldsp_init_neon(c, flags);
|
||||
}
|
||||
67
externals/ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c
vendored
Executable file
67
externals/ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c
vendored
Executable file
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "hpeldsp_arm.h"
|
||||
|
||||
void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
|
||||
void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
|
||||
void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
|
||||
void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
|
||||
void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
|
||||
void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
|
||||
av_cold void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags)
|
||||
{
|
||||
c->put_pixels_tab[0][0] = ff_put_pixels16_armv6;
|
||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6;
|
||||
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6;
|
||||
/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */
|
||||
c->put_pixels_tab[1][0] = ff_put_pixels8_armv6;
|
||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6;
|
||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6;
|
||||
/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */
|
||||
|
||||
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6;
|
||||
c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6;
|
||||
c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6;
|
||||
/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */
|
||||
c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6;
|
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6;
|
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6;
|
||||
/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */
|
||||
|
||||
c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6;
|
||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6;
|
||||
}
|
||||
88
externals/ffmpeg/libavcodec/arm/hpeldsp_init_neon.c
vendored
Executable file
88
externals/ffmpeg/libavcodec/arm/hpeldsp_init_neon.c
vendored
Executable file
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
* ARM NEON optimised DSP functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "hpeldsp_arm.h"
|
||||
|
||||
void ff_put_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
|
||||
void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
|
||||
|
||||
av_cold void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags)
|
||||
{
|
||||
c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
|
||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
|
||||
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
|
||||
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
|
||||
c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
|
||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
|
||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
|
||||
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
|
||||
|
||||
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
|
||||
c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
|
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
|
||||
|
||||
c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
|
||||
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
|
||||
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
|
||||
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
|
||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
|
||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
|
||||
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
|
||||
|
||||
c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
|
||||
c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
|
||||
c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
|
||||
c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
|
||||
}
|
||||
410
externals/ffmpeg/libavcodec/arm/hpeldsp_neon.S
vendored
Executable file
410
externals/ffmpeg/libavcodec/arm/hpeldsp_neon.S
vendored
Executable file
@@ -0,0 +1,410 @@
|
||||
/*
|
||||
* ARM NEON optimised DSP functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
.macro pixels16 rnd=1, avg=0
|
||||
.if \avg
|
||||
mov r12, r0
|
||||
.endif
|
||||
1: vld1.8 {q0}, [r1], r2
|
||||
vld1.8 {q1}, [r1], r2
|
||||
vld1.8 {q2}, [r1], r2
|
||||
pld [r1, r2, lsl #2]
|
||||
vld1.8 {q3}, [r1], r2
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
pld [r1, r2, lsl #1]
|
||||
.if \avg
|
||||
vld1.8 {q8}, [r12,:128], r2
|
||||
vrhadd.u8 q0, q0, q8
|
||||
vld1.8 {q9}, [r12,:128], r2
|
||||
vrhadd.u8 q1, q1, q9
|
||||
vld1.8 {q10}, [r12,:128], r2
|
||||
vrhadd.u8 q2, q2, q10
|
||||
vld1.8 {q11}, [r12,:128], r2
|
||||
vrhadd.u8 q3, q3, q11
|
||||
.endif
|
||||
subs r3, r3, #4
|
||||
vst1.64 {q0}, [r0,:128], r2
|
||||
vst1.64 {q1}, [r0,:128], r2
|
||||
vst1.64 {q2}, [r0,:128], r2
|
||||
vst1.64 {q3}, [r0,:128], r2
|
||||
bne 1b
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
.macro pixels16_x2 rnd=1, avg=0
|
||||
1: vld1.8 {d0-d2}, [r1], r2
|
||||
vld1.8 {d4-d6}, [r1], r2
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
subs r3, r3, #2
|
||||
vext.8 q1, q0, q1, #1
|
||||
avg q0, q0, q1
|
||||
vext.8 q3, q2, q3, #1
|
||||
avg q2, q2, q3
|
||||
.if \avg
|
||||
vld1.8 {q1}, [r0,:128], r2
|
||||
vld1.8 {q3}, [r0,:128]
|
||||
vrhadd.u8 q0, q0, q1
|
||||
vrhadd.u8 q2, q2, q3
|
||||
sub r0, r0, r2
|
||||
.endif
|
||||
vst1.8 {q0}, [r0,:128], r2
|
||||
vst1.8 {q2}, [r0,:128], r2
|
||||
bne 1b
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
.macro pixels16_y2 rnd=1, avg=0
|
||||
sub r3, r3, #2
|
||||
vld1.8 {q0}, [r1], r2
|
||||
vld1.8 {q1}, [r1], r2
|
||||
1: subs r3, r3, #2
|
||||
avg q2, q0, q1
|
||||
vld1.8 {q0}, [r1], r2
|
||||
avg q3, q0, q1
|
||||
vld1.8 {q1}, [r1], r2
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
.if \avg
|
||||
vld1.8 {q8}, [r0,:128], r2
|
||||
vld1.8 {q9}, [r0,:128]
|
||||
vrhadd.u8 q2, q2, q8
|
||||
vrhadd.u8 q3, q3, q9
|
||||
sub r0, r0, r2
|
||||
.endif
|
||||
vst1.8 {q2}, [r0,:128], r2
|
||||
vst1.8 {q3}, [r0,:128], r2
|
||||
bne 1b
|
||||
|
||||
avg q2, q0, q1
|
||||
vld1.8 {q0}, [r1], r2
|
||||
avg q3, q0, q1
|
||||
.if \avg
|
||||
vld1.8 {q8}, [r0,:128], r2
|
||||
vld1.8 {q9}, [r0,:128]
|
||||
vrhadd.u8 q2, q2, q8
|
||||
vrhadd.u8 q3, q3, q9
|
||||
sub r0, r0, r2
|
||||
.endif
|
||||
vst1.8 {q2}, [r0,:128], r2
|
||||
vst1.8 {q3}, [r0,:128], r2
|
||||
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
.macro pixels16_xy2 rnd=1, avg=0
|
||||
sub r3, r3, #2
|
||||
vld1.8 {d0-d2}, [r1], r2
|
||||
vld1.8 {d4-d6}, [r1], r2
|
||||
NRND vmov.i16 q13, #1
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
vext.8 q1, q0, q1, #1
|
||||
vext.8 q3, q2, q3, #1
|
||||
vaddl.u8 q8, d0, d2
|
||||
vaddl.u8 q10, d1, d3
|
||||
vaddl.u8 q9, d4, d6
|
||||
vaddl.u8 q11, d5, d7
|
||||
1: subs r3, r3, #2
|
||||
vld1.8 {d0-d2}, [r1], r2
|
||||
vadd.u16 q12, q8, q9
|
||||
pld [r1]
|
||||
NRND vadd.u16 q12, q12, q13
|
||||
vext.8 q15, q0, q1, #1
|
||||
vadd.u16 q1 , q10, q11
|
||||
shrn d28, q12, #2
|
||||
NRND vadd.u16 q1, q1, q13
|
||||
shrn d29, q1, #2
|
||||
.if \avg
|
||||
vld1.8 {q8}, [r0,:128]
|
||||
vrhadd.u8 q14, q14, q8
|
||||
.endif
|
||||
vaddl.u8 q8, d0, d30
|
||||
vld1.8 {d2-d4}, [r1], r2
|
||||
vaddl.u8 q10, d1, d31
|
||||
vst1.8 {q14}, [r0,:128], r2
|
||||
vadd.u16 q12, q8, q9
|
||||
pld [r1, r2]
|
||||
NRND vadd.u16 q12, q12, q13
|
||||
vext.8 q2, q1, q2, #1
|
||||
vadd.u16 q0, q10, q11
|
||||
shrn d30, q12, #2
|
||||
NRND vadd.u16 q0, q0, q13
|
||||
shrn d31, q0, #2
|
||||
.if \avg
|
||||
vld1.8 {q9}, [r0,:128]
|
||||
vrhadd.u8 q15, q15, q9
|
||||
.endif
|
||||
vaddl.u8 q9, d2, d4
|
||||
vaddl.u8 q11, d3, d5
|
||||
vst1.8 {q15}, [r0,:128], r2
|
||||
bgt 1b
|
||||
|
||||
vld1.8 {d0-d2}, [r1], r2
|
||||
vadd.u16 q12, q8, q9
|
||||
NRND vadd.u16 q12, q12, q13
|
||||
vext.8 q15, q0, q1, #1
|
||||
vadd.u16 q1 , q10, q11
|
||||
shrn d28, q12, #2
|
||||
NRND vadd.u16 q1, q1, q13
|
||||
shrn d29, q1, #2
|
||||
.if \avg
|
||||
vld1.8 {q8}, [r0,:128]
|
||||
vrhadd.u8 q14, q14, q8
|
||||
.endif
|
||||
vaddl.u8 q8, d0, d30
|
||||
vaddl.u8 q10, d1, d31
|
||||
vst1.8 {q14}, [r0,:128], r2
|
||||
vadd.u16 q12, q8, q9
|
||||
NRND vadd.u16 q12, q12, q13
|
||||
vadd.u16 q0, q10, q11
|
||||
shrn d30, q12, #2
|
||||
NRND vadd.u16 q0, q0, q13
|
||||
shrn d31, q0, #2
|
||||
.if \avg
|
||||
vld1.8 {q9}, [r0,:128]
|
||||
vrhadd.u8 q15, q15, q9
|
||||
.endif
|
||||
vst1.8 {q15}, [r0,:128], r2
|
||||
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
.macro pixels8 rnd=1, avg=0
|
||||
1: vld1.8 {d0}, [r1], r2
|
||||
vld1.8 {d1}, [r1], r2
|
||||
vld1.8 {d2}, [r1], r2
|
||||
pld [r1, r2, lsl #2]
|
||||
vld1.8 {d3}, [r1], r2
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
pld [r1, r2, lsl #1]
|
||||
.if \avg
|
||||
vld1.8 {d4}, [r0,:64], r2
|
||||
vrhadd.u8 d0, d0, d4
|
||||
vld1.8 {d5}, [r0,:64], r2
|
||||
vrhadd.u8 d1, d1, d5
|
||||
vld1.8 {d6}, [r0,:64], r2
|
||||
vrhadd.u8 d2, d2, d6
|
||||
vld1.8 {d7}, [r0,:64], r2
|
||||
vrhadd.u8 d3, d3, d7
|
||||
sub r0, r0, r2, lsl #2
|
||||
.endif
|
||||
subs r3, r3, #4
|
||||
vst1.8 {d0}, [r0,:64], r2
|
||||
vst1.8 {d1}, [r0,:64], r2
|
||||
vst1.8 {d2}, [r0,:64], r2
|
||||
vst1.8 {d3}, [r0,:64], r2
|
||||
bne 1b
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
.macro pixels8_x2 rnd=1, avg=0
|
||||
1: vld1.8 {q0}, [r1], r2
|
||||
vext.8 d1, d0, d1, #1
|
||||
vld1.8 {q1}, [r1], r2
|
||||
vext.8 d3, d2, d3, #1
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
subs r3, r3, #2
|
||||
vswp d1, d2
|
||||
avg q0, q0, q1
|
||||
.if \avg
|
||||
vld1.8 {d4}, [r0,:64], r2
|
||||
vld1.8 {d5}, [r0,:64]
|
||||
vrhadd.u8 q0, q0, q2
|
||||
sub r0, r0, r2
|
||||
.endif
|
||||
vst1.8 {d0}, [r0,:64], r2
|
||||
vst1.8 {d1}, [r0,:64], r2
|
||||
bne 1b
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
.macro pixels8_y2 rnd=1, avg=0
|
||||
sub r3, r3, #2
|
||||
vld1.8 {d0}, [r1], r2
|
||||
vld1.8 {d1}, [r1], r2
|
||||
1: subs r3, r3, #2
|
||||
avg d4, d0, d1
|
||||
vld1.8 {d0}, [r1], r2
|
||||
avg d5, d0, d1
|
||||
vld1.8 {d1}, [r1], r2
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
.if \avg
|
||||
vld1.8 {d2}, [r0,:64], r2
|
||||
vld1.8 {d3}, [r0,:64]
|
||||
vrhadd.u8 q2, q2, q1
|
||||
sub r0, r0, r2
|
||||
.endif
|
||||
vst1.8 {d4}, [r0,:64], r2
|
||||
vst1.8 {d5}, [r0,:64], r2
|
||||
bne 1b
|
||||
|
||||
avg d4, d0, d1
|
||||
vld1.8 {d0}, [r1], r2
|
||||
avg d5, d0, d1
|
||||
.if \avg
|
||||
vld1.8 {d2}, [r0,:64], r2
|
||||
vld1.8 {d3}, [r0,:64]
|
||||
vrhadd.u8 q2, q2, q1
|
||||
sub r0, r0, r2
|
||||
.endif
|
||||
vst1.8 {d4}, [r0,:64], r2
|
||||
vst1.8 {d5}, [r0,:64], r2
|
||||
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
.macro pixels8_xy2 rnd=1, avg=0
|
||||
sub r3, r3, #2
|
||||
vld1.8 {q0}, [r1], r2
|
||||
vld1.8 {q1}, [r1], r2
|
||||
NRND vmov.i16 q11, #1
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
vext.8 d4, d0, d1, #1
|
||||
vext.8 d6, d2, d3, #1
|
||||
vaddl.u8 q8, d0, d4
|
||||
vaddl.u8 q9, d2, d6
|
||||
1: subs r3, r3, #2
|
||||
vld1.8 {q0}, [r1], r2
|
||||
pld [r1]
|
||||
vadd.u16 q10, q8, q9
|
||||
vext.8 d4, d0, d1, #1
|
||||
NRND vadd.u16 q10, q10, q11
|
||||
vaddl.u8 q8, d0, d4
|
||||
shrn d5, q10, #2
|
||||
vld1.8 {q1}, [r1], r2
|
||||
vadd.u16 q10, q8, q9
|
||||
pld [r1, r2]
|
||||
.if \avg
|
||||
vld1.8 {d7}, [r0,:64]
|
||||
vrhadd.u8 d5, d5, d7
|
||||
.endif
|
||||
NRND vadd.u16 q10, q10, q11
|
||||
vst1.8 {d5}, [r0,:64], r2
|
||||
shrn d7, q10, #2
|
||||
.if \avg
|
||||
vld1.8 {d5}, [r0,:64]
|
||||
vrhadd.u8 d7, d7, d5
|
||||
.endif
|
||||
vext.8 d6, d2, d3, #1
|
||||
vaddl.u8 q9, d2, d6
|
||||
vst1.8 {d7}, [r0,:64], r2
|
||||
bgt 1b
|
||||
|
||||
vld1.8 {q0}, [r1], r2
|
||||
vadd.u16 q10, q8, q9
|
||||
vext.8 d4, d0, d1, #1
|
||||
NRND vadd.u16 q10, q10, q11
|
||||
vaddl.u8 q8, d0, d4
|
||||
shrn d5, q10, #2
|
||||
vadd.u16 q10, q8, q9
|
||||
.if \avg
|
||||
vld1.8 {d7}, [r0,:64]
|
||||
vrhadd.u8 d5, d5, d7
|
||||
.endif
|
||||
NRND vadd.u16 q10, q10, q11
|
||||
vst1.8 {d5}, [r0,:64], r2
|
||||
shrn d7, q10, #2
|
||||
.if \avg
|
||||
vld1.8 {d5}, [r0,:64]
|
||||
vrhadd.u8 d7, d7, d5
|
||||
.endif
|
||||
vst1.8 {d7}, [r0,:64], r2
|
||||
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
.macro pixfunc pfx, name, suf, rnd=1, avg=0
|
||||
.if \rnd
|
||||
.macro avg rd, rn, rm
|
||||
vrhadd.u8 \rd, \rn, \rm
|
||||
.endm
|
||||
.macro shrn rd, rn, rm
|
||||
vrshrn.u16 \rd, \rn, \rm
|
||||
.endm
|
||||
.macro NRND insn:vararg
|
||||
.endm
|
||||
.else
|
||||
.macro avg rd, rn, rm
|
||||
vhadd.u8 \rd, \rn, \rm
|
||||
.endm
|
||||
.macro shrn rd, rn, rm
|
||||
vshrn.u16 \rd, \rn, \rm
|
||||
.endm
|
||||
.macro NRND insn:vararg
|
||||
\insn
|
||||
.endm
|
||||
.endif
|
||||
function ff_\pfx\name\suf\()_neon, export=1
|
||||
\name \rnd, \avg
|
||||
endfunc
|
||||
.purgem avg
|
||||
.purgem shrn
|
||||
.purgem NRND
|
||||
.endm
|
||||
|
||||
.macro pixfunc2 pfx, name, avg=0
|
||||
pixfunc \pfx, \name, rnd=1, avg=\avg
|
||||
pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
|
||||
.endm
|
||||
|
||||
function ff_put_h264_qpel16_mc00_neon, export=1
|
||||
mov r3, #16
|
||||
endfunc
|
||||
|
||||
pixfunc put_, pixels16, avg=0
|
||||
pixfunc2 put_, pixels16_x2, avg=0
|
||||
pixfunc2 put_, pixels16_y2, avg=0
|
||||
pixfunc2 put_, pixels16_xy2, avg=0
|
||||
|
||||
function ff_avg_h264_qpel16_mc00_neon, export=1
|
||||
mov r3, #16
|
||||
endfunc
|
||||
|
||||
pixfunc avg_, pixels16, avg=1
|
||||
pixfunc2 avg_, pixels16_x2, avg=1
|
||||
pixfunc2 avg_, pixels16_y2, avg=1
|
||||
pixfunc2 avg_, pixels16_xy2, avg=1
|
||||
|
||||
function ff_put_h264_qpel8_mc00_neon, export=1
|
||||
mov r3, #8
|
||||
endfunc
|
||||
|
||||
pixfunc put_, pixels8, avg=0
|
||||
pixfunc2 put_, pixels8_x2, avg=0
|
||||
pixfunc2 put_, pixels8_y2, avg=0
|
||||
pixfunc2 put_, pixels8_xy2, avg=0
|
||||
|
||||
function ff_avg_h264_qpel8_mc00_neon, export=1
|
||||
mov r3, #8
|
||||
endfunc
|
||||
|
||||
pixfunc avg_, pixels8, avg=1
|
||||
pixfunc avg_, pixels8_x2, avg=1
|
||||
pixfunc avg_, pixels8_y2, avg=1
|
||||
pixfunc avg_, pixels8_xy2, avg=1
|
||||
41
externals/ffmpeg/libavcodec/arm/idct.h
vendored
Executable file
41
externals/ffmpeg/libavcodec/arm/idct.h
vendored
Executable file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_ARM_IDCT_H
|
||||
#define AVCODEC_ARM_IDCT_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
void ff_j_rev_dct_arm(int16_t *data);
|
||||
|
||||
void ff_simple_idct_arm(int16_t *data);
|
||||
|
||||
void ff_simple_idct_armv5te(int16_t *data);
|
||||
void ff_simple_idct_put_armv5te(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
|
||||
void ff_simple_idct_add_armv5te(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
|
||||
|
||||
void ff_simple_idct_armv6(int16_t *data);
|
||||
void ff_simple_idct_put_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
|
||||
void ff_simple_idct_add_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
|
||||
|
||||
void ff_simple_idct_neon(int16_t *data);
|
||||
void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
|
||||
void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
|
||||
|
||||
#endif /* AVCODEC_ARM_IDCT_H */
|
||||
120
externals/ffmpeg/libavcodec/arm/idctdsp_arm.S
vendored
Executable file
120
externals/ffmpeg/libavcodec/arm/idctdsp_arm.S
vendored
Executable file
@@ -0,0 +1,120 @@
|
||||
@
|
||||
@ ARMv4-optimized IDCT functions
|
||||
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
|
||||
@
|
||||
@ This file is part of FFmpeg.
|
||||
@
|
||||
@ FFmpeg is free software; you can redistribute it and/or
|
||||
@ modify it under the terms of the GNU Lesser General Public
|
||||
@ License as published by the Free Software Foundation; either
|
||||
@ version 2.1 of the License, or (at your option) any later version.
|
||||
@
|
||||
@ FFmpeg is distributed in the hope that it will be useful,
|
||||
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
@ Lesser General Public License for more details.
|
||||
@
|
||||
@ You should have received a copy of the GNU Lesser General Public
|
||||
@ License along with FFmpeg; if not, write to the Free Software
|
||||
@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
@
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, ptrdiff_t stride)
|
||||
function ff_add_pixels_clamped_arm, export=1, align=5
|
||||
push {r4-r10}
|
||||
mov r10, #8
|
||||
1:
|
||||
ldr r4, [r1] /* load dest */
|
||||
/* block[0] and block[1]*/
|
||||
ldrsh r5, [r0]
|
||||
ldrsh r7, [r0, #2]
|
||||
and r6, r4, #0xFF
|
||||
and r8, r4, #0xFF00
|
||||
add r6, r6, r5
|
||||
add r8, r7, r8, lsr #8
|
||||
mvn r5, r5
|
||||
mvn r7, r7
|
||||
tst r6, #0x100
|
||||
it ne
|
||||
movne r6, r5, lsr #24
|
||||
tst r8, #0x100
|
||||
it ne
|
||||
movne r8, r7, lsr #24
|
||||
mov r9, r6
|
||||
ldrsh r5, [r0, #4] /* moved form [A] */
|
||||
orr r9, r9, r8, lsl #8
|
||||
/* block[2] and block[3] */
|
||||
/* [A] */
|
||||
ldrsh r7, [r0, #6]
|
||||
and r6, r4, #0xFF0000
|
||||
and r8, r4, #0xFF000000
|
||||
add r6, r5, r6, lsr #16
|
||||
add r8, r7, r8, lsr #24
|
||||
mvn r5, r5
|
||||
mvn r7, r7
|
||||
tst r6, #0x100
|
||||
it ne
|
||||
movne r6, r5, lsr #24
|
||||
tst r8, #0x100
|
||||
it ne
|
||||
movne r8, r7, lsr #24
|
||||
orr r9, r9, r6, lsl #16
|
||||
ldr r4, [r1, #4] /* moved form [B] */
|
||||
orr r9, r9, r8, lsl #24
|
||||
/* store dest */
|
||||
ldrsh r5, [r0, #8] /* moved form [C] */
|
||||
str r9, [r1]
|
||||
|
||||
/* load dest */
|
||||
/* [B] */
|
||||
/* block[4] and block[5] */
|
||||
/* [C] */
|
||||
ldrsh r7, [r0, #10]
|
||||
and r6, r4, #0xFF
|
||||
and r8, r4, #0xFF00
|
||||
add r6, r6, r5
|
||||
add r8, r7, r8, lsr #8
|
||||
mvn r5, r5
|
||||
mvn r7, r7
|
||||
tst r6, #0x100
|
||||
it ne
|
||||
movne r6, r5, lsr #24
|
||||
tst r8, #0x100
|
||||
it ne
|
||||
movne r8, r7, lsr #24
|
||||
mov r9, r6
|
||||
ldrsh r5, [r0, #12] /* moved from [D] */
|
||||
orr r9, r9, r8, lsl #8
|
||||
/* block[6] and block[7] */
|
||||
/* [D] */
|
||||
ldrsh r7, [r0, #14]
|
||||
and r6, r4, #0xFF0000
|
||||
and r8, r4, #0xFF000000
|
||||
add r6, r5, r6, lsr #16
|
||||
add r8, r7, r8, lsr #24
|
||||
mvn r5, r5
|
||||
mvn r7, r7
|
||||
tst r6, #0x100
|
||||
it ne
|
||||
movne r6, r5, lsr #24
|
||||
tst r8, #0x100
|
||||
it ne
|
||||
movne r8, r7, lsr #24
|
||||
orr r9, r9, r6, lsl #16
|
||||
add r0, r0, #16 /* moved from [E] */
|
||||
orr r9, r9, r8, lsl #24
|
||||
subs r10, r10, #1 /* moved from [F] */
|
||||
/* store dest */
|
||||
str r9, [r1, #4]
|
||||
|
||||
/* [E] */
|
||||
/* [F] */
|
||||
add r1, r1, r2
|
||||
bne 1b
|
||||
|
||||
pop {r4-r10}
|
||||
bx lr
|
||||
endfunc
|
||||
34
externals/ffmpeg/libavcodec/arm/idctdsp_arm.h
vendored
Executable file
34
externals/ffmpeg/libavcodec/arm/idctdsp_arm.h
vendored
Executable file
@@ -0,0 +1,34 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_ARM_IDCTDSP_ARM_H
|
||||
#define AVCODEC_ARM_IDCTDSP_ARM_H
|
||||
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/idctdsp.h"
|
||||
|
||||
void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth);
|
||||
void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth);
|
||||
void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth);
|
||||
|
||||
#endif /* AVCODEC_ARM_IDCTDSP_ARM_H */
|
||||
48
externals/ffmpeg/libavcodec/arm/idctdsp_armv6.S
vendored
Executable file
48
externals/ffmpeg/libavcodec/arm/idctdsp_armv6.S
vendored
Executable file
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_add_pixels_clamped_armv6, export=1
|
||||
push {r4-r8,lr}
|
||||
mov r3, #8
|
||||
1:
|
||||
ldm r0!, {r4,r5,r12,lr}
|
||||
ldrd r6, r7, [r1]
|
||||
pkhbt r8, r4, r5, lsl #16
|
||||
pkhtb r5, r5, r4, asr #16
|
||||
pkhbt r4, r12, lr, lsl #16
|
||||
pkhtb lr, lr, r12, asr #16
|
||||
pld [r1, r2]
|
||||
uxtab16 r8, r8, r6
|
||||
uxtab16 r5, r5, r6, ror #8
|
||||
uxtab16 r4, r4, r7
|
||||
uxtab16 lr, lr, r7, ror #8
|
||||
usat16 r8, #8, r8
|
||||
usat16 r5, #8, r5
|
||||
usat16 r4, #8, r4
|
||||
usat16 lr, #8, lr
|
||||
orr r6, r8, r5, lsl #8
|
||||
orr r7, r4, lr, lsl #8
|
||||
subs r3, r3, #1
|
||||
strd_post r6, r7, r1, r2
|
||||
bgt 1b
|
||||
pop {r4-r8,pc}
|
||||
endfunc
|
||||
94
externals/ffmpeg/libavcodec/arm/idctdsp_init_arm.c
vendored
Executable file
94
externals/ffmpeg/libavcodec/arm/idctdsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,94 @@
|
||||
/*
|
||||
* ARM-optimized IDCT functions
|
||||
* Copyright (c) 2001 Lionel Ulmer
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/idctdsp.h"
|
||||
#include "idct.h"
|
||||
#include "idctdsp_arm.h"
|
||||
|
||||
void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
|
||||
ptrdiff_t line_size);
|
||||
|
||||
/* XXX: those functions should be suppressed ASAP when all IDCTs are
|
||||
* converted */
|
||||
static void j_rev_dct_arm_put(uint8_t *dest, ptrdiff_t line_size,
|
||||
int16_t *block)
|
||||
{
|
||||
ff_j_rev_dct_arm(block);
|
||||
ff_put_pixels_clamped_c(block, dest, line_size);
|
||||
}
|
||||
|
||||
static void j_rev_dct_arm_add(uint8_t *dest, ptrdiff_t line_size,
|
||||
int16_t *block)
|
||||
{
|
||||
ff_j_rev_dct_arm(block);
|
||||
ff_add_pixels_clamped_arm(block, dest, line_size);
|
||||
}
|
||||
|
||||
static void simple_idct_arm_put(uint8_t *dest, ptrdiff_t line_size,
|
||||
int16_t *block)
|
||||
{
|
||||
ff_simple_idct_arm(block);
|
||||
ff_put_pixels_clamped_c(block, dest, line_size);
|
||||
}
|
||||
|
||||
static void simple_idct_arm_add(uint8_t *dest, ptrdiff_t line_size,
|
||||
int16_t *block)
|
||||
{
|
||||
ff_simple_idct_arm(block);
|
||||
ff_add_pixels_clamped_arm(block, dest, line_size);
|
||||
}
|
||||
|
||||
av_cold void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (!avctx->lowres && !high_bit_depth) {
|
||||
if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
|
||||
avctx->idct_algo == FF_IDCT_ARM) {
|
||||
c->idct_put = j_rev_dct_arm_put;
|
||||
c->idct_add = j_rev_dct_arm_add;
|
||||
c->idct = ff_j_rev_dct_arm;
|
||||
c->perm_type = FF_IDCT_PERM_LIBMPEG2;
|
||||
} else if (avctx->idct_algo == FF_IDCT_SIMPLEARM) {
|
||||
c->idct_put = simple_idct_arm_put;
|
||||
c->idct_add = simple_idct_arm_add;
|
||||
c->idct = ff_simple_idct_arm;
|
||||
c->perm_type = FF_IDCT_PERM_NONE;
|
||||
}
|
||||
}
|
||||
|
||||
c->add_pixels_clamped = ff_add_pixels_clamped_arm;
|
||||
|
||||
if (have_armv5te(cpu_flags))
|
||||
ff_idctdsp_init_armv5te(c, avctx, high_bit_depth);
|
||||
if (have_armv6(cpu_flags))
|
||||
ff_idctdsp_init_armv6(c, avctx, high_bit_depth);
|
||||
if (have_neon(cpu_flags))
|
||||
ff_idctdsp_init_neon(c, avctx, high_bit_depth);
|
||||
}
|
||||
41
externals/ffmpeg/libavcodec/arm/idctdsp_init_armv5te.c
vendored
Executable file
41
externals/ffmpeg/libavcodec/arm/idctdsp_init_armv5te.c
vendored
Executable file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/idctdsp.h"
|
||||
#include "idct.h"
|
||||
#include "idctdsp_arm.h"
|
||||
|
||||
av_cold void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
if (!avctx->lowres && !high_bit_depth &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
|
||||
c->idct_put = ff_simple_idct_put_armv5te;
|
||||
c->idct_add = ff_simple_idct_add_armv5te;
|
||||
c->idct = ff_simple_idct_armv5te;
|
||||
c->perm_type = FF_IDCT_PERM_NONE;
|
||||
}
|
||||
}
|
||||
45
externals/ffmpeg/libavcodec/arm/idctdsp_init_armv6.c
vendored
Executable file
45
externals/ffmpeg/libavcodec/arm/idctdsp_init_armv6.c
vendored
Executable file
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/idctdsp.h"
|
||||
#include "idct.h"
|
||||
#include "idctdsp_arm.h"
|
||||
|
||||
void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
|
||||
av_cold void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
if (!avctx->lowres && !high_bit_depth) {
|
||||
if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEARMV6) {
|
||||
c->idct_put = ff_simple_idct_put_armv6;
|
||||
c->idct_add = ff_simple_idct_add_armv6;
|
||||
c->idct = ff_simple_idct_armv6;
|
||||
c->perm_type = FF_IDCT_PERM_LIBMPEG2;
|
||||
}
|
||||
}
|
||||
c->add_pixels_clamped = ff_add_pixels_clamped_armv6;
|
||||
}
|
||||
51
externals/ffmpeg/libavcodec/arm/idctdsp_init_neon.c
vendored
Executable file
51
externals/ffmpeg/libavcodec/arm/idctdsp_init_neon.c
vendored
Executable file
@@ -0,0 +1,51 @@
|
||||
/*
|
||||
* ARM-NEON-optimized IDCT functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/idctdsp.h"
|
||||
#include "idct.h"
|
||||
#include "idctdsp_arm.h"
|
||||
|
||||
void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
|
||||
void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
|
||||
void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
|
||||
|
||||
av_cold void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
if (!avctx->lowres && !high_bit_depth) {
|
||||
if (avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLENEON) {
|
||||
c->idct_put = ff_simple_idct_put_neon;
|
||||
c->idct_add = ff_simple_idct_add_neon;
|
||||
c->idct = ff_simple_idct_neon;
|
||||
c->perm_type = FF_IDCT_PERM_PARTTRANS;
|
||||
}
|
||||
}
|
||||
|
||||
c->add_pixels_clamped = ff_add_pixels_clamped_neon;
|
||||
c->put_pixels_clamped = ff_put_pixels_clamped_neon;
|
||||
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
|
||||
}
|
||||
128
externals/ffmpeg/libavcodec/arm/idctdsp_neon.S
vendored
Executable file
128
externals/ffmpeg/libavcodec/arm/idctdsp_neon.S
vendored
Executable file
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
* ARM-NEON-optimized IDCT functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_put_pixels_clamped_neon, export=1
|
||||
vld1.16 {d16-d19}, [r0,:128]!
|
||||
vqmovun.s16 d0, q8
|
||||
vld1.16 {d20-d23}, [r0,:128]!
|
||||
vqmovun.s16 d1, q9
|
||||
vld1.16 {d24-d27}, [r0,:128]!
|
||||
vqmovun.s16 d2, q10
|
||||
vld1.16 {d28-d31}, [r0,:128]!
|
||||
vqmovun.s16 d3, q11
|
||||
vst1.8 {d0}, [r1,:64], r2
|
||||
vqmovun.s16 d4, q12
|
||||
vst1.8 {d1}, [r1,:64], r2
|
||||
vqmovun.s16 d5, q13
|
||||
vst1.8 {d2}, [r1,:64], r2
|
||||
vqmovun.s16 d6, q14
|
||||
vst1.8 {d3}, [r1,:64], r2
|
||||
vqmovun.s16 d7, q15
|
||||
vst1.8 {d4}, [r1,:64], r2
|
||||
vst1.8 {d5}, [r1,:64], r2
|
||||
vst1.8 {d6}, [r1,:64], r2
|
||||
vst1.8 {d7}, [r1,:64], r2
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_put_signed_pixels_clamped_neon, export=1
|
||||
vmov.u8 d31, #128
|
||||
vld1.16 {d16-d17}, [r0,:128]!
|
||||
vqmovn.s16 d0, q8
|
||||
vld1.16 {d18-d19}, [r0,:128]!
|
||||
vqmovn.s16 d1, q9
|
||||
vld1.16 {d16-d17}, [r0,:128]!
|
||||
vqmovn.s16 d2, q8
|
||||
vld1.16 {d18-d19}, [r0,:128]!
|
||||
vadd.u8 d0, d0, d31
|
||||
vld1.16 {d20-d21}, [r0,:128]!
|
||||
vadd.u8 d1, d1, d31
|
||||
vld1.16 {d22-d23}, [r0,:128]!
|
||||
vadd.u8 d2, d2, d31
|
||||
vst1.8 {d0}, [r1,:64], r2
|
||||
vqmovn.s16 d3, q9
|
||||
vst1.8 {d1}, [r1,:64], r2
|
||||
vqmovn.s16 d4, q10
|
||||
vst1.8 {d2}, [r1,:64], r2
|
||||
vqmovn.s16 d5, q11
|
||||
vld1.16 {d24-d25}, [r0,:128]!
|
||||
vadd.u8 d3, d3, d31
|
||||
vld1.16 {d26-d27}, [r0,:128]!
|
||||
vadd.u8 d4, d4, d31
|
||||
vadd.u8 d5, d5, d31
|
||||
vst1.8 {d3}, [r1,:64], r2
|
||||
vqmovn.s16 d6, q12
|
||||
vst1.8 {d4}, [r1,:64], r2
|
||||
vqmovn.s16 d7, q13
|
||||
vst1.8 {d5}, [r1,:64], r2
|
||||
vadd.u8 d6, d6, d31
|
||||
vadd.u8 d7, d7, d31
|
||||
vst1.8 {d6}, [r1,:64], r2
|
||||
vst1.8 {d7}, [r1,:64], r2
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_add_pixels_clamped_neon, export=1
|
||||
mov r3, r1
|
||||
vld1.8 {d16}, [r1,:64], r2
|
||||
vld1.16 {d0-d1}, [r0,:128]!
|
||||
vaddw.u8 q0, q0, d16
|
||||
vld1.8 {d17}, [r1,:64], r2
|
||||
vld1.16 {d2-d3}, [r0,:128]!
|
||||
vqmovun.s16 d0, q0
|
||||
vld1.8 {d18}, [r1,:64], r2
|
||||
vaddw.u8 q1, q1, d17
|
||||
vld1.16 {d4-d5}, [r0,:128]!
|
||||
vaddw.u8 q2, q2, d18
|
||||
vst1.8 {d0}, [r3,:64], r2
|
||||
vqmovun.s16 d2, q1
|
||||
vld1.8 {d19}, [r1,:64], r2
|
||||
vld1.16 {d6-d7}, [r0,:128]!
|
||||
vaddw.u8 q3, q3, d19
|
||||
vqmovun.s16 d4, q2
|
||||
vst1.8 {d2}, [r3,:64], r2
|
||||
vld1.8 {d16}, [r1,:64], r2
|
||||
vqmovun.s16 d6, q3
|
||||
vld1.16 {d0-d1}, [r0,:128]!
|
||||
vaddw.u8 q0, q0, d16
|
||||
vst1.8 {d4}, [r3,:64], r2
|
||||
vld1.8 {d17}, [r1,:64], r2
|
||||
vld1.16 {d2-d3}, [r0,:128]!
|
||||
vaddw.u8 q1, q1, d17
|
||||
vst1.8 {d6}, [r3,:64], r2
|
||||
vqmovun.s16 d0, q0
|
||||
vld1.8 {d18}, [r1,:64], r2
|
||||
vld1.16 {d4-d5}, [r0,:128]!
|
||||
vaddw.u8 q2, q2, d18
|
||||
vst1.8 {d0}, [r3,:64], r2
|
||||
vqmovun.s16 d2, q1
|
||||
vld1.8 {d19}, [r1,:64], r2
|
||||
vqmovun.s16 d4, q2
|
||||
vld1.16 {d6-d7}, [r0,:128]!
|
||||
vaddw.u8 q3, q3, d19
|
||||
vst1.8 {d2}, [r3,:64], r2
|
||||
vqmovun.s16 d6, q3
|
||||
vst1.8 {d4}, [r3,:64], r2
|
||||
vst1.8 {d6}, [r3,:64], r2
|
||||
bx lr
|
||||
endfunc
|
||||
51
externals/ffmpeg/libavcodec/arm/int_neon.S
vendored
Executable file
51
externals/ffmpeg/libavcodec/arm/int_neon.S
vendored
Executable file
@@ -0,0 +1,51 @@
|
||||
/*
|
||||
* ARM NEON optimised integer operations
|
||||
* Copyright (c) 2009 Konstantin Shishkov
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_scalarproduct_int16_neon, export=1
|
||||
vmov.i16 q0, #0
|
||||
vmov.i16 q1, #0
|
||||
vmov.i16 q2, #0
|
||||
vmov.i16 q3, #0
|
||||
1: vld1.16 {d16-d17}, [r0]!
|
||||
vld1.16 {d20-d21}, [r1,:128]!
|
||||
vmlal.s16 q0, d16, d20
|
||||
vld1.16 {d18-d19}, [r0]!
|
||||
vmlal.s16 q1, d17, d21
|
||||
vld1.16 {d22-d23}, [r1,:128]!
|
||||
vmlal.s16 q2, d18, d22
|
||||
vmlal.s16 q3, d19, d23
|
||||
subs r2, r2, #16
|
||||
bgt 1b
|
||||
|
||||
vpadd.s32 d16, d0, d1
|
||||
vpadd.s32 d17, d2, d3
|
||||
vpadd.s32 d18, d4, d5
|
||||
vpadd.s32 d19, d6, d7
|
||||
vpadd.s32 d0, d16, d17
|
||||
vpadd.s32 d1, d18, d19
|
||||
vpadd.s32 d2, d0, d1
|
||||
vpaddl.s32 d3, d2
|
||||
vmov.32 r0, d3[0]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
383
externals/ffmpeg/libavcodec/arm/jrevdct_arm.S
vendored
Executable file
383
externals/ffmpeg/libavcodec/arm/jrevdct_arm.S
vendored
Executable file
@@ -0,0 +1,383 @@
|
||||
/*
|
||||
C-like prototype :
|
||||
void j_rev_dct_arm(DCTBLOCK data)
|
||||
|
||||
With DCTBLOCK being a pointer to an array of 64 'signed shorts'
|
||||
|
||||
Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
#define FIX_0_298631336 2446
|
||||
#define FIX_0_541196100 4433
|
||||
#define FIX_0_765366865 6270
|
||||
#define FIX_1_175875602 9633
|
||||
#define FIX_1_501321110 12299
|
||||
#define FIX_2_053119869 16819
|
||||
#define FIX_3_072711026 25172
|
||||
#define FIX_M_0_390180644 -3196
|
||||
#define FIX_M_0_899976223 -7373
|
||||
#define FIX_M_1_847759065 -15137
|
||||
#define FIX_M_1_961570560 -16069
|
||||
#define FIX_M_2_562915447 -20995
|
||||
#define FIX_0xFFFF 0xFFFF
|
||||
|
||||
#define FIX_0_298631336_ID 0
|
||||
#define FIX_0_541196100_ID 4
|
||||
#define FIX_0_765366865_ID 8
|
||||
#define FIX_1_175875602_ID 12
|
||||
#define FIX_1_501321110_ID 16
|
||||
#define FIX_2_053119869_ID 20
|
||||
#define FIX_3_072711026_ID 24
|
||||
#define FIX_M_0_390180644_ID 28
|
||||
#define FIX_M_0_899976223_ID 32
|
||||
#define FIX_M_1_847759065_ID 36
|
||||
#define FIX_M_1_961570560_ID 40
|
||||
#define FIX_M_2_562915447_ID 44
|
||||
#define FIX_0xFFFF_ID 48
|
||||
|
||||
function ff_j_rev_dct_arm, export=1
|
||||
push {r0, r4 - r11, lr}
|
||||
|
||||
mov lr, r0 @ lr = pointer to the current row
|
||||
mov r12, #8 @ r12 = row-counter
|
||||
movrel r11, const_array @ r11 = base pointer to the constants array
|
||||
row_loop:
|
||||
ldrsh r0, [lr, # 0] @ r0 = 'd0'
|
||||
ldrsh r2, [lr, # 2] @ r2 = 'd2'
|
||||
|
||||
@ Optimization for row that have all items except the first set to 0
|
||||
@ (this works as the int16_t are always 4-byte aligned)
|
||||
ldr r5, [lr, # 0]
|
||||
ldr r6, [lr, # 4]
|
||||
ldr r3, [lr, # 8]
|
||||
ldr r4, [lr, #12]
|
||||
orr r3, r3, r4
|
||||
orr r3, r3, r6
|
||||
orrs r5, r3, r5
|
||||
beq end_of_row_loop @ nothing to be done as ALL of them are '0'
|
||||
orrs r3, r3, r2
|
||||
beq empty_row
|
||||
|
||||
ldrsh r1, [lr, # 8] @ r1 = 'd1'
|
||||
ldrsh r4, [lr, # 4] @ r4 = 'd4'
|
||||
ldrsh r6, [lr, # 6] @ r6 = 'd6'
|
||||
|
||||
ldr r3, [r11, #FIX_0_541196100_ID]
|
||||
add r7, r2, r6
|
||||
ldr r5, [r11, #FIX_M_1_847759065_ID]
|
||||
mul r7, r3, r7 @ r7 = z1
|
||||
ldr r3, [r11, #FIX_0_765366865_ID]
|
||||
mla r6, r5, r6, r7 @ r6 = tmp2
|
||||
add r5, r0, r4 @ r5 = tmp0
|
||||
mla r2, r3, r2, r7 @ r2 = tmp3
|
||||
sub r3, r0, r4 @ r3 = tmp1
|
||||
|
||||
add r0, r2, r5, lsl #13 @ r0 = tmp10
|
||||
rsb r2, r2, r5, lsl #13 @ r2 = tmp13
|
||||
add r4, r6, r3, lsl #13 @ r4 = tmp11
|
||||
rsb r3, r6, r3, lsl #13 @ r3 = tmp12
|
||||
|
||||
push {r0, r2, r3, r4} @ save on the stack tmp10, tmp13, tmp12, tmp11
|
||||
|
||||
ldrsh r3, [lr, #10] @ r3 = 'd3'
|
||||
ldrsh r5, [lr, #12] @ r5 = 'd5'
|
||||
ldrsh r7, [lr, #14] @ r7 = 'd7'
|
||||
|
||||
add r0, r3, r5 @ r0 = 'z2'
|
||||
add r2, r1, r7 @ r2 = 'z1'
|
||||
add r4, r3, r7 @ r4 = 'z3'
|
||||
add r6, r1, r5 @ r6 = 'z4'
|
||||
ldr r9, [r11, #FIX_1_175875602_ID]
|
||||
add r8, r4, r6 @ r8 = z3 + z4
|
||||
ldr r10, [r11, #FIX_M_0_899976223_ID]
|
||||
mul r8, r9, r8 @ r8 = 'z5'
|
||||
ldr r9, [r11, #FIX_M_2_562915447_ID]
|
||||
mul r2, r10, r2 @ r2 = 'z1'
|
||||
ldr r10, [r11, #FIX_M_1_961570560_ID]
|
||||
mul r0, r9, r0 @ r0 = 'z2'
|
||||
ldr r9, [r11, #FIX_M_0_390180644_ID]
|
||||
mla r4, r10, r4, r8 @ r4 = 'z3'
|
||||
ldr r10, [r11, #FIX_0_298631336_ID]
|
||||
mla r6, r9, r6, r8 @ r6 = 'z4'
|
||||
ldr r9, [r11, #FIX_2_053119869_ID]
|
||||
mla r7, r10, r7, r2 @ r7 = tmp0 + z1
|
||||
ldr r10, [r11, #FIX_3_072711026_ID]
|
||||
mla r5, r9, r5, r0 @ r5 = tmp1 + z2
|
||||
ldr r9, [r11, #FIX_1_501321110_ID]
|
||||
mla r3, r10, r3, r0 @ r3 = tmp2 + z2
|
||||
add r7, r7, r4 @ r7 = tmp0
|
||||
mla r1, r9, r1, r2 @ r1 = tmp3 + z1
|
||||
add r5, r5, r6 @ r5 = tmp1
|
||||
add r3, r3, r4 @ r3 = tmp2
|
||||
add r1, r1, r6 @ r1 = tmp3
|
||||
|
||||
pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
|
||||
@ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
|
||||
|
||||
@ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
|
||||
add r8, r0, r1
|
||||
add r8, r8, #(1<<10)
|
||||
mov r8, r8, asr #11
|
||||
strh r8, [lr, # 0]
|
||||
|
||||
@ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
|
||||
sub r8, r0, r1
|
||||
add r8, r8, #(1<<10)
|
||||
mov r8, r8, asr #11
|
||||
strh r8, [lr, #14]
|
||||
|
||||
@ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
|
||||
add r8, r6, r3
|
||||
add r8, r8, #(1<<10)
|
||||
mov r8, r8, asr #11
|
||||
strh r8, [lr, # 2]
|
||||
|
||||
@ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
|
||||
sub r8, r6, r3
|
||||
add r8, r8, #(1<<10)
|
||||
mov r8, r8, asr #11
|
||||
strh r8, [lr, #12]
|
||||
|
||||
@ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
|
||||
add r8, r4, r5
|
||||
add r8, r8, #(1<<10)
|
||||
mov r8, r8, asr #11
|
||||
strh r8, [lr, # 4]
|
||||
|
||||
@ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
|
||||
sub r8, r4, r5
|
||||
add r8, r8, #(1<<10)
|
||||
mov r8, r8, asr #11
|
||||
strh r8, [lr, #10]
|
||||
|
||||
@ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
|
||||
add r8, r2, r7
|
||||
add r8, r8, #(1<<10)
|
||||
mov r8, r8, asr #11
|
||||
strh r8, [lr, # 6]
|
||||
|
||||
@ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
|
||||
sub r8, r2, r7
|
||||
add r8, r8, #(1<<10)
|
||||
mov r8, r8, asr #11
|
||||
strh r8, [lr, # 8]
|
||||
|
||||
@ End of row loop
|
||||
add lr, lr, #16
|
||||
subs r12, r12, #1
|
||||
bne row_loop
|
||||
beq start_column_loop
|
||||
|
||||
empty_row:
|
||||
ldr r1, [r11, #FIX_0xFFFF_ID]
|
||||
mov r0, r0, lsl #2
|
||||
and r0, r0, r1
|
||||
add r0, r0, r0, lsl #16
|
||||
str r0, [lr, # 0]
|
||||
str r0, [lr, # 4]
|
||||
str r0, [lr, # 8]
|
||||
str r0, [lr, #12]
|
||||
|
||||
end_of_row_loop:
|
||||
@ End of loop
|
||||
add lr, lr, #16
|
||||
subs r12, r12, #1
|
||||
bne row_loop
|
||||
|
||||
start_column_loop:
|
||||
@ Start of column loop
|
||||
pop {lr}
|
||||
mov r12, #8
|
||||
column_loop:
|
||||
ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0'
|
||||
ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2'
|
||||
ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4'
|
||||
ldrsh r6, [lr, #(12*8)] @ r6 = 'd6'
|
||||
|
||||
ldr r3, [r11, #FIX_0_541196100_ID]
|
||||
add r1, r2, r6
|
||||
ldr r5, [r11, #FIX_M_1_847759065_ID]
|
||||
mul r1, r3, r1 @ r1 = z1
|
||||
ldr r3, [r11, #FIX_0_765366865_ID]
|
||||
mla r6, r5, r6, r1 @ r6 = tmp2
|
||||
add r5, r0, r4 @ r5 = tmp0
|
||||
mla r2, r3, r2, r1 @ r2 = tmp3
|
||||
sub r3, r0, r4 @ r3 = tmp1
|
||||
|
||||
add r0, r2, r5, lsl #13 @ r0 = tmp10
|
||||
rsb r2, r2, r5, lsl #13 @ r2 = tmp13
|
||||
add r4, r6, r3, lsl #13 @ r4 = tmp11
|
||||
rsb r6, r6, r3, lsl #13 @ r6 = tmp12
|
||||
|
||||
ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1'
|
||||
ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3'
|
||||
ldrsh r5, [lr, #(10*8)] @ r5 = 'd5'
|
||||
ldrsh r7, [lr, #(14*8)] @ r7 = 'd7'
|
||||
|
||||
@ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
|
||||
orr r9, r1, r3
|
||||
orr r10, r5, r7
|
||||
orrs r10, r9, r10
|
||||
beq empty_odd_column
|
||||
|
||||
push {r0, r2, r4, r6} @ save on the stack tmp10, tmp13, tmp12, tmp11
|
||||
|
||||
add r0, r3, r5 @ r0 = 'z2'
|
||||
add r2, r1, r7 @ r2 = 'z1'
|
||||
add r4, r3, r7 @ r4 = 'z3'
|
||||
add r6, r1, r5 @ r6 = 'z4'
|
||||
ldr r9, [r11, #FIX_1_175875602_ID]
|
||||
add r8, r4, r6
|
||||
ldr r10, [r11, #FIX_M_0_899976223_ID]
|
||||
mul r8, r9, r8 @ r8 = 'z5'
|
||||
ldr r9, [r11, #FIX_M_2_562915447_ID]
|
||||
mul r2, r10, r2 @ r2 = 'z1'
|
||||
ldr r10, [r11, #FIX_M_1_961570560_ID]
|
||||
mul r0, r9, r0 @ r0 = 'z2'
|
||||
ldr r9, [r11, #FIX_M_0_390180644_ID]
|
||||
mla r4, r10, r4, r8 @ r4 = 'z3'
|
||||
ldr r10, [r11, #FIX_0_298631336_ID]
|
||||
mla r6, r9, r6, r8 @ r6 = 'z4'
|
||||
ldr r9, [r11, #FIX_2_053119869_ID]
|
||||
mla r7, r10, r7, r2 @ r7 = tmp0 + z1
|
||||
ldr r10, [r11, #FIX_3_072711026_ID]
|
||||
mla r5, r9, r5, r0 @ r5 = tmp1 + z2
|
||||
ldr r9, [r11, #FIX_1_501321110_ID]
|
||||
mla r3, r10, r3, r0 @ r3 = tmp2 + z2
|
||||
add r7, r7, r4 @ r7 = tmp0
|
||||
mla r1, r9, r1, r2 @ r1 = tmp3 + z1
|
||||
add r5, r5, r6 @ r5 = tmp1
|
||||
add r3, r3, r4 @ r3 = tmp2
|
||||
add r1, r1, r6 @ r1 = tmp3
|
||||
|
||||
pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
|
||||
@ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
|
||||
|
||||
@ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
|
||||
add r8, r0, r1
|
||||
add r8, r8, #(1<<17)
|
||||
mov r8, r8, asr #18
|
||||
strh r8, [lr, #( 0*8)]
|
||||
|
||||
@ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
|
||||
sub r8, r0, r1
|
||||
add r8, r8, #(1<<17)
|
||||
mov r8, r8, asr #18
|
||||
strh r8, [lr, #(14*8)]
|
||||
|
||||
@ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
|
||||
add r8, r4, r3
|
||||
add r8, r8, #(1<<17)
|
||||
mov r8, r8, asr #18
|
||||
strh r8, [lr, #( 2*8)]
|
||||
|
||||
@ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
|
||||
sub r8, r4, r3
|
||||
add r8, r8, #(1<<17)
|
||||
mov r8, r8, asr #18
|
||||
strh r8, [lr, #(12*8)]
|
||||
|
||||
@ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
|
||||
add r8, r6, r5
|
||||
add r8, r8, #(1<<17)
|
||||
mov r8, r8, asr #18
|
||||
strh r8, [lr, #( 4*8)]
|
||||
|
||||
@ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
|
||||
sub r8, r6, r5
|
||||
add r8, r8, #(1<<17)
|
||||
mov r8, r8, asr #18
|
||||
strh r8, [lr, #(10*8)]
|
||||
|
||||
@ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
|
||||
add r8, r2, r7
|
||||
add r8, r8, #(1<<17)
|
||||
mov r8, r8, asr #18
|
||||
strh r8, [lr, #( 6*8)]
|
||||
|
||||
@ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
|
||||
sub r8, r2, r7
|
||||
add r8, r8, #(1<<17)
|
||||
mov r8, r8, asr #18
|
||||
strh r8, [lr, #( 8*8)]
|
||||
|
||||
@ End of row loop
|
||||
add lr, lr, #2
|
||||
subs r12, r12, #1
|
||||
bne column_loop
|
||||
beq the_end
|
||||
|
||||
empty_odd_column:
|
||||
@ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
|
||||
@ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
|
||||
add r0, r0, #(1<<17)
|
||||
mov r0, r0, asr #18
|
||||
strh r0, [lr, #( 0*8)]
|
||||
strh r0, [lr, #(14*8)]
|
||||
|
||||
@ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
|
||||
@ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
|
||||
add r4, r4, #(1<<17)
|
||||
mov r4, r4, asr #18
|
||||
strh r4, [lr, #( 2*8)]
|
||||
strh r4, [lr, #(12*8)]
|
||||
|
||||
@ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
|
||||
@ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
|
||||
add r6, r6, #(1<<17)
|
||||
mov r6, r6, asr #18
|
||||
strh r6, [lr, #( 4*8)]
|
||||
strh r6, [lr, #(10*8)]
|
||||
|
||||
@ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
|
||||
@ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
|
||||
add r2, r2, #(1<<17)
|
||||
mov r2, r2, asr #18
|
||||
strh r2, [lr, #( 6*8)]
|
||||
strh r2, [lr, #( 8*8)]
|
||||
|
||||
@ End of row loop
|
||||
add lr, lr, #2
|
||||
subs r12, r12, #1
|
||||
bne column_loop
|
||||
|
||||
the_end:
|
||||
@ The end....
|
||||
pop {r4 - r11, pc}
|
||||
endfunc
|
||||
|
||||
const const_array
|
||||
.word FIX_0_298631336
|
||||
.word FIX_0_541196100
|
||||
.word FIX_0_765366865
|
||||
.word FIX_1_175875602
|
||||
.word FIX_1_501321110
|
||||
.word FIX_2_053119869
|
||||
.word FIX_3_072711026
|
||||
.word FIX_M_0_390180644
|
||||
.word FIX_M_0_899976223
|
||||
.word FIX_M_1_847759065
|
||||
.word FIX_M_1_961570560
|
||||
.word FIX_M_2_562915447
|
||||
.word FIX_0xFFFF
|
||||
endconst
|
||||
38
externals/ffmpeg/libavcodec/arm/lossless_audiodsp_init_arm.c
vendored
Executable file
38
externals/ffmpeg/libavcodec/arm/lossless_audiodsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/lossless_audiodsp.h"
|
||||
|
||||
int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3, int len, int mul);
|
||||
|
||||
av_cold void ff_llauddsp_init_arm(LLAudDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
|
||||
}
|
||||
}
|
||||
62
externals/ffmpeg/libavcodec/arm/lossless_audiodsp_neon.S
vendored
Executable file
62
externals/ffmpeg/libavcodec/arm/lossless_audiodsp_neon.S
vendored
Executable file
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
* ARM NEON optimised integer operations
|
||||
* Copyright (c) 2009 Kostya Shishkov
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
|
||||
function ff_scalarproduct_and_madd_int16_neon, export=1
|
||||
vld1.16 {d28[],d29[]}, [sp]
|
||||
vmov.i16 q0, #0
|
||||
vmov.i16 q1, #0
|
||||
vmov.i16 q2, #0
|
||||
vmov.i16 q3, #0
|
||||
mov r12, r0
|
||||
|
||||
1: vld1.16 {d16-d17}, [r0,:128]!
|
||||
vld1.16 {d18-d19}, [r1]!
|
||||
vld1.16 {d20-d21}, [r2]!
|
||||
vld1.16 {d22-d23}, [r0,:128]!
|
||||
vld1.16 {d24-d25}, [r1]!
|
||||
vld1.16 {d26-d27}, [r2]!
|
||||
vmul.s16 q10, q10, q14
|
||||
vmul.s16 q13, q13, q14
|
||||
vmlal.s16 q0, d16, d18
|
||||
vmlal.s16 q1, d17, d19
|
||||
vadd.s16 q10, q8, q10
|
||||
vadd.s16 q13, q11, q13
|
||||
vmlal.s16 q2, d22, d24
|
||||
vmlal.s16 q3, d23, d25
|
||||
vst1.16 {q10}, [r12,:128]!
|
||||
subs r3, r3, #16
|
||||
vst1.16 {q13}, [r12,:128]!
|
||||
bgt 1b
|
||||
|
||||
vpadd.s32 d16, d0, d1
|
||||
vpadd.s32 d17, d2, d3
|
||||
vpadd.s32 d18, d4, d5
|
||||
vpadd.s32 d19, d6, d7
|
||||
vpadd.s32 d0, d16, d17
|
||||
vpadd.s32 d1, d18, d19
|
||||
vpadd.s32 d2, d0, d1
|
||||
vpaddl.s32 d3, d2
|
||||
vmov.32 r0, d3[0]
|
||||
bx lr
|
||||
endfunc
|
||||
108
externals/ffmpeg/libavcodec/arm/mathops.h
vendored
Executable file
108
externals/ffmpeg/libavcodec/arm/mathops.h
vendored
Executable file
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
* simple math operations
|
||||
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_ARM_MATHOPS_H
|
||||
#define AVCODEC_ARM_MATHOPS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "config.h"
|
||||
#include "libavutil/common.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#if HAVE_ARMV6_INLINE
|
||||
#define MULH MULH
|
||||
static inline av_const int MULH(int a, int b)
|
||||
{
|
||||
int r;
|
||||
__asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
|
||||
return r;
|
||||
}
|
||||
|
||||
#define FASTDIV FASTDIV
|
||||
static av_always_inline av_const int FASTDIV(int a, int b)
|
||||
{
|
||||
int r;
|
||||
__asm__ ("cmp %2, #2 \n\t"
|
||||
"ldr %0, [%3, %2, lsl #2] \n\t"
|
||||
"ite le \n\t"
|
||||
"lsrle %0, %1, #1 \n\t"
|
||||
"smmulgt %0, %0, %1 \n\t"
|
||||
: "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc");
|
||||
return r;
|
||||
}
|
||||
|
||||
#else /* HAVE_ARMV6_INLINE */
|
||||
|
||||
#define FASTDIV FASTDIV
|
||||
static av_always_inline av_const int FASTDIV(int a, int b)
|
||||
{
|
||||
int r, t;
|
||||
__asm__ ("umull %1, %0, %2, %3"
|
||||
: "=&r"(r), "=&r"(t) : "r"(a), "r"(ff_inverse[b]));
|
||||
return r;
|
||||
}
|
||||
#endif
|
||||
|
||||
#define MLS64(d, a, b) MAC64(d, -(a), b)
|
||||
|
||||
#if HAVE_ARMV5TE_INLINE
|
||||
|
||||
/* signed 16x16 -> 32 multiply add accumulate */
|
||||
# define MAC16(rt, ra, rb) \
|
||||
__asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb));
|
||||
|
||||
/* signed 16x16 -> 32 multiply */
|
||||
# define MUL16 MUL16
|
||||
static inline av_const int MUL16(int ra, int rb)
|
||||
{
|
||||
int rt;
|
||||
__asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb));
|
||||
return rt;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define mid_pred mid_pred
|
||||
static inline av_const int mid_pred(int a, int b, int c)
|
||||
{
|
||||
int m;
|
||||
__asm__ (
|
||||
"mov %0, %2 \n\t"
|
||||
"cmp %1, %2 \n\t"
|
||||
"itt gt \n\t"
|
||||
"movgt %0, %1 \n\t"
|
||||
"movgt %1, %2 \n\t"
|
||||
"cmp %1, %3 \n\t"
|
||||
"it le \n\t"
|
||||
"movle %1, %3 \n\t"
|
||||
"cmp %0, %1 \n\t"
|
||||
"it gt \n\t"
|
||||
"movgt %0, %1 \n\t"
|
||||
: "=&r"(m), "+r"(a)
|
||||
: "r"(b), "r"(c)
|
||||
: "cc");
|
||||
return m;
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
#endif /* AVCODEC_ARM_MATHOPS_H */
|
||||
193
externals/ffmpeg/libavcodec/arm/mdct_fixed_neon.S
vendored
Executable file
193
externals/ffmpeg/libavcodec/arm/mdct_fixed_neon.S
vendored
Executable file
@@ -0,0 +1,193 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
.macro prerot dst, rt
|
||||
lsr r3, r6, #2 @ n4
|
||||
add \rt, r4, r6, lsr #1 @ revtab + n4
|
||||
add r9, r3, r3, lsl #1 @ n3
|
||||
add r8, r7, r6 @ tcos + n4
|
||||
add r3, r2, r6, lsr #1 @ in + n4
|
||||
add r9, r2, r9, lsl #1 @ in + n3
|
||||
sub r8, r8, #16
|
||||
sub r10, r3, #16
|
||||
sub r11, r9, #16
|
||||
mov r12, #-16
|
||||
1:
|
||||
vld2.16 {d0,d1}, [r9, :128]!
|
||||
vld2.16 {d2,d3}, [r11,:128], r12
|
||||
vld2.16 {d4,d5}, [r3, :128]!
|
||||
vld2.16 {d6,d7}, [r10,:128], r12
|
||||
vld2.16 {d16,d17},[r7, :128]! @ cos, sin
|
||||
vld2.16 {d18,d19},[r8, :128], r12
|
||||
vrev64.16 q1, q1
|
||||
vrev64.16 q3, q3
|
||||
vrev64.16 q9, q9
|
||||
vneg.s16 d0, d0
|
||||
vneg.s16 d2, d2
|
||||
vneg.s16 d16, d16
|
||||
vneg.s16 d18, d18
|
||||
vhsub.s16 d0, d0, d3 @ re
|
||||
vhsub.s16 d4, d7, d4 @ im
|
||||
vhsub.s16 d6, d6, d5
|
||||
vhsub.s16 d2, d2, d1
|
||||
vmull.s16 q10, d0, d16
|
||||
vmlsl.s16 q10, d4, d17
|
||||
vmull.s16 q11, d0, d17
|
||||
vmlal.s16 q11, d4, d16
|
||||
vmull.s16 q12, d6, d18
|
||||
vmlsl.s16 q12, d2, d19
|
||||
vmull.s16 q13, d6, d19
|
||||
vmlal.s16 q13, d2, d18
|
||||
vshrn.s32 d0, q10, #15
|
||||
vshrn.s32 d1, q11, #15
|
||||
vshrn.s32 d2, q12, #15
|
||||
vshrn.s32 d3, q13, #15
|
||||
vzip.16 d0, d1
|
||||
vzip.16 d2, d3
|
||||
ldrh lr, [r4], #2
|
||||
ldrh r2, [\rt, #-2]!
|
||||
add lr, \dst, lr, lsl #2
|
||||
add r2, \dst, r2, lsl #2
|
||||
vst1.32 {d0[0]}, [lr,:32]
|
||||
vst1.32 {d2[0]}, [r2,:32]
|
||||
ldrh lr, [r4], #2
|
||||
ldrh r2, [\rt, #-2]!
|
||||
add lr, \dst, lr, lsl #2
|
||||
add r2, \dst, r2, lsl #2
|
||||
vst1.32 {d0[1]}, [lr,:32]
|
||||
vst1.32 {d2[1]}, [r2,:32]
|
||||
ldrh lr, [r4], #2
|
||||
ldrh r2, [\rt, #-2]!
|
||||
add lr, \dst, lr, lsl #2
|
||||
add r2, \dst, r2, lsl #2
|
||||
vst1.32 {d1[0]}, [lr,:32]
|
||||
vst1.32 {d3[0]}, [r2,:32]
|
||||
ldrh lr, [r4], #2
|
||||
ldrh r2, [\rt, #-2]!
|
||||
add lr, \dst, lr, lsl #2
|
||||
add r2, \dst, r2, lsl #2
|
||||
vst1.32 {d1[1]}, [lr,:32]
|
||||
vst1.32 {d3[1]}, [r2,:32]
|
||||
subs r6, r6, #32
|
||||
bgt 1b
|
||||
.endm
|
||||
|
||||
function ff_mdct_fixed_calc_neon, export=1
|
||||
push {r1,r4-r11,lr}
|
||||
|
||||
ldr r4, [r0, #8] @ revtab
|
||||
ldr r6, [r0, #16] @ mdct_size; n
|
||||
ldr r7, [r0, #24] @ tcos
|
||||
|
||||
prerot r1, r5
|
||||
|
||||
mov r4, r0
|
||||
bl X(ff_fft_fixed_calc_neon)
|
||||
|
||||
pop {r5}
|
||||
mov r12, #-16
|
||||
ldr r6, [r4, #16] @ mdct_size; n
|
||||
ldr r7, [r4, #24] @ tcos
|
||||
add r5, r5, r6, lsr #1
|
||||
add r7, r7, r6, lsr #1
|
||||
sub r1, r5, #16
|
||||
sub r2, r7, #16
|
||||
1:
|
||||
vld2.16 {d4,d5}, [r7,:128]!
|
||||
vld2.16 {d6,d7}, [r2,:128], r12
|
||||
vld2.16 {d0,d1}, [r5,:128]
|
||||
vld2.16 {d2,d3}, [r1,:128]
|
||||
vrev64.16 q3, q3
|
||||
vrev64.16 q1, q1
|
||||
vneg.s16 q3, q3
|
||||
vneg.s16 q2, q2
|
||||
vmull.s16 q11, d2, d6
|
||||
vmlal.s16 q11, d3, d7
|
||||
vmull.s16 q8, d0, d5
|
||||
vmlsl.s16 q8, d1, d4
|
||||
vmull.s16 q9, d0, d4
|
||||
vmlal.s16 q9, d1, d5
|
||||
vmull.s16 q10, d2, d7
|
||||
vmlsl.s16 q10, d3, d6
|
||||
vshrn.s32 d0, q11, #15
|
||||
vshrn.s32 d1, q8, #15
|
||||
vshrn.s32 d2, q9, #15
|
||||
vshrn.s32 d3, q10, #15
|
||||
vrev64.16 q0, q0
|
||||
vst2.16 {d2,d3}, [r5,:128]!
|
||||
vst2.16 {d0,d1}, [r1,:128], r12
|
||||
subs r6, r6, #32
|
||||
bgt 1b
|
||||
|
||||
pop {r4-r11,pc}
|
||||
endfunc
|
||||
|
||||
function ff_mdct_fixed_calcw_neon, export=1
|
||||
push {r1,r4-r11,lr}
|
||||
|
||||
ldrd r4, r5, [r0, #8] @ revtab, tmp_buf
|
||||
ldr r6, [r0, #16] @ mdct_size; n
|
||||
ldr r7, [r0, #24] @ tcos
|
||||
|
||||
prerot r5, r1
|
||||
|
||||
mov r4, r0
|
||||
mov r1, r5
|
||||
bl X(ff_fft_fixed_calc_neon)
|
||||
|
||||
pop {r7}
|
||||
mov r12, #-16
|
||||
ldr r6, [r4, #16] @ mdct_size; n
|
||||
ldr r9, [r4, #24] @ tcos
|
||||
add r5, r5, r6, lsr #1
|
||||
add r7, r7, r6
|
||||
add r9, r9, r6, lsr #1
|
||||
sub r3, r5, #16
|
||||
sub r1, r7, #16
|
||||
sub r2, r9, #16
|
||||
1:
|
||||
vld2.16 {d4,d5}, [r9,:128]!
|
||||
vld2.16 {d6,d7}, [r2,:128], r12
|
||||
vld2.16 {d0,d1}, [r5,:128]!
|
||||
vld2.16 {d2,d3}, [r3,:128], r12
|
||||
vrev64.16 q3, q3
|
||||
vrev64.16 q1, q1
|
||||
vneg.s16 q3, q3
|
||||
vneg.s16 q2, q2
|
||||
vmull.s16 q8, d2, d6
|
||||
vmlal.s16 q8, d3, d7
|
||||
vmull.s16 q9, d0, d5
|
||||
vmlsl.s16 q9, d1, d4
|
||||
vmull.s16 q10, d0, d4
|
||||
vmlal.s16 q10, d1, d5
|
||||
vmull.s16 q11, d2, d7
|
||||
vmlsl.s16 q11, d3, d6
|
||||
vrev64.32 q8, q8
|
||||
vrev64.32 q9, q9
|
||||
vst2.32 {q10,q11},[r7,:128]!
|
||||
vst2.32 {d16,d18},[r1,:128], r12
|
||||
vst2.32 {d17,d19},[r1,:128], r12
|
||||
subs r6, r6, #32
|
||||
bgt 1b
|
||||
|
||||
pop {r4-r11,pc}
|
||||
endfunc
|
||||
301
externals/ffmpeg/libavcodec/arm/mdct_neon.S
vendored
Executable file
301
externals/ffmpeg/libavcodec/arm/mdct_neon.S
vendored
Executable file
@@ -0,0 +1,301 @@
|
||||
/*
|
||||
* ARM NEON optimised MDCT
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
#define ff_fft_calc_neon X(ff_fft_calc_neon)
|
||||
|
||||
function ff_imdct_half_neon, export=1
|
||||
push {r4-r8,lr}
|
||||
|
||||
mov r12, #1
|
||||
ldr lr, [r0, #20] @ mdct_bits
|
||||
ldr r4, [r0, #24] @ tcos
|
||||
ldr r3, [r0, #8] @ revtab
|
||||
lsl r12, r12, lr @ n = 1 << nbits
|
||||
lsr lr, r12, #2 @ n4 = n >> 2
|
||||
add r7, r2, r12, lsl #1
|
||||
mov r12, #-16
|
||||
sub r7, r7, #16
|
||||
|
||||
vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
|
||||
vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
|
||||
vrev64.32 d17, d17
|
||||
vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
|
||||
vmul.f32 d6, d17, d2
|
||||
vmul.f32 d7, d0, d2
|
||||
1:
|
||||
subs lr, lr, #2
|
||||
ldr r6, [r3], #4
|
||||
vmul.f32 d4, d0, d3
|
||||
vmul.f32 d5, d17, d3
|
||||
vsub.f32 d4, d6, d4
|
||||
vadd.f32 d5, d5, d7
|
||||
uxth r8, r6, ror #16
|
||||
uxth r6, r6
|
||||
add r8, r1, r8, lsl #3
|
||||
add r6, r1, r6, lsl #3
|
||||
beq 1f
|
||||
vld2.32 {d16-d17},[r7,:128],r12
|
||||
vld2.32 {d0-d1}, [r2,:128]!
|
||||
vrev64.32 d17, d17
|
||||
vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
|
||||
vmul.f32 d6, d17, d2
|
||||
vmul.f32 d7, d0, d2
|
||||
vst2.32 {d4[0],d5[0]}, [r6,:64]
|
||||
vst2.32 {d4[1],d5[1]}, [r8,:64]
|
||||
b 1b
|
||||
1:
|
||||
vst2.32 {d4[0],d5[0]}, [r6,:64]
|
||||
vst2.32 {d4[1],d5[1]}, [r8,:64]
|
||||
|
||||
mov r4, r0
|
||||
mov r6, r1
|
||||
bl ff_fft_calc_neon
|
||||
|
||||
mov r12, #1
|
||||
ldr lr, [r4, #20] @ mdct_bits
|
||||
ldr r4, [r4, #24] @ tcos
|
||||
lsl r12, r12, lr @ n = 1 << nbits
|
||||
lsr lr, r12, #3 @ n8 = n >> 3
|
||||
|
||||
add r4, r4, lr, lsl #3
|
||||
add r6, r6, lr, lsl #3
|
||||
sub r1, r4, #16
|
||||
sub r3, r6, #16
|
||||
|
||||
mov r7, #-16
|
||||
mov r8, r6
|
||||
mov r0, r3
|
||||
|
||||
vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
|
||||
vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
|
||||
vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
|
||||
1:
|
||||
subs lr, lr, #2
|
||||
vmul.f32 d7, d0, d18
|
||||
vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
|
||||
vmul.f32 d4, d1, d18
|
||||
vmul.f32 d5, d21, d19
|
||||
vmul.f32 d6, d20, d19
|
||||
vmul.f32 d22, d1, d16
|
||||
vmul.f32 d23, d21, d17
|
||||
vmul.f32 d24, d0, d16
|
||||
vmul.f32 d25, d20, d17
|
||||
vadd.f32 d7, d7, d22
|
||||
vadd.f32 d6, d6, d23
|
||||
vsub.f32 d4, d4, d24
|
||||
vsub.f32 d5, d5, d25
|
||||
beq 1f
|
||||
vld2.32 {d0-d1}, [r3,:128], r7
|
||||
vld2.32 {d20-d21},[r6,:128]!
|
||||
vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
|
||||
vrev64.32 q3, q3
|
||||
vst2.32 {d4,d6}, [r0,:128], r7
|
||||
vst2.32 {d5,d7}, [r8,:128]!
|
||||
b 1b
|
||||
1:
|
||||
vrev64.32 q3, q3
|
||||
vst2.32 {d4,d6}, [r0,:128]
|
||||
vst2.32 {d5,d7}, [r8,:128]
|
||||
|
||||
pop {r4-r8,pc}
|
||||
endfunc
|
||||
|
||||
function ff_imdct_calc_neon, export=1
|
||||
push {r4-r6,lr}
|
||||
|
||||
ldr r3, [r0, #20]
|
||||
mov r4, #1
|
||||
mov r5, r1
|
||||
lsl r4, r4, r3
|
||||
add r1, r1, r4
|
||||
|
||||
bl X(ff_imdct_half_neon)
|
||||
|
||||
add r0, r5, r4, lsl #2
|
||||
add r1, r5, r4, lsl #1
|
||||
sub r0, r0, #8
|
||||
sub r2, r1, #16
|
||||
mov r3, #-16
|
||||
mov r6, #-8
|
||||
vmov.i32 d30, #1<<31
|
||||
1:
|
||||
vld1.32 {d0-d1}, [r2,:128], r3
|
||||
pld [r0, #-16]
|
||||
vrev64.32 q0, q0
|
||||
vld1.32 {d2-d3}, [r1,:128]!
|
||||
veor d4, d1, d30
|
||||
pld [r2, #-16]
|
||||
vrev64.32 q1, q1
|
||||
veor d5, d0, d30
|
||||
vst1.32 {d2}, [r0,:64], r6
|
||||
vst1.32 {d3}, [r0,:64], r6
|
||||
vst1.32 {d4-d5}, [r5,:128]!
|
||||
subs r4, r4, #16
|
||||
bgt 1b
|
||||
|
||||
pop {r4-r6,pc}
|
||||
endfunc
|
||||
|
||||
function ff_mdct_calc_neon, export=1
|
||||
push {r4-r10,lr}
|
||||
|
||||
mov r12, #1
|
||||
ldr lr, [r0, #20] @ mdct_bits
|
||||
ldr r4, [r0, #24] @ tcos
|
||||
ldr r3, [r0, #8] @ revtab
|
||||
lsl lr, r12, lr @ n = 1 << nbits
|
||||
add r7, r2, lr @ in4u
|
||||
sub r9, r7, #16 @ in4d
|
||||
add r2, r7, lr, lsl #1 @ in3u
|
||||
add r8, r9, lr, lsl #1 @ in3d
|
||||
add r5, r4, lr, lsl #1
|
||||
sub r5, r5, #16
|
||||
sub r3, r3, #4
|
||||
mov r12, #-16
|
||||
|
||||
vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
|
||||
vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
|
||||
vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
|
||||
vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
|
||||
vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
|
||||
vsub.f32 d0, d18, d0 @ in4d-in4u I
|
||||
vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
|
||||
vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
|
||||
vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
|
||||
vadd.f32 d1, d1, d19 @ in3u+in3d -R
|
||||
vsub.f32 d16, d16, d2 @ in0u-in2d R
|
||||
vadd.f32 d17, d17, d3 @ in2u+in1d -I
|
||||
1:
|
||||
vmul.f32 d7, d0, d21 @ I*s
|
||||
A ldr r10, [r3, lr, lsr #1]
|
||||
T lsr r10, lr, #1
|
||||
T ldr r10, [r3, r10]
|
||||
vmul.f32 d6, d1, d20 @ -R*c
|
||||
ldr r6, [r3, #4]!
|
||||
vmul.f32 d4, d1, d21 @ -R*s
|
||||
vmul.f32 d5, d0, d20 @ I*c
|
||||
vmul.f32 d24, d16, d30 @ R*c
|
||||
vmul.f32 d25, d17, d31 @ -I*s
|
||||
vmul.f32 d22, d16, d31 @ R*s
|
||||
vmul.f32 d23, d17, d30 @ I*c
|
||||
subs lr, lr, #16
|
||||
vsub.f32 d6, d6, d7 @ -R*c-I*s
|
||||
vadd.f32 d7, d4, d5 @ -R*s+I*c
|
||||
vsub.f32 d24, d25, d24 @ I*s-R*c
|
||||
vadd.f32 d25, d22, d23 @ R*s-I*c
|
||||
beq 1f
|
||||
mov r12, #-16
|
||||
vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
|
||||
vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
|
||||
vneg.f32 d7, d7 @ R*s-I*c
|
||||
vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
|
||||
vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
|
||||
vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
|
||||
vsub.f32 d0, d18, d0 @ in4d-in4u I
|
||||
vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
|
||||
vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
|
||||
vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
|
||||
vadd.f32 d1, d1, d19 @ in3u+in3d -R
|
||||
vsub.f32 d16, d16, d2 @ in0u-in2d R
|
||||
vadd.f32 d17, d17, d3 @ in2u+in1d -I
|
||||
uxth r12, r6, ror #16
|
||||
uxth r6, r6
|
||||
add r12, r1, r12, lsl #3
|
||||
add r6, r1, r6, lsl #3
|
||||
vst2.32 {d6[0],d7[0]}, [r6,:64]
|
||||
vst2.32 {d6[1],d7[1]}, [r12,:64]
|
||||
uxth r6, r10, ror #16
|
||||
uxth r10, r10
|
||||
add r6 , r1, r6, lsl #3
|
||||
add r10, r1, r10, lsl #3
|
||||
vst2.32 {d24[0],d25[0]},[r10,:64]
|
||||
vst2.32 {d24[1],d25[1]},[r6,:64]
|
||||
b 1b
|
||||
1:
|
||||
vneg.f32 d7, d7 @ R*s-I*c
|
||||
uxth r12, r6, ror #16
|
||||
uxth r6, r6
|
||||
add r12, r1, r12, lsl #3
|
||||
add r6, r1, r6, lsl #3
|
||||
vst2.32 {d6[0],d7[0]}, [r6,:64]
|
||||
vst2.32 {d6[1],d7[1]}, [r12,:64]
|
||||
uxth r6, r10, ror #16
|
||||
uxth r10, r10
|
||||
add r6 , r1, r6, lsl #3
|
||||
add r10, r1, r10, lsl #3
|
||||
vst2.32 {d24[0],d25[0]},[r10,:64]
|
||||
vst2.32 {d24[1],d25[1]},[r6,:64]
|
||||
|
||||
mov r4, r0
|
||||
mov r6, r1
|
||||
bl ff_fft_calc_neon
|
||||
|
||||
mov r12, #1
|
||||
ldr lr, [r4, #20] @ mdct_bits
|
||||
ldr r4, [r4, #24] @ tcos
|
||||
lsl r12, r12, lr @ n = 1 << nbits
|
||||
lsr lr, r12, #3 @ n8 = n >> 3
|
||||
|
||||
add r4, r4, lr, lsl #3
|
||||
add r6, r6, lr, lsl #3
|
||||
sub r1, r4, #16
|
||||
sub r3, r6, #16
|
||||
|
||||
mov r7, #-16
|
||||
mov r8, r6
|
||||
mov r0, r3
|
||||
|
||||
vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
|
||||
vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
|
||||
vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
|
||||
1:
|
||||
subs lr, lr, #2
|
||||
vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
|
||||
vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
|
||||
vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
|
||||
vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
|
||||
vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
|
||||
vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
|
||||
vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
|
||||
vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
|
||||
vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
|
||||
vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
|
||||
vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
|
||||
vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
|
||||
vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
|
||||
vneg.f32 q2, q2
|
||||
beq 1f
|
||||
vld2.32 {d0-d1}, [r3,:128], r7
|
||||
vld2.32 {d20-d21},[r6,:128]!
|
||||
vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
|
||||
vrev64.32 q3, q3
|
||||
vst2.32 {d4,d6}, [r0,:128], r7
|
||||
vst2.32 {d5,d7}, [r8,:128]!
|
||||
b 1b
|
||||
1:
|
||||
vrev64.32 q3, q3
|
||||
vst2.32 {d4,d6}, [r0,:128]
|
||||
vst2.32 {d5,d7}, [r8,:128]
|
||||
|
||||
pop {r4-r10,pc}
|
||||
endfunc
|
||||
347
externals/ffmpeg/libavcodec/arm/mdct_vfp.S
vendored
Executable file
347
externals/ffmpeg/libavcodec/arm/mdct_vfp.S
vendored
Executable file
@@ -0,0 +1,347 @@
|
||||
/*
|
||||
* Copyright (c) 2013 RISC OS Open Ltd
|
||||
* Author: Ben Avison <bavison@riscosopen.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
CONTEXT .req a1
|
||||
ORIGOUT .req a2
|
||||
IN .req a3
|
||||
OUT .req v1
|
||||
REVTAB .req v2
|
||||
TCOS .req v3
|
||||
TSIN .req v4
|
||||
OLDFPSCR .req v5
|
||||
J0 .req a2
|
||||
J1 .req a4
|
||||
J2 .req ip
|
||||
J3 .req lr
|
||||
REVTAB_HI .req v5
|
||||
IN_HI .req v6
|
||||
OUT_HI .req v6
|
||||
TCOS_HI .req sl
|
||||
TSIN_HI .req fp
|
||||
|
||||
.macro prerotation_innerloop
|
||||
.set trig_lo, k
|
||||
.set trig_hi, n4 - k - 2
|
||||
.set in_lo, trig_lo * 2
|
||||
.set in_hi, trig_hi * 2
|
||||
vldr d8, [TCOS, #trig_lo*4] @ s16,s17
|
||||
vldr d9, [TCOS, #trig_hi*4] @ s18,s19
|
||||
vldr s0, [IN, #in_hi*4 + 12]
|
||||
vldr s1, [IN, #in_hi*4 + 4]
|
||||
vldr s2, [IN, #in_lo*4 + 12]
|
||||
vldr s3, [IN, #in_lo*4 + 4]
|
||||
vmul.f s8, s0, s16 @ vector operation
|
||||
vldr d10, [TSIN, #trig_lo*4] @ s20,s21
|
||||
vldr d11, [TSIN, #trig_hi*4] @ s22,s23
|
||||
vldr s4, [IN, #in_lo*4]
|
||||
vldr s5, [IN, #in_lo*4 + 8]
|
||||
vldr s6, [IN, #in_hi*4]
|
||||
vldr s7, [IN, #in_hi*4 + 8]
|
||||
ldr J0, [REVTAB, #trig_lo*2]
|
||||
vmul.f s12, s0, s20 @ vector operation
|
||||
ldr J2, [REVTAB, #trig_hi*2]
|
||||
mov J1, J0, lsr #16
|
||||
and J0, J0, #255 @ halfword value will be < n4
|
||||
vmls.f s8, s4, s20 @ vector operation
|
||||
mov J3, J2, lsr #16
|
||||
and J2, J2, #255 @ halfword value will be < n4
|
||||
add J0, OUT, J0, lsl #3
|
||||
vmla.f s12, s4, s16 @ vector operation
|
||||
add J1, OUT, J1, lsl #3
|
||||
add J2, OUT, J2, lsl #3
|
||||
add J3, OUT, J3, lsl #3
|
||||
vstr s8, [J0]
|
||||
vstr s9, [J1]
|
||||
vstr s10, [J2]
|
||||
vstr s11, [J3]
|
||||
vstr s12, [J0, #4]
|
||||
vstr s13, [J1, #4]
|
||||
vstr s14, [J2, #4]
|
||||
vstr s15, [J3, #4]
|
||||
.set k, k + 2
|
||||
.endm
|
||||
|
||||
.macro prerotation_innerloop_rolled
|
||||
vldmia TCOS!, {s16,s17}
|
||||
vldmdb TCOS_HI!, {s18,s19}
|
||||
vldr s0, [IN_HI, #-4]
|
||||
vldr s1, [IN_HI, #-12]
|
||||
vldr s2, [IN, #12]
|
||||
vldr s3, [IN, #4]
|
||||
vmul.f s8, s0, s16 @ vector operation
|
||||
vldmia TSIN!, {s20,s21}
|
||||
vldmdb TSIN_HI!, {s22,s23}
|
||||
vldr s4, [IN]
|
||||
vldr s5, [IN, #8]
|
||||
vldr s6, [IN_HI, #-16]
|
||||
vldr s7, [IN_HI, #-8]
|
||||
vmul.f s12, s0, s20 @ vector operation
|
||||
add IN, IN, #16
|
||||
sub IN_HI, IN_HI, #16
|
||||
ldrh J0, [REVTAB], #2
|
||||
ldrh J1, [REVTAB], #2
|
||||
vmls.f s8, s4, s20 @ vector operation
|
||||
ldrh J3, [REVTAB_HI, #-2]!
|
||||
ldrh J2, [REVTAB_HI, #-2]!
|
||||
add J0, OUT, J0, lsl #3
|
||||
vmla.f s12, s4, s16 @ vector operation
|
||||
add J1, OUT, J1, lsl #3
|
||||
add J2, OUT, J2, lsl #3
|
||||
add J3, OUT, J3, lsl #3
|
||||
vstr s8, [J0]
|
||||
vstr s9, [J1]
|
||||
vstr s10, [J2]
|
||||
vstr s11, [J3]
|
||||
vstr s12, [J0, #4]
|
||||
vstr s13, [J1, #4]
|
||||
vstr s14, [J2, #4]
|
||||
vstr s15, [J3, #4]
|
||||
.endm
|
||||
|
||||
.macro postrotation_innerloop tail, head
|
||||
.set trig_lo_head, n8 - k - 2
|
||||
.set trig_hi_head, n8 + k
|
||||
.set out_lo_head, trig_lo_head * 2
|
||||
.set out_hi_head, trig_hi_head * 2
|
||||
.set trig_lo_tail, n8 - (k - 2) - 2
|
||||
.set trig_hi_tail, n8 + (k - 2)
|
||||
.set out_lo_tail, trig_lo_tail * 2
|
||||
.set out_hi_tail, trig_hi_tail * 2
|
||||
.if (k & 2) == 0
|
||||
TCOS_D0_HEAD .req d10 @ s20,s21
|
||||
TCOS_D1_HEAD .req d11 @ s22,s23
|
||||
TCOS_S0_TAIL .req s24
|
||||
.else
|
||||
TCOS_D0_HEAD .req d12 @ s24,s25
|
||||
TCOS_D1_HEAD .req d13 @ s26,s27
|
||||
TCOS_S0_TAIL .req s20
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
|
||||
vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
|
||||
vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vldr s0, [OUT, #out_lo_head*4]
|
||||
vldr s1, [OUT, #out_lo_head*4 + 8]
|
||||
vldr s2, [OUT, #out_hi_head*4]
|
||||
vldr s3, [OUT, #out_hi_head*4 + 8]
|
||||
vldr s4, [OUT, #out_lo_head*4 + 4]
|
||||
vldr s5, [OUT, #out_lo_head*4 + 12]
|
||||
vldr s6, [OUT, #out_hi_head*4 + 4]
|
||||
vldr s7, [OUT, #out_hi_head*4 + 12]
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vstr s8, [OUT, #out_lo_tail*4]
|
||||
vstr s9, [OUT, #out_lo_tail*4 + 8]
|
||||
vstr s10, [OUT, #out_hi_tail*4]
|
||||
vstr s11, [OUT, #out_hi_tail*4 + 8]
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vmul.f s8, s4, s16 @ vector operation
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vstr s12, [OUT, #out_hi_tail*4 + 12]
|
||||
vstr s13, [OUT, #out_hi_tail*4 + 4]
|
||||
vstr s14, [OUT, #out_lo_tail*4 + 12]
|
||||
vstr s15, [OUT, #out_lo_tail*4 + 4]
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vmul.f s12, s0, s16 @ vector operation
|
||||
vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
|
||||
.endif
|
||||
.unreq TCOS_D0_HEAD
|
||||
.unreq TCOS_D1_HEAD
|
||||
.unreq TCOS_S0_TAIL
|
||||
.ifnc "\head",""
|
||||
.set k, k + 2
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
|
||||
.ifnc "\tail",""
|
||||
vmls.f s8, s0, \tcos_s0_tail @ vector operation
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vldmia TSIN!, {s16,s17}
|
||||
vldmdb TSIN_HI!, {s18,s19}
|
||||
vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head}
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vmla.f s12, s4, \tcos_s0_tail @ vector operation
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vldr s0, [OUT, #+\out_offset_head+0]
|
||||
vldr s1, [OUT, #+\out_offset_head+8]
|
||||
vldr s2, [OUT_HI, #-\out_offset_head-16]
|
||||
vldr s3, [OUT_HI, #-\out_offset_head-8]
|
||||
vldr s4, [OUT, #+\out_offset_head+4]
|
||||
vldr s5, [OUT, #+\out_offset_head+12]
|
||||
vldr s6, [OUT_HI, #-\out_offset_head-12]
|
||||
vldr s7, [OUT_HI, #-\out_offset_head-4]
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vstr s8, [OUT, #+\out_offset_tail+0]
|
||||
vstr s9, [OUT, #+\out_offset_tail+8]
|
||||
vstr s10, [OUT_HI, #-\out_offset_tail-16]
|
||||
vstr s11, [OUT_HI, #-\out_offset_tail-8]
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vmul.f s8, s4, s16 @ vector operation
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vstr s12, [OUT_HI, #-\out_offset_tail-4]
|
||||
vstr s13, [OUT_HI, #-\out_offset_tail-12]
|
||||
vstr s14, [OUT, #+\out_offset_tail+12]
|
||||
vstr s15, [OUT, #+\out_offset_tail+4]
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vmul.f s12, s0, s16 @ vector operation
|
||||
vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head}
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
/* void ff_imdct_half_vfp(FFTContext *s,
|
||||
* FFTSample *output,
|
||||
* const FFTSample *input)
|
||||
*/
|
||||
function ff_imdct_half_vfp, export=1
|
||||
ldr ip, [CONTEXT, #5*4] @ mdct_bits
|
||||
teq ip, #6
|
||||
bne 10f
|
||||
|
||||
.set n, 1<<6
|
||||
.set n2, n/2
|
||||
.set n4, n/4
|
||||
.set n8, n/8
|
||||
|
||||
push {v1-v5,lr}
|
||||
vpush {s16-s27}
|
||||
fmrx OLDFPSCR, FPSCR
|
||||
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
fmxr FPSCR, lr
|
||||
mov OUT, ORIGOUT
|
||||
ldr REVTAB, [CONTEXT, #2*4]
|
||||
ldr TCOS, [CONTEXT, #6*4]
|
||||
ldr TSIN, [CONTEXT, #7*4]
|
||||
|
||||
.set k, 0
|
||||
.rept n8/2
|
||||
prerotation_innerloop
|
||||
.endr
|
||||
|
||||
fmxr FPSCR, OLDFPSCR
|
||||
mov a1, OUT
|
||||
bl X(ff_fft16_vfp)
|
||||
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
fmxr FPSCR, lr
|
||||
|
||||
.set k, 0
|
||||
postrotation_innerloop , head
|
||||
.rept n8/2 - 1
|
||||
postrotation_innerloop tail, head
|
||||
.endr
|
||||
postrotation_innerloop tail
|
||||
|
||||
fmxr FPSCR, OLDFPSCR
|
||||
vpop {s16-s27}
|
||||
pop {v1-v5,pc}
|
||||
|
||||
10:
|
||||
push {v1-v6,sl,fp,lr}
|
||||
vpush {s16-s27}
|
||||
fmrx OLDFPSCR, FPSCR
|
||||
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
fmxr FPSCR, lr
|
||||
mov lr, #1
|
||||
mov OUT, ORIGOUT
|
||||
ldr REVTAB, [CONTEXT, #2*4]
|
||||
ldr TCOS, [CONTEXT, #6*4]
|
||||
ldr TSIN, [CONTEXT, #7*4]
|
||||
mov lr, lr, lsl ip
|
||||
|
||||
push {CONTEXT,OLDFPSCR}
|
||||
add IN_HI, IN, lr, lsl #1
|
||||
add REVTAB_HI, REVTAB, lr, lsr #1
|
||||
add TCOS_HI, TCOS, lr
|
||||
add TSIN_HI, TSIN, lr
|
||||
0: prerotation_innerloop_rolled
|
||||
teq IN, IN_HI
|
||||
bne 0b
|
||||
ldmia sp, {CONTEXT,OLDFPSCR}
|
||||
|
||||
mov ORIGOUT, OUT
|
||||
fmxr FPSCR, OLDFPSCR
|
||||
ldr ip, [CONTEXT, #9*4]
|
||||
blx ip @ s->fft_calc(s, output)
|
||||
|
||||
pop {CONTEXT,OLDFPSCR}
|
||||
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
ldr ip, [CONTEXT, #5*4] @ mdct_bits
|
||||
fmxr FPSCR, lr
|
||||
mov lr, #1
|
||||
mov lr, lr, lsl ip
|
||||
sub TCOS, TCOS, lr, lsr #1
|
||||
sub TSIN, TSIN, lr, lsr #1
|
||||
add OUT_HI, OUT, lr, lsl #1
|
||||
add TCOS_HI, TCOS, lr
|
||||
add TSIN_HI, TSIN, lr
|
||||
postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0
|
||||
b 1f
|
||||
0: add OUT, OUT, #32
|
||||
sub OUT_HI, OUT_HI, #32
|
||||
postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16
|
||||
1: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0
|
||||
teq TSIN, TSIN_HI
|
||||
bne 0b
|
||||
postrotation_innerloop_rolled tail,,,,,, s24,, 16
|
||||
|
||||
fmxr FPSCR, OLDFPSCR
|
||||
vpop {s16-s27}
|
||||
pop {v1-v6,sl,fp,pc}
|
||||
endfunc
|
||||
|
||||
.unreq CONTEXT
|
||||
.unreq ORIGOUT
|
||||
.unreq IN
|
||||
.unreq OUT
|
||||
.unreq REVTAB
|
||||
.unreq TCOS
|
||||
.unreq TSIN
|
||||
.unreq OLDFPSCR
|
||||
.unreq J0
|
||||
.unreq J1
|
||||
.unreq J2
|
||||
.unreq J3
|
||||
.unreq REVTAB_HI
|
||||
.unreq IN_HI
|
||||
.unreq OUT_HI
|
||||
.unreq TCOS_HI
|
||||
.unreq TSIN_HI
|
||||
244
externals/ffmpeg/libavcodec/arm/me_cmp_armv6.S
vendored
Executable file
244
externals/ffmpeg/libavcodec/arm/me_cmp_armv6.S
vendored
Executable file
@@ -0,0 +1,244 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_pix_abs16_armv6, export=1
|
||||
ldr r0, [sp]
|
||||
push {r4-r9, lr}
|
||||
mov r12, #0
|
||||
mov lr, #0
|
||||
ldm r1, {r4-r7}
|
||||
ldr r8, [r2]
|
||||
1:
|
||||
ldr r9, [r2, #4]
|
||||
pld [r1, r3]
|
||||
usada8 r12, r4, r8, r12
|
||||
ldr r8, [r2, #8]
|
||||
pld [r2, r3]
|
||||
usada8 lr, r5, r9, lr
|
||||
ldr r9, [r2, #12]
|
||||
usada8 r12, r6, r8, r12
|
||||
subs r0, r0, #1
|
||||
usada8 lr, r7, r9, lr
|
||||
beq 2f
|
||||
add r1, r1, r3
|
||||
ldm r1, {r4-r7}
|
||||
add r2, r2, r3
|
||||
ldr r8, [r2]
|
||||
b 1b
|
||||
2:
|
||||
add r0, r12, lr
|
||||
pop {r4-r9, pc}
|
||||
endfunc
|
||||
|
||||
function ff_pix_abs16_x2_armv6, export=1
|
||||
ldr r12, [sp]
|
||||
push {r4-r11, lr}
|
||||
mov r0, #0
|
||||
mov lr, #1
|
||||
orr lr, lr, lr, lsl #8
|
||||
orr lr, lr, lr, lsl #16
|
||||
1:
|
||||
ldr r8, [r2]
|
||||
ldr r9, [r2, #4]
|
||||
lsr r10, r8, #8
|
||||
ldr r4, [r1]
|
||||
lsr r6, r9, #8
|
||||
orr r10, r10, r9, lsl #24
|
||||
ldr r5, [r2, #8]
|
||||
eor r11, r8, r10
|
||||
uhadd8 r7, r8, r10
|
||||
orr r6, r6, r5, lsl #24
|
||||
and r11, r11, lr
|
||||
uadd8 r7, r7, r11
|
||||
ldr r8, [r1, #4]
|
||||
usada8 r0, r4, r7, r0
|
||||
eor r7, r9, r6
|
||||
lsr r10, r5, #8
|
||||
and r7, r7, lr
|
||||
uhadd8 r4, r9, r6
|
||||
ldr r6, [r2, #12]
|
||||
uadd8 r4, r4, r7
|
||||
pld [r1, r3]
|
||||
orr r10, r10, r6, lsl #24
|
||||
usada8 r0, r8, r4, r0
|
||||
ldr r4, [r1, #8]
|
||||
eor r11, r5, r10
|
||||
ldrb r7, [r2, #16]
|
||||
and r11, r11, lr
|
||||
uhadd8 r8, r5, r10
|
||||
ldr r5, [r1, #12]
|
||||
uadd8 r8, r8, r11
|
||||
pld [r2, r3]
|
||||
lsr r10, r6, #8
|
||||
usada8 r0, r4, r8, r0
|
||||
orr r10, r10, r7, lsl #24
|
||||
subs r12, r12, #1
|
||||
eor r11, r6, r10
|
||||
add r1, r1, r3
|
||||
uhadd8 r9, r6, r10
|
||||
and r11, r11, lr
|
||||
uadd8 r9, r9, r11
|
||||
add r2, r2, r3
|
||||
usada8 r0, r5, r9, r0
|
||||
bgt 1b
|
||||
|
||||
pop {r4-r11, pc}
|
||||
endfunc
|
||||
|
||||
.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3
|
||||
ldr \n0, [r2]
|
||||
eor \n1, \p0, \n0
|
||||
uhadd8 \p0, \p0, \n0
|
||||
and \n1, \n1, lr
|
||||
ldr \n2, [r1]
|
||||
uadd8 \p0, \p0, \n1
|
||||
ldr \n1, [r2, #4]
|
||||
usada8 r0, \p0, \n2, r0
|
||||
pld [r1, r3]
|
||||
eor \n3, \p1, \n1
|
||||
uhadd8 \p1, \p1, \n1
|
||||
and \n3, \n3, lr
|
||||
ldr \p0, [r1, #4]
|
||||
uadd8 \p1, \p1, \n3
|
||||
ldr \n2, [r2, #8]
|
||||
usada8 r0, \p1, \p0, r0
|
||||
pld [r2, r3]
|
||||
eor \p0, \p2, \n2
|
||||
uhadd8 \p2, \p2, \n2
|
||||
and \p0, \p0, lr
|
||||
ldr \p1, [r1, #8]
|
||||
uadd8 \p2, \p2, \p0
|
||||
ldr \n3, [r2, #12]
|
||||
usada8 r0, \p2, \p1, r0
|
||||
eor \p1, \p3, \n3
|
||||
uhadd8 \p3, \p3, \n3
|
||||
and \p1, \p1, lr
|
||||
ldr \p0, [r1, #12]
|
||||
uadd8 \p3, \p3, \p1
|
||||
add r1, r1, r3
|
||||
usada8 r0, \p3, \p0, r0
|
||||
add r2, r2, r3
|
||||
.endm
|
||||
|
||||
function ff_pix_abs16_y2_armv6, export=1
|
||||
pld [r1]
|
||||
pld [r2]
|
||||
ldr r12, [sp]
|
||||
push {r4-r11, lr}
|
||||
mov r0, #0
|
||||
mov lr, #1
|
||||
orr lr, lr, lr, lsl #8
|
||||
orr lr, lr, lr, lsl #16
|
||||
ldr r4, [r2]
|
||||
ldr r5, [r2, #4]
|
||||
ldr r6, [r2, #8]
|
||||
ldr r7, [r2, #12]
|
||||
add r2, r2, r3
|
||||
1:
|
||||
usad_y2 r4, r5, r6, r7, r8, r9, r10, r11
|
||||
subs r12, r12, #2
|
||||
usad_y2 r8, r9, r10, r11, r4, r5, r6, r7
|
||||
bgt 1b
|
||||
|
||||
pop {r4-r11, pc}
|
||||
endfunc
|
||||
|
||||
function ff_pix_abs8_armv6, export=1
|
||||
pld [r2, r3]
|
||||
ldr r12, [sp]
|
||||
push {r4-r9, lr}
|
||||
mov r0, #0
|
||||
mov lr, #0
|
||||
ldrd_post r4, r5, r1, r3
|
||||
1:
|
||||
subs r12, r12, #2
|
||||
ldr r7, [r2, #4]
|
||||
ldr_post r6, r2, r3
|
||||
ldrd_post r8, r9, r1, r3
|
||||
usada8 r0, r4, r6, r0
|
||||
pld [r2, r3]
|
||||
usada8 lr, r5, r7, lr
|
||||
ldr r7, [r2, #4]
|
||||
ldr_post r6, r2, r3
|
||||
beq 2f
|
||||
ldrd_post r4, r5, r1, r3
|
||||
usada8 r0, r8, r6, r0
|
||||
pld [r2, r3]
|
||||
usada8 lr, r9, r7, lr
|
||||
b 1b
|
||||
2:
|
||||
usada8 r0, r8, r6, r0
|
||||
usada8 lr, r9, r7, lr
|
||||
add r0, r0, lr
|
||||
pop {r4-r9, pc}
|
||||
endfunc
|
||||
|
||||
function ff_sse16_armv6, export=1
|
||||
ldr r12, [sp]
|
||||
push {r4-r9, lr}
|
||||
mov r0, #0
|
||||
1:
|
||||
ldrd r4, r5, [r1]
|
||||
ldr r8, [r2]
|
||||
uxtb16 lr, r4
|
||||
uxtb16 r4, r4, ror #8
|
||||
uxtb16 r9, r8
|
||||
uxtb16 r8, r8, ror #8
|
||||
ldr r7, [r2, #4]
|
||||
usub16 lr, lr, r9
|
||||
usub16 r4, r4, r8
|
||||
smlad r0, lr, lr, r0
|
||||
uxtb16 r6, r5
|
||||
uxtb16 lr, r5, ror #8
|
||||
uxtb16 r8, r7
|
||||
uxtb16 r9, r7, ror #8
|
||||
smlad r0, r4, r4, r0
|
||||
ldrd r4, r5, [r1, #8]
|
||||
usub16 r6, r6, r8
|
||||
usub16 r8, lr, r9
|
||||
ldr r7, [r2, #8]
|
||||
smlad r0, r6, r6, r0
|
||||
uxtb16 lr, r4
|
||||
uxtb16 r4, r4, ror #8
|
||||
uxtb16 r9, r7
|
||||
uxtb16 r7, r7, ror #8
|
||||
smlad r0, r8, r8, r0
|
||||
ldr r8, [r2, #12]
|
||||
usub16 lr, lr, r9
|
||||
usub16 r4, r4, r7
|
||||
smlad r0, lr, lr, r0
|
||||
uxtb16 r6, r5
|
||||
uxtb16 r5, r5, ror #8
|
||||
uxtb16 r9, r8
|
||||
uxtb16 r8, r8, ror #8
|
||||
smlad r0, r4, r4, r0
|
||||
usub16 r6, r6, r9
|
||||
usub16 r5, r5, r8
|
||||
smlad r0, r6, r6, r0
|
||||
add r1, r1, r3
|
||||
add r2, r2, r3
|
||||
subs r12, r12, #1
|
||||
smlad r0, r5, r5, r0
|
||||
bgt 1b
|
||||
|
||||
pop {r4-r9, pc}
|
||||
endfunc
|
||||
57
externals/ffmpeg/libavcodec/arm/me_cmp_init_arm.c
vendored
Executable file
57
externals/ffmpeg/libavcodec/arm/me_cmp_init_arm.c
vendored
Executable file
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/me_cmp.h"
|
||||
#include "libavcodec/mpegvideo.h"
|
||||
|
||||
int ff_pix_abs16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_pix_abs16_x2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_pix_abs16_y2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
|
||||
ptrdiff_t stride, int h);
|
||||
|
||||
int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
|
||||
ptrdiff_t stride, int h);
|
||||
|
||||
int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
|
||||
ptrdiff_t stride, int h);
|
||||
|
||||
av_cold void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_armv6(cpu_flags)) {
|
||||
c->pix_abs[0][0] = ff_pix_abs16_armv6;
|
||||
c->pix_abs[0][1] = ff_pix_abs16_x2_armv6;
|
||||
c->pix_abs[0][2] = ff_pix_abs16_y2_armv6;
|
||||
|
||||
c->pix_abs[1][0] = ff_pix_abs8_armv6;
|
||||
|
||||
c->sad[0] = ff_pix_abs16_armv6;
|
||||
c->sad[1] = ff_pix_abs8_armv6;
|
||||
|
||||
c->sse[0] = ff_sse16_armv6;
|
||||
}
|
||||
}
|
||||
662
externals/ffmpeg/libavcodec/arm/mlpdsp_armv5te.S
vendored
Executable file
662
externals/ffmpeg/libavcodec/arm/mlpdsp_armv5te.S
vendored
Executable file
@@ -0,0 +1,662 @@
|
||||
/*
|
||||
* Copyright (c) 2014 RISC OS Open Ltd
|
||||
* Author: Ben Avison <bavison@riscosopen.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
#define MAX_CHANNELS 8
|
||||
#define MAX_FIR_ORDER 8
|
||||
#define MAX_IIR_ORDER 4
|
||||
#define MAX_RATEFACTOR 4
|
||||
#define MAX_BLOCKSIZE (40 * MAX_RATEFACTOR)
|
||||
|
||||
PST .req a1
|
||||
PCO .req a2
|
||||
AC0 .req a3
|
||||
AC1 .req a4
|
||||
CO0 .req v1
|
||||
CO1 .req v2
|
||||
CO2 .req v3
|
||||
CO3 .req v4
|
||||
ST0 .req v5
|
||||
ST1 .req v6
|
||||
ST2 .req sl
|
||||
ST3 .req fp
|
||||
I .req ip
|
||||
PSAMP .req lr
|
||||
|
||||
|
||||
.macro branch_pic_label first, remainder:vararg
|
||||
A .word \first - 4
|
||||
T .hword (\first) / 2
|
||||
.ifnb \remainder
|
||||
branch_pic_label \remainder
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// Some macros that do loads/multiplies where the register number is determined
|
||||
// from an assembly-time expression. Boy is GNU assembler's syntax ugly...
|
||||
|
||||
.macro load group, index, base, offset
|
||||
.altmacro
|
||||
load_ \group, %(\index), \base, \offset
|
||||
.noaltmacro
|
||||
.endm
|
||||
|
||||
.macro load_ group, index, base, offset
|
||||
ldr \group\index, [\base, #\offset]
|
||||
.endm
|
||||
|
||||
.macro loadd group, index, base, offset
|
||||
.altmacro
|
||||
loadd_ \group, %(\index), %(\index+1), \base, \offset
|
||||
.noaltmacro
|
||||
.endm
|
||||
|
||||
.macro loadd_ group, index0, index1, base, offset
|
||||
A .if \offset >= 256
|
||||
A ldr \group\index0, [\base, #\offset]
|
||||
A ldr \group\index1, [\base, #(\offset) + 4]
|
||||
A .else
|
||||
ldrd \group\index0, \group\index1, [\base, #\offset]
|
||||
A .endif
|
||||
.endm
|
||||
|
||||
.macro multiply index, accumulate, long
|
||||
.altmacro
|
||||
multiply_ %(\index), \accumulate, \long
|
||||
.noaltmacro
|
||||
.endm
|
||||
|
||||
.macro multiply_ index, accumulate, long
|
||||
.if \long
|
||||
.if \accumulate
|
||||
smlal AC0, AC1, CO\index, ST\index
|
||||
.else
|
||||
smull AC0, AC1, CO\index, ST\index
|
||||
.endif
|
||||
.else
|
||||
.if \accumulate
|
||||
mla AC0, CO\index, ST\index, AC0
|
||||
.else
|
||||
mul AC0, CO\index, ST\index
|
||||
.endif
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// A macro to update the load register number and load offsets
|
||||
|
||||
.macro inc howmany
|
||||
.set LOAD_REG, (LOAD_REG + \howmany) & 3
|
||||
.set OFFSET_CO, OFFSET_CO + 4 * \howmany
|
||||
.set OFFSET_ST, OFFSET_ST + 4 * \howmany
|
||||
.if FIR_REMAIN > 0
|
||||
.set FIR_REMAIN, FIR_REMAIN - \howmany
|
||||
.if FIR_REMAIN == 0
|
||||
.set OFFSET_CO, 4 * MAX_FIR_ORDER
|
||||
.set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
|
||||
.endif
|
||||
.elseif IIR_REMAIN > 0
|
||||
.set IIR_REMAIN, IIR_REMAIN - \howmany
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// Macro to implement the inner loop for one specific combination of parameters
|
||||
|
||||
.macro implement_filter mask_minus1, shift_0, shift_8, iir_taps, fir_taps
|
||||
.set TOTAL_TAPS, \iir_taps + \fir_taps
|
||||
|
||||
// Deal with register allocation...
|
||||
.set DEFINED_SHIFT, 0
|
||||
.set DEFINED_MASK, 0
|
||||
.set SHUFFLE_SHIFT, 0
|
||||
.set SHUFFLE_MASK, 0
|
||||
.set SPILL_SHIFT, 0
|
||||
.set SPILL_MASK, 0
|
||||
.if TOTAL_TAPS == 0
|
||||
// Little register pressure in this case - just keep MASK where it was
|
||||
.if !\mask_minus1
|
||||
MASK .req ST1
|
||||
.set DEFINED_MASK, 1
|
||||
.endif
|
||||
.else
|
||||
.if \shift_0
|
||||
.if !\mask_minus1
|
||||
// AC1 is unused with shift 0
|
||||
MASK .req AC1
|
||||
.set DEFINED_MASK, 1
|
||||
.set SHUFFLE_MASK, 1
|
||||
.endif
|
||||
.elseif \shift_8
|
||||
.if !\mask_minus1
|
||||
.if TOTAL_TAPS <= 4
|
||||
// All coefficients are preloaded (so pointer not needed)
|
||||
MASK .req PCO
|
||||
.set DEFINED_MASK, 1
|
||||
.set SHUFFLE_MASK, 1
|
||||
.else
|
||||
.set SPILL_MASK, 1
|
||||
.endif
|
||||
.endif
|
||||
.else // shift not 0 or 8
|
||||
.if TOTAL_TAPS <= 3
|
||||
// All coefficients are preloaded, and at least one CO register is unused
|
||||
.if \fir_taps & 1
|
||||
SHIFT .req CO0
|
||||
.set DEFINED_SHIFT, 1
|
||||
.set SHUFFLE_SHIFT, 1
|
||||
.else
|
||||
SHIFT .req CO3
|
||||
.set DEFINED_SHIFT, 1
|
||||
.set SHUFFLE_SHIFT, 1
|
||||
.endif
|
||||
.if !\mask_minus1
|
||||
MASK .req PCO
|
||||
.set DEFINED_MASK, 1
|
||||
.set SHUFFLE_MASK, 1
|
||||
.endif
|
||||
.elseif TOTAL_TAPS == 4
|
||||
// All coefficients are preloaded
|
||||
SHIFT .req PCO
|
||||
.set DEFINED_SHIFT, 1
|
||||
.set SHUFFLE_SHIFT, 1
|
||||
.if !\mask_minus1
|
||||
.set SPILL_MASK, 1
|
||||
.endif
|
||||
.else
|
||||
.set SPILL_SHIFT, 1
|
||||
.if !\mask_minus1
|
||||
.set SPILL_MASK, 1
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.if SPILL_SHIFT
|
||||
SHIFT .req ST0
|
||||
.set DEFINED_SHIFT, 1
|
||||
.endif
|
||||
.if SPILL_MASK
|
||||
MASK .req ST1
|
||||
.set DEFINED_MASK, 1
|
||||
.endif
|
||||
|
||||
// Preload coefficients if possible
|
||||
.if TOTAL_TAPS <= 4
|
||||
.set OFFSET_CO, 0
|
||||
.if \fir_taps & 1
|
||||
.set LOAD_REG, 1
|
||||
.else
|
||||
.set LOAD_REG, 0
|
||||
.endif
|
||||
.rept \fir_taps
|
||||
load CO, LOAD_REG, PCO, OFFSET_CO
|
||||
.set LOAD_REG, (LOAD_REG + 1) & 3
|
||||
.set OFFSET_CO, OFFSET_CO + 4
|
||||
.endr
|
||||
.set OFFSET_CO, 4 * MAX_FIR_ORDER
|
||||
.rept \iir_taps
|
||||
load CO, LOAD_REG, PCO, OFFSET_CO
|
||||
.set LOAD_REG, (LOAD_REG + 1) & 3
|
||||
.set OFFSET_CO, OFFSET_CO + 4
|
||||
.endr
|
||||
.endif
|
||||
|
||||
// Move mask/shift to final positions if necessary
|
||||
// Need to do this after preloading, because in some cases we
|
||||
// reuse the coefficient pointer register
|
||||
.if SHUFFLE_SHIFT
|
||||
mov SHIFT, ST0
|
||||
.endif
|
||||
.if SHUFFLE_MASK
|
||||
mov MASK, ST1
|
||||
.endif
|
||||
|
||||
// Begin loop
|
||||
01:
|
||||
.if TOTAL_TAPS == 0
|
||||
// Things simplify a lot in this case
|
||||
// In fact this could be pipelined further if it's worth it...
|
||||
ldr ST0, [PSAMP]
|
||||
subs I, I, #1
|
||||
.if !\mask_minus1
|
||||
and ST0, ST0, MASK
|
||||
.endif
|
||||
str ST0, [PST, #-4]!
|
||||
str ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
|
||||
str ST0, [PSAMP], #4 * MAX_CHANNELS
|
||||
bne 01b
|
||||
.else
|
||||
.if \fir_taps & 1
|
||||
.set LOAD_REG, 1
|
||||
.else
|
||||
.set LOAD_REG, 0
|
||||
.endif
|
||||
.set LOAD_BANK, 0
|
||||
.set FIR_REMAIN, \fir_taps
|
||||
.set IIR_REMAIN, \iir_taps
|
||||
.if FIR_REMAIN == 0 // only IIR terms
|
||||
.set OFFSET_CO, 4 * MAX_FIR_ORDER
|
||||
.set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
|
||||
.else
|
||||
.set OFFSET_CO, 0
|
||||
.set OFFSET_ST, 0
|
||||
.endif
|
||||
.set MUL_REG, LOAD_REG
|
||||
.set COUNTER, 0
|
||||
.rept TOTAL_TAPS + 2
|
||||
// Do load(s)
|
||||
.if FIR_REMAIN != 0 || IIR_REMAIN != 0
|
||||
.if COUNTER == 0
|
||||
.if TOTAL_TAPS > 4
|
||||
load CO, LOAD_REG, PCO, OFFSET_CO
|
||||
.endif
|
||||
load ST, LOAD_REG, PST, OFFSET_ST
|
||||
inc 1
|
||||
.elseif COUNTER == 1 && (\fir_taps & 1) == 0
|
||||
.if TOTAL_TAPS > 4
|
||||
load CO, LOAD_REG, PCO, OFFSET_CO
|
||||
.endif
|
||||
load ST, LOAD_REG, PST, OFFSET_ST
|
||||
inc 1
|
||||
.elseif LOAD_BANK == 0
|
||||
.if TOTAL_TAPS > 4
|
||||
.if FIR_REMAIN == 0 && IIR_REMAIN == 1
|
||||
load CO, LOAD_REG, PCO, OFFSET_CO
|
||||
.else
|
||||
loadd CO, LOAD_REG, PCO, OFFSET_CO
|
||||
.endif
|
||||
.endif
|
||||
.set LOAD_BANK, 1
|
||||
.else
|
||||
.if FIR_REMAIN == 0 && IIR_REMAIN == 1
|
||||
load ST, LOAD_REG, PST, OFFSET_ST
|
||||
inc 1
|
||||
.else
|
||||
loadd ST, LOAD_REG, PST, OFFSET_ST
|
||||
inc 2
|
||||
.endif
|
||||
.set LOAD_BANK, 0
|
||||
.endif
|
||||
.endif
|
||||
|
||||
// Do interleaved multiplies, slightly delayed
|
||||
.if COUNTER >= 2
|
||||
multiply MUL_REG, COUNTER > 2, !\shift_0
|
||||
.set MUL_REG, (MUL_REG + 1) & 3
|
||||
.endif
|
||||
.set COUNTER, COUNTER + 1
|
||||
.endr
|
||||
|
||||
// Post-process the result of the multiplies
|
||||
.if SPILL_SHIFT
|
||||
ldr SHIFT, [sp, #9*4 + 0*4]
|
||||
.endif
|
||||
.if SPILL_MASK
|
||||
ldr MASK, [sp, #9*4 + 1*4]
|
||||
.endif
|
||||
ldr ST2, [PSAMP]
|
||||
subs I, I, #1
|
||||
.if \shift_8
|
||||
mov AC0, AC0, lsr #8
|
||||
orr AC0, AC0, AC1, lsl #24
|
||||
.elseif !\shift_0
|
||||
rsb ST3, SHIFT, #32
|
||||
mov AC0, AC0, lsr SHIFT
|
||||
A orr AC0, AC0, AC1, lsl ST3
|
||||
T mov AC1, AC1, lsl ST3
|
||||
T orr AC0, AC0, AC1
|
||||
.endif
|
||||
.if \mask_minus1
|
||||
add ST3, ST2, AC0
|
||||
.else
|
||||
add ST2, ST2, AC0
|
||||
and ST3, ST2, MASK
|
||||
sub ST2, ST3, AC0
|
||||
.endif
|
||||
str ST3, [PST, #-4]!
|
||||
str ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
|
||||
str ST3, [PSAMP], #4 * MAX_CHANNELS
|
||||
bne 01b
|
||||
.endif
|
||||
b 99f
|
||||
|
||||
.if DEFINED_SHIFT
|
||||
.unreq SHIFT
|
||||
.endif
|
||||
.if DEFINED_MASK
|
||||
.unreq MASK
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro switch_on_fir_taps mask_minus1, shift_0, shift_8, iir_taps
|
||||
A ldr CO0, [pc, a3, lsl #2] // firorder is in range 0-(8-iir_taps)
|
||||
A add pc, pc, CO0
|
||||
T tbh [pc, a3, lsl #1]
|
||||
0:
|
||||
branch_pic_label (70f - 0b), (71f - 0b), (72f - 0b), (73f - 0b)
|
||||
branch_pic_label (74f - 0b)
|
||||
.if \iir_taps <= 3
|
||||
branch_pic_label (75f - 0b)
|
||||
.if \iir_taps <= 2
|
||||
branch_pic_label (76f - 0b)
|
||||
.if \iir_taps <= 1
|
||||
branch_pic_label (77f - 0b)
|
||||
.if \iir_taps == 0
|
||||
branch_pic_label (78f - 0b)
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
70: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 0
|
||||
71: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 1
|
||||
72: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 2
|
||||
73: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 3
|
||||
74: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 4
|
||||
.if \iir_taps <= 3
|
||||
75: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 5
|
||||
.if \iir_taps <= 2
|
||||
76: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 6
|
||||
.if \iir_taps <= 1
|
||||
77: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 7
|
||||
.if \iir_taps == 0
|
||||
78: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 8
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro switch_on_iir_taps mask_minus1, shift_0, shift_8
|
||||
A ldr CO0, [pc, a4, lsl #2] // irorder is in range 0-4
|
||||
A add pc, pc, CO0
|
||||
T tbh [pc, a4, lsl #1]
|
||||
0:
|
||||
branch_pic_label (60f - 0b), (61f - 0b), (62f - 0b), (63f - 0b)
|
||||
branch_pic_label (64f - 0b)
|
||||
60: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 0
|
||||
61: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 1
|
||||
62: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 2
|
||||
63: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 3
|
||||
64: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 4
|
||||
.endm
|
||||
|
||||
/* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
|
||||
* int firorder, int iirorder,
|
||||
* unsigned int filter_shift, int32_t mask,
|
||||
* int blocksize, int32_t *sample_buffer);
|
||||
*/
|
||||
function ff_mlp_filter_channel_arm, export=1
|
||||
push {v1-fp,lr}
|
||||
add v1, sp, #9*4 // point at arguments on stack
|
||||
ldm v1, {ST0,ST1,I,PSAMP}
|
||||
cmp ST1, #-1
|
||||
bne 30f
|
||||
movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
|
||||
bne 20f
|
||||
bcs 10f
|
||||
switch_on_iir_taps 1, 1, 0
|
||||
10: switch_on_iir_taps 1, 0, 1
|
||||
20: switch_on_iir_taps 1, 0, 0
|
||||
30: movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
|
||||
bne 50f
|
||||
bcs 40f
|
||||
switch_on_iir_taps 0, 1, 0
|
||||
40: switch_on_iir_taps 0, 0, 1
|
||||
50: switch_on_iir_taps 0, 0, 0
|
||||
99: pop {v1-fp,pc}
|
||||
endfunc
|
||||
|
||||
.unreq PST
|
||||
.unreq PCO
|
||||
.unreq AC0
|
||||
.unreq AC1
|
||||
.unreq CO0
|
||||
.unreq CO1
|
||||
.unreq CO2
|
||||
.unreq CO3
|
||||
.unreq ST0
|
||||
.unreq ST1
|
||||
.unreq ST2
|
||||
.unreq ST3
|
||||
.unreq I
|
||||
.unreq PSAMP
|
||||
|
||||
/********************************************************************/
|
||||
|
||||
PSA .req a1 // samples
|
||||
PCO .req a2 // coeffs
|
||||
PBL .req a3 // bypassed_lsbs
|
||||
INDEX .req a4
|
||||
CO0 .req v1
|
||||
CO1 .req v2
|
||||
CO2 .req v3
|
||||
CO3 .req v4
|
||||
SA0 .req v5
|
||||
SA1 .req v6
|
||||
SA2 .req sl
|
||||
SA3 .req fp
|
||||
AC0 .req ip
|
||||
AC1 .req lr
|
||||
NOISE .req SA0
|
||||
LSB .req SA1
|
||||
DCH .req SA2 // dest_ch
|
||||
MASK .req SA3
|
||||
|
||||
// INDEX is used as follows:
|
||||
// bits 0..6 index2 (values up to 17, but wider so that we can
|
||||
// add to index field without needing to mask)
|
||||
// bits 7..14 i (values up to 160)
|
||||
// bit 15 underflow detect for i
|
||||
// bits 25..31 (if access_unit_size_pow2 == 128) \ index
|
||||
// bits 26..31 (if access_unit_size_pow2 == 64) /
|
||||
|
||||
.macro implement_rematrix shift, index_mask, mask_minus1, maxchan
|
||||
.if \maxchan == 1
|
||||
// We can just leave the coefficients in registers in this case
|
||||
ldrd CO0, CO1, [PCO]
|
||||
.endif
|
||||
1:
|
||||
.if \maxchan == 1
|
||||
ldrd SA0, SA1, [PSA]
|
||||
smull AC0, AC1, CO0, SA0
|
||||
.elseif \maxchan == 5
|
||||
ldr CO0, [PCO, #0]
|
||||
ldr SA0, [PSA, #0]
|
||||
ldr CO1, [PCO, #4]
|
||||
ldr SA1, [PSA, #4]
|
||||
ldrd CO2, CO3, [PCO, #8]
|
||||
smull AC0, AC1, CO0, SA0
|
||||
ldrd SA2, SA3, [PSA, #8]
|
||||
smlal AC0, AC1, CO1, SA1
|
||||
ldrd CO0, CO1, [PCO, #16]
|
||||
smlal AC0, AC1, CO2, SA2
|
||||
ldrd SA0, SA1, [PSA, #16]
|
||||
smlal AC0, AC1, CO3, SA3
|
||||
smlal AC0, AC1, CO0, SA0
|
||||
.else // \maxchan == 7
|
||||
ldr CO2, [PCO, #0]
|
||||
ldr SA2, [PSA, #0]
|
||||
ldr CO3, [PCO, #4]
|
||||
ldr SA3, [PSA, #4]
|
||||
ldrd CO0, CO1, [PCO, #8]
|
||||
smull AC0, AC1, CO2, SA2
|
||||
ldrd SA0, SA1, [PSA, #8]
|
||||
smlal AC0, AC1, CO3, SA3
|
||||
ldrd CO2, CO3, [PCO, #16]
|
||||
smlal AC0, AC1, CO0, SA0
|
||||
ldrd SA2, SA3, [PSA, #16]
|
||||
smlal AC0, AC1, CO1, SA1
|
||||
ldrd CO0, CO1, [PCO, #24]
|
||||
smlal AC0, AC1, CO2, SA2
|
||||
ldrd SA0, SA1, [PSA, #24]
|
||||
smlal AC0, AC1, CO3, SA3
|
||||
smlal AC0, AC1, CO0, SA0
|
||||
.endif
|
||||
ldm sp, {NOISE, DCH, MASK}
|
||||
smlal AC0, AC1, CO1, SA1
|
||||
.if \shift != 0
|
||||
.if \index_mask == 63
|
||||
add NOISE, NOISE, INDEX, lsr #32-6
|
||||
ldrb LSB, [PBL], #MAX_CHANNELS
|
||||
ldrsb NOISE, [NOISE]
|
||||
add INDEX, INDEX, INDEX, lsl #32-6
|
||||
.else // \index_mask == 127
|
||||
add NOISE, NOISE, INDEX, lsr #32-7
|
||||
ldrb LSB, [PBL], #MAX_CHANNELS
|
||||
ldrsb NOISE, [NOISE]
|
||||
add INDEX, INDEX, INDEX, lsl #32-7
|
||||
.endif
|
||||
sub INDEX, INDEX, #1<<7
|
||||
adds AC0, AC0, NOISE, lsl #\shift + 7
|
||||
adc AC1, AC1, NOISE, asr #31
|
||||
.else
|
||||
ldrb LSB, [PBL], #MAX_CHANNELS
|
||||
sub INDEX, INDEX, #1<<7
|
||||
.endif
|
||||
add PSA, PSA, #MAX_CHANNELS*4
|
||||
mov AC0, AC0, lsr #14
|
||||
orr AC0, AC0, AC1, lsl #18
|
||||
.if !\mask_minus1
|
||||
and AC0, AC0, MASK
|
||||
.endif
|
||||
add AC0, AC0, LSB
|
||||
tst INDEX, #1<<15
|
||||
str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA
|
||||
beq 1b
|
||||
b 98f
|
||||
.endm
|
||||
|
||||
.macro switch_on_maxchan shift, index_mask, mask_minus1
|
||||
cmp v4, #5
|
||||
blo 51f
|
||||
beq 50f
|
||||
implement_rematrix \shift, \index_mask, \mask_minus1, 7
|
||||
50: implement_rematrix \shift, \index_mask, \mask_minus1, 5
|
||||
51: implement_rematrix \shift, \index_mask, \mask_minus1, 1
|
||||
.endm
|
||||
|
||||
.macro switch_on_mask shift, index_mask
|
||||
cmp sl, #-1
|
||||
bne 40f
|
||||
switch_on_maxchan \shift, \index_mask, 1
|
||||
40: switch_on_maxchan \shift, \index_mask, 0
|
||||
.endm
|
||||
|
||||
.macro switch_on_au_size shift
|
||||
.if \shift == 0
|
||||
switch_on_mask \shift, undefined
|
||||
.else
|
||||
teq v6, #64
|
||||
bne 30f
|
||||
orr INDEX, INDEX, v1, lsl #32-6
|
||||
switch_on_mask \shift, 63
|
||||
30: orr INDEX, INDEX, v1, lsl #32-7
|
||||
switch_on_mask \shift, 127
|
||||
.endif
|
||||
.endm
|
||||
|
||||
/* void ff_mlp_rematrix_channel_arm(int32_t *samples,
|
||||
* const int32_t *coeffs,
|
||||
* const uint8_t *bypassed_lsbs,
|
||||
* const int8_t *noise_buffer,
|
||||
* int index,
|
||||
* unsigned int dest_ch,
|
||||
* uint16_t blockpos,
|
||||
* unsigned int maxchan,
|
||||
* int matrix_noise_shift,
|
||||
* int access_unit_size_pow2,
|
||||
* int32_t mask);
|
||||
*/
|
||||
function ff_mlp_rematrix_channel_arm, export=1
|
||||
push {v1-fp,lr}
|
||||
add v1, sp, #9*4 // point at arguments on stack
|
||||
ldm v1, {v1-sl}
|
||||
teq v4, #1
|
||||
itt ne
|
||||
teqne v4, #5
|
||||
teqne v4, #7
|
||||
bne 99f
|
||||
teq v6, #64
|
||||
it ne
|
||||
teqne v6, #128
|
||||
bne 99f
|
||||
sub v2, v2, #MAX_CHANNELS
|
||||
push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned
|
||||
movs INDEX, v3, lsl #7
|
||||
beq 98f // just in case, do nothing if blockpos = 0
|
||||
subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time
|
||||
adc lr, v1, v1 // calculate index2 (C was set by preceding subs)
|
||||
orr INDEX, INDEX, lr
|
||||
// Switch on matrix_noise_shift: values 0 and 1 are
|
||||
// disproportionately common so do those in a form the branch
|
||||
// predictor can accelerate. Values can only go up to 15.
|
||||
cmp v5, #1
|
||||
beq 11f
|
||||
blo 10f
|
||||
A ldr v5, [pc, v5, lsl #2]
|
||||
A add pc, pc, v5
|
||||
T tbh [pc, v5, lsl #1]
|
||||
0:
|
||||
branch_pic_label 0, 0, (12f - 0b), (13f - 0b)
|
||||
branch_pic_label (14f - 0b), (15f - 0b), (16f - 0b), (17f - 0b)
|
||||
branch_pic_label (18f - 0b), (19f - 0b), (20f - 0b), (21f - 0b)
|
||||
branch_pic_label (22f - 0b), (23f - 0b), (24f - 0b), (25f - 0b)
|
||||
10: switch_on_au_size 0
|
||||
11: switch_on_au_size 1
|
||||
12: switch_on_au_size 2
|
||||
13: switch_on_au_size 3
|
||||
14: switch_on_au_size 4
|
||||
15: switch_on_au_size 5
|
||||
16: switch_on_au_size 6
|
||||
17: switch_on_au_size 7
|
||||
18: switch_on_au_size 8
|
||||
19: switch_on_au_size 9
|
||||
20: switch_on_au_size 10
|
||||
21: switch_on_au_size 11
|
||||
22: switch_on_au_size 12
|
||||
23: switch_on_au_size 13
|
||||
24: switch_on_au_size 14
|
||||
25: switch_on_au_size 15
|
||||
|
||||
98: add sp, sp, #3*4
|
||||
pop {v1-fp,pc}
|
||||
99: // Can't handle these parameters, drop back to C
|
||||
pop {v1-fp,lr}
|
||||
b X(ff_mlp_rematrix_channel)
|
||||
endfunc
|
||||
|
||||
.unreq PSA
|
||||
.unreq PCO
|
||||
.unreq PBL
|
||||
.unreq INDEX
|
||||
.unreq CO0
|
||||
.unreq CO1
|
||||
.unreq CO2
|
||||
.unreq CO3
|
||||
.unreq SA0
|
||||
.unreq SA1
|
||||
.unreq SA2
|
||||
.unreq SA3
|
||||
.unreq AC0
|
||||
.unreq AC1
|
||||
.unreq NOISE
|
||||
.unreq LSB
|
||||
.unreq DCH
|
||||
.unreq MASK
|
||||
533
externals/ffmpeg/libavcodec/arm/mlpdsp_armv6.S
vendored
Executable file
533
externals/ffmpeg/libavcodec/arm/mlpdsp_armv6.S
vendored
Executable file
@@ -0,0 +1,533 @@
|
||||
/*
|
||||
* Copyright (c) 2014 RISC OS Open Ltd
|
||||
* Author: Ben Avison <bavison@riscosopen.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
.macro loadregoffsh2 group, index, base, offgroup, offindex
|
||||
.altmacro
|
||||
loadregoffsh2_ \group, %(\index), \base, \offgroup, %(\offindex)
|
||||
.noaltmacro
|
||||
.endm
|
||||
|
||||
.macro loadregoffsh2_ group, index, base, offgroup, offindex
|
||||
ldr \group\index, [\base, \offgroup\offindex, lsl #2]
|
||||
.endm
|
||||
|
||||
.macro eorlslreg check, data, group, index
|
||||
.altmacro
|
||||
eorlslreg_ \check, \data, \group, %(\index)
|
||||
.noaltmacro
|
||||
.endm
|
||||
|
||||
.macro eorlslreg_ check, data, group, index
|
||||
eor \check, \check, \data, lsl \group\index
|
||||
.endm
|
||||
|
||||
.macro decr_modulo var, by, modulus
|
||||
.set \var, \var - \by
|
||||
.if \var == 0
|
||||
.set \var, \modulus
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro load_group1 size, channels, r0, r1, r2, r3, pointer_dead=0
|
||||
.if \size == 2
|
||||
ldrd \r0, \r1, [IN], #(\size + 8 - \channels) * 4
|
||||
.else // size == 4
|
||||
.if IDX1 > 4 || \channels==8
|
||||
ldm IN!, {\r0, \r1, \r2, \r3}
|
||||
.else
|
||||
ldm IN, {\r0, \r1, \r2, \r3}
|
||||
.if !\pointer_dead
|
||||
add IN, IN, #(4 + 8 - \channels) * 4
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
decr_modulo IDX1, \size, \channels
|
||||
.endm
|
||||
|
||||
.macro load_group2 size, channels, r0, r1, r2, r3, pointer_dead=0
|
||||
.if \size == 2
|
||||
.if IDX1 > 2
|
||||
ldm IN!, {\r2, \r3}
|
||||
.else
|
||||
//A .ifc \r2, ip
|
||||
//A .if \pointer_dead
|
||||
//A ldm IN, {\r2, \r3}
|
||||
//A .else
|
||||
//A ldr \r2, [IN], #4
|
||||
//A ldr \r3, [IN], #(\size - 1 + 8 - \channels) * 4
|
||||
//A .endif
|
||||
//A .else
|
||||
ldrd \r2, \r3, [IN], #(\size + 8 - \channels) * 4
|
||||
//A .endif
|
||||
.endif
|
||||
.endif
|
||||
decr_modulo IDX1, \size, \channels
|
||||
.endm
|
||||
|
||||
.macro implement_pack inorder, channels, shift
|
||||
.if \inorder
|
||||
.ifc \shift, mixed
|
||||
|
||||
CHECK .req a1
|
||||
COUNT .req a2
|
||||
IN .req a3
|
||||
OUT .req a4
|
||||
DAT0 .req v1
|
||||
DAT1 .req v2
|
||||
DAT2 .req v3
|
||||
DAT3 .req v4
|
||||
SHIFT0 .req v5
|
||||
SHIFT1 .req v6
|
||||
SHIFT2 .req sl
|
||||
SHIFT3 .req fp
|
||||
SHIFT4 .req ip
|
||||
SHIFT5 .req lr
|
||||
|
||||
.macro output4words
|
||||
.set SIZE_GROUP1, IDX1
|
||||
.if SIZE_GROUP1 > 4
|
||||
.set SIZE_GROUP1, 4
|
||||
.endif
|
||||
.set SIZE_GROUP2, 4 - SIZE_GROUP1
|
||||
load_group1 SIZE_GROUP1, \channels, DAT0, DAT1, DAT2, DAT3
|
||||
load_group2 SIZE_GROUP2, \channels, DAT0, DAT1, DAT2, DAT3
|
||||
.if \channels == 2
|
||||
lsl DAT0, SHIFT0
|
||||
lsl DAT1, SHIFT1
|
||||
lsl DAT2, SHIFT0
|
||||
lsl DAT3, SHIFT1
|
||||
.elseif \channels == 6
|
||||
.if IDX2 == 6
|
||||
lsl DAT0, SHIFT0
|
||||
lsl DAT1, SHIFT1
|
||||
lsl DAT2, SHIFT2
|
||||
lsl DAT3, SHIFT3
|
||||
.elseif IDX2 == 2
|
||||
lsl DAT0, SHIFT4
|
||||
lsl DAT1, SHIFT5
|
||||
lsl DAT2, SHIFT0
|
||||
lsl DAT3, SHIFT1
|
||||
.else // IDX2 == 4
|
||||
lsl DAT0, SHIFT2
|
||||
lsl DAT1, SHIFT3
|
||||
lsl DAT2, SHIFT4
|
||||
lsl DAT3, SHIFT5
|
||||
.endif
|
||||
.elseif \channels == 8
|
||||
.if IDX2 == 8
|
||||
uxtb SHIFT0, SHIFT4, ror #0
|
||||
uxtb SHIFT1, SHIFT4, ror #8
|
||||
uxtb SHIFT2, SHIFT4, ror #16
|
||||
uxtb SHIFT3, SHIFT4, ror #24
|
||||
.else
|
||||
uxtb SHIFT0, SHIFT5, ror #0
|
||||
uxtb SHIFT1, SHIFT5, ror #8
|
||||
uxtb SHIFT2, SHIFT5, ror #16
|
||||
uxtb SHIFT3, SHIFT5, ror #24
|
||||
.endif
|
||||
lsl DAT0, SHIFT0
|
||||
lsl DAT1, SHIFT1
|
||||
lsl DAT2, SHIFT2
|
||||
lsl DAT3, SHIFT3
|
||||
.endif
|
||||
eor CHECK, CHECK, DAT0, lsr #8 - (\channels - IDX2)
|
||||
eor CHECK, CHECK, DAT1, lsr #7 - (\channels - IDX2)
|
||||
decr_modulo IDX2, 2, \channels
|
||||
eor CHECK, CHECK, DAT2, lsr #8 - (\channels - IDX2)
|
||||
eor CHECK, CHECK, DAT3, lsr #7 - (\channels - IDX2)
|
||||
decr_modulo IDX2, 2, \channels
|
||||
stm OUT!, {DAT0 - DAT3}
|
||||
.endm
|
||||
|
||||
.set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4)
|
||||
.if (WORDS_PER_LOOP % 2) == 0
|
||||
.set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
|
||||
.endif
|
||||
.if (WORDS_PER_LOOP % 2) == 0
|
||||
.set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
|
||||
.endif
|
||||
.set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
|
||||
.set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
|
||||
|
||||
function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_armv6, export=1
|
||||
.if SAMPLES_PER_LOOP > 1
|
||||
tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
|
||||
it ne
|
||||
bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
|
||||
.endif
|
||||
teq COUNT, #0
|
||||
it eq
|
||||
bxeq lr
|
||||
push {v1-v6,sl,fp,lr}
|
||||
ldr SHIFT0, [sp, #(9+1)*4] // get output_shift from stack
|
||||
ldr SHIFT1, =0x08080808
|
||||
ldr SHIFT4, [SHIFT0]
|
||||
.if \channels == 2
|
||||
uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
|
||||
uxtb SHIFT0, SHIFT4, ror #0
|
||||
uxtb SHIFT1, SHIFT4, ror #8
|
||||
.else
|
||||
ldr SHIFT5, [SHIFT0, #4]
|
||||
uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
|
||||
uadd8 SHIFT5, SHIFT5, SHIFT1
|
||||
.if \channels == 6
|
||||
uxtb SHIFT0, SHIFT4, ror #0
|
||||
uxtb SHIFT1, SHIFT4, ror #8
|
||||
uxtb SHIFT2, SHIFT4, ror #16
|
||||
uxtb SHIFT3, SHIFT4, ror #24
|
||||
uxtb SHIFT4, SHIFT5, ror #0
|
||||
uxtb SHIFT5, SHIFT5, ror #8
|
||||
.endif
|
||||
.endif
|
||||
.set IDX1, \channels
|
||||
.set IDX2, \channels
|
||||
0:
|
||||
.rept WORDS_PER_LOOP / 4
|
||||
output4words
|
||||
.endr
|
||||
subs COUNT, COUNT, #SAMPLES_PER_LOOP
|
||||
bne 0b
|
||||
pop {v1-v6,sl,fp,pc}
|
||||
.ltorg
|
||||
endfunc
|
||||
.purgem output4words
|
||||
|
||||
.unreq CHECK
|
||||
.unreq COUNT
|
||||
.unreq IN
|
||||
.unreq OUT
|
||||
.unreq DAT0
|
||||
.unreq DAT1
|
||||
.unreq DAT2
|
||||
.unreq DAT3
|
||||
.unreq SHIFT0
|
||||
.unreq SHIFT1
|
||||
.unreq SHIFT2
|
||||
.unreq SHIFT3
|
||||
.unreq SHIFT4
|
||||
.unreq SHIFT5
|
||||
|
||||
.else // not mixed
|
||||
|
||||
CHECK .req a1
|
||||
COUNT .req a2
|
||||
IN .req a3
|
||||
OUT .req a4
|
||||
DAT0 .req v1
|
||||
DAT1 .req v2
|
||||
DAT2 .req v3
|
||||
DAT3 .req v4
|
||||
DAT4 .req v5
|
||||
DAT5 .req v6
|
||||
DAT6 .req sl // use these rather than the otherwise unused
|
||||
DAT7 .req fp // ip and lr so that we can load them using LDRD
|
||||
|
||||
.macro output4words tail, head, r0, r1, r2, r3, r4, r5, r6, r7, pointer_dead=0
|
||||
.if \head
|
||||
.set SIZE_GROUP1, IDX1
|
||||
.if SIZE_GROUP1 > 4
|
||||
.set SIZE_GROUP1, 4
|
||||
.endif
|
||||
.set SIZE_GROUP2, 4 - SIZE_GROUP1
|
||||
load_group1 SIZE_GROUP1, \channels, \r0, \r1, \r2, \r3, \pointer_dead
|
||||
.endif
|
||||
.if \tail
|
||||
eor CHECK, CHECK, \r4, lsr #8 - (\channels - IDX2)
|
||||
eor CHECK, CHECK, \r5, lsr #7 - (\channels - IDX2)
|
||||
decr_modulo IDX2, 2, \channels
|
||||
.endif
|
||||
.if \head
|
||||
load_group2 SIZE_GROUP2, \channels, \r0, \r1, \r2, \r3, \pointer_dead
|
||||
.endif
|
||||
.if \tail
|
||||
eor CHECK, CHECK, \r6, lsr #8 - (\channels - IDX2)
|
||||
eor CHECK, CHECK, \r7, lsr #7 - (\channels - IDX2)
|
||||
decr_modulo IDX2, 2, \channels
|
||||
stm OUT!, {\r4, \r5, \r6, \r7}
|
||||
.endif
|
||||
.if \head
|
||||
lsl \r0, #8 + \shift
|
||||
lsl \r1, #8 + \shift
|
||||
lsl \r2, #8 + \shift
|
||||
lsl \r3, #8 + \shift
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.set WORDS_PER_LOOP, \channels // calculate LCM (channels, 8)
|
||||
.if (WORDS_PER_LOOP % 2) == 0
|
||||
.set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
|
||||
.endif
|
||||
.if (WORDS_PER_LOOP % 2) == 0
|
||||
.set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
|
||||
.endif
|
||||
.if (WORDS_PER_LOOP % 2) == 0
|
||||
.set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
|
||||
.endif
|
||||
.set WORDS_PER_LOOP, WORDS_PER_LOOP * 8
|
||||
.set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
|
||||
|
||||
function ff_mlp_pack_output_inorder_\channels\()ch_\shift\()shift_armv6, export=1
|
||||
.if SAMPLES_PER_LOOP > 1
|
||||
tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
|
||||
it ne
|
||||
bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
|
||||
.endif
|
||||
subs COUNT, COUNT, #SAMPLES_PER_LOOP
|
||||
it lo
|
||||
bxlo lr
|
||||
push {v1-v6,sl,fp,lr}
|
||||
.set IDX1, \channels
|
||||
.set IDX2, \channels
|
||||
output4words 0, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
|
||||
0: beq 1f
|
||||
.rept WORDS_PER_LOOP / 8
|
||||
output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
|
||||
output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
|
||||
.endr
|
||||
subs COUNT, COUNT, #SAMPLES_PER_LOOP
|
||||
bne 0b
|
||||
1:
|
||||
.rept WORDS_PER_LOOP / 8 - 1
|
||||
output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
|
||||
output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
|
||||
.endr
|
||||
output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3, pointer_dead=1
|
||||
output4words 1, 0, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
|
||||
pop {v1-v6,sl,fp,pc}
|
||||
endfunc
|
||||
.purgem output4words
|
||||
|
||||
.unreq CHECK
|
||||
.unreq COUNT
|
||||
.unreq IN
|
||||
.unreq OUT
|
||||
.unreq DAT0
|
||||
.unreq DAT1
|
||||
.unreq DAT2
|
||||
.unreq DAT3
|
||||
.unreq DAT4
|
||||
.unreq DAT5
|
||||
.unreq DAT6
|
||||
.unreq DAT7
|
||||
|
||||
.endif // mixed
|
||||
.else // not inorder
|
||||
.ifc \shift, mixed
|
||||
|
||||
// This case not currently handled
|
||||
|
||||
.else // not mixed
|
||||
|
||||
#if !CONFIG_THUMB
|
||||
|
||||
CHECK .req a1
|
||||
COUNT .req a2
|
||||
IN .req a3
|
||||
OUT .req a4
|
||||
DAT0 .req v1
|
||||
DAT1 .req v2
|
||||
DAT2 .req v3
|
||||
DAT3 .req v4
|
||||
CHAN0 .req v5
|
||||
CHAN1 .req v6
|
||||
CHAN2 .req sl
|
||||
CHAN3 .req fp
|
||||
CHAN4 .req ip
|
||||
CHAN5 .req lr
|
||||
|
||||
.macro output4words
|
||||
.if \channels == 8
|
||||
.if IDX1 == 8
|
||||
uxtb CHAN0, CHAN4, ror #0
|
||||
uxtb CHAN1, CHAN4, ror #8
|
||||
uxtb CHAN2, CHAN4, ror #16
|
||||
uxtb CHAN3, CHAN4, ror #24
|
||||
.else
|
||||
uxtb CHAN0, CHAN5, ror #0
|
||||
uxtb CHAN1, CHAN5, ror #8
|
||||
uxtb CHAN2, CHAN5, ror #16
|
||||
uxtb CHAN3, CHAN5, ror #24
|
||||
.endif
|
||||
ldr DAT0, [IN, CHAN0, lsl #2]
|
||||
ldr DAT1, [IN, CHAN1, lsl #2]
|
||||
ldr DAT2, [IN, CHAN2, lsl #2]
|
||||
ldr DAT3, [IN, CHAN3, lsl #2]
|
||||
.if IDX1 == 4
|
||||
add IN, IN, #8*4
|
||||
.endif
|
||||
decr_modulo IDX1, 4, \channels
|
||||
.else
|
||||
.set SIZE_GROUP1, IDX1
|
||||
.if SIZE_GROUP1 > 4
|
||||
.set SIZE_GROUP1, 4
|
||||
.endif
|
||||
.set SIZE_GROUP2, 4 - SIZE_GROUP1
|
||||
.if SIZE_GROUP1 == 2
|
||||
loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
|
||||
loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
|
||||
add IN, IN, #8*4
|
||||
.else // SIZE_GROUP1 == 4
|
||||
loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
|
||||
loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
|
||||
loadregoffsh2 DAT, 2, IN, CHAN, 2 + (\channels - IDX1)
|
||||
loadregoffsh2 DAT, 3, IN, CHAN, 3 + (\channels - IDX1)
|
||||
.if IDX1 == 4
|
||||
add IN, IN, #8*4
|
||||
.endif
|
||||
.endif
|
||||
decr_modulo IDX1, SIZE_GROUP1, \channels
|
||||
.if SIZE_GROUP2 == 2
|
||||
loadregoffsh2 DAT, 2, IN, CHAN, 0 + (\channels - IDX1)
|
||||
loadregoffsh2 DAT, 3, IN, CHAN, 1 + (\channels - IDX1)
|
||||
.if IDX1 == 2
|
||||
add IN, IN, #8*4
|
||||
.endif
|
||||
.endif
|
||||
decr_modulo IDX1, SIZE_GROUP2, \channels
|
||||
.endif
|
||||
.if \channels == 8 // in this case we can corrupt CHAN0-3
|
||||
rsb CHAN0, CHAN0, #8
|
||||
rsb CHAN1, CHAN1, #8
|
||||
rsb CHAN2, CHAN2, #8
|
||||
rsb CHAN3, CHAN3, #8
|
||||
lsl DAT0, #8 + \shift
|
||||
lsl DAT1, #8 + \shift
|
||||
lsl DAT2, #8 + \shift
|
||||
lsl DAT3, #8 + \shift
|
||||
eor CHECK, CHECK, DAT0, lsr CHAN0
|
||||
eor CHECK, CHECK, DAT1, lsr CHAN1
|
||||
eor CHECK, CHECK, DAT2, lsr CHAN2
|
||||
eor CHECK, CHECK, DAT3, lsr CHAN3
|
||||
.else
|
||||
.if \shift != 0
|
||||
lsl DAT0, #\shift
|
||||
lsl DAT1, #\shift
|
||||
lsl DAT2, #\shift
|
||||
lsl DAT3, #\shift
|
||||
.endif
|
||||
bic DAT0, DAT0, #0xff000000
|
||||
bic DAT1, DAT1, #0xff000000
|
||||
bic DAT2, DAT2, #0xff000000
|
||||
bic DAT3, DAT3, #0xff000000
|
||||
eorlslreg CHECK, DAT0, CHAN, 0 + (\channels - IDX2)
|
||||
eorlslreg CHECK, DAT1, CHAN, 1 + (\channels - IDX2)
|
||||
decr_modulo IDX2, 2, \channels
|
||||
eorlslreg CHECK, DAT2, CHAN, 0 + (\channels - IDX2)
|
||||
eorlslreg CHECK, DAT3, CHAN, 1 + (\channels - IDX2)
|
||||
decr_modulo IDX2, 2, \channels
|
||||
lsl DAT0, #8
|
||||
lsl DAT1, #8
|
||||
lsl DAT2, #8
|
||||
lsl DAT3, #8
|
||||
.endif
|
||||
stm OUT!, {DAT0 - DAT3}
|
||||
.endm
|
||||
|
||||
.set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4)
|
||||
.if (WORDS_PER_LOOP % 2) == 0
|
||||
.set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
|
||||
.endif
|
||||
.if (WORDS_PER_LOOP % 2) == 0
|
||||
.set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
|
||||
.endif
|
||||
.set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
|
||||
.set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
|
||||
|
||||
function ff_mlp_pack_output_outoforder_\channels\()ch_\shift\()shift_armv6, export=1
|
||||
.if SAMPLES_PER_LOOP > 1
|
||||
tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
|
||||
it ne
|
||||
bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
|
||||
.endif
|
||||
teq COUNT, #0
|
||||
it eq
|
||||
bxeq lr
|
||||
push {v1-v6,sl,fp,lr}
|
||||
ldr CHAN0, [sp, #(9+0)*4] // get ch_assign from stack
|
||||
ldr CHAN4, [CHAN0]
|
||||
.if \channels == 2
|
||||
uxtb CHAN0, CHAN4, ror #0
|
||||
uxtb CHAN1, CHAN4, ror #8
|
||||
.else
|
||||
ldr CHAN5, [CHAN0, #4]
|
||||
.if \channels == 6
|
||||
uxtb CHAN0, CHAN4, ror #0
|
||||
uxtb CHAN1, CHAN4, ror #8
|
||||
uxtb CHAN2, CHAN4, ror #16
|
||||
uxtb CHAN3, CHAN4, ror #24
|
||||
uxtb CHAN4, CHAN5, ror #0
|
||||
uxtb CHAN5, CHAN5, ror #8
|
||||
.endif
|
||||
.endif
|
||||
.set IDX1, \channels
|
||||
.set IDX2, \channels
|
||||
0:
|
||||
.rept WORDS_PER_LOOP / 4
|
||||
output4words
|
||||
.endr
|
||||
subs COUNT, COUNT, #SAMPLES_PER_LOOP
|
||||
bne 0b
|
||||
pop {v1-v6,sl,fp,pc}
|
||||
.ltorg
|
||||
endfunc
|
||||
.purgem output4words
|
||||
|
||||
.unreq CHECK
|
||||
.unreq COUNT
|
||||
.unreq IN
|
||||
.unreq OUT
|
||||
.unreq DAT0
|
||||
.unreq DAT1
|
||||
.unreq DAT2
|
||||
.unreq DAT3
|
||||
.unreq CHAN0
|
||||
.unreq CHAN1
|
||||
.unreq CHAN2
|
||||
.unreq CHAN3
|
||||
.unreq CHAN4
|
||||
.unreq CHAN5
|
||||
|
||||
#endif // !CONFIG_THUMB
|
||||
|
||||
.endif // mixed
|
||||
.endif // inorder
|
||||
.endm // implement_pack
|
||||
|
||||
.macro pack_channels inorder, channels
|
||||
implement_pack \inorder, \channels, 0
|
||||
implement_pack \inorder, \channels, 1
|
||||
implement_pack \inorder, \channels, 2
|
||||
implement_pack \inorder, \channels, 3
|
||||
implement_pack \inorder, \channels, 4
|
||||
implement_pack \inorder, \channels, 5
|
||||
implement_pack \inorder, \channels, mixed
|
||||
.endm
|
||||
|
||||
.macro pack_order inorder
|
||||
pack_channels \inorder, 2
|
||||
pack_channels \inorder, 6
|
||||
pack_channels \inorder, 8
|
||||
.endm
|
||||
|
||||
pack_order 0
|
||||
pack_order 1
|
||||
146
externals/ffmpeg/libavcodec/arm/mlpdsp_init_arm.c
vendored
Executable file
146
externals/ffmpeg/libavcodec/arm/mlpdsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Copyright (c) 2014 RISC OS Open Ltd
|
||||
* Author: Ben Avison <bavison@riscosopen.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/mlpdsp.h"
|
||||
|
||||
void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
|
||||
int firorder, int iirorder,
|
||||
unsigned int filter_shift, int32_t mask,
|
||||
int blocksize, int32_t *sample_buffer);
|
||||
void ff_mlp_rematrix_channel_arm(int32_t *samples,
|
||||
const int32_t *coeffs,
|
||||
const uint8_t *bypassed_lsbs,
|
||||
const int8_t *noise_buffer,
|
||||
int index,
|
||||
unsigned int dest_ch,
|
||||
uint16_t blockpos,
|
||||
unsigned int maxchan,
|
||||
int matrix_noise_shift,
|
||||
int access_unit_size_pow2,
|
||||
int32_t mask);
|
||||
|
||||
#define DECLARE_PACK(order,channels,shift) \
|
||||
int32_t ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int);
|
||||
#define ENUMERATE_PACK(order,channels,shift) \
|
||||
ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6,
|
||||
#define PACK_CHANNELS(macro,order,channels) \
|
||||
macro(order,channels,0) \
|
||||
macro(order,channels,1) \
|
||||
macro(order,channels,2) \
|
||||
macro(order,channels,3) \
|
||||
macro(order,channels,4) \
|
||||
macro(order,channels,5) \
|
||||
macro(order,channels,mixed)
|
||||
#define PACK_ORDER(macro,order) \
|
||||
PACK_CHANNELS(macro,order,2) \
|
||||
PACK_CHANNELS(macro,order,6) \
|
||||
PACK_CHANNELS(macro,order,8)
|
||||
#define PACK_ALL(macro) \
|
||||
PACK_ORDER(macro,outof) \
|
||||
PACK_ORDER(macro,in)
|
||||
PACK_ALL(DECLARE_PACK)
|
||||
|
||||
#define ff_mlp_pack_output_outoforder_2ch_mixedshift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_6ch_mixedshift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_8ch_mixedshift_armv6 0
|
||||
#if CONFIG_THUMB
|
||||
#define ff_mlp_pack_output_outoforder_2ch_0shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_2ch_1shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_2ch_2shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_2ch_3shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_2ch_4shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_2ch_5shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_6ch_0shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_6ch_1shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_6ch_2shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_6ch_3shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_6ch_4shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_6ch_5shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_8ch_0shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_8ch_1shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_8ch_2shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_8ch_3shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_8ch_4shift_armv6 0
|
||||
#define ff_mlp_pack_output_outoforder_8ch_5shift_armv6 0
|
||||
#endif
|
||||
|
||||
static int32_t (*mlp_select_pack_output_armv6(uint8_t *ch_assign,
|
||||
int8_t *output_shift,
|
||||
uint8_t max_matrix_channel,
|
||||
int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int)
|
||||
{
|
||||
int ch_index;
|
||||
int shift = output_shift[0] < 0 || output_shift[0] > 5 ? 6 : output_shift[0];
|
||||
int inorder = 1;
|
||||
static int32_t (*const routine[2*3*7])(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int) = {
|
||||
PACK_ALL(ENUMERATE_PACK)
|
||||
};
|
||||
int i;
|
||||
|
||||
if (!is32) // don't support 16-bit output (it's not used by TrueHD)
|
||||
return ff_mlp_pack_output;
|
||||
|
||||
switch (max_matrix_channel) {
|
||||
case 1:
|
||||
ch_index = 0;
|
||||
break;
|
||||
case 5:
|
||||
ch_index = 1;
|
||||
break;
|
||||
case 7:
|
||||
ch_index = 2;
|
||||
break;
|
||||
default:
|
||||
return ff_mlp_pack_output;
|
||||
}
|
||||
|
||||
for (i = 0; i <= max_matrix_channel; i++) {
|
||||
if (shift != 6 && output_shift[i] != shift)
|
||||
shift = 6; // indicate mixed shifts
|
||||
if (ch_assign[i] != i)
|
||||
inorder = 0;
|
||||
}
|
||||
#if CONFIG_THUMB
|
||||
if (!inorder)
|
||||
return ff_mlp_pack_output; // can't currently handle an order array except in ARM mode
|
||||
#else
|
||||
if (shift == 6 && !inorder)
|
||||
return ff_mlp_pack_output; // can't currently handle both an order array and a shift array
|
||||
#endif
|
||||
|
||||
return routine[(inorder*3+ch_index)*7+shift];
|
||||
}
|
||||
|
||||
av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_armv5te(cpu_flags)) {
|
||||
c->mlp_filter_channel = ff_mlp_filter_channel_arm;
|
||||
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
|
||||
}
|
||||
if (have_armv6(cpu_flags))
|
||||
c->mlp_select_pack_output = mlp_select_pack_output_armv6;
|
||||
}
|
||||
143
externals/ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
vendored
Executable file
143
externals/ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
vendored
Executable file
@@ -0,0 +1,143 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
.macro skip args:vararg
|
||||
.endm
|
||||
|
||||
.macro sum8 lo, hi, w, p, t1, t2, t3, t4, rsb=skip, offs=0
|
||||
ldr \t1, [\w, #4*\offs]
|
||||
ldr \t2, [\p, #4]!
|
||||
\rsb \t1, \t1, #0
|
||||
.irpc i, 135
|
||||
ldr \t3, [\w, #4*64*\i+4*\offs]
|
||||
ldr \t4, [\p, #4*64*\i]
|
||||
smlal \lo, \hi, \t1, \t2
|
||||
\rsb \t3, \t3, #0
|
||||
ldr \t1, [\w, #4*64*(\i+1)+4*\offs]
|
||||
ldr \t2, [\p, #4*64*(\i+1)]
|
||||
smlal \lo, \hi, \t3, \t4
|
||||
\rsb \t1, \t1, #0
|
||||
.endr
|
||||
ldr \t3, [\w, #4*64*7+4*\offs]
|
||||
ldr \t4, [\p, #4*64*7]
|
||||
smlal \lo, \hi, \t1, \t2
|
||||
\rsb \t3, \t3, #0
|
||||
smlal \lo, \hi, \t3, \t4
|
||||
.endm
|
||||
|
||||
.macro round rd, lo, hi
|
||||
lsr \rd, \lo, #24
|
||||
bic \lo, \lo, #0xff000000
|
||||
orr \rd, \rd, \hi, lsl #8
|
||||
mov \hi, #0
|
||||
ssat \rd, #16, \rd
|
||||
.endm
|
||||
|
||||
function ff_mpadsp_apply_window_fixed_armv6, export=1
|
||||
push {r2,r4-r11,lr}
|
||||
|
||||
add r4, r0, #4*512 @ synth_buf + 512
|
||||
.rept 4
|
||||
ldm r0!, {r5-r12}
|
||||
stm r4!, {r5-r12}
|
||||
.endr
|
||||
|
||||
ldr r4, [sp, #40] @ incr
|
||||
sub r0, r0, #4*17 @ synth_buf + 16
|
||||
ldr r8, [r2] @ sum:low
|
||||
add r2, r0, #4*32 @ synth_buf + 48
|
||||
rsb r5, r4, r4, lsl #5 @ 31 * incr
|
||||
lsl r4, r4, #1
|
||||
asr r9, r8, #31 @ sum:high
|
||||
add r5, r3, r5, lsl #1 @ samples2
|
||||
add r6, r1, #4*32 @ w2
|
||||
str r4, [sp, #40]
|
||||
|
||||
sum8 r8, r9, r1, r0, r10, r11, r12, lr
|
||||
sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32
|
||||
round r10, r8, r9
|
||||
strh_post r10, r3, r4
|
||||
|
||||
mov lr, #15
|
||||
1:
|
||||
ldr r12, [r0, #4]!
|
||||
ldr r11, [r6, #-4]!
|
||||
ldr r10, [r1, #4]!
|
||||
.irpc i, 0246
|
||||
.if \i
|
||||
ldr r11, [r6, #4*64*\i]
|
||||
ldr r10, [r1, #4*64*\i]
|
||||
.endif
|
||||
rsb r11, r11, #0
|
||||
smlal r8, r9, r10, r12
|
||||
ldr r10, [r0, #4*64*(\i+1)]
|
||||
.ifeq \i
|
||||
smull r4, r7, r11, r12
|
||||
.else
|
||||
smlal r4, r7, r11, r12
|
||||
.endif
|
||||
ldr r11, [r6, #4*64*(\i+1)]
|
||||
ldr r12, [r1, #4*64*(\i+1)]
|
||||
rsb r11, r11, #0
|
||||
smlal r8, r9, r12, r10
|
||||
.iflt \i-6
|
||||
ldr r12, [r0, #4*64*(\i+2)]
|
||||
.else
|
||||
ldr r12, [r2, #-4]!
|
||||
.endif
|
||||
smlal r4, r7, r11, r10
|
||||
.endr
|
||||
.irpc i, 0246
|
||||
ldr r10, [r1, #4*64*\i+4*32]
|
||||
rsb r12, r12, #0
|
||||
ldr r11, [r6, #4*64*\i+4*32]
|
||||
smlal r8, r9, r10, r12
|
||||
ldr r10, [r2, #4*64*(\i+1)]
|
||||
smlal r4, r7, r11, r12
|
||||
ldr r12, [r1, #4*64*(\i+1)+4*32]
|
||||
rsb r10, r10, #0
|
||||
ldr r11, [r6, #4*64*(\i+1)+4*32]
|
||||
smlal r8, r9, r12, r10
|
||||
.iflt \i-6
|
||||
ldr r12, [r2, #4*64*(\i+2)]
|
||||
.else
|
||||
ldr r12, [sp, #40]
|
||||
.endif
|
||||
smlal r4, r7, r11, r10
|
||||
.endr
|
||||
round r10, r8, r9
|
||||
adds r8, r8, r4
|
||||
adc r9, r9, r7
|
||||
strh_post r10, r3, r12
|
||||
round r11, r8, r9
|
||||
subs lr, lr, #1
|
||||
strh_dpost r11, r5, r12
|
||||
bgt 1b
|
||||
|
||||
sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33
|
||||
pop {r4}
|
||||
round r10, r8, r9
|
||||
str r8, [r4]
|
||||
strh r10, [r3]
|
||||
|
||||
pop {r4-r11,pc}
|
||||
endfunc
|
||||
38
externals/ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c
vendored
Executable file
38
externals/ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Mans Rullgard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/mpegaudiodsp.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_mpadsp_apply_window_fixed_armv6(int32_t *synth_buf, int32_t *window,
|
||||
int *dither, int16_t *out, ptrdiff_t incr);
|
||||
|
||||
av_cold void ff_mpadsp_init_arm(MPADSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_armv6(cpu_flags)) {
|
||||
s->apply_window_fixed = ff_mpadsp_apply_window_fixed_armv6;
|
||||
}
|
||||
}
|
||||
54
externals/ffmpeg/libavcodec/arm/mpegvideo_arm.c
vendored
Executable file
54
externals/ffmpeg/libavcodec/arm/mpegvideo_arm.c
vendored
Executable file
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
* Copyright (c) 2002 Michael Niedermayer
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/mpegvideo.h"
|
||||
#include "mpegvideo_arm.h"
|
||||
#include "asm-offsets.h"
|
||||
|
||||
#if HAVE_NEON
|
||||
AV_CHECK_OFFSET(MpegEncContext, y_dc_scale, Y_DC_SCALE);
|
||||
AV_CHECK_OFFSET(MpegEncContext, c_dc_scale, C_DC_SCALE);
|
||||
AV_CHECK_OFFSET(MpegEncContext, ac_pred, AC_PRED);
|
||||
AV_CHECK_OFFSET(MpegEncContext, block_last_index, BLOCK_LAST_INDEX);
|
||||
AV_CHECK_OFFSET(MpegEncContext, inter_scantable.raster_end,
|
||||
INTER_SCANTAB_RASTER_END);
|
||||
AV_CHECK_OFFSET(MpegEncContext, h263_aic, H263_AIC);
|
||||
#endif
|
||||
|
||||
void ff_dct_unquantize_h263_inter_neon(MpegEncContext *s, int16_t *block,
|
||||
int n, int qscale);
|
||||
void ff_dct_unquantize_h263_intra_neon(MpegEncContext *s, int16_t *block,
|
||||
int n, int qscale);
|
||||
|
||||
av_cold void ff_mpv_common_init_arm(MpegEncContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_armv5te(cpu_flags))
|
||||
ff_mpv_common_init_armv5te(s);
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_neon;
|
||||
s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_neon;
|
||||
}
|
||||
}
|
||||
26
externals/ffmpeg/libavcodec/arm/mpegvideo_arm.h
vendored
Executable file
26
externals/ffmpeg/libavcodec/arm/mpegvideo_arm.h
vendored
Executable file
@@ -0,0 +1,26 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_ARM_MPEGVIDEO_ARM_H
|
||||
#define AVCODEC_ARM_MPEGVIDEO_ARM_H
|
||||
|
||||
#include "libavcodec/mpegvideo.h"
|
||||
|
||||
void ff_mpv_common_init_armv5te(MpegEncContext *s);
|
||||
|
||||
#endif /* AVCODEC_ARM_MPEGVIDEO_ARM_H */
|
||||
102
externals/ffmpeg/libavcodec/arm/mpegvideo_armv5te.c
vendored
Executable file
102
externals/ffmpeg/libavcodec/arm/mpegvideo_armv5te.c
vendored
Executable file
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Optimization of some functions from mpegvideo.c for armv5te
|
||||
* Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/avassert.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/mpegvideo.h"
|
||||
#include "mpegvideo_arm.h"
|
||||
|
||||
void ff_dct_unquantize_h263_armv5te(int16_t *block, int qmul, int qadd, int count);
|
||||
|
||||
#ifdef ENABLE_ARM_TESTS
|
||||
/**
|
||||
* H.263 dequantizer supplementary function, it is performance critical and needs to
|
||||
* have optimized implementations for each architecture. Is also used as a reference
|
||||
* implementation in regression tests
|
||||
*/
|
||||
static inline void dct_unquantize_h263_helper_c(int16_t *block, int qmul, int qadd, int count)
|
||||
{
|
||||
int i, level;
|
||||
for (i = 0; i < count; i++) {
|
||||
level = block[i];
|
||||
if (level) {
|
||||
if (level < 0) {
|
||||
level = level * qmul - qadd;
|
||||
} else {
|
||||
level = level * qmul + qadd;
|
||||
}
|
||||
block[i] = level;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
|
||||
int16_t *block, int n, int qscale)
|
||||
{
|
||||
int level, qmul, qadd;
|
||||
int nCoeffs;
|
||||
|
||||
av_assert2(s->block_last_index[n]>=0);
|
||||
|
||||
qmul = qscale << 1;
|
||||
|
||||
if (!s->h263_aic) {
|
||||
if (n < 4)
|
||||
level = block[0] * s->y_dc_scale;
|
||||
else
|
||||
level = block[0] * s->c_dc_scale;
|
||||
qadd = (qscale - 1) | 1;
|
||||
}else{
|
||||
qadd = 0;
|
||||
level = block[0];
|
||||
}
|
||||
if(s->ac_pred)
|
||||
nCoeffs=63;
|
||||
else
|
||||
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
|
||||
|
||||
ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
|
||||
block[0] = level;
|
||||
}
|
||||
|
||||
static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
|
||||
int16_t *block, int n, int qscale)
|
||||
{
|
||||
int qmul, qadd;
|
||||
int nCoeffs;
|
||||
|
||||
av_assert2(s->block_last_index[n]>=0);
|
||||
|
||||
qadd = (qscale - 1) | 1;
|
||||
qmul = qscale << 1;
|
||||
|
||||
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
|
||||
|
||||
ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
|
||||
}
|
||||
|
||||
av_cold void ff_mpv_common_init_armv5te(MpegEncContext *s)
|
||||
{
|
||||
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te;
|
||||
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te;
|
||||
}
|
||||
114
externals/ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S
vendored
Executable file
114
externals/ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S
vendored
Executable file
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
* Optimization of some functions from mpegvideo.c for armv5te
|
||||
* Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
/*
|
||||
* Special optimized version of dct_unquantize_h263_helper_c, it
|
||||
* requires the block to be at least 8 bytes aligned, and may process
|
||||
* more elements than requested. But it is guaranteed to never
|
||||
* process more than 64 elements provided that count argument is <= 64,
|
||||
* so it is safe. This function is optimized for a common distribution
|
||||
* of values for nCoeffs (they are mostly multiple of 8 plus one or
|
||||
* two extra elements). So this function processes data as 8 elements
|
||||
* per loop iteration and contains optional 2 elements processing in
|
||||
* the end.
|
||||
*
|
||||
* Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
|
||||
*/
|
||||
|
||||
.macro dequant_t dst, src, mul, add, tmp
|
||||
rsbs \tmp, ip, \src, asr #16
|
||||
it gt
|
||||
addgt \tmp, \add, #0
|
||||
it lt
|
||||
rsblt \tmp, \add, #0
|
||||
it ne
|
||||
smlatbne \dst, \src, \mul, \tmp
|
||||
.endm
|
||||
|
||||
.macro dequant_b dst, src, mul, add, tmp
|
||||
rsbs \tmp, ip, \src, lsl #16
|
||||
it gt
|
||||
addgt \tmp, \add, #0
|
||||
it lt
|
||||
rsblt \tmp, \add, #0
|
||||
it ne
|
||||
smlabbne \dst, \src, \mul, \tmp
|
||||
.endm
|
||||
|
||||
function ff_dct_unquantize_h263_armv5te, export=1
|
||||
push {r4-r9,lr}
|
||||
mov ip, #0
|
||||
subs r3, r3, #2
|
||||
ble 2f
|
||||
ldrd r4, r5, [r0, #0]
|
||||
1:
|
||||
ldrd r6, r7, [r0, #8]
|
||||
|
||||
dequant_t r9, r4, r1, r2, r9
|
||||
dequant_t lr, r5, r1, r2, lr
|
||||
dequant_b r4, r4, r1, r2, r8
|
||||
dequant_b r5, r5, r1, r2, r8
|
||||
|
||||
strh r4, [r0], #2
|
||||
strh r9, [r0], #2
|
||||
strh r5, [r0], #2
|
||||
strh lr, [r0], #2
|
||||
|
||||
dequant_t r9, r6, r1, r2, r9
|
||||
dequant_t lr, r7, r1, r2, lr
|
||||
dequant_b r6, r6, r1, r2, r8
|
||||
dequant_b r7, r7, r1, r2, r8
|
||||
|
||||
strh r6, [r0], #2
|
||||
strh r9, [r0], #2
|
||||
strh r7, [r0], #2
|
||||
strh lr, [r0], #2
|
||||
|
||||
subs r3, r3, #8
|
||||
it gt
|
||||
ldrdgt r4, r5, [r0, #0] /* load data early to avoid load/use pipeline stall */
|
||||
bgt 1b
|
||||
|
||||
adds r3, r3, #2
|
||||
it le
|
||||
pople {r4-r9,pc}
|
||||
2:
|
||||
ldrsh r9, [r0, #0]
|
||||
ldrsh lr, [r0, #2]
|
||||
mov r8, r2
|
||||
cmp r9, #0
|
||||
it lt
|
||||
rsblt r8, r2, #0
|
||||
it ne
|
||||
smlabbne r9, r9, r1, r8
|
||||
mov r8, r2
|
||||
cmp lr, #0
|
||||
it lt
|
||||
rsblt r8, r2, #0
|
||||
it ne
|
||||
smlabbne lr, lr, r1, r8
|
||||
strh r9, [r0], #2
|
||||
strh lr, [r0], #2
|
||||
pop {r4-r9,pc}
|
||||
endfunc
|
||||
107
externals/ffmpeg/libavcodec/arm/mpegvideo_neon.S
vendored
Executable file
107
externals/ffmpeg/libavcodec/arm/mpegvideo_neon.S
vendored
Executable file
@@ -0,0 +1,107 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Mans Rullgard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
#include "asm-offsets.h"
|
||||
|
||||
function ff_dct_unquantize_h263_inter_neon, export=1
|
||||
add r12, r0, #BLOCK_LAST_INDEX
|
||||
ldr r12, [r12, r2, lsl #2]
|
||||
add r0, r0, #INTER_SCANTAB_RASTER_END
|
||||
ldrb r12, [r0, r12]
|
||||
sub r2, r3, #1
|
||||
lsl r0, r3, #1
|
||||
orr r2, r2, #1
|
||||
add r3, r12, #1
|
||||
endfunc
|
||||
|
||||
function ff_dct_unquantize_h263_neon, export=1
|
||||
vdup.16 q15, r0 @ qmul
|
||||
vdup.16 q14, r2 @ qadd
|
||||
vneg.s16 q13, q14
|
||||
cmp r3, #4
|
||||
mov r0, r1
|
||||
ble 2f
|
||||
1:
|
||||
vld1.16 {q0}, [r0,:128]!
|
||||
vclt.s16 q3, q0, #0
|
||||
vld1.16 {q8}, [r0,:128]!
|
||||
vceq.s16 q1, q0, #0
|
||||
vmul.s16 q2, q0, q15
|
||||
vclt.s16 q11, q8, #0
|
||||
vmul.s16 q10, q8, q15
|
||||
vbsl q3, q13, q14
|
||||
vbsl q11, q13, q14
|
||||
vadd.s16 q2, q2, q3
|
||||
vceq.s16 q9, q8, #0
|
||||
vadd.s16 q10, q10, q11
|
||||
vbif q0, q2, q1
|
||||
vbif q8, q10, q9
|
||||
subs r3, r3, #16
|
||||
vst1.16 {q0}, [r1,:128]!
|
||||
vst1.16 {q8}, [r1,:128]!
|
||||
it le
|
||||
bxle lr
|
||||
cmp r3, #8
|
||||
bgt 1b
|
||||
2:
|
||||
vld1.16 {d0}, [r0,:64]
|
||||
vclt.s16 d3, d0, #0
|
||||
vceq.s16 d1, d0, #0
|
||||
vmul.s16 d2, d0, d30
|
||||
vbsl d3, d26, d28
|
||||
vadd.s16 d2, d2, d3
|
||||
vbif d0, d2, d1
|
||||
vst1.16 {d0}, [r1,:64]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_dct_unquantize_h263_intra_neon, export=1
|
||||
push {r4-r6,lr}
|
||||
add r12, r0, #BLOCK_LAST_INDEX
|
||||
ldr r6, [r0, #AC_PRED]
|
||||
add lr, r0, #INTER_SCANTAB_RASTER_END
|
||||
cmp r6, #0
|
||||
it ne
|
||||
movne r12, #63
|
||||
bne 1f
|
||||
ldr r12, [r12, r2, lsl #2]
|
||||
ldrb r12, [lr, r12]
|
||||
1: ldr r5, [r0, #H263_AIC]
|
||||
ldrsh r4, [r1]
|
||||
cmp r5, #0
|
||||
mov r5, r1
|
||||
it ne
|
||||
movne r2, #0
|
||||
bne 2f
|
||||
cmp r2, #4
|
||||
it ge
|
||||
addge r0, r0, #4
|
||||
sub r2, r3, #1
|
||||
ldr r6, [r0, #Y_DC_SCALE]
|
||||
orr r2, r2, #1
|
||||
smulbb r4, r4, r6
|
||||
2: lsl r0, r3, #1
|
||||
add r3, r12, #1
|
||||
bl X(ff_dct_unquantize_h263_neon)
|
||||
vmov.16 d0[0], r4
|
||||
vst1.16 {d0[0]}, [r5]
|
||||
pop {r4-r6,pc}
|
||||
endfunc
|
||||
76
externals/ffmpeg/libavcodec/arm/mpegvideoencdsp_armv6.S
vendored
Executable file
76
externals/ffmpeg/libavcodec/arm/mpegvideoencdsp_armv6.S
vendored
Executable file
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_pix_norm1_armv6, export=1
|
||||
push {r4-r6, lr}
|
||||
mov r12, #16
|
||||
mov lr, #0
|
||||
1:
|
||||
ldm r0, {r2-r5}
|
||||
uxtb16 r6, r2
|
||||
uxtb16 r2, r2, ror #8
|
||||
smlad lr, r6, r6, lr
|
||||
uxtb16 r6, r3
|
||||
smlad lr, r2, r2, lr
|
||||
uxtb16 r3, r3, ror #8
|
||||
smlad lr, r6, r6, lr
|
||||
uxtb16 r6, r4
|
||||
smlad lr, r3, r3, lr
|
||||
uxtb16 r4, r4, ror #8
|
||||
smlad lr, r6, r6, lr
|
||||
uxtb16 r6, r5
|
||||
smlad lr, r4, r4, lr
|
||||
uxtb16 r5, r5, ror #8
|
||||
smlad lr, r6, r6, lr
|
||||
subs r12, r12, #1
|
||||
add r0, r0, r1
|
||||
smlad lr, r5, r5, lr
|
||||
bgt 1b
|
||||
|
||||
mov r0, lr
|
||||
pop {r4-r6, pc}
|
||||
endfunc
|
||||
|
||||
function ff_pix_sum_armv6, export=1
|
||||
push {r4-r7, lr}
|
||||
mov r12, #16
|
||||
mov r2, #0
|
||||
mov r3, #0
|
||||
mov lr, #0
|
||||
ldr r4, [r0]
|
||||
1:
|
||||
subs r12, r12, #1
|
||||
ldr r5, [r0, #4]
|
||||
usada8 r2, r4, lr, r2
|
||||
ldr r6, [r0, #8]
|
||||
usada8 r3, r5, lr, r3
|
||||
ldr r7, [r0, #12]
|
||||
usada8 r2, r6, lr, r2
|
||||
beq 2f
|
||||
ldr_pre r4, r0, r1
|
||||
usada8 r3, r7, lr, r3
|
||||
bgt 1b
|
||||
2:
|
||||
usada8 r3, r7, lr, r3
|
||||
add r0, r2, r3
|
||||
pop {r4-r7, pc}
|
||||
endfunc
|
||||
38
externals/ffmpeg/libavcodec/arm/mpegvideoencdsp_init_arm.c
vendored
Executable file
38
externals/ffmpeg/libavcodec/arm/mpegvideoencdsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/mpegvideoencdsp.h"
|
||||
|
||||
int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
|
||||
int ff_pix_sum_armv6(uint8_t *pix, int line_size);
|
||||
|
||||
av_cold void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
|
||||
AVCodecContext *avctx)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_armv6(cpu_flags)) {
|
||||
c->pix_norm1 = ff_pix_norm1_armv6;
|
||||
c->pix_sum = ff_pix_sum_armv6;
|
||||
}
|
||||
}
|
||||
59
externals/ffmpeg/libavcodec/arm/neon.S
vendored
Executable file
59
externals/ffmpeg/libavcodec/arm/neon.S
vendored
Executable file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
|
||||
vtrn.32 \r0, \r4
|
||||
vtrn.32 \r1, \r5
|
||||
vtrn.32 \r2, \r6
|
||||
vtrn.32 \r3, \r7
|
||||
vtrn.16 \r0, \r2
|
||||
vtrn.16 \r1, \r3
|
||||
vtrn.16 \r4, \r6
|
||||
vtrn.16 \r5, \r7
|
||||
vtrn.8 \r0, \r1
|
||||
vtrn.8 \r2, \r3
|
||||
vtrn.8 \r4, \r5
|
||||
vtrn.8 \r6, \r7
|
||||
.endm
|
||||
|
||||
.macro transpose_4x4 r0, r1, r2, r3
|
||||
vtrn.16 \r0, \r2
|
||||
vtrn.16 \r1, \r3
|
||||
vtrn.8 \r0, \r1
|
||||
vtrn.8 \r2, \r3
|
||||
.endm
|
||||
|
||||
.macro swap4 r0, r1, r2, r3, r4, r5, r6, r7
|
||||
vswp \r0, \r4
|
||||
vswp \r1, \r5
|
||||
vswp \r2, \r6
|
||||
vswp \r3, \r7
|
||||
.endm
|
||||
|
||||
.macro transpose16_4x4 r0, r1, r2, r3, r4, r5, r6, r7
|
||||
vtrn.32 \r0, \r2
|
||||
vtrn.32 \r1, \r3
|
||||
vtrn.32 \r4, \r6
|
||||
vtrn.32 \r5, \r7
|
||||
vtrn.16 \r0, \r1
|
||||
vtrn.16 \r2, \r3
|
||||
vtrn.16 \r4, \r5
|
||||
vtrn.16 \r6, \r7
|
||||
.endm
|
||||
99
externals/ffmpeg/libavcodec/arm/neontest.c
vendored
Executable file
99
externals/ffmpeg/libavcodec/arm/neontest.c
vendored
Executable file
@@ -0,0 +1,99 @@
|
||||
/*
|
||||
* check NEON registers for clobbers
|
||||
* Copyright (c) 2013 Martin Storsjo
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavutil/arm/neontest.h"
|
||||
|
||||
wrap(avcodec_open2(AVCodecContext *avctx,
|
||||
const AVCodec *codec,
|
||||
AVDictionary **options))
|
||||
{
|
||||
testneonclobbers(avcodec_open2, avctx, codec, options);
|
||||
}
|
||||
|
||||
wrap(avcodec_decode_audio4(AVCodecContext *avctx,
|
||||
AVFrame *frame,
|
||||
int *got_frame_ptr,
|
||||
AVPacket *avpkt))
|
||||
{
|
||||
testneonclobbers(avcodec_decode_audio4, avctx, frame,
|
||||
got_frame_ptr, avpkt);
|
||||
}
|
||||
|
||||
wrap(avcodec_decode_video2(AVCodecContext *avctx,
|
||||
AVFrame *picture,
|
||||
int *got_picture_ptr,
|
||||
AVPacket *avpkt))
|
||||
{
|
||||
testneonclobbers(avcodec_decode_video2, avctx, picture,
|
||||
got_picture_ptr, avpkt);
|
||||
}
|
||||
|
||||
wrap(avcodec_decode_subtitle2(AVCodecContext *avctx,
|
||||
AVSubtitle *sub,
|
||||
int *got_sub_ptr,
|
||||
AVPacket *avpkt))
|
||||
{
|
||||
testneonclobbers(avcodec_decode_subtitle2, avctx, sub,
|
||||
got_sub_ptr, avpkt);
|
||||
}
|
||||
|
||||
wrap(avcodec_encode_audio2(AVCodecContext *avctx,
|
||||
AVPacket *avpkt,
|
||||
const AVFrame *frame,
|
||||
int *got_packet_ptr))
|
||||
{
|
||||
testneonclobbers(avcodec_encode_audio2, avctx, avpkt, frame,
|
||||
got_packet_ptr);
|
||||
}
|
||||
|
||||
wrap(avcodec_encode_subtitle(AVCodecContext *avctx,
|
||||
uint8_t *buf, int buf_size,
|
||||
const AVSubtitle *sub))
|
||||
{
|
||||
testneonclobbers(avcodec_encode_subtitle, avctx, buf, buf_size, sub);
|
||||
}
|
||||
|
||||
wrap(avcodec_encode_video2(AVCodecContext *avctx, AVPacket *avpkt,
|
||||
const AVFrame *frame, int *got_packet_ptr))
|
||||
{
|
||||
testneonclobbers(avcodec_encode_video2, avctx, avpkt, frame, got_packet_ptr);
|
||||
}
|
||||
|
||||
wrap(avcodec_send_packet(AVCodecContext *avctx, const AVPacket *avpkt))
|
||||
{
|
||||
testneonclobbers(avcodec_send_packet, avctx, avpkt);
|
||||
}
|
||||
|
||||
wrap(avcodec_receive_packet(AVCodecContext *avctx, AVPacket *avpkt))
|
||||
{
|
||||
testneonclobbers(avcodec_receive_packet, avctx, avpkt);
|
||||
}
|
||||
|
||||
wrap(avcodec_send_frame(AVCodecContext *avctx, const AVFrame *frame))
|
||||
{
|
||||
testneonclobbers(avcodec_send_frame, avctx, frame);
|
||||
}
|
||||
|
||||
wrap(avcodec_receive_frame(AVCodecContext *avctx, AVFrame *frame))
|
||||
{
|
||||
testneonclobbers(avcodec_receive_frame, avctx, frame);
|
||||
}
|
||||
76
externals/ffmpeg/libavcodec/arm/pixblockdsp_armv6.S
vendored
Executable file
76
externals/ffmpeg/libavcodec/arm/pixblockdsp_armv6.S
vendored
Executable file
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_get_pixels_armv6, export=1
|
||||
pld [r1, r2]
|
||||
push {r4-r8, lr}
|
||||
mov lr, #8
|
||||
1:
|
||||
ldrd_post r4, r5, r1, r2
|
||||
subs lr, lr, #1
|
||||
uxtb16 r6, r4
|
||||
uxtb16 r4, r4, ror #8
|
||||
uxtb16 r12, r5
|
||||
uxtb16 r8, r5, ror #8
|
||||
pld [r1, r2]
|
||||
pkhbt r5, r6, r4, lsl #16
|
||||
pkhtb r6, r4, r6, asr #16
|
||||
pkhbt r7, r12, r8, lsl #16
|
||||
pkhtb r12, r8, r12, asr #16
|
||||
stm r0!, {r5,r6,r7,r12}
|
||||
bgt 1b
|
||||
|
||||
pop {r4-r8, pc}
|
||||
endfunc
|
||||
|
||||
function ff_diff_pixels_armv6, export=1
|
||||
pld [r1, r3]
|
||||
pld [r2, r3]
|
||||
push {r4-r9, lr}
|
||||
mov lr, #8
|
||||
1:
|
||||
ldrd_post r4, r5, r1, r3
|
||||
ldrd_post r6, r7, r2, r3
|
||||
uxtb16 r8, r4
|
||||
uxtb16 r4, r4, ror #8
|
||||
uxtb16 r9, r6
|
||||
uxtb16 r6, r6, ror #8
|
||||
pld [r1, r3]
|
||||
ssub16 r9, r8, r9
|
||||
ssub16 r6, r4, r6
|
||||
uxtb16 r8, r5
|
||||
uxtb16 r5, r5, ror #8
|
||||
pld [r2, r3]
|
||||
pkhbt r4, r9, r6, lsl #16
|
||||
pkhtb r6, r6, r9, asr #16
|
||||
uxtb16 r9, r7
|
||||
uxtb16 r7, r7, ror #8
|
||||
ssub16 r9, r8, r9
|
||||
ssub16 r5, r5, r7
|
||||
subs lr, lr, #1
|
||||
pkhbt r8, r9, r5, lsl #16
|
||||
pkhtb r9, r5, r9, asr #16
|
||||
stm r0!, {r4,r6,r8,r9}
|
||||
bgt 1b
|
||||
|
||||
pop {r4-r9, pc}
|
||||
endfunc
|
||||
61
externals/ffmpeg/libavcodec/arm/pixblockdsp_init_arm.c
vendored
Executable file
61
externals/ffmpeg/libavcodec/arm/pixblockdsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/pixblockdsp.h"
|
||||
|
||||
void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t stride);
|
||||
void ff_diff_pixels_armv6(int16_t *block, const uint8_t *s1,
|
||||
const uint8_t *s2, ptrdiff_t stride);
|
||||
|
||||
void ff_get_pixels_neon(int16_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t stride);
|
||||
void ff_get_pixels_unaligned_neon(int16_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t stride);
|
||||
void ff_diff_pixels_neon(int16_t *block, const uint8_t *s1,
|
||||
const uint8_t *s2, ptrdiff_t stride);
|
||||
void ff_diff_pixels_unaligned_neon(int16_t *block, const uint8_t *s1,
|
||||
const uint8_t *s2, ptrdiff_t stride);
|
||||
|
||||
av_cold void ff_pixblockdsp_init_arm(PixblockDSPContext *c,
|
||||
AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_armv6(cpu_flags)) {
|
||||
if (!high_bit_depth)
|
||||
c->get_pixels = ff_get_pixels_armv6;
|
||||
c->diff_pixels = ff_diff_pixels_armv6;
|
||||
}
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
if (!high_bit_depth) {
|
||||
c->get_pixels_unaligned = ff_get_pixels_unaligned_neon;
|
||||
c->get_pixels = ff_get_pixels_neon;
|
||||
}
|
||||
c->diff_pixels_unaligned = ff_diff_pixels_unaligned_neon;
|
||||
c->diff_pixels = ff_diff_pixels_neon;
|
||||
}
|
||||
}
|
||||
69
externals/ffmpeg/libavcodec/arm/pixblockdsp_neon.S
vendored
Executable file
69
externals/ffmpeg/libavcodec/arm/pixblockdsp_neon.S
vendored
Executable file
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright (c) 2020 Martin Storsjo
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
.macro vld1_8 dst, src, incr, aligned
|
||||
.if \aligned
|
||||
vld1.8 {\dst}, [\src, :64], \incr
|
||||
.else
|
||||
vld1.8 {\dst}, [\src], \incr
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro get_pixels suffix, aligned
|
||||
function ff_get_pixels\suffix\()_neon, export=1
|
||||
mov r3, #8
|
||||
1:
|
||||
vld1_8 d0, r1, r2, \aligned
|
||||
subs r3, r3, #2
|
||||
vld1_8 d2, r1, r2, \aligned
|
||||
vmovl.u8 q0, d0
|
||||
vmovl.u8 q1, d2
|
||||
vst1.16 {q0, q1}, [r0, :128]!
|
||||
bgt 1b
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
get_pixels , aligned=1
|
||||
get_pixels _unaligned, aligned=0
|
||||
|
||||
.macro diff_pixels suffix, aligned=0
|
||||
function ff_diff_pixels\suffix\()_neon, export=1
|
||||
mov r12, #8
|
||||
1:
|
||||
vld1_8 d0, r1, r3, \aligned
|
||||
vld1_8 d1, r2, r3, \aligned
|
||||
subs r12, r12, #2
|
||||
vld1_8 d2, r1, r3, \aligned
|
||||
vsubl.u8 q0, d0, d1
|
||||
vld1_8 d3, r2, r3, \aligned
|
||||
vsubl.u8 q1, d2, d3
|
||||
vst1.16 {q0, q1}, [r0]!
|
||||
bgt 1b
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
diff_pixels , aligned=1
|
||||
diff_pixels _unaligned, aligned=0
|
||||
33
externals/ffmpeg/libavcodec/arm/rdft_init_arm.c
vendored
Executable file
33
externals/ffmpeg/libavcodec/arm/rdft_init_arm.c
vendored
Executable file
@@ -0,0 +1,33 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
|
||||
#include "libavcodec/rdft.h"
|
||||
|
||||
void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
|
||||
|
||||
av_cold void ff_rdft_init_arm(RDFTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags))
|
||||
s->rdft_calc = ff_rdft_calc_neon;
|
||||
}
|
||||
155
externals/ffmpeg/libavcodec/arm/rdft_neon.S
vendored
Executable file
155
externals/ffmpeg/libavcodec/arm/rdft_neon.S
vendored
Executable file
@@ -0,0 +1,155 @@
|
||||
/*
|
||||
* ARM NEON optimised RDFT
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_rdft_calc_neon, export=1
|
||||
push {r4-r8,lr}
|
||||
|
||||
ldr r6, [r0, #4] @ inverse
|
||||
mov r4, r0
|
||||
mov r5, r1
|
||||
|
||||
lsls r6, r6, #31
|
||||
bne 1f
|
||||
add r0, r4, #24
|
||||
bl X(ff_fft_permute_neon)
|
||||
add r0, r4, #24
|
||||
mov r1, r5
|
||||
bl X(ff_fft_calc_neon)
|
||||
1:
|
||||
ldr r12, [r4, #0] @ nbits
|
||||
mov r2, #1
|
||||
ldr r8, [r4, #20] @ negative_sin
|
||||
lsl r12, r2, r12
|
||||
add r0, r5, #8
|
||||
lsl r8, r8, #31
|
||||
add r1, r5, r12, lsl #2
|
||||
lsr r12, r12, #2
|
||||
vdup.32 d26, r8
|
||||
ldr r2, [r4, #12] @ tcos
|
||||
sub r12, r12, #2
|
||||
ldr r3, [r4, #16] @ tsin
|
||||
mov r7, r0
|
||||
sub r1, r1, #8
|
||||
mov lr, r1
|
||||
mov r8, #-8
|
||||
vld1.32 {d0}, [r0,:64]! @ d1[0,1]
|
||||
vld1.32 {d1}, [r1,:64], r8 @ d2[0,1]
|
||||
vld1.32 {d4}, [r2,:64]! @ tcos[i]
|
||||
vld1.32 {d5}, [r3,:64]! @ tsin[i]
|
||||
vmov.f32 d18, #0.5 @ k1
|
||||
vdup.32 d19, r6
|
||||
veor d5, d26, d5
|
||||
pld [r0, #32]
|
||||
veor d19, d18, d19 @ k2
|
||||
vmov.i32 d16, #0
|
||||
vmov.i32 d17, #1<<31
|
||||
pld [r1, #-32]
|
||||
vtrn.32 d16, d17
|
||||
pld [r2, #32]
|
||||
vrev64.32 d16, d16 @ d16=1,0 d17=0,1
|
||||
pld [r3, #32]
|
||||
2:
|
||||
veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1]
|
||||
vld1.32 {d24}, [r0,:64]! @ d1[0,1]
|
||||
vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1]
|
||||
vld1.32 {d25}, [r1,:64], r8 @ d2[0,1]
|
||||
vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1]
|
||||
veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1]
|
||||
pld [r0, #32]
|
||||
vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re
|
||||
pld [r1, #-32]
|
||||
vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1]
|
||||
vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1]
|
||||
vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re
|
||||
veor d7, d21, d16 @ -od.im, od.re
|
||||
vrev64.32 d3, d21 @ od.re, od.im
|
||||
veor d6, d20, d17 @ ev.re,-ev.im
|
||||
veor d2, d3, d16 @ -od.re, od.im
|
||||
vmla.f32 d20, d3, d4[1]
|
||||
vmla.f32 d20, d7, d5[1]
|
||||
vmla.f32 d6, d2, d4[1]
|
||||
vmla.f32 d6, d21, d5[1]
|
||||
vld1.32 {d4}, [r2,:64]! @ tcos[i]
|
||||
veor d7, d23, d16 @ -od.im, od.re
|
||||
vld1.32 {d5}, [r3,:64]! @ tsin[i]
|
||||
veor d24, d22, d17 @ ev.re,-ev.im
|
||||
vrev64.32 d3, d23 @ od.re, od.im
|
||||
veor d5, d26, d5
|
||||
pld [r2, #32]
|
||||
veor d2, d3, d16 @ -od.re, od.im
|
||||
pld [r3, #32]
|
||||
vmla.f32 d22, d3, d4[0]
|
||||
vmla.f32 d22, d7, d5[0]
|
||||
vmla.f32 d24, d2, d4[0]
|
||||
vmla.f32 d24, d23, d5[0]
|
||||
vld1.32 {d0}, [r0,:64]! @ d1[0,1]
|
||||
vld1.32 {d1}, [r1,:64], r8 @ d2[0,1]
|
||||
vst1.32 {d20}, [r7,:64]!
|
||||
vst1.32 {d6}, [lr,:64], r8
|
||||
vst1.32 {d22}, [r7,:64]!
|
||||
vst1.32 {d24}, [lr,:64], r8
|
||||
subs r12, r12, #2
|
||||
bgt 2b
|
||||
|
||||
veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1]
|
||||
vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1]
|
||||
vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1]
|
||||
ldr r2, [r4, #8] @ sign_convention
|
||||
vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re
|
||||
add r0, r0, #4
|
||||
bfc r2, #0, #31
|
||||
vld1.32 {d0[0]}, [r0,:32]
|
||||
veor d7, d21, d16 @ -od.im, od.re
|
||||
vrev64.32 d3, d21 @ od.re, od.im
|
||||
veor d6, d20, d17 @ ev.re,-ev.im
|
||||
vld1.32 {d22}, [r5,:64]
|
||||
vdup.32 d1, r2
|
||||
vmov d23, d22
|
||||
veor d2, d3, d16 @ -od.re, od.im
|
||||
vtrn.32 d22, d23
|
||||
veor d0, d0, d1
|
||||
veor d23, d23, d17
|
||||
vmla.f32 d20, d3, d4[1]
|
||||
vmla.f32 d20, d7, d5[1]
|
||||
vmla.f32 d6, d2, d4[1]
|
||||
vmla.f32 d6, d21, d5[1]
|
||||
vadd.f32 d22, d22, d23
|
||||
vst1.32 {d20}, [r7,:64]
|
||||
vst1.32 {d6}, [lr,:64]
|
||||
vst1.32 {d0[0]}, [r0,:32]
|
||||
vst1.32 {d22}, [r5,:64]
|
||||
|
||||
cmp r6, #0
|
||||
it eq
|
||||
popeq {r4-r8,pc}
|
||||
|
||||
vmul.f32 d22, d22, d18
|
||||
vst1.32 {d22}, [r5,:64]
|
||||
add r0, r4, #24
|
||||
mov r1, r5
|
||||
bl X(ff_fft_permute_neon)
|
||||
add r0, r4, #24
|
||||
mov r1, r5
|
||||
pop {r4-r8,lr}
|
||||
b X(ff_fft_calc_neon)
|
||||
endfunc
|
||||
46
externals/ffmpeg/libavcodec/arm/rv34dsp_init_arm.c
vendored
Executable file
46
externals/ffmpeg/libavcodec/arm/rv34dsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/rv34dsp.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
|
||||
void ff_rv34_inv_transform_noround_neon(int16_t *block);
|
||||
|
||||
void ff_rv34_inv_transform_noround_dc_neon(int16_t *block);
|
||||
|
||||
void ff_rv34_idct_add_neon(uint8_t *dst, ptrdiff_t stride, int16_t *block);
|
||||
void ff_rv34_idct_dc_add_neon(uint8_t *dst, ptrdiff_t stride, int dc);
|
||||
|
||||
av_cold void ff_rv34dsp_init_arm(RV34DSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
c->rv34_inv_transform = ff_rv34_inv_transform_noround_neon;
|
||||
c->rv34_inv_transform_dc = ff_rv34_inv_transform_noround_dc_neon;
|
||||
|
||||
c->rv34_idct_add = ff_rv34_idct_add_neon;
|
||||
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_neon;
|
||||
}
|
||||
}
|
||||
156
externals/ffmpeg/libavcodec/arm/rv34dsp_neon.S
vendored
Executable file
156
externals/ffmpeg/libavcodec/arm/rv34dsp_neon.S
vendored
Executable file
@@ -0,0 +1,156 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
.macro rv34_inv_transform r0
|
||||
vld1.16 {q14-q15}, [\r0,:128]
|
||||
vmov.s16 d0, #13
|
||||
vshll.s16 q12, d29, #3
|
||||
vshll.s16 q13, d29, #4
|
||||
vshll.s16 q9, d31, #3
|
||||
vshll.s16 q1, d31, #4
|
||||
vmull.s16 q10, d28, d0
|
||||
vmlal.s16 q10, d30, d0
|
||||
vmull.s16 q11, d28, d0
|
||||
vmlsl.s16 q11, d30, d0
|
||||
vsubw.s16 q12, q12, d29 @ z2 = block[i+4*1]*7
|
||||
vaddw.s16 q13, q13, d29 @ z3 = block[i+4*1]*17
|
||||
vsubw.s16 q9, q9, d31
|
||||
vaddw.s16 q1, q1, d31
|
||||
vadd.s32 q13, q13, q9 @ z3 = 17*block[i+4*1] + 7*block[i+4*3]
|
||||
vsub.s32 q12, q12, q1 @ z2 = 7*block[i+4*1] - 17*block[i+4*3]
|
||||
vadd.s32 q1, q10, q13 @ z0 + z3
|
||||
vadd.s32 q2, q11, q12 @ z1 + z2
|
||||
vsub.s32 q8, q10, q13 @ z0 - z3
|
||||
vsub.s32 q3, q11, q12 @ z1 - z2
|
||||
vtrn.32 q1, q2
|
||||
vtrn.32 q3, q8
|
||||
vswp d3, d6
|
||||
vswp d5, d16
|
||||
vmov.s32 d0, #13
|
||||
vadd.s32 q10, q1, q3
|
||||
vsub.s32 q11, q1, q3
|
||||
vshl.s32 q12, q2, #3
|
||||
vshl.s32 q9, q2, #4
|
||||
vmul.s32 q13, q11, d0[0]
|
||||
vshl.s32 q11, q8, #4
|
||||
vadd.s32 q9, q9, q2
|
||||
vshl.s32 q15, q8, #3
|
||||
vsub.s32 q12, q12, q2
|
||||
vadd.s32 q11, q11, q8
|
||||
vmul.s32 q14, q10, d0[0]
|
||||
vsub.s32 q8, q15, q8
|
||||
vsub.s32 q12, q12, q11
|
||||
vadd.s32 q9, q9, q8
|
||||
vadd.s32 q2, q13, q12 @ z1 + z2
|
||||
vadd.s32 q1, q14, q9 @ z0 + z3
|
||||
vsub.s32 q3, q13, q12 @ z1 - z2
|
||||
vsub.s32 q15, q14, q9 @ z0 - z3
|
||||
.endm
|
||||
|
||||
/* void rv34_idct_add_c(uint8_t *dst, int stride, int16_t *block) */
|
||||
function ff_rv34_idct_add_neon, export=1
|
||||
mov r3, r0
|
||||
rv34_inv_transform r2
|
||||
vmov.i16 q12, #0
|
||||
vrshrn.s32 d16, q1, #10 @ (z0 + z3) >> 10
|
||||
vrshrn.s32 d17, q2, #10 @ (z1 + z2) >> 10
|
||||
vrshrn.s32 d18, q3, #10 @ (z1 - z2) >> 10
|
||||
vrshrn.s32 d19, q15, #10 @ (z0 - z3) >> 10
|
||||
vld1.32 {d28[]}, [r0,:32], r1
|
||||
vld1.32 {d29[]}, [r0,:32], r1
|
||||
vtrn.32 q8, q9
|
||||
vld1.32 {d28[1]}, [r0,:32], r1
|
||||
vld1.32 {d29[1]}, [r0,:32], r1
|
||||
vst1.16 {q12}, [r2,:128]! @ memset(block, 0, 16)
|
||||
vst1.16 {q12}, [r2,:128] @ memset(block+16, 0, 16)
|
||||
vtrn.16 d16, d17
|
||||
vtrn.32 d28, d29
|
||||
vtrn.16 d18, d19
|
||||
vaddw.u8 q0, q8, d28
|
||||
vaddw.u8 q1, q9, d29
|
||||
vqmovun.s16 d28, q0
|
||||
vqmovun.s16 d29, q1
|
||||
vst1.32 {d28[0]}, [r3,:32], r1
|
||||
vst1.32 {d28[1]}, [r3,:32], r1
|
||||
vst1.32 {d29[0]}, [r3,:32], r1
|
||||
vst1.32 {d29[1]}, [r3,:32], r1
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
/* void rv34_inv_transform_noround_neon(int16_t *block); */
|
||||
function ff_rv34_inv_transform_noround_neon, export=1
|
||||
rv34_inv_transform r0
|
||||
vshl.s32 q11, q2, #1
|
||||
vshl.s32 q10, q1, #1
|
||||
vshl.s32 q12, q3, #1
|
||||
vshl.s32 q13, q15, #1
|
||||
vadd.s32 q11, q11, q2
|
||||
vadd.s32 q10, q10, q1
|
||||
vadd.s32 q12, q12, q3
|
||||
vadd.s32 q13, q13, q15
|
||||
vshrn.s32 d0, q10, #11 @ (z0 + z3)*3 >> 11
|
||||
vshrn.s32 d1, q11, #11 @ (z1 + z2)*3 >> 11
|
||||
vshrn.s32 d2, q12, #11 @ (z1 - z2)*3 >> 11
|
||||
vshrn.s32 d3, q13, #11 @ (z0 - z3)*3 >> 11
|
||||
vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0,:64]!
|
||||
vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0,:64]!
|
||||
vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0,:64]!
|
||||
vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0,:64]!
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
/* void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc) */
|
||||
function ff_rv34_idct_dc_add_neon, export=1
|
||||
mov r3, r0
|
||||
vld1.32 {d28[]}, [r0,:32], r1
|
||||
vld1.32 {d29[]}, [r0,:32], r1
|
||||
vdup.16 d0, r2
|
||||
vmov.s16 d1, #169
|
||||
vld1.32 {d28[1]}, [r0,:32], r1
|
||||
vmull.s16 q1, d0, d1 @ dc * 13 * 13
|
||||
vld1.32 {d29[1]}, [r0,:32], r1
|
||||
vrshrn.s32 d0, q1, #10 @ (dc * 13 * 13 + 0x200) >> 10
|
||||
vmov d1, d0
|
||||
vaddw.u8 q2, q0, d28
|
||||
vaddw.u8 q3, q0, d29
|
||||
vqmovun.s16 d28, q2
|
||||
vqmovun.s16 d29, q3
|
||||
vst1.32 {d28[0]}, [r3,:32], r1
|
||||
vst1.32 {d29[0]}, [r3,:32], r1
|
||||
vst1.32 {d28[1]}, [r3,:32], r1
|
||||
vst1.32 {d29[1]}, [r3,:32], r1
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
/* void rv34_inv_transform_dc_noround_c(int16_t *block) */
|
||||
function ff_rv34_inv_transform_noround_dc_neon, export=1
|
||||
vld1.16 {d28[]}, [r0,:16] @ block[0]
|
||||
vmov.i16 d4, #251
|
||||
vorr.s16 d4, #256 @ 13^2 * 3
|
||||
vmull.s16 q3, d28, d4
|
||||
vshrn.s32 d0, q3, #11
|
||||
vmov.i16 d1, d0
|
||||
vst1.64 {q0}, [r0,:128]!
|
||||
vst1.64 {q0}, [r0,:128]!
|
||||
bx lr
|
||||
endfunc
|
||||
150
externals/ffmpeg/libavcodec/arm/rv40dsp_init_arm.c
vendored
Executable file
150
externals/ffmpeg/libavcodec/arm/rv40dsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,150 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/rv34dsp.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
|
||||
#define DECL_QPEL3(type, w, pos) \
|
||||
void ff_ ## type ## _rv40_qpel ## w ## _mc ## pos ## _neon(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride)
|
||||
|
||||
#define DECL_QPEL2(w, pos) \
|
||||
DECL_QPEL3(put, w, pos); \
|
||||
DECL_QPEL3(avg, w, pos)
|
||||
|
||||
#define DECL_QPEL_XY(x, y) \
|
||||
DECL_QPEL2(16, x ## y); \
|
||||
DECL_QPEL2(8, x ## y)
|
||||
|
||||
#define DECL_QPEL_Y(y) \
|
||||
DECL_QPEL_XY(0, y); \
|
||||
DECL_QPEL_XY(1, y); \
|
||||
DECL_QPEL_XY(2, y); \
|
||||
DECL_QPEL_XY(3, y); \
|
||||
|
||||
DECL_QPEL_Y(0);
|
||||
DECL_QPEL_Y(1);
|
||||
DECL_QPEL_Y(2);
|
||||
DECL_QPEL_Y(3);
|
||||
|
||||
void ff_put_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
|
||||
void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
|
||||
|
||||
void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
|
||||
void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
|
||||
|
||||
void ff_rv40_weight_func_16_neon(uint8_t *, uint8_t *, uint8_t *, int, int, ptrdiff_t);
|
||||
void ff_rv40_weight_func_8_neon(uint8_t *, uint8_t *, uint8_t *, int, int, ptrdiff_t);
|
||||
|
||||
int ff_rv40_h_loop_filter_strength_neon(uint8_t *src, ptrdiff_t stride,
|
||||
int beta, int beta2, int edge,
|
||||
int *p1, int *q1);
|
||||
int ff_rv40_v_loop_filter_strength_neon(uint8_t *src, ptrdiff_t stride,
|
||||
int beta, int beta2, int edge,
|
||||
int *p1, int *q1);
|
||||
|
||||
void ff_rv40_h_weak_loop_filter_neon(uint8_t *src, ptrdiff_t stride, int filter_p1,
|
||||
int filter_q1, int alpha, int beta,
|
||||
int lim_p0q0, int lim_q1, int lim_p1);
|
||||
void ff_rv40_v_weak_loop_filter_neon(uint8_t *src, ptrdiff_t stride, int filter_p1,
|
||||
int filter_q1, int alpha, int beta,
|
||||
int lim_p0q0, int lim_q1, int lim_p1);
|
||||
|
||||
static av_cold void rv40dsp_init_neon(RV34DSPContext *c)
|
||||
{
|
||||
c->put_pixels_tab[0][ 1] = ff_put_rv40_qpel16_mc10_neon;
|
||||
c->put_pixels_tab[0][ 3] = ff_put_rv40_qpel16_mc30_neon;
|
||||
c->put_pixels_tab[0][ 4] = ff_put_rv40_qpel16_mc01_neon;
|
||||
c->put_pixels_tab[0][ 5] = ff_put_rv40_qpel16_mc11_neon;
|
||||
c->put_pixels_tab[0][ 6] = ff_put_rv40_qpel16_mc21_neon;
|
||||
c->put_pixels_tab[0][ 7] = ff_put_rv40_qpel16_mc31_neon;
|
||||
c->put_pixels_tab[0][ 9] = ff_put_rv40_qpel16_mc12_neon;
|
||||
c->put_pixels_tab[0][10] = ff_put_rv40_qpel16_mc22_neon;
|
||||
c->put_pixels_tab[0][11] = ff_put_rv40_qpel16_mc32_neon;
|
||||
c->put_pixels_tab[0][12] = ff_put_rv40_qpel16_mc03_neon;
|
||||
c->put_pixels_tab[0][13] = ff_put_rv40_qpel16_mc13_neon;
|
||||
c->put_pixels_tab[0][14] = ff_put_rv40_qpel16_mc23_neon;
|
||||
c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_neon;
|
||||
c->avg_pixels_tab[0][ 1] = ff_avg_rv40_qpel16_mc10_neon;
|
||||
c->avg_pixels_tab[0][ 3] = ff_avg_rv40_qpel16_mc30_neon;
|
||||
c->avg_pixels_tab[0][ 4] = ff_avg_rv40_qpel16_mc01_neon;
|
||||
c->avg_pixels_tab[0][ 5] = ff_avg_rv40_qpel16_mc11_neon;
|
||||
c->avg_pixels_tab[0][ 6] = ff_avg_rv40_qpel16_mc21_neon;
|
||||
c->avg_pixels_tab[0][ 7] = ff_avg_rv40_qpel16_mc31_neon;
|
||||
c->avg_pixels_tab[0][ 9] = ff_avg_rv40_qpel16_mc12_neon;
|
||||
c->avg_pixels_tab[0][10] = ff_avg_rv40_qpel16_mc22_neon;
|
||||
c->avg_pixels_tab[0][11] = ff_avg_rv40_qpel16_mc32_neon;
|
||||
c->avg_pixels_tab[0][12] = ff_avg_rv40_qpel16_mc03_neon;
|
||||
c->avg_pixels_tab[0][13] = ff_avg_rv40_qpel16_mc13_neon;
|
||||
c->avg_pixels_tab[0][14] = ff_avg_rv40_qpel16_mc23_neon;
|
||||
c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_neon;
|
||||
c->put_pixels_tab[1][ 1] = ff_put_rv40_qpel8_mc10_neon;
|
||||
c->put_pixels_tab[1][ 3] = ff_put_rv40_qpel8_mc30_neon;
|
||||
c->put_pixels_tab[1][ 4] = ff_put_rv40_qpel8_mc01_neon;
|
||||
c->put_pixels_tab[1][ 5] = ff_put_rv40_qpel8_mc11_neon;
|
||||
c->put_pixels_tab[1][ 6] = ff_put_rv40_qpel8_mc21_neon;
|
||||
c->put_pixels_tab[1][ 7] = ff_put_rv40_qpel8_mc31_neon;
|
||||
c->put_pixels_tab[1][ 9] = ff_put_rv40_qpel8_mc12_neon;
|
||||
c->put_pixels_tab[1][10] = ff_put_rv40_qpel8_mc22_neon;
|
||||
c->put_pixels_tab[1][11] = ff_put_rv40_qpel8_mc32_neon;
|
||||
c->put_pixels_tab[1][12] = ff_put_rv40_qpel8_mc03_neon;
|
||||
c->put_pixels_tab[1][13] = ff_put_rv40_qpel8_mc13_neon;
|
||||
c->put_pixels_tab[1][14] = ff_put_rv40_qpel8_mc23_neon;
|
||||
c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_neon;
|
||||
c->avg_pixels_tab[1][ 1] = ff_avg_rv40_qpel8_mc10_neon;
|
||||
c->avg_pixels_tab[1][ 3] = ff_avg_rv40_qpel8_mc30_neon;
|
||||
c->avg_pixels_tab[1][ 4] = ff_avg_rv40_qpel8_mc01_neon;
|
||||
c->avg_pixels_tab[1][ 5] = ff_avg_rv40_qpel8_mc11_neon;
|
||||
c->avg_pixels_tab[1][ 6] = ff_avg_rv40_qpel8_mc21_neon;
|
||||
c->avg_pixels_tab[1][ 7] = ff_avg_rv40_qpel8_mc31_neon;
|
||||
c->avg_pixels_tab[1][ 9] = ff_avg_rv40_qpel8_mc12_neon;
|
||||
c->avg_pixels_tab[1][10] = ff_avg_rv40_qpel8_mc22_neon;
|
||||
c->avg_pixels_tab[1][11] = ff_avg_rv40_qpel8_mc32_neon;
|
||||
c->avg_pixels_tab[1][12] = ff_avg_rv40_qpel8_mc03_neon;
|
||||
c->avg_pixels_tab[1][13] = ff_avg_rv40_qpel8_mc13_neon;
|
||||
c->avg_pixels_tab[1][14] = ff_avg_rv40_qpel8_mc23_neon;
|
||||
c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_neon;
|
||||
|
||||
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon;
|
||||
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon;
|
||||
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
|
||||
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
|
||||
|
||||
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_16_neon;
|
||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_8_neon;
|
||||
|
||||
c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon;
|
||||
c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon;
|
||||
c->rv40_weak_loop_filter[0] = ff_rv40_h_weak_loop_filter_neon;
|
||||
c->rv40_weak_loop_filter[1] = ff_rv40_v_weak_loop_filter_neon;
|
||||
}
|
||||
|
||||
av_cold void ff_rv40dsp_init_arm(RV34DSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags))
|
||||
rv40dsp_init_neon(c);
|
||||
}
|
||||
920
externals/ffmpeg/libavcodec/arm/rv40dsp_neon.S
vendored
Executable file
920
externals/ffmpeg/libavcodec/arm/rv40dsp_neon.S
vendored
Executable file
@@ -0,0 +1,920 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
|
||||
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
.macro qpel_lowpass r0, r1, rc1, rc2, shift
|
||||
vext.8 d25, \r0, \r1, #1 @ src[-1]
|
||||
vext.8 d26, \r0, \r1, #4 @ src[ 2]
|
||||
vext.8 d24, \r0, \r1, #5 @ src[ 3]
|
||||
vaddl.u8 q9, d25, d26
|
||||
vaddl.u8 q8, \r0, d24
|
||||
vext.8 d27, \r0, \r1, #2 @ src[ 0]
|
||||
vshl.s16 q12, q9, #2
|
||||
vsub.s16 q8, q8, q9
|
||||
vext.8 d28, \r0, \r1, #3 @ src[ 1]
|
||||
vsub.s16 q8, q8, q12
|
||||
vmlal.u8 q8, d27, \rc1
|
||||
vmlal.u8 q8, d28, \rc2
|
||||
vqrshrun.s16 \r0, q8, #\shift
|
||||
.endm
|
||||
|
||||
.macro qpel_lowpass_x2 r0, r1, r2, r3, rc1, rc2, shift
|
||||
vext.8 d25, \r0, \r1, #1 @ src[-1]
|
||||
vext.8 d26, \r0, \r1, #4 @ src[ 2]
|
||||
vext.8 d24, \r0, \r1, #5 @ src[ 3]
|
||||
vaddl.u8 q9, d25, d26
|
||||
vaddl.u8 q8, \r0, d24
|
||||
vext.8 d29, \r0, \r1, #2 @ src[ 0]
|
||||
vext.8 d28, \r0, \r1, #3 @ src[ 1]
|
||||
vshl.s16 q10, q9, #2
|
||||
vext.8 \r1, \r2, \r3, #1 @ src[-1]
|
||||
vsub.s16 q8, q8, q9
|
||||
vext.8 d22, \r2, \r3, #4 @ src[ 2]
|
||||
vext.8 \r0, \r2, \r3, #5 @ src[ 3]
|
||||
vaddl.u8 q13, \r1, d22
|
||||
vaddl.u8 q12, \r2, \r0
|
||||
vsub.s16 q8, q8, q10
|
||||
vshl.s16 q9, q13, #2
|
||||
vsub.s16 q12, q12, q13
|
||||
vmlal.u8 q8, d29, \rc1
|
||||
vmlal.u8 q8, d28, \rc2
|
||||
vsub.s16 q12, q12, q9
|
||||
vext.8 d26, \r2, \r3, #2 @ src[ 0]
|
||||
vext.8 d27, \r2, \r3, #3 @ src[ 1]
|
||||
vmlal.u8 q12, d26, \rc1
|
||||
vmlal.u8 q12, d27, \rc2
|
||||
vqrshrun.s16 \r0, q8, #\shift
|
||||
vqrshrun.s16 \r2, q12, #\shift
|
||||
.endm
|
||||
|
||||
.macro rv40_qpel8_h shift
|
||||
function put_rv40_qpel8_h_lp_packed_s\shift\()_neon
|
||||
1:
|
||||
vld1.8 {q2}, [r1], r2
|
||||
vld1.8 {q3}, [r1], r2
|
||||
qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, \shift
|
||||
vst1.8 {d4}, [r12,:64]!
|
||||
vst1.8 {d6}, [r12,:64]!
|
||||
subs r3, r3, #2
|
||||
bgt 1b
|
||||
vld1.8 {q2}, [r1]
|
||||
qpel_lowpass d4, d5, d0, d1, \shift
|
||||
vst1.8 {d4}, [r12,:64]!
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro rv40_qpel8_v shift, type
|
||||
function \type\()_rv40_qpel8_v_lp_packed_s\shift\()_neon
|
||||
vld1.64 {d2}, [r1,:64]!
|
||||
vld1.64 {d3}, [r1,:64]!
|
||||
vld1.64 {d4}, [r1,:64]!
|
||||
vld1.64 {d5}, [r1,:64]!
|
||||
vld1.64 {d6}, [r1,:64]!
|
||||
vld1.64 {d7}, [r1,:64]!
|
||||
vld1.64 {d8}, [r1,:64]!
|
||||
vld1.64 {d9}, [r1,:64]!
|
||||
vld1.64 {d10}, [r1,:64]!
|
||||
vld1.64 {d11}, [r1,:64]!
|
||||
vld1.64 {d12}, [r1,:64]!
|
||||
vld1.64 {d13}, [r1,:64]!
|
||||
vld1.64 {d14}, [r1,:64]!
|
||||
transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
|
||||
transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31
|
||||
qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, \shift
|
||||
qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, \shift
|
||||
qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, \shift
|
||||
qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, \shift
|
||||
transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
|
||||
.ifc \type,avg
|
||||
vld1.64 d12, [r0,:64], r2
|
||||
vld1.64 d13, [r0,:64], r2
|
||||
vld1.64 d14, [r0,:64], r2
|
||||
vld1.64 d15, [r0,:64], r2
|
||||
vld1.64 d16, [r0,:64], r2
|
||||
vld1.64 d17, [r0,:64], r2
|
||||
vld1.64 d18, [r0,:64], r2
|
||||
vld1.64 d19, [r0,:64], r2
|
||||
sub r0, r0, r2, lsl #3
|
||||
vrhadd.u8 q1, q1, q6
|
||||
vrhadd.u8 q2, q2, q7
|
||||
vrhadd.u8 q3, q3, q8
|
||||
vrhadd.u8 q4, q4, q9
|
||||
.endif
|
||||
vst1.64 d2, [r0,:64], r2
|
||||
vst1.64 d3, [r0,:64], r2
|
||||
vst1.64 d4, [r0,:64], r2
|
||||
vst1.64 d5, [r0,:64], r2
|
||||
vst1.64 d6, [r0,:64], r2
|
||||
vst1.64 d7, [r0,:64], r2
|
||||
vst1.64 d8, [r0,:64], r2
|
||||
vst1.64 d9, [r0,:64], r2
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
rv40_qpel8_h 5
|
||||
rv40_qpel8_h 6
|
||||
|
||||
.macro rv40_qpel type
|
||||
function \type\()_rv40_qpel8_h_lowpass_neon
|
||||
.ifc \type,avg
|
||||
mov r12, r0
|
||||
.endif
|
||||
1:
|
||||
vld1.8 {q2}, [r1], r2
|
||||
vld1.8 {q3}, [r1], r2
|
||||
qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, 6
|
||||
.ifc \type,avg
|
||||
vld1.8 {d3}, [r12,:64], r2
|
||||
vld1.8 {d16}, [r12,:64], r2
|
||||
vrhadd.u8 d4, d4, d3
|
||||
vrhadd.u8 d6, d6, d16
|
||||
.endif
|
||||
vst1.8 {d4}, [r0,:64], r2
|
||||
vst1.8 {d6}, [r0,:64], r2
|
||||
subs r3, r3, #2
|
||||
bgt 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function \type\()_rv40_qpel8_v_lowpass_neon
|
||||
vld1.64 {d2}, [r1], r2
|
||||
vld1.64 {d3}, [r1], r2
|
||||
vld1.64 {d4}, [r1], r2
|
||||
vld1.64 {d5}, [r1], r2
|
||||
vld1.64 {d6}, [r1], r2
|
||||
vld1.64 {d7}, [r1], r2
|
||||
vld1.64 {d8}, [r1], r2
|
||||
vld1.64 {d9}, [r1], r2
|
||||
vld1.64 {d10}, [r1], r2
|
||||
vld1.64 {d11}, [r1], r2
|
||||
vld1.64 {d12}, [r1], r2
|
||||
vld1.64 {d13}, [r1], r2
|
||||
vld1.64 {d14}, [r1]
|
||||
transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
|
||||
transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31
|
||||
qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, 6
|
||||
qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, 6
|
||||
qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, 6
|
||||
qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, 6
|
||||
transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
|
||||
.ifc \type,avg
|
||||
vld1.64 d12, [r0,:64], r2
|
||||
vld1.64 d13, [r0,:64], r2
|
||||
vld1.64 d14, [r0,:64], r2
|
||||
vld1.64 d15, [r0,:64], r2
|
||||
vld1.64 d16, [r0,:64], r2
|
||||
vld1.64 d17, [r0,:64], r2
|
||||
vld1.64 d18, [r0,:64], r2
|
||||
vld1.64 d19, [r0,:64], r2
|
||||
sub r0, r0, r2, lsl #3
|
||||
vrhadd.u8 q1, q1, q6
|
||||
vrhadd.u8 q2, q2, q7
|
||||
vrhadd.u8 q3, q3, q8
|
||||
vrhadd.u8 q4, q4, q9
|
||||
.endif
|
||||
vst1.64 d2, [r0,:64], r2
|
||||
vst1.64 d3, [r0,:64], r2
|
||||
vst1.64 d4, [r0,:64], r2
|
||||
vst1.64 d5, [r0,:64], r2
|
||||
vst1.64 d6, [r0,:64], r2
|
||||
vst1.64 d7, [r0,:64], r2
|
||||
vst1.64 d8, [r0,:64], r2
|
||||
vst1.64 d9, [r0,:64], r2
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
rv40_qpel8_v 5, \type
|
||||
rv40_qpel8_v 6, \type
|
||||
|
||||
function ff_\type\()_rv40_qpel8_mc10_neon, export=1
|
||||
sub r1, r1, #2
|
||||
mov r3, #8
|
||||
vmov.i8 d0, #52
|
||||
vmov.i8 d1, #20
|
||||
b \type\()_rv40_qpel8_h_lowpass_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel8_mc30_neon, export=1
|
||||
sub r1, r1, #2
|
||||
mov r3, #8
|
||||
vmov.i8 d0, #20
|
||||
vmov.i8 d1, #52
|
||||
b \type\()_rv40_qpel8_h_lowpass_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel8_mc01_neon, export=1
|
||||
push {r4, lr}
|
||||
vpush {d8-d15}
|
||||
sub r1, r1, r2, lsl #1
|
||||
vmov.i8 d0, #52
|
||||
vmov.i8 d1, #20
|
||||
bl \type\()_rv40_qpel8_v_lowpass_neon
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel8_mc11_neon, export=1
|
||||
push {r4, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #14*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, #12
|
||||
vmov.i8 d0, #52
|
||||
vmov.i8 d1, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
add r1, sp, #7
|
||||
bic r1, r1, #7
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
|
||||
add sp, sp, #14*8
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel8_mc21_neon, export=1
|
||||
push {r4, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #14*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, #12
|
||||
vmov.i8 d0, #20
|
||||
vmov.i8 d1, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s5_neon
|
||||
add r1, sp, #7
|
||||
bic r1, r1, #7
|
||||
vmov.i8 d0, #52
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
|
||||
add sp, sp, #14*8
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel8_mc31_neon, export=1
|
||||
push {r4, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #14*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, #12
|
||||
vmov.i8 d0, #20
|
||||
vmov.i8 d1, #52
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
add r1, sp, #7
|
||||
bic r1, r1, #7
|
||||
vswp d0, d1
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
|
||||
add sp, sp, #14*8
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel8_mc12_neon, export=1
|
||||
push {r4, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #14*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, #12
|
||||
vmov.i8 d0, #52
|
||||
vmov.i8 d1, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
add r1, sp, #7
|
||||
bic r1, r1, #7
|
||||
vmov.i8 d0, #20
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
|
||||
add sp, sp, #14*8
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel8_mc22_neon, export=1
|
||||
push {r4, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #14*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, #12
|
||||
vmov.i8 d0, #20
|
||||
vmov.i8 d1, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s5_neon
|
||||
add r1, sp, #7
|
||||
bic r1, r1, #7
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
|
||||
add sp, sp, #14*8
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel8_mc32_neon, export=1
|
||||
push {r4, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #14*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, #12
|
||||
vmov.i8 d0, #20
|
||||
vmov.i8 d1, #52
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
add r1, sp, #7
|
||||
bic r1, r1, #7
|
||||
vmov.i8 d1, #20
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
|
||||
add sp, sp, #14*8
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel8_mc03_neon, export=1
|
||||
push {r4, lr}
|
||||
vpush {d8-d15}
|
||||
sub r1, r1, r2, lsl #1
|
||||
vmov.i8 d0, #20
|
||||
vmov.i8 d1, #52
|
||||
bl \type\()_rv40_qpel8_v_lowpass_neon
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel8_mc33_neon, export=1
|
||||
mov r3, #8
|
||||
b X(ff_\type\()_pixels8_xy2_neon)
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel8_mc13_neon, export=1
|
||||
push {r4, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #14*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, #12
|
||||
vmov.i8 d0, #52
|
||||
vmov.i8 d1, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
add r1, sp, #7
|
||||
bic r1, r1, #7
|
||||
vswp d0, d1
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
|
||||
add sp, sp, #14*8
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel8_mc23_neon, export=1
|
||||
push {r4, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #14*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, #12
|
||||
vmov.i8 d0, #20
|
||||
vmov.i8 d1, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s5_neon
|
||||
add r1, sp, #7
|
||||
bic r1, r1, #7
|
||||
vmov.i8 d1, #52
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
|
||||
add sp, sp, #14*8
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel16_mc10_neon, export=1
|
||||
vmov.i8 d0, #52
|
||||
vmov.i8 d1, #20
|
||||
.L\type\()_rv40_qpel16_h:
|
||||
push {r1, lr}
|
||||
sub r1, r1, #2
|
||||
mov r3, #16
|
||||
bl \type\()_rv40_qpel8_h_lowpass_neon
|
||||
pop {r1, lr}
|
||||
sub r0, r0, r2, lsl #4
|
||||
add r0, r0, #8
|
||||
add r1, r1, #6
|
||||
mov r3, #16
|
||||
b \type\()_rv40_qpel8_h_lowpass_neon
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel16_mc30_neon, export=1
|
||||
vmov.i8 d0, #20
|
||||
vmov.i8 d1, #52
|
||||
b .L\type\()_rv40_qpel16_h
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel16_mc01_neon, export=1
|
||||
vmov.i8 d0, #52
|
||||
vmov.i8 d1, #20
|
||||
.L\type\()_rv40_qpel16_v:
|
||||
sub r1, r1, r2, lsl #1
|
||||
push {r1, lr}
|
||||
vpush {d8-d15}
|
||||
bl \type\()_rv40_qpel8_v_lowpass_neon
|
||||
sub r1, r1, r2, lsl #2
|
||||
bl \type\()_rv40_qpel8_v_lowpass_neon
|
||||
ldr r1, [sp, #64]
|
||||
sub r0, r0, r2, lsl #4
|
||||
add r0, r0, #8
|
||||
add r1, r1, #8
|
||||
bl \type\()_rv40_qpel8_v_lowpass_neon
|
||||
sub r1, r1, r2, lsl #2
|
||||
bl \type\()_rv40_qpel8_v_lowpass_neon
|
||||
vpop {d8-d15}
|
||||
pop {r1, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel16_mc11_neon, export=1
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
push {r1, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #44*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
mov r3, #20
|
||||
vmov.i8 d0, #52
|
||||
vmov.i8 d1, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
ldr r1, [sp, #416]
|
||||
add r1, r1, #8
|
||||
mov r3, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
.L\type\()_rv40_qpel16_v_s6:
|
||||
add r1, sp, #7
|
||||
bic r1, r1, #7
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
|
||||
sub r1, r1, #40
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
|
||||
sub r0, r0, r2, lsl #4
|
||||
add r0, r0, #8
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
|
||||
sub r1, r1, #40
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
|
||||
add sp, sp, #44*8
|
||||
vpop {d8-d15}
|
||||
pop {r1, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel16_mc21_neon, export=1
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
push {r1, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #44*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
mov r3, #20
|
||||
vmov.i8 d0, #20
|
||||
vmov.i8 d1, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s5_neon
|
||||
ldr r1, [sp, #416]
|
||||
add r1, r1, #8
|
||||
mov r3, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s5_neon
|
||||
vmov.i8 d0, #52
|
||||
b .L\type\()_rv40_qpel16_v_s6
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel16_mc31_neon, export=1
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
push {r1, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #44*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
mov r3, #20
|
||||
vmov.i8 d0, #20
|
||||
vmov.i8 d1, #52
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
ldr r1, [sp, #416]
|
||||
add r1, r1, #8
|
||||
mov r3, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
vswp d0, d1
|
||||
b .L\type\()_rv40_qpel16_v_s6
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel16_mc12_neon, export=1
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
push {r1, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #44*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
mov r3, #20
|
||||
vmov.i8 d0, #52
|
||||
vmov.i8 d1, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
ldr r1, [sp, #416]
|
||||
add r1, r1, #8
|
||||
mov r3, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
vmov.i8 d0, #20
|
||||
.L\type\()_rv40_qpel16_v_s5:
|
||||
add r1, sp, #7
|
||||
bic r1, r1, #7
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
|
||||
sub r1, r1, #40
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
|
||||
sub r0, r0, r2, lsl #4
|
||||
add r0, r0, #8
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
|
||||
sub r1, r1, #40
|
||||
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
|
||||
add sp, sp, #44*8
|
||||
vpop {d8-d15}
|
||||
pop {r1, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel16_mc22_neon, export=1
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
push {r1, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #44*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
mov r3, #20
|
||||
vmov.i8 d0, #20
|
||||
vmov.i8 d1, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s5_neon
|
||||
ldr r1, [sp, #416]
|
||||
add r1, r1, #8
|
||||
mov r3, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s5_neon
|
||||
b .L\type\()_rv40_qpel16_v_s5
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel16_mc32_neon, export=1
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
push {r1, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #44*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
mov r3, #20
|
||||
vmov.i8 d0, #20
|
||||
vmov.i8 d1, #52
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
ldr r1, [sp, #416]
|
||||
add r1, r1, #8
|
||||
mov r3, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
vmov.i8 d1, #20
|
||||
b .L\type\()_rv40_qpel16_v_s5
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel16_mc03_neon, export=1
|
||||
vmov.i8 d0, #20
|
||||
vmov.i8 d1, #52
|
||||
b .L\type\()_rv40_qpel16_v
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel16_mc13_neon, export=1
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
push {r1, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #44*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
mov r3, #20
|
||||
vmov.i8 d0, #52
|
||||
vmov.i8 d1, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
ldr r1, [sp, #416]
|
||||
add r1, r1, #8
|
||||
mov r3, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s6_neon
|
||||
vswp d0, d1
|
||||
b .L\type\()_rv40_qpel16_v_s6
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel16_mc23_neon, export=1
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
push {r1, lr}
|
||||
vpush {d8-d15}
|
||||
sub sp, sp, #44*8
|
||||
add r12, sp, #7
|
||||
bic r12, r12, #7
|
||||
mov r3, #20
|
||||
vmov.i8 d0, #20
|
||||
vmov.i8 d1, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s5_neon
|
||||
ldr r1, [sp, #416]
|
||||
add r1, r1, #8
|
||||
mov r3, #20
|
||||
bl put_rv40_qpel8_h_lp_packed_s5_neon
|
||||
vmov.i8 d1, #52
|
||||
b .L\type\()_rv40_qpel16_v_s6
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_rv40_qpel16_mc33_neon, export=1
|
||||
mov r3, #16
|
||||
b X(ff_\type\()_pixels16_xy2_neon)
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
rv40_qpel put
|
||||
rv40_qpel avg
|
||||
|
||||
.macro rv40_weight
|
||||
vmovl.u8 q8, d2
|
||||
vmovl.u8 q9, d3
|
||||
vmovl.u8 q10, d4
|
||||
vmovl.u8 q11, d5
|
||||
vmull.u16 q2, d16, d0[2]
|
||||
vmull.u16 q3, d17, d0[2]
|
||||
vmull.u16 q8, d18, d0[2]
|
||||
vmull.u16 q9, d19, d0[2]
|
||||
vmull.u16 q12, d20, d0[0]
|
||||
vmull.u16 q13, d21, d0[0]
|
||||
vmull.u16 q14, d22, d0[0]
|
||||
vmull.u16 q15, d23, d0[0]
|
||||
vshrn.i32 d4, q2, #9
|
||||
vshrn.i32 d5, q3, #9
|
||||
vshrn.i32 d6, q8, #9
|
||||
vshrn.i32 d7, q9, #9
|
||||
vshrn.i32 d16, q12, #9
|
||||
vshrn.i32 d17, q13, #9
|
||||
vshrn.i32 d18, q14, #9
|
||||
vshrn.i32 d19, q15, #9
|
||||
vadd.u16 q2, q2, q8
|
||||
vadd.u16 q3, q3, q9
|
||||
vrshrn.i16 d2, q2, #5
|
||||
vrshrn.i16 d3, q3, #5
|
||||
.endm
|
||||
|
||||
/* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
||||
int w1, int w2, int stride) */
|
||||
function ff_rv40_weight_func_16_neon, export=1
|
||||
ldr r12, [sp]
|
||||
vmov d0, r3, r12
|
||||
ldr r12, [sp, #4]
|
||||
mov r3, #16
|
||||
1:
|
||||
vld1.8 {q1}, [r1,:128], r12
|
||||
vld1.8 {q2}, [r2,:128], r12
|
||||
rv40_weight
|
||||
vst1.8 {q1}, [r0,:128], r12
|
||||
subs r3, r3, #1
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
/* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
||||
int w1, int w2, int stride) */
|
||||
function ff_rv40_weight_func_8_neon, export=1
|
||||
ldr r12, [sp]
|
||||
vmov d0, r3, r12
|
||||
ldr r12, [sp, #4]
|
||||
mov r3, #8
|
||||
1:
|
||||
vld1.8 {d2}, [r1,:64], r12
|
||||
vld1.8 {d3}, [r1,:64], r12
|
||||
vld1.8 {d4}, [r2,:64], r12
|
||||
vld1.8 {d5}, [r2,:64], r12
|
||||
rv40_weight
|
||||
vst1.8 {d2}, [r0,:64], r12
|
||||
vst1.8 {d3}, [r0,:64], r12
|
||||
subs r3, r3, #2
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_rv40_h_loop_filter_strength_neon, export=1
|
||||
pkhbt r2, r3, r2, lsl #18
|
||||
|
||||
ldr r3, [r0]
|
||||
ldr_dpre r12, r0, r1
|
||||
teq r3, r12
|
||||
beq 1f
|
||||
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
vld1.32 {d4[]}, [r0,:32], r1 @ -3
|
||||
vld1.32 {d0[]}, [r0,:32], r1 @ -2
|
||||
vld1.32 {d4[1]}, [r0,:32], r1 @ -1
|
||||
vld1.32 {d5[]}, [r0,:32], r1 @ 0
|
||||
vld1.32 {d1[]}, [r0,:32], r1 @ 1
|
||||
vld1.32 {d5[0]}, [r0,:32], r1 @ 2
|
||||
|
||||
vpaddl.u8 q8, q0 @ -2, -2, -2, -2, 1, 1, 1, 1
|
||||
vpaddl.u8 q9, q2 @ -3, -3, -1, -1, 2, 2, 0, 0
|
||||
vdup.32 d30, r2 @ beta2, beta << 2
|
||||
vpadd.u16 d16, d16, d17 @ -2, -2, 1, 1
|
||||
vpadd.u16 d18, d18, d19 @ -3, -1, 2, 0
|
||||
vabd.u16 d16, d18, d16
|
||||
vclt.u16 d16, d16, d30
|
||||
|
||||
ldrd r2, r3, [sp, #4]
|
||||
vmovl.u16 q12, d16
|
||||
vtrn.16 d16, d17
|
||||
vshr.u32 q12, q12, #15
|
||||
ldr r0, [sp]
|
||||
vst1.32 {d24[1]}, [r2,:32]
|
||||
vst1.32 {d25[1]}, [r3,:32]
|
||||
|
||||
cmp r0, #0
|
||||
it eq
|
||||
bxeq lr
|
||||
|
||||
vand d18, d16, d17
|
||||
vtrn.32 d18, d19
|
||||
vand d18, d18, d19
|
||||
vmov.u16 r0, d18[0]
|
||||
bx lr
|
||||
1:
|
||||
ldrd r2, r3, [sp, #4]
|
||||
mov r0, #0
|
||||
str r0, [r2]
|
||||
str r0, [r3]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_rv40_v_loop_filter_strength_neon, export=1
|
||||
sub r0, r0, #3
|
||||
pkhbt r2, r3, r2, lsl #18
|
||||
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d1}, [r0], r1
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d3}, [r0], r1
|
||||
|
||||
vaddl.u8 q0, d0, d1
|
||||
vaddl.u8 q1, d2, d3
|
||||
vdup.32 q15, r2
|
||||
vadd.u16 q0, q0, q1 @ -3, -2, -1, 0, 1, 2
|
||||
vext.16 q1, q0, q0, #1 @ -2, -1, 0, 1, 2
|
||||
vabd.u16 q0, q1, q0
|
||||
vclt.u16 q0, q0, q15
|
||||
|
||||
ldrd r2, r3, [sp, #4]
|
||||
vmovl.u16 q1, d0
|
||||
vext.16 d1, d0, d1, #3
|
||||
vshr.u32 q1, q1, #15
|
||||
ldr r0, [sp]
|
||||
vst1.32 {d2[1]}, [r2,:32]
|
||||
vst1.32 {d3[1]}, [r3,:32]
|
||||
|
||||
cmp r0, #0
|
||||
it eq
|
||||
bxeq lr
|
||||
|
||||
vand d0, d0, d1
|
||||
vtrn.16 d0, d1
|
||||
vand d0, d0, d1
|
||||
vmov.u16 r0, d0[0]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
.macro rv40_weak_loop_filter
|
||||
vdup.16 d30, r2 @ filter_p1
|
||||
vdup.16 d31, r3 @ filter_q1
|
||||
ldrd r2, r3, [sp]
|
||||
vdup.16 d28, r2 @ alpha
|
||||
vdup.16 d29, r3 @ beta
|
||||
ldr r12, [sp, #8]
|
||||
vdup.16 d25, r12 @ lim_p0q0
|
||||
ldrd r2, r3, [sp, #12]
|
||||
vsubl.u8 q9, d5, d4 @ x, t
|
||||
vabdl.u8 q8, d5, d4 @ x, abs(t)
|
||||
vneg.s16 q15, q15
|
||||
vceq.i16 d16, d19, #0 @ !t
|
||||
vshl.s16 d19, d19, #2 @ t << 2
|
||||
vmul.u16 d18, d17, d28 @ alpha * abs(t)
|
||||
vand d24, d30, d31 @ filter_p1 & filter_q1
|
||||
vsubl.u8 q1, d0, d4 @ p1p2, p1p0
|
||||
vsubl.u8 q3, d1, d5 @ q1q2, q1q0
|
||||
vmov.i16 d22, #3
|
||||
vshr.u16 d18, d18, #7
|
||||
vadd.i16 d22, d22, d24 @ 3 - (filter_p1 & filter_q1)
|
||||
vsubl.u8 q10, d0, d1 @ src[-2] - src[1]
|
||||
vcle.u16 d18, d18, d22
|
||||
vand d20, d20, d24
|
||||
vneg.s16 d23, d25 @ -lim_p0q0
|
||||
vadd.s16 d19, d19, d20
|
||||
vbic d16, d18, d16 @ t && u <= 3 - (fp1 & fq1)
|
||||
vtrn.32 d4, d5 @ -3, 2, -1, 0
|
||||
vrshr.s16 d19, d19, #3
|
||||
vmov d28, d29 @ beta
|
||||
vswp d3, d6 @ q1q2, p1p0
|
||||
vmin.s16 d19, d19, d25
|
||||
vand d30, d30, d16
|
||||
vand d31, d31, d16
|
||||
vadd.s16 q10, q1, q3 @ p1p2 + p1p0, q1q2 + q1q0
|
||||
vmax.s16 d19, d19, d23 @ diff
|
||||
vabs.s16 q1, q1 @ abs(p1p2), abs(q1q2)
|
||||
vand d18, d19, d16 @ diff
|
||||
vcle.u16 q1, q1, q14
|
||||
vneg.s16 d19, d18 @ -diff
|
||||
vdup.16 d26, r3 @ lim_p1
|
||||
vaddw.u8 q2, q9, d5 @ src[-1]+diff, src[0]-diff
|
||||
vhsub.s16 q11, q10, q9
|
||||
vand q1, q1, q15
|
||||
vqmovun.s16 d4, q2 @ -1, 0
|
||||
vand q9, q11, q1
|
||||
vdup.16 d27, r2 @ lim_q1
|
||||
vneg.s16 q9, q9
|
||||
vneg.s16 q14, q13
|
||||
vmin.s16 q9, q9, q13
|
||||
vtrn.32 d0, d1 @ -2, 1, -2, 1
|
||||
vmax.s16 q9, q9, q14
|
||||
vaddw.u8 q3, q9, d0
|
||||
vqmovun.s16 d5, q3 @ -2, 1
|
||||
.endm
|
||||
|
||||
function ff_rv40_h_weak_loop_filter_neon, export=1
|
||||
sub r0, r0, r1, lsl #1
|
||||
sub r0, r0, r1
|
||||
|
||||
vld1.32 {d4[]}, [r0,:32], r1
|
||||
vld1.32 {d0[]}, [r0,:32], r1
|
||||
vld1.32 {d4[1]}, [r0,:32], r1
|
||||
vld1.32 {d5[]}, [r0,:32], r1
|
||||
vld1.32 {d1[]}, [r0,:32], r1
|
||||
vld1.32 {d5[0]}, [r0,:32]
|
||||
|
||||
sub r0, r0, r1, lsl #2
|
||||
|
||||
rv40_weak_loop_filter
|
||||
|
||||
vst1.32 {d5[0]}, [r0,:32], r1
|
||||
vst1.32 {d4[0]}, [r0,:32], r1
|
||||
vst1.32 {d4[1]}, [r0,:32], r1
|
||||
vst1.32 {d5[1]}, [r0,:32], r1
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_rv40_v_weak_loop_filter_neon, export=1
|
||||
sub r12, r0, #3
|
||||
sub r0, r0, #2
|
||||
|
||||
vld1.8 {d4}, [r12], r1
|
||||
vld1.8 {d5}, [r12], r1
|
||||
vld1.8 {d2}, [r12], r1
|
||||
vld1.8 {d3}, [r12], r1
|
||||
|
||||
vtrn.16 q2, q1
|
||||
vtrn.8 d4, d5
|
||||
vtrn.8 d2, d3
|
||||
|
||||
vrev64.32 d5, d5
|
||||
vtrn.32 q2, q1
|
||||
vdup.32 d0, d3[0]
|
||||
vdup.32 d1, d2[0]
|
||||
|
||||
rv40_weak_loop_filter
|
||||
|
||||
vtrn.32 q2, q3
|
||||
vswp d4, d5
|
||||
|
||||
vst4.8 {d4[0],d5[0],d6[0],d7[0]}, [r0], r1
|
||||
vst4.8 {d4[1],d5[1],d6[1],d7[1]}, [r0], r1
|
||||
vst4.8 {d4[2],d5[2],d6[2],d7[2]}, [r0], r1
|
||||
vst4.8 {d4[3],d5[3],d6[3],d7[3]}, [r0], r1
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
245
externals/ffmpeg/libavcodec/arm/sbcdsp_armv6.S
vendored
Executable file
245
externals/ffmpeg/libavcodec/arm/sbcdsp_armv6.S
vendored
Executable file
@@ -0,0 +1,245 @@
|
||||
/*
|
||||
* Bluetooth low-complexity, subband codec (SBC)
|
||||
*
|
||||
* Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org>
|
||||
* Copyright (C) 2008-2010 Nokia Corporation
|
||||
* Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org>
|
||||
* Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch>
|
||||
* Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* SBC ARMv6 optimizations. The instructions are scheduled for ARM11 pipeline.
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_sbc_analyze_4_armv6, export=1
|
||||
@ r0 = in, r1 = out, r2 = consts
|
||||
push {r1, r3-r7, lr}
|
||||
push {r8-r12, r14}
|
||||
ldrd r4, r5, [r0, #0]
|
||||
ldrd r6, r7, [r2, #0]
|
||||
ldrd r8, r9, [r0, #16]
|
||||
ldrd r10, r11, [r2, #16]
|
||||
mov r14, #0x8000
|
||||
smlad r3, r4, r6, r14
|
||||
smlad r12, r5, r7, r14
|
||||
ldrd r4, r5, [r0, #32]
|
||||
ldrd r6, r7, [r2, #32]
|
||||
smlad r3, r8, r10, r3
|
||||
smlad r12, r9, r11, r12
|
||||
ldrd r8, r9, [r0, #48]
|
||||
ldrd r10, r11, [r2, #48]
|
||||
smlad r3, r4, r6, r3
|
||||
smlad r12, r5, r7, r12
|
||||
ldrd r4, r5, [r0, #64]
|
||||
ldrd r6, r7, [r2, #64]
|
||||
smlad r3, r8, r10, r3
|
||||
smlad r12, r9, r11, r12
|
||||
ldrd r8, r9, [r0, #8]
|
||||
ldrd r10, r11, [r2, #8]
|
||||
smlad r3, r4, r6, r3 @ t1[0] is done
|
||||
smlad r12, r5, r7, r12 @ t1[1] is done
|
||||
ldrd r4, r5, [r0, #24]
|
||||
ldrd r6, r7, [r2, #24]
|
||||
pkhtb r3, r12, r3, asr #16 @ combine t1[0] and t1[1]
|
||||
smlad r12, r8, r10, r14
|
||||
smlad r14, r9, r11, r14
|
||||
ldrd r8, r9, [r0, #40]
|
||||
ldrd r10, r11, [r2, #40]
|
||||
smlad r12, r4, r6, r12
|
||||
smlad r14, r5, r7, r14
|
||||
ldrd r4, r5, [r0, #56]
|
||||
ldrd r6, r7, [r2, #56]
|
||||
smlad r12, r8, r10, r12
|
||||
smlad r14, r9, r11, r14
|
||||
ldrd r8, r9, [r0, #72]
|
||||
ldrd r10, r11, [r2, #72]
|
||||
smlad r12, r4, r6, r12
|
||||
smlad r14, r5, r7, r14
|
||||
ldrd r4, r5, [r2, #80] @ start loading cos table
|
||||
smlad r12, r8, r10, r12 @ t1[2] is done
|
||||
smlad r14, r9, r11, r14 @ t1[3] is done
|
||||
ldrd r6, r7, [r2, #88]
|
||||
ldrd r8, r9, [r2, #96]
|
||||
ldrd r10, r11, [r2, #104] @ cos table fully loaded
|
||||
pkhtb r12, r14, r12, asr #16 @ combine t1[2] and t1[3]
|
||||
smuad r4, r3, r4
|
||||
smuad r5, r3, r5
|
||||
smlad r4, r12, r8, r4
|
||||
smlad r5, r12, r9, r5
|
||||
smuad r6, r3, r6
|
||||
smuad r7, r3, r7
|
||||
smlad r6, r12, r10, r6
|
||||
smlad r7, r12, r11, r7
|
||||
pop {r8-r12, r14}
|
||||
stmia r1, {r4, r5, r6, r7}
|
||||
pop {r1, r3-r7, pc}
|
||||
endfunc
|
||||
|
||||
function ff_sbc_analyze_8_armv6, export=1
|
||||
@ r0 = in, r1 = out, r2 = consts
|
||||
push {r1, r3-r7, lr}
|
||||
push {r8-r12, r14}
|
||||
ldrd r4, r5, [r0, #24]
|
||||
ldrd r6, r7, [r2, #24]
|
||||
ldrd r8, r9, [r0, #56]
|
||||
ldrd r10, r11, [r2, #56]
|
||||
mov r14, #0x8000
|
||||
smlad r3, r4, r6, r14
|
||||
smlad r12, r5, r7, r14
|
||||
ldrd r4, r5, [r0, #88]
|
||||
ldrd r6, r7, [r2, #88]
|
||||
smlad r3, r8, r10, r3
|
||||
smlad r12, r9, r11, r12
|
||||
ldrd r8, r9, [r0, #120]
|
||||
ldrd r10, r11, [r2, #120]
|
||||
smlad r3, r4, r6, r3
|
||||
smlad r12, r5, r7, r12
|
||||
ldrd r4, r5, [r0, #152]
|
||||
ldrd r6, r7, [r2, #152]
|
||||
smlad r3, r8, r10, r3
|
||||
smlad r12, r9, r11, r12
|
||||
ldrd r8, r9, [r0, #16]
|
||||
ldrd r10, r11, [r2, #16]
|
||||
smlad r3, r4, r6, r3 @ t1[6] is done
|
||||
smlad r12, r5, r7, r12 @ t1[7] is done
|
||||
ldrd r4, r5, [r0, #48]
|
||||
ldrd r6, r7, [r2, #48]
|
||||
pkhtb r3, r12, r3, asr #16 @ combine t1[6] and t1[7]
|
||||
str r3, [sp, #-4]! @ save to stack
|
||||
smlad r3, r8, r10, r14
|
||||
smlad r12, r9, r11, r14
|
||||
ldrd r8, r9, [r0, #80]
|
||||
ldrd r10, r11, [r2, #80]
|
||||
smlad r3, r4, r6, r3
|
||||
smlad r12, r5, r7, r12
|
||||
ldrd r4, r5, [r0, #112]
|
||||
ldrd r6, r7, [r2, #112]
|
||||
smlad r3, r8, r10, r3
|
||||
smlad r12, r9, r11, r12
|
||||
ldrd r8, r9, [r0, #144]
|
||||
ldrd r10, r11, [r2, #144]
|
||||
smlad r3, r4, r6, r3
|
||||
smlad r12, r5, r7, r12
|
||||
ldrd r4, r5, [r0, #0]
|
||||
ldrd r6, r7, [r2, #0]
|
||||
smlad r3, r8, r10, r3 @ t1[4] is done
|
||||
smlad r12, r9, r11, r12 @ t1[5] is done
|
||||
ldrd r8, r9, [r0, #32]
|
||||
ldrd r10, r11, [r2, #32]
|
||||
pkhtb r3, r12, r3, asr #16 @ combine t1[4] and t1[5]
|
||||
str r3, [sp, #-4]! @ save to stack
|
||||
smlad r3, r4, r6, r14
|
||||
smlad r12, r5, r7, r14
|
||||
ldrd r4, r5, [r0, #64]
|
||||
ldrd r6, r7, [r2, #64]
|
||||
smlad r3, r8, r10, r3
|
||||
smlad r12, r9, r11, r12
|
||||
ldrd r8, r9, [r0, #96]
|
||||
ldrd r10, r11, [r2, #96]
|
||||
smlad r3, r4, r6, r3
|
||||
smlad r12, r5, r7, r12
|
||||
ldrd r4, r5, [r0, #128]
|
||||
ldrd r6, r7, [r2, #128]
|
||||
smlad r3, r8, r10, r3
|
||||
smlad r12, r9, r11, r12
|
||||
ldrd r8, r9, [r0, #8]
|
||||
ldrd r10, r11, [r2, #8]
|
||||
smlad r3, r4, r6, r3 @ t1[0] is done
|
||||
smlad r12, r5, r7, r12 @ t1[1] is done
|
||||
ldrd r4, r5, [r0, #40]
|
||||
ldrd r6, r7, [r2, #40]
|
||||
pkhtb r3, r12, r3, asr #16 @ combine t1[0] and t1[1]
|
||||
smlad r12, r8, r10, r14
|
||||
smlad r14, r9, r11, r14
|
||||
ldrd r8, r9, [r0, #72]
|
||||
ldrd r10, r11, [r2, #72]
|
||||
smlad r12, r4, r6, r12
|
||||
smlad r14, r5, r7, r14
|
||||
ldrd r4, r5, [r0, #104]
|
||||
ldrd r6, r7, [r2, #104]
|
||||
smlad r12, r8, r10, r12
|
||||
smlad r14, r9, r11, r14
|
||||
ldrd r8, r9, [r0, #136]
|
||||
ldrd r10, r11, [r2, #136]!
|
||||
smlad r12, r4, r6, r12
|
||||
smlad r14, r5, r7, r14
|
||||
ldrd r4, r5, [r2, #(160 - 136 + 0)]
|
||||
smlad r12, r8, r10, r12 @ t1[2] is done
|
||||
smlad r14, r9, r11, r14 @ t1[3] is done
|
||||
ldrd r6, r7, [r2, #(160 - 136 + 8)]
|
||||
smuad r4, r3, r4
|
||||
smuad r5, r3, r5
|
||||
pkhtb r12, r14, r12, asr #16 @ combine t1[2] and t1[3]
|
||||
@ r3 = t2[0:1]
|
||||
@ r12 = t2[2:3]
|
||||
pop {r0, r14} @ t2[4:5], t2[6:7]
|
||||
ldrd r8, r9, [r2, #(160 - 136 + 32)]
|
||||
smuad r6, r3, r6
|
||||
smuad r7, r3, r7
|
||||
ldrd r10, r11, [r2, #(160 - 136 + 40)]
|
||||
smlad r4, r12, r8, r4
|
||||
smlad r5, r12, r9, r5
|
||||
ldrd r8, r9, [r2, #(160 - 136 + 64)]
|
||||
smlad r6, r12, r10, r6
|
||||
smlad r7, r12, r11, r7
|
||||
ldrd r10, r11, [r2, #(160 - 136 + 72)]
|
||||
smlad r4, r0, r8, r4
|
||||
smlad r5, r0, r9, r5
|
||||
ldrd r8, r9, [r2, #(160 - 136 + 96)]
|
||||
smlad r6, r0, r10, r6
|
||||
smlad r7, r0, r11, r7
|
||||
ldrd r10, r11, [r2, #(160 - 136 + 104)]
|
||||
smlad r4, r14, r8, r4
|
||||
smlad r5, r14, r9, r5
|
||||
ldrd r8, r9, [r2, #(160 - 136 + 16 + 0)]
|
||||
smlad r6, r14, r10, r6
|
||||
smlad r7, r14, r11, r7
|
||||
ldrd r10, r11, [r2, #(160 - 136 + 16 + 8)]
|
||||
stmia r1!, {r4, r5}
|
||||
smuad r4, r3, r8
|
||||
smuad r5, r3, r9
|
||||
ldrd r8, r9, [r2, #(160 - 136 + 16 + 32)]
|
||||
stmia r1!, {r6, r7}
|
||||
smuad r6, r3, r10
|
||||
smuad r7, r3, r11
|
||||
ldrd r10, r11, [r2, #(160 - 136 + 16 + 40)]
|
||||
smlad r4, r12, r8, r4
|
||||
smlad r5, r12, r9, r5
|
||||
ldrd r8, r9, [r2, #(160 - 136 + 16 + 64)]
|
||||
smlad r6, r12, r10, r6
|
||||
smlad r7, r12, r11, r7
|
||||
ldrd r10, r11, [r2, #(160 - 136 + 16 + 72)]
|
||||
smlad r4, r0, r8, r4
|
||||
smlad r5, r0, r9, r5
|
||||
ldrd r8, r9, [r2, #(160 - 136 + 16 + 96)]
|
||||
smlad r6, r0, r10, r6
|
||||
smlad r7, r0, r11, r7
|
||||
ldrd r10, r11, [r2, #(160 - 136 + 16 + 104)]
|
||||
smlad r4, r14, r8, r4
|
||||
smlad r5, r14, r9, r5
|
||||
smlad r6, r14, r10, r6
|
||||
smlad r7, r14, r11, r7
|
||||
pop {r8-r12, r14}
|
||||
stmia r1!, {r4, r5, r6, r7}
|
||||
pop {r1, r3-r7, pc}
|
||||
endfunc
|
||||
105
externals/ffmpeg/libavcodec/arm/sbcdsp_init_arm.c
vendored
Executable file
105
externals/ffmpeg/libavcodec/arm/sbcdsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,105 @@
|
||||
/*
|
||||
* Bluetooth low-complexity, subband codec (SBC)
|
||||
*
|
||||
* Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org>
|
||||
* Copyright (C) 2008-2010 Nokia Corporation
|
||||
* Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org>
|
||||
* Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch>
|
||||
* Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* SBC ARMv6 optimization for some basic "building bricks"
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/sbcdsp.h"
|
||||
|
||||
void ff_sbc_analyze_4_armv6(const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
void ff_sbc_analyze_8_armv6(const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
|
||||
void ff_sbc_analyze_4_neon(const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
void ff_sbc_analyze_8_neon(const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
void ff_sbc_calc_scalefactors_neon(int32_t sb_sample_f[16][2][8],
|
||||
uint32_t scale_factor[2][8],
|
||||
int blocks, int channels, int subbands);
|
||||
int ff_sbc_calc_scalefactors_j_neon(int32_t sb_sample_f[16][2][8],
|
||||
uint32_t scale_factor[2][8],
|
||||
int blocks, int subbands);
|
||||
int ff_sbc_enc_process_input_4s_neon(int position, const uint8_t *pcm,
|
||||
int16_t X[2][SBC_X_BUFFER_SIZE],
|
||||
int nsamples, int nchannels);
|
||||
int ff_sbc_enc_process_input_8s_neon(int position, const uint8_t *pcm,
|
||||
int16_t X[2][SBC_X_BUFFER_SIZE],
|
||||
int nsamples, int nchannels);
|
||||
|
||||
DECLARE_ALIGNED(SBC_ALIGN, int32_t, ff_sbcdsp_joint_bits_mask)[8] = {
|
||||
8, 4, 2, 1, 128, 64, 32, 16
|
||||
};
|
||||
|
||||
#if HAVE_BIGENDIAN
|
||||
#define PERM(a, b, c, d) { \
|
||||
(a * 2) + 1, (a * 2) + 0, \
|
||||
(b * 2) + 1, (b * 2) + 0, \
|
||||
(c * 2) + 1, (c * 2) + 0, \
|
||||
(d * 2) + 1, (d * 2) + 0 \
|
||||
}
|
||||
#else
|
||||
#define PERM(a, b, c, d) { \
|
||||
(a * 2) + 0, (a * 2) + 1, \
|
||||
(b * 2) + 0, (b * 2) + 1, \
|
||||
(c * 2) + 0, (c * 2) + 1, \
|
||||
(d * 2) + 0, (d * 2) + 1 \
|
||||
}
|
||||
#endif
|
||||
|
||||
DECLARE_ALIGNED(SBC_ALIGN, uint8_t, ff_sbc_input_perm_4)[2][8] = {
|
||||
PERM(7, 3, 6, 4),
|
||||
PERM(0, 2, 1, 5)
|
||||
};
|
||||
|
||||
DECLARE_ALIGNED(SBC_ALIGN, uint8_t, ff_sbc_input_perm_8)[4][8] = {
|
||||
PERM(15, 7, 14, 8),
|
||||
PERM(13, 9, 12, 10),
|
||||
PERM(11, 3, 6, 0),
|
||||
PERM( 5, 1, 4, 2)
|
||||
};
|
||||
|
||||
av_cold void ff_sbcdsp_init_arm(SBCDSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_armv6(cpu_flags)) {
|
||||
s->sbc_analyze_4 = ff_sbc_analyze_4_armv6;
|
||||
s->sbc_analyze_8 = ff_sbc_analyze_8_armv6;
|
||||
}
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
s->sbc_analyze_4 = ff_sbc_analyze_4_neon;
|
||||
s->sbc_analyze_8 = ff_sbc_analyze_8_neon;
|
||||
s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_neon;
|
||||
s->sbc_calc_scalefactors_j = ff_sbc_calc_scalefactors_j_neon;
|
||||
if (s->increment != 1) {
|
||||
s->sbc_enc_process_input_4s = ff_sbc_enc_process_input_4s_neon;
|
||||
s->sbc_enc_process_input_8s = ff_sbc_enc_process_input_8s_neon;
|
||||
}
|
||||
}
|
||||
}
|
||||
714
externals/ffmpeg/libavcodec/arm/sbcdsp_neon.S
vendored
Executable file
714
externals/ffmpeg/libavcodec/arm/sbcdsp_neon.S
vendored
Executable file
@@ -0,0 +1,714 @@
|
||||
/*
|
||||
* Bluetooth low-complexity, subband codec (SBC)
|
||||
*
|
||||
* Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org>
|
||||
* Copyright (C) 2008-2010 Nokia Corporation
|
||||
* Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org>
|
||||
* Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch>
|
||||
* Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* SBC ARM NEON optimizations
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
#define SBC_PROTO_FIXED_SCALE 16
|
||||
|
||||
function ff_sbc_analyze_4_neon, export=1
|
||||
/* TODO: merge even and odd cases (or even merge all four calls to this
|
||||
* function) in order to have only aligned reads from 'in' array
|
||||
* and reduce number of load instructions */
|
||||
vld1.16 {d4, d5}, [r0, :64]!
|
||||
vld1.16 {d8, d9}, [r2, :128]!
|
||||
|
||||
vmull.s16 q0, d4, d8
|
||||
vld1.16 {d6, d7}, [r0, :64]!
|
||||
vmull.s16 q1, d5, d9
|
||||
vld1.16 {d10, d11}, [r2, :128]!
|
||||
|
||||
vmlal.s16 q0, d6, d10
|
||||
vld1.16 {d4, d5}, [r0, :64]!
|
||||
vmlal.s16 q1, d7, d11
|
||||
vld1.16 {d8, d9}, [r2, :128]!
|
||||
|
||||
vmlal.s16 q0, d4, d8
|
||||
vld1.16 {d6, d7}, [r0, :64]!
|
||||
vmlal.s16 q1, d5, d9
|
||||
vld1.16 {d10, d11}, [r2, :128]!
|
||||
|
||||
vmlal.s16 q0, d6, d10
|
||||
vld1.16 {d4, d5}, [r0, :64]!
|
||||
vmlal.s16 q1, d7, d11
|
||||
vld1.16 {d8, d9}, [r2, :128]!
|
||||
|
||||
vmlal.s16 q0, d4, d8
|
||||
vmlal.s16 q1, d5, d9
|
||||
|
||||
vpadd.s32 d0, d0, d1
|
||||
vpadd.s32 d1, d2, d3
|
||||
|
||||
vrshrn.s32 d0, q0, SBC_PROTO_FIXED_SCALE
|
||||
|
||||
vld1.16 {d2, d3, d4, d5}, [r2, :128]!
|
||||
|
||||
vdup.i32 d1, d0[1] /* TODO: can be eliminated */
|
||||
vdup.i32 d0, d0[0] /* TODO: can be eliminated */
|
||||
|
||||
vmull.s16 q3, d2, d0
|
||||
vmull.s16 q4, d3, d0
|
||||
vmlal.s16 q3, d4, d1
|
||||
vmlal.s16 q4, d5, d1
|
||||
|
||||
vpadd.s32 d0, d6, d7 /* TODO: can be eliminated */
|
||||
vpadd.s32 d1, d8, d9 /* TODO: can be eliminated */
|
||||
|
||||
vst1.32 {d0, d1}, [r1, :128]
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_sbc_analyze_8_neon, export=1
|
||||
/* TODO: merge even and odd cases (or even merge all four calls to this
|
||||
* function) in order to have only aligned reads from 'in' array
|
||||
* and reduce number of load instructions */
|
||||
vld1.16 {d4, d5}, [r0, :64]!
|
||||
vld1.16 {d8, d9}, [r2, :128]!
|
||||
|
||||
vmull.s16 q6, d4, d8
|
||||
vld1.16 {d6, d7}, [r0, :64]!
|
||||
vmull.s16 q7, d5, d9
|
||||
vld1.16 {d10, d11}, [r2, :128]!
|
||||
vmull.s16 q8, d6, d10
|
||||
vld1.16 {d4, d5}, [r0, :64]!
|
||||
vmull.s16 q9, d7, d11
|
||||
vld1.16 {d8, d9}, [r2, :128]!
|
||||
|
||||
vmlal.s16 q6, d4, d8
|
||||
vld1.16 {d6, d7}, [r0, :64]!
|
||||
vmlal.s16 q7, d5, d9
|
||||
vld1.16 {d10, d11}, [r2, :128]!
|
||||
vmlal.s16 q8, d6, d10
|
||||
vld1.16 {d4, d5}, [r0, :64]!
|
||||
vmlal.s16 q9, d7, d11
|
||||
vld1.16 {d8, d9}, [r2, :128]!
|
||||
|
||||
vmlal.s16 q6, d4, d8
|
||||
vld1.16 {d6, d7}, [r0, :64]!
|
||||
vmlal.s16 q7, d5, d9
|
||||
vld1.16 {d10, d11}, [r2, :128]!
|
||||
vmlal.s16 q8, d6, d10
|
||||
vld1.16 {d4, d5}, [r0, :64]!
|
||||
vmlal.s16 q9, d7, d11
|
||||
vld1.16 {d8, d9}, [r2, :128]!
|
||||
|
||||
vmlal.s16 q6, d4, d8
|
||||
vld1.16 {d6, d7}, [r0, :64]!
|
||||
vmlal.s16 q7, d5, d9
|
||||
vld1.16 {d10, d11}, [r2, :128]!
|
||||
vmlal.s16 q8, d6, d10
|
||||
vld1.16 {d4, d5}, [r0, :64]!
|
||||
vmlal.s16 q9, d7, d11
|
||||
vld1.16 {d8, d9}, [r2, :128]!
|
||||
|
||||
vmlal.s16 q6, d4, d8
|
||||
vld1.16 {d6, d7}, [r0, :64]!
|
||||
vmlal.s16 q7, d5, d9
|
||||
vld1.16 {d10, d11}, [r2, :128]!
|
||||
|
||||
vmlal.s16 q8, d6, d10
|
||||
vmlal.s16 q9, d7, d11
|
||||
|
||||
vpadd.s32 d0, d12, d13
|
||||
vpadd.s32 d1, d14, d15
|
||||
vpadd.s32 d2, d16, d17
|
||||
vpadd.s32 d3, d18, d19
|
||||
|
||||
vrshr.s32 q0, q0, SBC_PROTO_FIXED_SCALE
|
||||
vrshr.s32 q1, q1, SBC_PROTO_FIXED_SCALE
|
||||
vmovn.s32 d0, q0
|
||||
vmovn.s32 d1, q1
|
||||
|
||||
vdup.i32 d3, d1[1] /* TODO: can be eliminated */
|
||||
vdup.i32 d2, d1[0] /* TODO: can be eliminated */
|
||||
vdup.i32 d1, d0[1] /* TODO: can be eliminated */
|
||||
vdup.i32 d0, d0[0] /* TODO: can be eliminated */
|
||||
|
||||
vld1.16 {d4, d5}, [r2, :128]!
|
||||
vmull.s16 q6, d4, d0
|
||||
vld1.16 {d6, d7}, [r2, :128]!
|
||||
vmull.s16 q7, d5, d0
|
||||
vmull.s16 q8, d6, d0
|
||||
vmull.s16 q9, d7, d0
|
||||
|
||||
vld1.16 {d4, d5}, [r2, :128]!
|
||||
vmlal.s16 q6, d4, d1
|
||||
vld1.16 {d6, d7}, [r2, :128]!
|
||||
vmlal.s16 q7, d5, d1
|
||||
vmlal.s16 q8, d6, d1
|
||||
vmlal.s16 q9, d7, d1
|
||||
|
||||
vld1.16 {d4, d5}, [r2, :128]!
|
||||
vmlal.s16 q6, d4, d2
|
||||
vld1.16 {d6, d7}, [r2, :128]!
|
||||
vmlal.s16 q7, d5, d2
|
||||
vmlal.s16 q8, d6, d2
|
||||
vmlal.s16 q9, d7, d2
|
||||
|
||||
vld1.16 {d4, d5}, [r2, :128]!
|
||||
vmlal.s16 q6, d4, d3
|
||||
vld1.16 {d6, d7}, [r2, :128]!
|
||||
vmlal.s16 q7, d5, d3
|
||||
vmlal.s16 q8, d6, d3
|
||||
vmlal.s16 q9, d7, d3
|
||||
|
||||
vpadd.s32 d0, d12, d13 /* TODO: can be eliminated */
|
||||
vpadd.s32 d1, d14, d15 /* TODO: can be eliminated */
|
||||
vpadd.s32 d2, d16, d17 /* TODO: can be eliminated */
|
||||
vpadd.s32 d3, d18, d19 /* TODO: can be eliminated */
|
||||
|
||||
vst1.32 {d0, d1, d2, d3}, [r1, :128]
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_sbc_calc_scalefactors_neon, export=1
|
||||
@ parameters
|
||||
@ r0 = sb_sample_f
|
||||
@ r1 = scale_factor
|
||||
@ r2 = blocks
|
||||
@ r3 = channels
|
||||
@ r4 = subbands
|
||||
@ local variables
|
||||
@ r5 = in_loop_1
|
||||
@ r6 = in
|
||||
@ r7 = out_loop_1
|
||||
@ r8 = out
|
||||
@ r9 = ch
|
||||
@ r10 = sb
|
||||
@ r11 = inc
|
||||
@ r12 = blk
|
||||
|
||||
push {r1-r2, r4-r12}
|
||||
ldr r4, [sp, #44]
|
||||
mov r11, #64
|
||||
|
||||
mov r9, #0
|
||||
1:
|
||||
add r5, r0, r9, lsl#5
|
||||
add r7, r1, r9, lsl#5
|
||||
|
||||
mov r10, #0
|
||||
2:
|
||||
add r6, r5, r10, lsl#2
|
||||
add r8, r7, r10, lsl#2
|
||||
mov r12, r2
|
||||
|
||||
vmov.s32 q0, #0
|
||||
vmov.s32 q1, #0x8000 @ 1 << SCALE_OUT_BITS
|
||||
vmov.s32 q14, #1
|
||||
vmov.s32 q15, #16 @ 31 - SCALE_OUT_BITS
|
||||
vadd.s32 q1, q1, q14
|
||||
3:
|
||||
vld1.32 {d16, d17}, [r6, :128], r11
|
||||
vabs.s32 q8, q8
|
||||
vld1.32 {d18, d19}, [r6, :128], r11
|
||||
vabs.s32 q9, q9
|
||||
vld1.32 {d20, d21}, [r6, :128], r11
|
||||
vabs.s32 q10, q10
|
||||
vld1.32 {d22, d23}, [r6, :128], r11
|
||||
vabs.s32 q11, q11
|
||||
vmax.s32 q0, q0, q8
|
||||
vmax.s32 q1, q1, q9
|
||||
vmax.s32 q0, q0, q10
|
||||
vmax.s32 q1, q1, q11
|
||||
subs r12, r12, #4
|
||||
bgt 3b
|
||||
vmax.s32 q0, q0, q1
|
||||
vsub.s32 q0, q0, q14
|
||||
vclz.s32 q0, q0
|
||||
vsub.s32 q0, q15, q0
|
||||
vst1.32 {d0, d1}, [r8, :128]
|
||||
|
||||
add r10, r10, #4
|
||||
cmp r10, r4
|
||||
blt 2b
|
||||
|
||||
add r9, r9, #1
|
||||
cmp r9, r3
|
||||
blt 1b
|
||||
|
||||
pop {r1-r2, r4-r12}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
/*
|
||||
* constants: q13 = (31 - SCALE_OUT_BITS)
|
||||
* q14 = 1
|
||||
* input: q0 - ((1 << SCALE_OUT_BITS) + 1)
|
||||
* r5 - samples for channel 0
|
||||
* r6 - samples for shannel 1
|
||||
* output: q0, q1 - scale factors without joint stereo
|
||||
* q2, q3 - scale factors with joint stereo
|
||||
* q15 - joint stereo selection mask
|
||||
*/
|
||||
.macro calc_scalefactors
|
||||
vmov.s32 q1, q0
|
||||
vmov.s32 q2, q0
|
||||
vmov.s32 q3, q0
|
||||
mov r3, r2
|
||||
1:
|
||||
vld1.32 {d18, d19}, [r6, :128], r11
|
||||
vbic.s32 q11, q9, q14
|
||||
vld1.32 {d16, d17}, [r5, :128], r11
|
||||
vhadd.s32 q10, q8, q11
|
||||
vhsub.s32 q11, q8, q11
|
||||
vabs.s32 q8, q8
|
||||
vabs.s32 q9, q9
|
||||
vabs.s32 q10, q10
|
||||
vabs.s32 q11, q11
|
||||
vmax.s32 q0, q0, q8
|
||||
vmax.s32 q1, q1, q9
|
||||
vmax.s32 q2, q2, q10
|
||||
vmax.s32 q3, q3, q11
|
||||
subs r3, r3, #1
|
||||
bgt 1b
|
||||
vsub.s32 q0, q0, q14
|
||||
vsub.s32 q1, q1, q14
|
||||
vsub.s32 q2, q2, q14
|
||||
vsub.s32 q3, q3, q14
|
||||
vclz.s32 q0, q0
|
||||
vclz.s32 q1, q1
|
||||
vclz.s32 q2, q2
|
||||
vclz.s32 q3, q3
|
||||
vsub.s32 q0, q13, q0
|
||||
vsub.s32 q1, q13, q1
|
||||
vsub.s32 q2, q13, q2
|
||||
vsub.s32 q3, q13, q3
|
||||
.endm
|
||||
|
||||
/*
|
||||
* constants: q14 = 1
|
||||
* input: q15 - joint stereo selection mask
|
||||
* r5 - value set by calc_scalefactors macro
|
||||
* r6 - value set by calc_scalefactors macro
|
||||
*/
|
||||
.macro update_joint_stereo_samples
|
||||
sub r8, r6, r11
|
||||
sub r7, r5, r11
|
||||
sub r6, r6, r11, asl #1
|
||||
sub r5, r5, r11, asl #1
|
||||
vld1.32 {d18, d19}, [r6, :128]
|
||||
vbic.s32 q11, q9, q14
|
||||
vld1.32 {d16, d17}, [r5, :128]
|
||||
vld1.32 {d2, d3}, [r8, :128]
|
||||
vbic.s32 q3, q1, q14
|
||||
vld1.32 {d0, d1}, [r7, :128]
|
||||
vhsub.s32 q10, q8, q11
|
||||
vhadd.s32 q11, q8, q11
|
||||
vhsub.s32 q2, q0, q3
|
||||
vhadd.s32 q3, q0, q3
|
||||
vbif.s32 q10, q9, q15
|
||||
vbif.s32 d22, d16, d30
|
||||
sub r11, r10, r11, asl #1
|
||||
sub r3, r2, #2
|
||||
2:
|
||||
vbif.s32 d23, d17, d31
|
||||
vst1.32 {d20, d21}, [r6, :128], r11
|
||||
vbif.s32 d4, d2, d30
|
||||
vld1.32 {d18, d19}, [r6, :128]
|
||||
vbif.s32 d5, d3, d31
|
||||
vst1.32 {d22, d23}, [r5, :128], r11
|
||||
vbif.s32 d6, d0, d30
|
||||
vld1.32 {d16, d17}, [r5, :128]
|
||||
vbif.s32 d7, d1, d31
|
||||
vst1.32 {d4, d5}, [r8, :128], r11
|
||||
vbic.s32 q11, q9, q14
|
||||
vld1.32 {d2, d3}, [r8, :128]
|
||||
vst1.32 {d6, d7}, [r7, :128], r11
|
||||
vbic.s32 q3, q1, q14
|
||||
vld1.32 {d0, d1}, [r7, :128]
|
||||
vhsub.s32 q10, q8, q11
|
||||
vhadd.s32 q11, q8, q11
|
||||
vhsub.s32 q2, q0, q3
|
||||
vhadd.s32 q3, q0, q3
|
||||
vbif.s32 q10, q9, q15
|
||||
vbif.s32 d22, d16, d30
|
||||
subs r3, r3, #2
|
||||
bgt 2b
|
||||
sub r11, r10, r11, asr #1
|
||||
vbif.s32 d23, d17, d31
|
||||
vst1.32 {d20, d21}, [r6, :128]
|
||||
vbif.s32 q2, q1, q15
|
||||
vst1.32 {d22, d23}, [r5, :128]
|
||||
vbif.s32 q3, q0, q15
|
||||
vst1.32 {d4, d5}, [r8, :128]
|
||||
vst1.32 {d6, d7}, [r7, :128]
|
||||
.endm
|
||||
|
||||
function ff_sbc_calc_scalefactors_j_neon, export=1
|
||||
@ parameters
|
||||
@ r0 = in = sb_sample_f
|
||||
@ r1 = out = scale_factor
|
||||
@ r2 = blocks
|
||||
@ r3 = subbands
|
||||
@ local variables
|
||||
@ r4 = consts = ff_sbcdsp_joint_bits_mask
|
||||
@ r5 = in0
|
||||
@ r6 = in1
|
||||
@ r7 = out0
|
||||
@ r8 = out1
|
||||
@ r10 = zero
|
||||
@ r11 = inc
|
||||
@ return r0 = joint
|
||||
|
||||
push {r3-r11}
|
||||
movrelx r4, X(ff_sbcdsp_joint_bits_mask)
|
||||
mov r10, #0
|
||||
mov r11, #64
|
||||
|
||||
vmov.s32 q14, #1
|
||||
vmov.s32 q13, #16 @ 31 - SCALE_OUT_BITS
|
||||
|
||||
cmp r3, #4
|
||||
bne 8f
|
||||
|
||||
4: @ 4 subbands
|
||||
add r5, r0, #0
|
||||
add r6, r0, #32
|
||||
add r7, r1, #0
|
||||
add r8, r1, #32
|
||||
vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS
|
||||
vadd.s32 q0, q0, q14
|
||||
|
||||
calc_scalefactors
|
||||
|
||||
@ check whether to use joint stereo for subbands 0, 1, 2
|
||||
vadd.s32 q15, q0, q1
|
||||
vadd.s32 q9, q2, q3
|
||||
vmov.s32 d31[1], r10 @ last subband -> no joint
|
||||
vld1.32 {d16, d17}, [r4, :128]!
|
||||
vcgt.s32 q15, q15, q9
|
||||
|
||||
@ calculate and save to memory 'joint' variable
|
||||
@ update and save scale factors to memory
|
||||
vand.s32 q8, q8, q15
|
||||
vbit.s32 q0, q2, q15
|
||||
vpadd.s32 d16, d16, d17
|
||||
vbit.s32 q1, q3, q15
|
||||
vpadd.s32 d16, d16, d16
|
||||
vst1.32 {d0, d1}, [r7, :128]
|
||||
vst1.32 {d2, d3}, [r8, :128]
|
||||
vmov.32 r0, d16[0]
|
||||
|
||||
update_joint_stereo_samples
|
||||
b 9f
|
||||
|
||||
8: @ 8 subbands
|
||||
add r5, r0, #16
|
||||
add r6, r0, #48
|
||||
add r7, r1, #16
|
||||
add r8, r1, #48
|
||||
vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS
|
||||
vadd.s32 q0, q0, q14
|
||||
|
||||
calc_scalefactors
|
||||
|
||||
@ check whether to use joint stereo for subbands 4, 5, 6
|
||||
vadd.s32 q15, q0, q1
|
||||
vadd.s32 q9, q2, q3
|
||||
vmov.s32 d31[1], r10 @ last subband -> no joint
|
||||
vld1.32 {d16, d17}, [r4, :128]!
|
||||
vcgt.s32 q15, q15, q9
|
||||
|
||||
@ calculate part of 'joint' variable and save it to d24
|
||||
@ update and save scale factors to memory
|
||||
vand.s32 q8, q8, q15
|
||||
vbit.s32 q0, q2, q15
|
||||
vpadd.s32 d16, d16, d17
|
||||
vbit.s32 q1, q3, q15
|
||||
vst1.32 {d0, d1}, [r7, :128]
|
||||
vst1.32 {d2, d3}, [r8, :128]
|
||||
vpadd.s32 d24, d16, d16
|
||||
|
||||
update_joint_stereo_samples
|
||||
|
||||
add r5, r0, #0
|
||||
add r6, r0, #32
|
||||
add r7, r1, #0
|
||||
add r8, r1, #32
|
||||
vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS
|
||||
vadd.s32 q0, q0, q14
|
||||
|
||||
calc_scalefactors
|
||||
|
||||
@ check whether to use joint stereo for subbands 0, 1, 2, 3
|
||||
vadd.s32 q15, q0, q1
|
||||
vadd.s32 q9, q2, q3
|
||||
vld1.32 {d16, d17}, [r4, :128]!
|
||||
vcgt.s32 q15, q15, q9
|
||||
|
||||
@ combine last part of 'joint' with d24 and save to memory
|
||||
@ update and save scale factors to memory
|
||||
vand.s32 q8, q8, q15
|
||||
vbit.s32 q0, q2, q15
|
||||
vpadd.s32 d16, d16, d17
|
||||
vbit.s32 q1, q3, q15
|
||||
vpadd.s32 d16, d16, d16
|
||||
vst1.32 {d0, d1}, [r7, :128]
|
||||
vadd.s32 d16, d16, d24
|
||||
vst1.32 {d2, d3}, [r8, :128]
|
||||
vmov.32 r0, d16[0]
|
||||
|
||||
update_joint_stereo_samples
|
||||
9:
|
||||
pop {r3-r11}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_sbc_enc_process_input_4s_neon, export=1
|
||||
@ parameters
|
||||
@ r0 = positioin
|
||||
@ r1 = pcm
|
||||
@ r2 = X
|
||||
@ r3 = nsamples
|
||||
@ r4 = nchannels
|
||||
@ local variables
|
||||
@ r5 = ff_sbc_input_perm_4
|
||||
@ r6 = src / x
|
||||
@ r7 = dst / y
|
||||
|
||||
push {r1, r3-r7}
|
||||
ldr r4, [sp, #24]
|
||||
movrelx r5, X(ff_sbc_input_perm_4)
|
||||
|
||||
@ handle X buffer wraparound
|
||||
cmp r0, r3
|
||||
bge 1f @ if (position < nsamples)
|
||||
add r7, r2, #576 @ &X[0][SBC_X_BUFFER_SIZE - 40]
|
||||
add r6, r2, r0, lsl#1 @ &X[0][position]
|
||||
vld1.16 {d0, d1, d2, d3}, [r6, :128]!
|
||||
vst1.16 {d0, d1, d2, d3}, [r7, :128]!
|
||||
vld1.16 {d0, d1, d2, d3}, [r6, :128]!
|
||||
vst1.16 {d0, d1, d2, d3}, [r7, :128]!
|
||||
vld1.16 {d0}, [r6, :64]!
|
||||
vst1.16 {d0}, [r7, :64]!
|
||||
cmp r4, #1
|
||||
ble 2f @ if (nchannels > 1)
|
||||
add r7, r2, #1232 @ &X[1][SBC_X_BUFFER_SIZE - 40]
|
||||
add r6, r2, #656
|
||||
add r6, r6, r0, lsl#1 @ &X[1][position]
|
||||
vld1.16 {d0, d1, d2, d3}, [r6, :128]!
|
||||
vst1.16 {d0, d1, d2, d3}, [r7, :128]!
|
||||
vld1.16 {d0, d1, d2, d3}, [r6, :128]!
|
||||
vst1.16 {d0, d1, d2, d3}, [r7, :128]!
|
||||
vld1.16 {d0}, [r6, :64]!
|
||||
vst1.16 {d0}, [r7, :64]!
|
||||
2:
|
||||
mov r0, #288 @ SBC_X_BUFFER_SIZE - 40
|
||||
1:
|
||||
|
||||
add r6, r2, r0, lsl#1 @ &X[0][position]
|
||||
add r7, r6, #656 @ &X[1][position]
|
||||
|
||||
cmp r4, #1
|
||||
ble 8f @ if (nchannels > 1)
|
||||
tst r1, #1
|
||||
beq 7f @ if (pcm & 1)
|
||||
@ poor 'pcm' alignment
|
||||
vld1.8 {d0, d1}, [r5, :128]
|
||||
1:
|
||||
sub r6, r6, #16
|
||||
sub r7, r7, #16
|
||||
sub r0, r0, #8
|
||||
vld1.8 {d4, d5}, [r1]!
|
||||
vuzp.16 d4, d5
|
||||
vld1.8 {d20, d21}, [r1]!
|
||||
vuzp.16 d20, d21
|
||||
vswp d5, d20
|
||||
vtbl.8 d16, {d4, d5}, d0
|
||||
vtbl.8 d17, {d4, d5}, d1
|
||||
vtbl.8 d18, {d20, d21}, d0
|
||||
vtbl.8 d19, {d20, d21}, d1
|
||||
vst1.16 {d16, d17}, [r6, :128]
|
||||
vst1.16 {d18, d19}, [r7, :128]
|
||||
subs r3, r3, #8
|
||||
bgt 1b
|
||||
b 9f
|
||||
7:
|
||||
@ proper 'pcm' alignment
|
||||
vld1.8 {d0, d1}, [r5, :128]
|
||||
1:
|
||||
sub r6, r6, #16
|
||||
sub r7, r7, #16
|
||||
sub r0, r0, #8
|
||||
vld2.16 {d4, d5}, [r1]!
|
||||
vld2.16 {d20, d21}, [r1]!
|
||||
vswp d5, d20
|
||||
vtbl.8 d16, {d4, d5}, d0
|
||||
vtbl.8 d17, {d4, d5}, d1
|
||||
vtbl.8 d18, {d20, d21}, d0
|
||||
vtbl.8 d19, {d20, d21}, d1
|
||||
vst1.16 {d16, d17}, [r6, :128]
|
||||
vst1.16 {d18, d19}, [r7, :128]
|
||||
subs r3, r3, #8
|
||||
bgt 1b
|
||||
b 9f
|
||||
8:
|
||||
@ mono
|
||||
vld1.8 {d0, d1}, [r5, :128]
|
||||
1:
|
||||
sub r6, r6, #16
|
||||
sub r0, r0, #8
|
||||
vld1.8 {d4, d5}, [r1]!
|
||||
vtbl.8 d16, {d4, d5}, d0
|
||||
vtbl.8 d17, {d4, d5}, d1
|
||||
vst1.16 {d16, d17}, [r6, :128]
|
||||
subs r3, r3, #8
|
||||
bgt 1b
|
||||
9:
|
||||
pop {r1, r3-r7}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_sbc_enc_process_input_8s_neon, export=1
|
||||
@ parameters
|
||||
@ r0 = positioin
|
||||
@ r1 = pcm
|
||||
@ r2 = X
|
||||
@ r3 = nsamples
|
||||
@ r4 = nchannels
|
||||
@ local variables
|
||||
@ r5 = ff_sbc_input_perm_8
|
||||
@ r6 = src
|
||||
@ r7 = dst
|
||||
|
||||
push {r1, r3-r7}
|
||||
ldr r4, [sp, #24]
|
||||
movrelx r5, X(ff_sbc_input_perm_8)
|
||||
|
||||
@ handle X buffer wraparound
|
||||
cmp r0, r3
|
||||
bge 1f @ if (position < nsamples)
|
||||
add r7, r2, #512 @ &X[0][SBC_X_BUFFER_SIZE - 72]
|
||||
add r6, r2, r0, lsl#1 @ &X[0][position]
|
||||
vld1.16 {d0, d1, d2, d3}, [r6, :128]!
|
||||
vst1.16 {d0, d1, d2, d3}, [r7, :128]!
|
||||
vld1.16 {d0, d1, d2, d3}, [r6, :128]!
|
||||
vst1.16 {d0, d1, d2, d3}, [r7, :128]!
|
||||
vld1.16 {d0, d1, d2, d3}, [r6, :128]!
|
||||
vst1.16 {d0, d1, d2, d3}, [r7, :128]!
|
||||
vld1.16 {d0, d1, d2, d3}, [r6, :128]!
|
||||
vst1.16 {d0, d1, d2, d3}, [r7, :128]!
|
||||
vld1.16 {d0, d1}, [r6, :128]!
|
||||
vst1.16 {d0, d1}, [r7, :128]!
|
||||
cmp r4, #1
|
||||
ble 2f @ if (nchannels > 1)
|
||||
add r7, r2, #1168 @ &X[1][SBC_X_BUFFER_SIZE - 72]
|
||||
add r6, r2, #656
|
||||
add r6, r6, r0, lsl#1 @ &X[1][position]
|
||||
vld1.16 {d0, d1, d2, d3}, [r6, :128]!
|
||||
vst1.16 {d0, d1, d2, d3}, [r7, :128]!
|
||||
vld1.16 {d0, d1, d2, d3}, [r6, :128]!
|
||||
vst1.16 {d0, d1, d2, d3}, [r7, :128]!
|
||||
vld1.16 {d0, d1, d2, d3}, [r6, :128]!
|
||||
vst1.16 {d0, d1, d2, d3}, [r7, :128]!
|
||||
vld1.16 {d0, d1, d2, d3}, [r6, :128]!
|
||||
vst1.16 {d0, d1, d2, d3}, [r7, :128]!
|
||||
vld1.16 {d0, d1}, [r6, :128]!
|
||||
vst1.16 {d0, d1}, [r7, :128]!
|
||||
2:
|
||||
mov r0, #256 @ SBC_X_BUFFER_SIZE - 72
|
||||
1:
|
||||
|
||||
add r6, r2, r0, lsl#1 @ &X[0][position]
|
||||
add r7, r6, #656 @ &X[1][position]
|
||||
|
||||
cmp r4, #1
|
||||
ble 8f @ if (nchannels > 1)
|
||||
tst r1, #1
|
||||
beq 7f @ if (pcm & 1)
|
||||
@ poor 'pcm' alignment
|
||||
vld1.8 {d0, d1, d2, d3}, [r5, :128]
|
||||
1:
|
||||
sub r6, r6, #32
|
||||
sub r7, r7, #32
|
||||
sub r0, r0, #16
|
||||
vld1.8 {d4, d5, d6, d7}, [r1]!
|
||||
vuzp.16 q2, q3
|
||||
vld1.8 {d20, d21, d22, d23}, [r1]!
|
||||
vuzp.16 q10, q11
|
||||
vswp q3, q10
|
||||
vtbl.8 d16, {d4, d5, d6, d7}, d0
|
||||
vtbl.8 d17, {d4, d5, d6, d7}, d1
|
||||
vtbl.8 d18, {d4, d5, d6, d7}, d2
|
||||
vtbl.8 d19, {d4, d5, d6, d7}, d3
|
||||
vst1.16 {d16, d17, d18, d19}, [r6, :128]
|
||||
vtbl.8 d16, {d20, d21, d22, d23}, d0
|
||||
vtbl.8 d17, {d20, d21, d22, d23}, d1
|
||||
vtbl.8 d18, {d20, d21, d22, d23}, d2
|
||||
vtbl.8 d19, {d20, d21, d22, d23}, d3
|
||||
vst1.16 {d16, d17, d18, d19}, [r7, :128]
|
||||
subs r3, r3, #16
|
||||
bgt 1b
|
||||
b 9f
|
||||
7:
|
||||
@ proper 'pcm' alignment
|
||||
vld1.8 {d0, d1, d2, d3}, [r5, :128]
|
||||
1:
|
||||
sub r6, r6, #32
|
||||
sub r7, r7, #32
|
||||
sub r0, r0, #16
|
||||
vld2.16 {d4, d5, d6, d7}, [r1]!
|
||||
vld2.16 {d20, d21, d22, d23}, [r1]!
|
||||
vswp q3, q10
|
||||
vtbl.8 d16, {d4, d5, d6, d7}, d0
|
||||
vtbl.8 d17, {d4, d5, d6, d7}, d1
|
||||
vtbl.8 d18, {d4, d5, d6, d7}, d2
|
||||
vtbl.8 d19, {d4, d5, d6, d7}, d3
|
||||
vst1.16 {d16, d17, d18, d19}, [r6, :128]
|
||||
vtbl.8 d16, {d20, d21, d22, d23}, d0
|
||||
vtbl.8 d17, {d20, d21, d22, d23}, d1
|
||||
vtbl.8 d18, {d20, d21, d22, d23}, d2
|
||||
vtbl.8 d19, {d20, d21, d22, d23}, d3
|
||||
vst1.16 {d16, d17, d18, d19}, [r7, :128]
|
||||
subs r3, r3, #16
|
||||
bgt 1b
|
||||
b 9f
|
||||
8:
|
||||
@ mono
|
||||
vld1.8 {d0, d1, d2, d3}, [r5, :128]
|
||||
1:
|
||||
sub r6, r6, #32
|
||||
sub r0, r0, #16
|
||||
vld1.8 {d4, d5, d6, d7}, [r1]!
|
||||
vtbl.8 d16, {d4, d5, d6, d7}, d0
|
||||
vtbl.8 d17, {d4, d5, d6, d7}, d1
|
||||
vtbl.8 d18, {d4, d5, d6, d7}, d2
|
||||
vtbl.8 d19, {d4, d5, d6, d7}, d3
|
||||
vst1.16 {d16, d17, d18, d19}, [r6, :128]
|
||||
subs r3, r3, #16
|
||||
bgt 1b
|
||||
9:
|
||||
pop {r1, r3-r7}
|
||||
bx lr
|
||||
endfunc
|
||||
73
externals/ffmpeg/libavcodec/arm/sbrdsp_init_arm.c
vendored
Executable file
73
externals/ffmpeg/libavcodec/arm/sbrdsp_init_arm.c
vendored
Executable file
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
* Copyright (c) 2012 Mans Rullgard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/sbrdsp.h"
|
||||
|
||||
void ff_sbr_sum64x5_neon(float *z);
|
||||
float ff_sbr_sum_square_neon(float (*x)[2], int n);
|
||||
void ff_sbr_neg_odd_64_neon(float *x);
|
||||
void ff_sbr_qmf_pre_shuffle_neon(float *z);
|
||||
void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z);
|
||||
void ff_sbr_qmf_deint_neg_neon(float *v, const float *src);
|
||||
void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1);
|
||||
void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2],
|
||||
const float *g_filt, int m_max, intptr_t ixh);
|
||||
void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2],
|
||||
const float alpha0[2], const float alpha1[2],
|
||||
float bw, int start, int end);
|
||||
void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]);
|
||||
|
||||
void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
|
||||
av_cold void ff_sbrdsp_init_arm(SBRDSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
s->sum64x5 = ff_sbr_sum64x5_neon;
|
||||
s->sum_square = ff_sbr_sum_square_neon;
|
||||
s->neg_odd_64 = ff_sbr_neg_odd_64_neon;
|
||||
s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon;
|
||||
s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon;
|
||||
s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon;
|
||||
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon;
|
||||
s->hf_g_filt = ff_sbr_hf_g_filt_neon;
|
||||
s->hf_gen = ff_sbr_hf_gen_neon;
|
||||
s->autocorrelate = ff_sbr_autocorrelate_neon;
|
||||
s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon;
|
||||
s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon;
|
||||
s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon;
|
||||
s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon;
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user