.github
CMakeModules
dist
externals
Vulkan-Headers
cmake-modules
cubeb
discord-rpc
dynarmic
ffmpeg
compat
doc
ffbuild
fftools
libavcodec
libavdevice
libavfilter
libavformat
libavresample
libavutil
aarch64
arm
Makefile
asm.S
bswap.h
cpu.c
cpu.h
float_dsp_arm.h
float_dsp_init_arm.c
float_dsp_init_neon.c
float_dsp_init_vfp.c
float_dsp_neon.S
float_dsp_vfp.S
intmath.h
intreadwrite.h
neontest.h
timer.h
avr32
bfin
mips
ppc
sh4
tests
tomi
x86
.gitignore
Makefile
adler32.c
adler32.h
aes.c
aes.h
aes_ctr.c
aes_ctr.h
aes_internal.h
attributes.h
audio_fifo.c
audio_fifo.h
avassert.h
avsscanf.c
avstring.c
avstring.h
avutil.h
avutilres.rc
base64.c
base64.h
blowfish.c
blowfish.h
bprint.c
bprint.h
bswap.h
buffer.c
buffer.h
buffer_internal.h
camellia.c
camellia.h
cast5.c
cast5.h
channel_layout.c
channel_layout.h
color_utils.c
color_utils.h
colorspace.h
common.h
cpu.c
cpu.h
cpu_internal.h
crc.c
crc.h
cuda_check.h
des.c
des.h
dict.c
dict.h
display.c
display.h
dovi_meta.c
dovi_meta.h
downmix_info.c
downmix_info.h
dynarray.h
encryption_info.c
encryption_info.h
error.c
error.h
eval.c
eval.h
ffmath.h
fifo.c
fifo.h
file.c
file.h
file_open.c
fixed_dsp.c
fixed_dsp.h
float_dsp.c
float_dsp.h
frame.c
frame.h
hash.c
hash.h
hdr_dynamic_metadata.c
hdr_dynamic_metadata.h
hmac.c
hmac.h
hwcontext.c
hwcontext.h
hwcontext_cuda.c
hwcontext_cuda.h
hwcontext_cuda_internal.h
hwcontext_d3d11va.c
hwcontext_d3d11va.h
hwcontext_drm.c
hwcontext_drm.h
hwcontext_dxva2.c
hwcontext_dxva2.h
hwcontext_internal.h
hwcontext_mediacodec.c
hwcontext_mediacodec.h
hwcontext_opencl.c
hwcontext_opencl.h
hwcontext_qsv.c
hwcontext_qsv.h
hwcontext_vaapi.c
hwcontext_vaapi.h
hwcontext_vdpau.c
hwcontext_vdpau.h
hwcontext_videotoolbox.c
hwcontext_videotoolbox.h
hwcontext_vulkan.c
hwcontext_vulkan.h
imgutils.c
imgutils.h
imgutils_internal.h
integer.c
integer.h
internal.h
intfloat.h
intmath.c
intmath.h
intreadwrite.h
lfg.c
lfg.h
libavutil.v
libm.h
lls.c
lls.h
log.c
log.h
log2_tab.c
lzo.c
lzo.h
macros.h
mastering_display_metadata.c
mastering_display_metadata.h
mathematics.c
mathematics.h
md5.c
md5.h
mem.c
mem.h
mem_internal.h
motion_vector.h
murmur3.c
murmur3.h
opt.c
opt.h
parseutils.c
parseutils.h
pca.c
pca.h
pixdesc.c
pixdesc.h
pixelutils.c
pixelutils.h
pixfmt.h
qsort.h
random_seed.c
random_seed.h
rational.c
rational.h
rc4.c
rc4.h
replaygain.h
reverse.c
reverse.h
ripemd.c
ripemd.h
samplefmt.c
samplefmt.h
sha.c
sha.h
sha512.c
sha512.h
slicethread.c
slicethread.h
softfloat.h
softfloat_ieee754.h
softfloat_tables.h
spherical.c
spherical.h
stereo3d.c
stereo3d.h
tablegen.h
tea.c
tea.h
thread.h
threadmessage.c
threadmessage.h
time.c
time.h
time_internal.h
timecode.c
timecode.h
timer.h
timestamp.h
tree.c
tree.h
twofish.c
twofish.h
tx.c
tx.h
tx_double.c
tx_float.c
tx_int32.c
tx_priv.h
tx_template.c
utils.c
version.h
video_enc_params.c
video_enc_params.h
wchar_filename.h
xga_font_data.c
xga_font_data.h
xtea.c
xtea.h
libpostproc
libswresample
libswscale
presets
tests
tools
.gitattributes
.gitignore
.mailmap
.travis.yml
CONTRIBUTING.md
COPYING.GPLv2
COPYING.GPLv3
COPYING.LGPLv2.1
COPYING.LGPLv3
CREDITS
Changelog
INSTALL.md
LICENSE.md
MAINTAINERS
Makefile
README.md
RELEASE
RELEASE_NOTES
configure
find-modules
getopt
glad
httplib
inih
libressl
libusb
libzip
mbedtls
microprofile
opus
sirit
soundtouch
xbyak
CMakeLists.txt
patches
src
CMakeLists.txt
LICENSE
README.md
license.txt
272 lines
9.5 KiB
ArmAsm
272 lines
9.5 KiB
ArmAsm
![]() |
/*
|
||
|
* ARM NEON optimised Float DSP functions
|
||
|
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||
|
*
|
||
|
* This file is part of FFmpeg.
|
||
|
*
|
||
|
* FFmpeg is free software; you can redistribute it and/or
|
||
|
* modify it under the terms of the GNU Lesser General Public
|
||
|
* License as published by the Free Software Foundation; either
|
||
|
* version 2.1 of the License, or (at your option) any later version.
|
||
|
*
|
||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
* Lesser General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU Lesser General Public
|
||
|
* License along with FFmpeg; if not, write to the Free Software
|
||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||
|
*/
|
||
|
|
||
|
#include "config.h"
|
||
|
#include "asm.S"
|
||
|
|
||
|
function ff_vector_fmul_neon, export=1
|
||
|
subs r3, r3, #8
|
||
|
vld1.32 {d0-d3}, [r1,:128]!
|
||
|
vld1.32 {d4-d7}, [r2,:128]!
|
||
|
vmul.f32 q8, q0, q2
|
||
|
vmul.f32 q9, q1, q3
|
||
|
beq 3f
|
||
|
bics ip, r3, #15
|
||
|
beq 2f
|
||
|
1: subs ip, ip, #16
|
||
|
vld1.32 {d0-d1}, [r1,:128]!
|
||
|
vld1.32 {d4-d5}, [r2,:128]!
|
||
|
vmul.f32 q10, q0, q2
|
||
|
vld1.32 {d2-d3}, [r1,:128]!
|
||
|
vld1.32 {d6-d7}, [r2,:128]!
|
||
|
vmul.f32 q11, q1, q3
|
||
|
vst1.32 {d16-d19},[r0,:128]!
|
||
|
vld1.32 {d0-d1}, [r1,:128]!
|
||
|
vld1.32 {d4-d5}, [r2,:128]!
|
||
|
vmul.f32 q8, q0, q2
|
||
|
vld1.32 {d2-d3}, [r1,:128]!
|
||
|
vld1.32 {d6-d7}, [r2,:128]!
|
||
|
vmul.f32 q9, q1, q3
|
||
|
vst1.32 {d20-d23},[r0,:128]!
|
||
|
bne 1b
|
||
|
ands r3, r3, #15
|
||
|
beq 3f
|
||
|
2: vld1.32 {d0-d1}, [r1,:128]!
|
||
|
vld1.32 {d4-d5}, [r2,:128]!
|
||
|
vst1.32 {d16-d17},[r0,:128]!
|
||
|
vmul.f32 q8, q0, q2
|
||
|
vld1.32 {d2-d3}, [r1,:128]!
|
||
|
vld1.32 {d6-d7}, [r2,:128]!
|
||
|
vst1.32 {d18-d19},[r0,:128]!
|
||
|
vmul.f32 q9, q1, q3
|
||
|
3: vst1.32 {d16-d19},[r0,:128]!
|
||
|
bx lr
|
||
|
endfunc
|
||
|
|
||
|
function ff_vector_fmac_scalar_neon, export=1
|
||
|
VFP len .req r2
|
||
|
VFP acc .req r3
|
||
|
NOVFP len .req r3
|
||
|
NOVFP acc .req r2
|
||
|
VFP vdup.32 q15, d0[0]
|
||
|
NOVFP vdup.32 q15, r2
|
||
|
bics r12, len, #15
|
||
|
mov acc, r0
|
||
|
beq 3f
|
||
|
vld1.32 {q0}, [r1,:128]!
|
||
|
vld1.32 {q8}, [acc,:128]!
|
||
|
vld1.32 {q1}, [r1,:128]!
|
||
|
vld1.32 {q9}, [acc,:128]!
|
||
|
1: vmla.f32 q8, q0, q15
|
||
|
vld1.32 {q2}, [r1,:128]!
|
||
|
vld1.32 {q10}, [acc,:128]!
|
||
|
vmla.f32 q9, q1, q15
|
||
|
vld1.32 {q3}, [r1,:128]!
|
||
|
vld1.32 {q11}, [acc,:128]!
|
||
|
vmla.f32 q10, q2, q15
|
||
|
vst1.32 {q8}, [r0,:128]!
|
||
|
vmla.f32 q11, q3, q15
|
||
|
vst1.32 {q9}, [r0,:128]!
|
||
|
subs r12, r12, #16
|
||
|
beq 2f
|
||
|
vld1.32 {q0}, [r1,:128]!
|
||
|
vld1.32 {q8}, [acc,:128]!
|
||
|
vst1.32 {q10}, [r0,:128]!
|
||
|
vld1.32 {q1}, [r1,:128]!
|
||
|
vld1.32 {q9}, [acc,:128]!
|
||
|
vst1.32 {q11}, [r0,:128]!
|
||
|
b 1b
|
||
|
2: vst1.32 {q10}, [r0,:128]!
|
||
|
vst1.32 {q11}, [r0,:128]!
|
||
|
ands len, len, #15
|
||
|
it eq
|
||
|
bxeq lr
|
||
|
3: vld1.32 {q0}, [r1,:128]!
|
||
|
vld1.32 {q8}, [acc,:128]!
|
||
|
vmla.f32 q8, q0, q15
|
||
|
vst1.32 {q8}, [r0,:128]!
|
||
|
subs len, len, #4
|
||
|
bgt 3b
|
||
|
bx lr
|
||
|
.unreq len
|
||
|
endfunc
|
||
|
|
||
|
function ff_vector_fmul_scalar_neon, export=1
|
||
|
VFP len .req r2
|
||
|
NOVFP len .req r3
|
||
|
VFP vdup.32 q8, d0[0]
|
||
|
NOVFP vdup.32 q8, r2
|
||
|
bics r12, len, #15
|
||
|
beq 3f
|
||
|
vld1.32 {q0},[r1,:128]!
|
||
|
vld1.32 {q1},[r1,:128]!
|
||
|
1: vmul.f32 q0, q0, q8
|
||
|
vld1.32 {q2},[r1,:128]!
|
||
|
vmul.f32 q1, q1, q8
|
||
|
vld1.32 {q3},[r1,:128]!
|
||
|
vmul.f32 q2, q2, q8
|
||
|
vst1.32 {q0},[r0,:128]!
|
||
|
vmul.f32 q3, q3, q8
|
||
|
vst1.32 {q1},[r0,:128]!
|
||
|
subs r12, r12, #16
|
||
|
beq 2f
|
||
|
vld1.32 {q0},[r1,:128]!
|
||
|
vst1.32 {q2},[r0,:128]!
|
||
|
vld1.32 {q1},[r1,:128]!
|
||
|
vst1.32 {q3},[r0,:128]!
|
||
|
b 1b
|
||
|
2: vst1.32 {q2},[r0,:128]!
|
||
|
vst1.32 {q3},[r0,:128]!
|
||
|
ands len, len, #15
|
||
|
it eq
|
||
|
bxeq lr
|
||
|
3: vld1.32 {q0},[r1,:128]!
|
||
|
vmul.f32 q0, q0, q8
|
||
|
vst1.32 {q0},[r0,:128]!
|
||
|
subs len, len, #4
|
||
|
bgt 3b
|
||
|
bx lr
|
||
|
.unreq len
|
||
|
endfunc
|
||
|
|
||
|
function ff_vector_fmul_window_neon, export=1
|
||
|
push {r4,r5,lr}
|
||
|
ldr lr, [sp, #12]
|
||
|
sub r2, r2, #8
|
||
|
sub r5, lr, #2
|
||
|
add r2, r2, r5, lsl #2
|
||
|
add r4, r3, r5, lsl #3
|
||
|
add ip, r0, r5, lsl #3
|
||
|
mov r5, #-16
|
||
|
vld1.32 {d0,d1}, [r1,:128]!
|
||
|
vld1.32 {d2,d3}, [r2,:128], r5
|
||
|
vld1.32 {d4,d5}, [r3,:128]!
|
||
|
vld1.32 {d6,d7}, [r4,:128], r5
|
||
|
1: subs lr, lr, #4
|
||
|
vmul.f32 d22, d0, d4
|
||
|
vrev64.32 q3, q3
|
||
|
vmul.f32 d23, d1, d5
|
||
|
vrev64.32 q1, q1
|
||
|
vmul.f32 d20, d0, d7
|
||
|
vmul.f32 d21, d1, d6
|
||
|
beq 2f
|
||
|
vmla.f32 d22, d3, d7
|
||
|
vld1.32 {d0,d1}, [r1,:128]!
|
||
|
vmla.f32 d23, d2, d6
|
||
|
vld1.32 {d18,d19},[r2,:128], r5
|
||
|
vmls.f32 d20, d3, d4
|
||
|
vld1.32 {d24,d25},[r3,:128]!
|
||
|
vmls.f32 d21, d2, d5
|
||
|
vld1.32 {d6,d7}, [r4,:128], r5
|
||
|
vmov q1, q9
|
||
|
vrev64.32 q11, q11
|
||
|
vmov q2, q12
|
||
|
vswp d22, d23
|
||
|
vst1.32 {d20,d21},[r0,:128]!
|
||
|
vst1.32 {d22,d23},[ip,:128], r5
|
||
|
b 1b
|
||
|
2: vmla.f32 d22, d3, d7
|
||
|
vmla.f32 d23, d2, d6
|
||
|
vmls.f32 d20, d3, d4
|
||
|
vmls.f32 d21, d2, d5
|
||
|
vrev64.32 q11, q11
|
||
|
vswp d22, d23
|
||
|
vst1.32 {d20,d21},[r0,:128]!
|
||
|
vst1.32 {d22,d23},[ip,:128], r5
|
||
|
pop {r4,r5,pc}
|
||
|
endfunc
|
||
|
|
||
|
function ff_vector_fmul_add_neon, export=1
|
||
|
ldr r12, [sp]
|
||
|
vld1.32 {q0-q1}, [r1,:128]!
|
||
|
vld1.32 {q8-q9}, [r2,:128]!
|
||
|
vld1.32 {q2-q3}, [r3,:128]!
|
||
|
vmul.f32 q10, q0, q8
|
||
|
vmul.f32 q11, q1, q9
|
||
|
1: vadd.f32 q12, q2, q10
|
||
|
vadd.f32 q13, q3, q11
|
||
|
pld [r1, #16]
|
||
|
pld [r2, #16]
|
||
|
pld [r3, #16]
|
||
|
subs r12, r12, #8
|
||
|
beq 2f
|
||
|
vld1.32 {q0}, [r1,:128]!
|
||
|
vld1.32 {q8}, [r2,:128]!
|
||
|
vmul.f32 q10, q0, q8
|
||
|
vld1.32 {q1}, [r1,:128]!
|
||
|
vld1.32 {q9}, [r2,:128]!
|
||
|
vmul.f32 q11, q1, q9
|
||
|
vld1.32 {q2-q3}, [r3,:128]!
|
||
|
vst1.32 {q12-q13},[r0,:128]!
|
||
|
b 1b
|
||
|
2: vst1.32 {q12-q13},[r0,:128]!
|
||
|
bx lr
|
||
|
endfunc
|
||
|
|
||
|
function ff_vector_fmul_reverse_neon, export=1
|
||
|
add r2, r2, r3, lsl #2
|
||
|
sub r2, r2, #32
|
||
|
mov r12, #-32
|
||
|
vld1.32 {q0-q1}, [r1,:128]!
|
||
|
vld1.32 {q2-q3}, [r2,:128], r12
|
||
|
1: pld [r1, #32]
|
||
|
vrev64.32 q3, q3
|
||
|
vmul.f32 d16, d0, d7
|
||
|
vmul.f32 d17, d1, d6
|
||
|
pld [r2, #-32]
|
||
|
vrev64.32 q2, q2
|
||
|
vmul.f32 d18, d2, d5
|
||
|
vmul.f32 d19, d3, d4
|
||
|
subs r3, r3, #8
|
||
|
beq 2f
|
||
|
vld1.32 {q0-q1}, [r1,:128]!
|
||
|
vld1.32 {q2-q3}, [r2,:128], r12
|
||
|
vst1.32 {q8-q9}, [r0,:128]!
|
||
|
b 1b
|
||
|
2: vst1.32 {q8-q9}, [r0,:128]!
|
||
|
bx lr
|
||
|
endfunc
|
||
|
|
||
|
function ff_butterflies_float_neon, export=1
|
||
|
1: vld1.32 {q0},[r0,:128]
|
||
|
vld1.32 {q1},[r1,:128]
|
||
|
vsub.f32 q2, q0, q1
|
||
|
vadd.f32 q1, q0, q1
|
||
|
vst1.32 {q2},[r1,:128]!
|
||
|
vst1.32 {q1},[r0,:128]!
|
||
|
subs r2, r2, #4
|
||
|
bgt 1b
|
||
|
bx lr
|
||
|
endfunc
|
||
|
|
||
|
function ff_scalarproduct_float_neon, export=1
|
||
|
vmov.f32 q2, #0.0
|
||
|
1: vld1.32 {q0},[r0,:128]!
|
||
|
vld1.32 {q1},[r1,:128]!
|
||
|
vmla.f32 q2, q0, q1
|
||
|
subs r2, r2, #4
|
||
|
bgt 1b
|
||
|
vadd.f32 d0, d4, d5
|
||
|
vpadd.f32 d0, d0, d0
|
||
|
NOVFP vmov.32 r0, d0[0]
|
||
|
bx lr
|
||
|
endfunc
|