early-access version 1432

This commit is contained in:
pineappleEA
2021-02-09 04:25:58 +01:00
parent de64eab4b4
commit 3d5a9d908a
7336 changed files with 1773492 additions and 111 deletions

8
externals/ffmpeg/libswscale/arm/Makefile vendored Executable file
View File

@@ -0,0 +1,8 @@
OBJS += arm/swscale.o \
arm/swscale_unscaled.o \
NEON-OBJS += arm/rgb2yuv_neon_32.o
NEON-OBJS += arm/rgb2yuv_neon_16.o
NEON-OBJS += arm/hscale.o \
arm/output.o \
arm/yuv2rgb_neon.o \

70
externals/ffmpeg/libswscale/arm/hscale.S vendored Executable file
View File

@@ -0,0 +1,70 @@
/*
* Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
* Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_hscale_8_to_15_neon, export=1
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ filter
ldr r5, [sp, #108] @ filterPos
ldr r6, [sp, #112] @ filterSize
add r10, r4, r6, lsl #1 @ filter2 = filter + filterSize * 2
1: ldr r8, [r5], #4 @ filterPos[0]
ldr r9, [r5], #4 @ filterPos[1]
vmov.s32 q4, #0 @ val accumulator
vmov.s32 q5, #0 @ val accumulator
mov r7, r6 @ tmpfilterSize = filterSize
mov r0, r3 @ srcp
2: add r11, r0, r8 @ srcp + filterPos[0]
add r12, r0, r9 @ srcp + filterPos[1]
vld1.8 d0, [r11] @ srcp[filterPos[0] + {0..7}]
vld1.8 d2, [r12] @ srcp[filterPos[1] + {0..7}]
vld1.16 {q2}, [r4]! @ load 8x16-bit filter values
vld1.16 {q3}, [r10]! @ load 8x16-bit filter values
vmovl.u8 q0, d0 @ unpack src values to 16-bit
vmovl.u8 q1, d2 @ unpack src values to 16-bit
vmull.s16 q8, d0, d4 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 1)
vmull.s16 q9, d1, d5 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 2)
vmull.s16 q10, d2, d6 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 1)
vmull.s16 q11, d3, d7 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 2)
vpadd.s32 d16, d16, d17 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
vpadd.s32 d17, d18, d19 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
vpadd.s32 d20, d20, d21 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
vpadd.s32 d21, d22, d23 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
vadd.s32 q4, q8 @ update val accumulator
vadd.s32 q5, q10 @ update val accumulator
add r0, #8 @ srcp += 8
subs r7, #8 @ tmpfilterSize -= 8
bgt 2b @ loop until tmpfilterSize is consumed
mov r4, r10 @ filter = filter2
add r10, r10, r6, lsl #1 @ filter2 += filterSize * 2
vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 1)
vpadd.s32 d9, d10, d11 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 2)
vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 4x32-bit sums into 2x32-bit
vqshrn.s32 d8, q4, #7 @ shift and clip the 2x16-bit final values
vst1.32 {d8[0]},[r1]! @ write destination
subs r2, #2 @ dstW -= 2
bgt 1b @ loop until end of line
vpop {q4-q7}
pop {r4-r12, lr}
mov pc, lr
endfunc

78
externals/ffmpeg/libswscale/arm/output.S vendored Executable file
View File

@@ -0,0 +1,78 @@
/*
* Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
* Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_yuv2planeX_8_neon, export=1
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ dstW
ldr r5, [sp, #108] @ dither
ldr r6, [sp, #112] @ offset
vld1.8 {d0}, [r5] @ load 8x8-bit dither values
cmp r6, #0 @ check offsetting which can be 0 or 3 only
beq 1f
vext.u8 d0, d0, d0, #3 @ honor offseting which can be 3 only
1: vmovl.u8 q0, d0 @ extend dither to 16-bit
vshll.u16 q1, d0, #12 @ extend dither to 32-bit with left shift by 12 (part 1)
vshll.u16 q2, d1, #12 @ extend dither to 32-bit with left shift by 12 (part 2)
mov r7, #0 @ i = 0
2: vmov.u8 q3, q1 @ initialize accumulator with dithering values (part 1)
vmov.u8 q4, q2 @ initialize accumulator with dithering values (part 2)
mov r8, r1 @ tmpFilterSize = filterSize
mov r9, r2 @ srcp
mov r10, r0 @ filterp
3: ldr r11, [r9], #4 @ get pointer @ src[j]
ldr r12, [r9], #4 @ get pointer @ src[j+1]
add r11, r11, r7, lsl #1 @ &src[j][i]
add r12, r12, r7, lsl #1 @ &src[j+1][i]
vld1.16 {q5}, [r11] @ read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
vld1.16 {q6}, [r12] @ read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
ldr r11, [r10], #4 @ read 2x16-bit coeffs (X, Y) at (filter[j], filter[j+1])
vmov.16 q7, q5 @ copy 8x16-bit @ src[j ][i + {0..7}] for following inplace zip instruction
vmov.16 q8, q6 @ copy 8x16-bit @ src[j+1][i + {0..7}] for following inplace zip instruction
vzip.16 q7, q8 @ A,I,B,J,C,K,D,L,E,M,F,N,G,O,H,P
vdup.32 q15, r11 @ X,Y,X,Y,X,Y,X,Y
vmull.s16 q9, d14, d30 @ A*X,I*Y,B*X,J*Y
vmull.s16 q10, d15, d31 @ C*X,K*Y,D*X,L*Y
vmull.s16 q11, d16, d30 @ E*X,M*Y,F*X,N*Y
vmull.s16 q12, d17, d31 @ G*X,O*Y,H*X,P*Y
vpadd.s32 d10, d18, d19 @ A*X+I*Y,B*X+J*Y
vpadd.s32 d11, d20, d21 @ C*X+K*Y,D*X+L*Y
vpadd.s32 d12, d22, d23 @ E*X+M*Y,F*X+N*Y
vpadd.s32 d13, d24, d25 @ G*X+O*Y,H*X+P*Y
vadd.s32 q3, q5 @ update val accumulator (part 1)
vadd.s32 q4, q6 @ update val accumulator (part 2)
subs r8, #2 @ tmpFilterSize -= 2
bgt 3b @ loop until filterSize is consumed
vshr.s32 q3, q3, #19 @ val>>19 (part 1)
vshr.s32 q4, q4, #19 @ val>>19 (part 2)
vqmovun.s32 d6, q3 @ clip16(val>>19) (part 1)
vqmovun.s32 d7, q4 @ clip16(val>>19) (part 2)
vqmovn.u16 d6, q3 @ merge part 1 and part 2
vst1.8 {d6}, [r3]! @ write destination
add r7, #8 @ i += 8
subs r4, r4, #8 @ dstW -= 8
bgt 2b @ loop until width is consumed
vpop {q4-q7}
pop {r4-r12, lr}
mov pc, lr
endfunc

View File

@@ -0,0 +1,83 @@
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_AS_DN_DIRECTIVE
#include "rgb2yuv_neon_common.S"
/* downsampled R16G16B16 x8 */
alias_qw r16x8, q7
alias_qw g16x8, q8
alias_qw b16x8, q9
alias n16x16_l, q11
alias n16x16_h, q12
alias y16x16_l, q13
alias y16x16_h, q14
alias_qw y8x16, q15
.macro init src
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
vrshrn.i32 CO_R, q13, #7
vrshrn.i32 CO_G, q14, #7
vrshrn.i32 CO_B, q15, #7
vmov.u8 BIAS_Y, #16
vmov.u8 BIAS_U, #128
.endm
.macro compute_y_16x1_step action, s8x16, coeff
vmovl.u8 n16x16_l, \s8x16\()_l
vmovl.u8 n16x16_h, \s8x16\()_h
\action y16x16_l, n16x16_l, \coeff
\action y16x16_h, n16x16_h, \coeff
.endm
.macro compute_y_16x1
compute_y_16x1_step vmul, r8x16, CO_RY
compute_y_16x1_step vmla, g8x16, CO_GY
compute_y_16x1_step vmla, b8x16, CO_BY
vrshrn.i16 y8x16_l, y16x16_l, #8
vrshrn.i16 y8x16_h, y16x16_h, #8
vadd.u8 y8x16, y8x16, BIAS_Y
.endm
alias c16x8, q15
alias_qw c8x8x2, q10
.macro compute_chroma_8x1 c, C
vmul c16x8, r16x8, CO_R\C
vmla c16x8, g16x8, CO_G\C
vmla c16x8, b16x8, CO_B\C
vrshrn.i16 \c\()8x8, c16x8, #8
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
.endm
loop_420sp rgbx, nv12, init, kernel_420_16x2, 16
#endif

View File

@@ -0,0 +1,122 @@
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_AS_DN_DIRECTIVE
#include "rgb2yuv_neon_common.S"
/* downsampled R16G16B16 x8 */
alias_qw r16x8, q7
alias_qw g16x8, q8
alias_qw b16x8, q9
alias n16x16_o, q11
alias n16x16_ol, q11_l
alias n16x16_oh, q11_h
alias y32x16_el, q12
alias y32x16_eh, q13
alias y32x16_ol, q14
alias y32x16_oh, q15
alias y16x16_e, q12
alias y16x16_el, q12_l
alias y16x16_eh, q12_h
alias y16x16_o, q13
alias y16x16_ol, q13_l
alias y16x16_oh, q13_h
alias y8x16, y16x16_e
.macro init src
// load s32x3x3, narrow to s16x3x3
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
vmovn.i32 CO_R, q13
vmovn.i32 CO_G, q14
vmovn.i32 CO_B, q15
vmov.u8 BIAS_Y, #16
vmov.u8 BIAS_U, #128
.endm
.macro compute_y_16x1_step action, s8x16, coeff
vmov.u8 n16x16_o, #0
vtrn.u8 \s8x16, n16x16_o
\action y32x16_el, \s8x16\()_l, \coeff
\action y32x16_eh, \s8x16\()_h, \coeff
\action y32x16_ol, n16x16_ol, \coeff
\action y32x16_oh, n16x16_oh, \coeff
.endm
/*
* in: r8x16, g8x16, b8x16
* out: y8x16
* clobber: q11-q15, r8x16, g8x16, b8x16
*/
.macro compute_y_16x1
compute_y_16x1_step vmull, r8x16, CO_RY
compute_y_16x1_step vmlal, g8x16, CO_GY
compute_y_16x1_step vmlal, b8x16, CO_BY
vrshrn.i32 y16x16_el, y32x16_el, #15
vrshrn.i32 y16x16_eh, y32x16_eh, #15
vrshrn.i32 y16x16_ol, y32x16_ol, #15
vrshrn.i32 y16x16_oh, y32x16_oh, #15
vtrn.8 y16x16_e, y16x16_o
vadd.u8 y8x16, y8x16, BIAS_Y
.endm
alias c32x8_l, q14
alias c32x8_h, q15
alias_qw c16x8, q13
alias_qw c8x8x2, q10
.macro compute_chroma_8x1_step action, s16x8, coeff
\action c32x8_l, \s16x8\()_l, \coeff
\action c32x8_h, \s16x8\()_h, \coeff
.endm
/*
* in: r16x8, g16x8, b16x8
* out: c8x8
* clobber: q14-q15
*/
.macro compute_chroma_8x1 c, C
compute_chroma_8x1_step vmull, r16x8, CO_R\C
compute_chroma_8x1_step vmlal, g16x8, CO_G\C
compute_chroma_8x1_step vmlal, b16x8, CO_B\C
vrshrn.i32 c16x8_l, c32x8_l, #15
vrshrn.i32 c16x8_h, c32x8_h, #15
vmovn.i16 \c\()8x8, c16x8
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
.endm
loop_420sp rgbx, nv12, init, kernel_420_16x2, 32
#endif

View File

@@ -0,0 +1,291 @@
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro alias name, tgt, set=1
.if \set != 0
\name .req \tgt
.else
.unreq \name
.endif
.endm
.altmacro
.macro alias_dw_all qw, dw_l, dw_h
alias q\qw\()_l, d\dw_l
alias q\qw\()_h, d\dw_h
.if \qw < 15
alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
.endif
.endm
alias_dw_all 0, 0, 1
.noaltmacro
.macro alias_qw name, qw, set=1
alias \name\(), \qw, \set
alias \name\()_l, \qw\()_l, \set
alias \name\()_h, \qw\()_h, \set
.endm
.macro prologue
push {r4-r12, lr}
vpush {q4-q7}
.endm
.macro epilogue
vpop {q4-q7}
pop {r4-r12, pc}
.endm
.macro load_arg reg, ix
ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
.endm
/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma
* int width, int height,
* int y_stride, int c_stride, int src_stride,
* int32_t coeff_table[9]);
*/
.macro alias_loop_420sp set=1
alias src, r0, \set
alias src0, src, \set
alias y, r1, \set
alias y0, y, \set
alias chroma, r2, \set
alias width, r3, \set
alias header, width, \set
alias height, r4, \set
alias y_stride, r5, \set
alias c_stride, r6, \set
alias c_padding, c_stride, \set
alias src_stride, r7, \set
alias y0_end, r8, \set
alias src_padding,r9, \set
alias y_padding, r10, \set
alias src1, r11, \set
alias y1, r12, \set
alias coeff_table,r12, \set
.endm
.macro loop_420sp s_fmt, d_fmt, init, kernel, precision
function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
prologue
alias_loop_420sp
load_arg height, 4
load_arg y_stride, 5
load_arg c_stride, 6
load_arg src_stride, 7
load_arg coeff_table, 8
\init coeff_table
sub y_padding, y_stride, width
sub c_padding, c_stride, width
sub src_padding, src_stride, width, LSL #2
add y0_end, y0, width
and header, width, #15
add y1, y0, y_stride
add src1, src0, src_stride
0:
cmp header, #0
beq 1f
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header
1:
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma
cmp y0, y0_end
blt 1b
2:
add y0, y1, y_padding
add y0_end, y1, y_stride
add chroma, chroma, c_padding
add src0, src1, src_padding
add y1, y0, y_stride
add src1, src0, src_stride
subs height, height, #2
bgt 0b
epilogue
alias_loop_420sp 0
endfunc
.endm
.macro downsample
vpaddl.u8 r16x8, r8x16
vpaddl.u8 g16x8, g8x16
vpaddl.u8 b16x8, b8x16
.endm
/* acculumate and right shift by 2 */
.macro downsample_ars2
vpadal.u8 r16x8, r8x16
vpadal.u8 g16x8, g8x16
vpadal.u8 b16x8, b8x16
vrshr.u16 r16x8, r16x8, #2
vrshr.u16 g16x8, g16x8, #2
vrshr.u16 b16x8, b16x8, #2
.endm
.macro store_y8_16x1 dst, count
.ifc "\count",""
vstmia \dst!, {y8x16}
.else
vstmia \dst, {y8x16}
add \dst, \dst, \count
.endif
.endm
.macro store_chroma_nv12_8x1 dst, count
.ifc "\count",""
vst2.i8 {u8x8, v8x8}, [\dst]!
.else
vst2.i8 {u8x8, v8x8}, [\dst], \count
.endif
.endm
.macro store_chroma_nv21_8x1 dst, count
.ifc "\count",""
vst2.i8 {v8x8, u8x8}, [\dst]!
.else
vst2.i8 {v8x8, u8x8}, [\dst], \count
.endif
.endm
.macro load_8888_16x1 a, b, c, d, src, count
.ifc "\count",""
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]!
.else
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]
sub \src, \src, #32
add \src, \src, \count, LSL #2
.endif
.endm
.macro load_rgbx_16x1 src, count
load_8888_16x1 r, g, b, x, \src, \count
.endm
.macro load_bgrx_16x1 src, count
load_8888_16x1 b, g, r, x, \src, \count
.endm
.macro alias_src_rgbx set=1
alias_src_8888 r, g, b, x, \set
.endm
.macro alias_src_bgrx set=1
alias_src_8888 b, g, r, x, \set
.endm
.macro alias_dst_nv12 set=1
alias u8x8, c8x8x2_l, \set
alias v8x8, c8x8x2_h, \set
.endm
.macro alias_dst_nv21 set=1
alias v8x8, c8x8x2_l, \set
alias u8x8, c8x8x2_h, \set
.endm
// common aliases
alias CO_R d0
CO_RY .dn d0.s16[0]
CO_RU .dn d0.s16[1]
CO_RV .dn d0.s16[2]
alias CO_G d1
CO_GY .dn d1.s16[0]
CO_GU .dn d1.s16[1]
CO_GV .dn d1.s16[2]
alias CO_B d2
CO_BY .dn d2.s16[0]
CO_BU .dn d2.s16[1]
CO_BV .dn d2.s16[2]
alias BIAS_U, d3
alias BIAS_V, BIAS_U
alias BIAS_Y, q2
/* q3-q6 R8G8B8X8 x16 */
.macro alias_src_8888 a, b, c, d, set
alias_qw \a\()8x16, q3, \set
alias_qw \b\()8x16, q4, \set
alias_qw \c\()8x16, q5, \set
alias_qw \d\()8x16, q6, \set
.endm
.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count
alias_src_\rgb_fmt
alias_dst_\yuv_fmt
load_\rgb_fmt\()_16x1 \rgb0, \count
downsample
compute_y_16x1
store_y8_16x1 \y0, \count
load_\rgb_fmt\()_16x1 \rgb1, \count
downsample_ars2
compute_y_16x1
store_y8_16x1 \y1, \count
compute_chroma_8x1 u, U
compute_chroma_8x1 v, V
store_chroma_\yuv_fmt\()_8x1 \chroma, \count
alias_dst_\yuv_fmt 0
alias_src_\rgb_fmt 0
.endm

47
externals/ffmpeg/libswscale/arm/swscale.c vendored Executable file
View File

@@ -0,0 +1,47 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/arm/cpu.h"
void ff_hscale_8_to_15_neon(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
av_cold void ff_sws_init_swscale_arm(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
if (c->srcBpc == 8 && c->dstBpc <= 14 &&
(c->hLumFilterSize % 8) == 0 &&
(c->hChrFilterSize % 8) == 0)
{
c->hyScale = c->hcScale = ff_hscale_8_to_15_neon;
}
if (c->dstBpc == 8) {
c->yuv2planeX = ff_yuv2planeX_8_neon;
}
}
}

View File

@@ -0,0 +1,186 @@
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/arm/cpu.h"
#if HAVE_AS_DN_DIRECTIVE
extern void rgbx_to_nv12_neon_32(const uint8_t *src, uint8_t *y, uint8_t *chroma,
int width, int height,
int y_stride, int c_stride, int src_stride,
int32_t coeff_tbl[9]);
extern void rgbx_to_nv12_neon_16(const uint8_t *src, uint8_t *y, uint8_t *chroma,
int width, int height,
int y_stride, int c_stride, int src_stride,
int32_t coeff_tbl[9]);
static int rgbx_to_nv12_neon_32_wrapper(SwsContext *context, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[]) {
rgbx_to_nv12_neon_32(src[0] + srcSliceY * srcStride[0],
dst[0] + srcSliceY * dstStride[0],
dst[1] + (srcSliceY / 2) * dstStride[1],
context->srcW, srcSliceH,
dstStride[0], dstStride[1], srcStride[0],
context->input_rgb2yuv_table);
return 0;
}
static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[]) {
rgbx_to_nv12_neon_16(src[0] + srcSliceY * srcStride[0],
dst[0] + srcSliceY * dstStride[0],
dst[1] + (srcSliceY / 2) * dstStride[1],
context->srcW, srcSliceH,
dstStride[0], dstStride[1], srcStride[0],
context->input_rgb2yuv_table);
return 0;
}
#define YUV_TO_RGB_TABLE \
c->yuv2rgb_v2r_coeff, \
c->yuv2rgb_u2g_coeff, \
c->yuv2rgb_v2g_coeff, \
c->yuv2rgb_u2b_coeff, \
#define DECLARE_FF_YUVX_TO_RGBX_FUNCS(ifmt, ofmt) \
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
uint8_t *dst, int linesize, \
const uint8_t *srcY, int linesizeY, \
const uint8_t *srcU, int linesizeU, \
const uint8_t *srcV, int linesizeV, \
const int16_t *table, \
int y_offset, \
int y_coeff); \
\
static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \
int srcStride[], int srcSliceY, int srcSliceH, \
uint8_t *dst[], int dstStride[]) { \
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
\
ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
src[0], srcStride[0], \
src[1], srcStride[1], \
src[2], srcStride[2], \
yuv2rgb_table, \
c->yuv2rgb_y_offset >> 6, \
c->yuv2rgb_y_coeff); \
\
return 0; \
} \
#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr) \
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra) \
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p)
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p)
#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt) \
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
uint8_t *dst, int linesize, \
const uint8_t *srcY, int linesizeY, \
const uint8_t *srcC, int linesizeC, \
const int16_t *table, \
int y_offset, \
int y_coeff); \
\
static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \
int srcStride[], int srcSliceY, int srcSliceH, \
uint8_t *dst[], int dstStride[]) { \
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
\
ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
src[0], srcStride[0], src[1], srcStride[1], \
yuv2rgb_table, \
c->yuv2rgb_y_offset >> 6, \
c->yuv2rgb_y_coeff); \
\
return 0; \
} \
#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra) \
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12)
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
/* We need a 16 pixel width alignment. This constraint can easily be removed
* for input reading but for the output which is 4-bytes per pixel (RGBA) the
* assembly might be writing as much as 4*15=60 extra bytes at the end of the
* line, which won't fit the 32-bytes buffer alignment. */
#define SET_FF_NVX_TO_RGBX_FUNC(ifmt, IFMT, ofmt, OFMT, accurate_rnd) do { \
if (c->srcFormat == AV_PIX_FMT_##IFMT \
&& c->dstFormat == AV_PIX_FMT_##OFMT \
&& !(c->srcH & 1) \
&& !(c->srcW & 15) \
&& !accurate_rnd) { \
c->swscale = ifmt##_to_##ofmt##_neon_wrapper; \
} \
} while (0)
#define SET_FF_NVX_TO_ALL_RGBX_FUNC(nvx, NVX, accurate_rnd) do { \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, argb, ARGB, accurate_rnd); \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgba, RGBA, accurate_rnd); \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd); \
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd); \
} while (0)
static void get_unscaled_swscale_neon(SwsContext *c) {
int accurate_rnd = c->flags & SWS_ACCURATE_RND;
if (c->srcFormat == AV_PIX_FMT_RGBA
&& c->dstFormat == AV_PIX_FMT_NV12
&& (c->srcW >= 16)) {
c->swscale = accurate_rnd ? rgbx_to_nv12_neon_32_wrapper
: rgbx_to_nv12_neon_16_wrapper;
}
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
}
void ff_get_unscaled_swscale_arm(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
get_unscaled_swscale_neon(c);
}
#else
void ff_get_unscaled_swscale_arm(SwsContext *c)
{
}
#endif

280
externals/ffmpeg/libswscale/arm/yuv2rgb_neon.S vendored Executable file
View File

@@ -0,0 +1,280 @@
/*
* Copyright (c) 2015 Matthieu Bouron <matthieu.bouron stupeflix.com>
* Copyright (c) 2015 Clément Bœsch <clement stupeflix.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro compute_premult
vsub.u16 q14,q11 @ q14 = U * (1 << 3) - 128 * (1 << 3)
vsub.u16 q15,q11 @ q15 = V * (1 << 3) - 128 * (1 << 3)
vqdmulh.s16 q8, q15, d1[0] @ q8 = V * v2r
vqdmulh.s16 q9, q14, d1[1] @ q9 = U * u2g
vqdmulh.s16 q5, q15, d1[2] @ q5 = V * v2g
vadd.s16 q9, q5 @ q9 = U * u2g + V * v2g
vqdmulh.s16 q10,q14, d1[3] @ q10 = U * u2b
.endm
.macro compute_color dst_comp1 dst_comp2 pre
vadd.s16 q1, q14, \pre
vadd.s16 q2, q15, \pre
vqrshrun.s16 \dst_comp1, q1, #1
vqrshrun.s16 \dst_comp2, q2, #1
.endm
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
compute_color \r1, \r2, q8
compute_color \g1, \g2, q9
compute_color \b1, \b2, q10
vmov.u8 \a1, #255
vmov.u8 \a2, #255
.endm
.macro compute dst ofmt
vshll.u8 q14, d14, #3 @ q14 = Y * (1 << 3)
vshll.u8 q15, d15, #3 @ q15 = Y * (1 << 3)
vsub.s16 q14, q12 @ q14 = (Y - y_offset)
vsub.s16 q15, q12 @ q15 = (Y - y_offset)
vqdmulh.s16 q14, q13 @ q14 = (Y - y_offset) * y_coeff
vqdmulh.s16 q15, q13 @ q15 = (Y - y_offset) * y_coeff
.ifc \ofmt,argb
compute_rgba d7, d8, d9, d6, d11, d12, d13, d10
.endif
.ifc \ofmt,rgba
compute_rgba d6, d7, d8, d9, d10, d11, d12, d13
.endif
.ifc \ofmt,abgr
compute_rgba d9, d8, d7, d6, d13, d12, d11, d10
.endif
.ifc \ofmt,bgra
compute_rgba d8, d7, d6, d9, d12, d11, d10, d13
.endif
vzip.8 d6, d10 @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16
vzip.8 d7, d11 @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16
vzip.8 d8, d12 @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16
vzip.8 d9, d13 @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16
vst4.8 {q3, q4}, [\dst,:128]!
vst4.8 {q5, q6}, [\dst,:128]!
.endm
.macro process_1l_internal dst src ofmt
vld2.8 {d14, d15}, [\src]! @ q7 = Y (interleaved)
compute \dst, \ofmt
.endm
.macro process_1l ofmt
compute_premult
process_1l_internal r2, r4, \ofmt
.endm
.macro process_2l ofmt
compute_premult
process_1l_internal r2, r4, \ofmt
process_1l_internal r11,r12,\ofmt
.endm
.macro load_args_nv12
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcC
ldr r7, [sp, #116] @ r7 = linesizeC
ldr r8, [sp, #120] @ r8 = table
ldr r9, [sp, #124] @ r9 = y_offset
ldr r10,[sp, #128] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
add r11, r2, r3 @ r11 = dst + linesize (dst2)
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
lsl r3, r3, #1
lsl r5, r5, #1
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
sub r7, r7, r0 @ r7 = linesizeC - width (paddingC)
.endm
.macro load_args_nv21
load_args_nv12
.endm
.macro load_args_yuv420p
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcU
ldr r8, [sp, #128] @ r8 = table
ldr r9, [sp, #132] @ r9 = y_offset
ldr r10,[sp, #136] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
add r11, r2, r3 @ r11 = dst + linesize (dst2)
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
lsl r3, r3, #1
lsl r5, r5, #1
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
ldr r10,[sp, #120] @ r10 = srcV
.endm
.macro load_args_yuv422p
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcU
ldr r7, [sp, #116] @ r7 = linesizeU
ldr r12,[sp, #124] @ r12 = linesizeV
ldr r8, [sp, #128] @ r8 = table
ldr r9, [sp, #132] @ r9 = y_offset
ldr r10,[sp, #136] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
sub r3, r3, r0, lsl #2 @ r3 = linesize - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY - width (paddingY)
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV)
ldr r10,[sp, #120] @ r10 = srcV
.endm
.macro load_chroma_nv12
pld [r12, #64*3]
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
.endm
.macro load_chroma_nv21
pld [r12, #64*3]
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
vshll.u8 q14, d3, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d2, #3 @ q15 = V * (1 << 3)
.endm
.macro load_chroma_yuv420p
pld [r10, #64*3]
pld [r12, #64*3]
vld1.8 d2, [r6]! @ d2: chroma red line
vld1.8 d3, [r10]! @ d3: chroma blue line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
.endm
.macro load_chroma_yuv422p
pld [r10, #64*3]
vld1.8 d2, [r6]! @ d2: chroma red line
vld1.8 d3, [r10]! @ d3: chroma blue line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
.endm
.macro increment_and_test_nv12
add r11, r11, r3 @ dst2 += padding
add r12, r12, r5 @ srcY2 += paddingY
add r6, r6, r7 @ srcC += paddingC
subs r1, r1, #2 @ height -= 2
.endm
.macro increment_and_test_nv21
increment_and_test_nv12
.endm
.macro increment_and_test_yuv420p
add r11, r11, r3 @ dst2 += padding
add r12, r12, r5 @ srcY2 += paddingY
ldr r7, [sp, #116] @ r7 = linesizeU
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
add r6, r6, r7 @ srcU += paddingU
ldr r7, [sp, #124] @ r7 = linesizeV
sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV)
add r10, r10, r7 @ srcV += paddingV
subs r1, r1, #2 @ height -= 2
.endm
.macro increment_and_test_yuv422p
add r6, r6, r7 @ srcU += paddingU
add r10,r10,r12 @ srcV += paddingV
subs r1, r1, #1 @ height -= 1
.endm
.macro process_nv12 ofmt
process_2l \ofmt
.endm
.macro process_nv21 ofmt
process_2l \ofmt
.endm
.macro process_yuv420p ofmt
process_2l \ofmt
.endm
.macro process_yuv422p ofmt
process_1l \ofmt
.endm
.macro declare_func ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
load_args_\ifmt
vmov.u16 q11, #1024 @ q11 = 128 * (1 << 3)
vdup.16 q12, r9 @ q12 = y_offset
vmov d26, d0 @ q13 = y_coeff
vmov d27, d0 @ q13 = y_coeff
1:
mov r8, r0 @ r8 = width
2:
pld [r6, #64*3]
pld [r4, #64*3]
vmov.i8 d10, #128
load_chroma_\ifmt
process_\ifmt \ofmt
subs r8, r8, #16 @ width -= 16
bgt 2b
add r2, r2, r3 @ dst += padding
add r4, r4, r5 @ srcY += paddingY
increment_and_test_\ifmt
bgt 1b
vpop {q4-q7}
pop {r4-r12, lr}
mov pc, lr
endfunc
.endm
.macro declare_rgb_funcs ifmt
declare_func \ifmt, argb
declare_func \ifmt, rgba
declare_func \ifmt, abgr
declare_func \ifmt, bgra
.endm
declare_rgb_funcs nv12
declare_rgb_funcs nv21
declare_rgb_funcs yuv420p
declare_rgb_funcs yuv422p