early-access version 1432
This commit is contained in:
8
externals/ffmpeg/libswscale/arm/Makefile
vendored
Executable file
8
externals/ffmpeg/libswscale/arm/Makefile
vendored
Executable file
@@ -0,0 +1,8 @@
|
||||
OBJS += arm/swscale.o \
|
||||
arm/swscale_unscaled.o \
|
||||
|
||||
NEON-OBJS += arm/rgb2yuv_neon_32.o
|
||||
NEON-OBJS += arm/rgb2yuv_neon_16.o
|
||||
NEON-OBJS += arm/hscale.o \
|
||||
arm/output.o \
|
||||
arm/yuv2rgb_neon.o \
|
70
externals/ffmpeg/libswscale/arm/hscale.S
vendored
Executable file
70
externals/ffmpeg/libswscale/arm/hscale.S
vendored
Executable file
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
|
||||
* Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_hscale_8_to_15_neon, export=1
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ filter
|
||||
ldr r5, [sp, #108] @ filterPos
|
||||
ldr r6, [sp, #112] @ filterSize
|
||||
add r10, r4, r6, lsl #1 @ filter2 = filter + filterSize * 2
|
||||
1: ldr r8, [r5], #4 @ filterPos[0]
|
||||
ldr r9, [r5], #4 @ filterPos[1]
|
||||
vmov.s32 q4, #0 @ val accumulator
|
||||
vmov.s32 q5, #0 @ val accumulator
|
||||
mov r7, r6 @ tmpfilterSize = filterSize
|
||||
mov r0, r3 @ srcp
|
||||
2: add r11, r0, r8 @ srcp + filterPos[0]
|
||||
add r12, r0, r9 @ srcp + filterPos[1]
|
||||
vld1.8 d0, [r11] @ srcp[filterPos[0] + {0..7}]
|
||||
vld1.8 d2, [r12] @ srcp[filterPos[1] + {0..7}]
|
||||
vld1.16 {q2}, [r4]! @ load 8x16-bit filter values
|
||||
vld1.16 {q3}, [r10]! @ load 8x16-bit filter values
|
||||
vmovl.u8 q0, d0 @ unpack src values to 16-bit
|
||||
vmovl.u8 q1, d2 @ unpack src values to 16-bit
|
||||
vmull.s16 q8, d0, d4 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 1)
|
||||
vmull.s16 q9, d1, d5 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 2)
|
||||
vmull.s16 q10, d2, d6 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 1)
|
||||
vmull.s16 q11, d3, d7 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 2)
|
||||
vpadd.s32 d16, d16, d17 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
|
||||
vpadd.s32 d17, d18, d19 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
|
||||
vpadd.s32 d20, d20, d21 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
|
||||
vpadd.s32 d21, d22, d23 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
|
||||
vadd.s32 q4, q8 @ update val accumulator
|
||||
vadd.s32 q5, q10 @ update val accumulator
|
||||
add r0, #8 @ srcp += 8
|
||||
subs r7, #8 @ tmpfilterSize -= 8
|
||||
bgt 2b @ loop until tmpfilterSize is consumed
|
||||
mov r4, r10 @ filter = filter2
|
||||
add r10, r10, r6, lsl #1 @ filter2 += filterSize * 2
|
||||
vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 1)
|
||||
vpadd.s32 d9, d10, d11 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 2)
|
||||
vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 4x32-bit sums into 2x32-bit
|
||||
vqshrn.s32 d8, q4, #7 @ shift and clip the 2x16-bit final values
|
||||
vst1.32 {d8[0]},[r1]! @ write destination
|
||||
subs r2, #2 @ dstW -= 2
|
||||
bgt 1b @ loop until end of line
|
||||
vpop {q4-q7}
|
||||
pop {r4-r12, lr}
|
||||
mov pc, lr
|
||||
endfunc
|
78
externals/ffmpeg/libswscale/arm/output.S
vendored
Executable file
78
externals/ffmpeg/libswscale/arm/output.S
vendored
Executable file
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
|
||||
* Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_yuv2planeX_8_neon, export=1
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ dstW
|
||||
ldr r5, [sp, #108] @ dither
|
||||
ldr r6, [sp, #112] @ offset
|
||||
vld1.8 {d0}, [r5] @ load 8x8-bit dither values
|
||||
cmp r6, #0 @ check offsetting which can be 0 or 3 only
|
||||
beq 1f
|
||||
vext.u8 d0, d0, d0, #3 @ honor offseting which can be 3 only
|
||||
1: vmovl.u8 q0, d0 @ extend dither to 16-bit
|
||||
vshll.u16 q1, d0, #12 @ extend dither to 32-bit with left shift by 12 (part 1)
|
||||
vshll.u16 q2, d1, #12 @ extend dither to 32-bit with left shift by 12 (part 2)
|
||||
mov r7, #0 @ i = 0
|
||||
2: vmov.u8 q3, q1 @ initialize accumulator with dithering values (part 1)
|
||||
vmov.u8 q4, q2 @ initialize accumulator with dithering values (part 2)
|
||||
mov r8, r1 @ tmpFilterSize = filterSize
|
||||
mov r9, r2 @ srcp
|
||||
mov r10, r0 @ filterp
|
||||
3: ldr r11, [r9], #4 @ get pointer @ src[j]
|
||||
ldr r12, [r9], #4 @ get pointer @ src[j+1]
|
||||
add r11, r11, r7, lsl #1 @ &src[j][i]
|
||||
add r12, r12, r7, lsl #1 @ &src[j+1][i]
|
||||
vld1.16 {q5}, [r11] @ read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
vld1.16 {q6}, [r12] @ read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
|
||||
ldr r11, [r10], #4 @ read 2x16-bit coeffs (X, Y) at (filter[j], filter[j+1])
|
||||
vmov.16 q7, q5 @ copy 8x16-bit @ src[j ][i + {0..7}] for following inplace zip instruction
|
||||
vmov.16 q8, q6 @ copy 8x16-bit @ src[j+1][i + {0..7}] for following inplace zip instruction
|
||||
vzip.16 q7, q8 @ A,I,B,J,C,K,D,L,E,M,F,N,G,O,H,P
|
||||
vdup.32 q15, r11 @ X,Y,X,Y,X,Y,X,Y
|
||||
vmull.s16 q9, d14, d30 @ A*X,I*Y,B*X,J*Y
|
||||
vmull.s16 q10, d15, d31 @ C*X,K*Y,D*X,L*Y
|
||||
vmull.s16 q11, d16, d30 @ E*X,M*Y,F*X,N*Y
|
||||
vmull.s16 q12, d17, d31 @ G*X,O*Y,H*X,P*Y
|
||||
vpadd.s32 d10, d18, d19 @ A*X+I*Y,B*X+J*Y
|
||||
vpadd.s32 d11, d20, d21 @ C*X+K*Y,D*X+L*Y
|
||||
vpadd.s32 d12, d22, d23 @ E*X+M*Y,F*X+N*Y
|
||||
vpadd.s32 d13, d24, d25 @ G*X+O*Y,H*X+P*Y
|
||||
vadd.s32 q3, q5 @ update val accumulator (part 1)
|
||||
vadd.s32 q4, q6 @ update val accumulator (part 2)
|
||||
subs r8, #2 @ tmpFilterSize -= 2
|
||||
bgt 3b @ loop until filterSize is consumed
|
||||
vshr.s32 q3, q3, #19 @ val>>19 (part 1)
|
||||
vshr.s32 q4, q4, #19 @ val>>19 (part 2)
|
||||
vqmovun.s32 d6, q3 @ clip16(val>>19) (part 1)
|
||||
vqmovun.s32 d7, q4 @ clip16(val>>19) (part 2)
|
||||
vqmovn.u16 d6, q3 @ merge part 1 and part 2
|
||||
vst1.8 {d6}, [r3]! @ write destination
|
||||
add r7, #8 @ i += 8
|
||||
subs r4, r4, #8 @ dstW -= 8
|
||||
bgt 2b @ loop until width is consumed
|
||||
vpop {q4-q7}
|
||||
pop {r4-r12, lr}
|
||||
mov pc, lr
|
||||
endfunc
|
83
externals/ffmpeg/libswscale/arm/rgb2yuv_neon_16.S
vendored
Executable file
83
externals/ffmpeg/libswscale/arm/rgb2yuv_neon_16.S
vendored
Executable file
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#if HAVE_AS_DN_DIRECTIVE
|
||||
#include "rgb2yuv_neon_common.S"
|
||||
|
||||
/* downsampled R16G16B16 x8 */
|
||||
alias_qw r16x8, q7
|
||||
alias_qw g16x8, q8
|
||||
alias_qw b16x8, q9
|
||||
|
||||
alias n16x16_l, q11
|
||||
alias n16x16_h, q12
|
||||
|
||||
alias y16x16_l, q13
|
||||
alias y16x16_h, q14
|
||||
|
||||
alias_qw y8x16, q15
|
||||
|
||||
.macro init src
|
||||
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
|
||||
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
|
||||
vrshrn.i32 CO_R, q13, #7
|
||||
vrshrn.i32 CO_G, q14, #7
|
||||
vrshrn.i32 CO_B, q15, #7
|
||||
|
||||
vmov.u8 BIAS_Y, #16
|
||||
vmov.u8 BIAS_U, #128
|
||||
.endm
|
||||
|
||||
|
||||
.macro compute_y_16x1_step action, s8x16, coeff
|
||||
vmovl.u8 n16x16_l, \s8x16\()_l
|
||||
vmovl.u8 n16x16_h, \s8x16\()_h
|
||||
|
||||
\action y16x16_l, n16x16_l, \coeff
|
||||
\action y16x16_h, n16x16_h, \coeff
|
||||
.endm
|
||||
|
||||
.macro compute_y_16x1
|
||||
compute_y_16x1_step vmul, r8x16, CO_RY
|
||||
compute_y_16x1_step vmla, g8x16, CO_GY
|
||||
compute_y_16x1_step vmla, b8x16, CO_BY
|
||||
|
||||
vrshrn.i16 y8x16_l, y16x16_l, #8
|
||||
vrshrn.i16 y8x16_h, y16x16_h, #8
|
||||
|
||||
vadd.u8 y8x16, y8x16, BIAS_Y
|
||||
.endm
|
||||
|
||||
alias c16x8, q15
|
||||
alias_qw c8x8x2, q10
|
||||
|
||||
|
||||
.macro compute_chroma_8x1 c, C
|
||||
vmul c16x8, r16x8, CO_R\C
|
||||
vmla c16x8, g16x8, CO_G\C
|
||||
vmla c16x8, b16x8, CO_B\C
|
||||
|
||||
vrshrn.i16 \c\()8x8, c16x8, #8
|
||||
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
|
||||
.endm
|
||||
|
||||
loop_420sp rgbx, nv12, init, kernel_420_16x2, 16
|
||||
#endif
|
122
externals/ffmpeg/libswscale/arm/rgb2yuv_neon_32.S
vendored
Executable file
122
externals/ffmpeg/libswscale/arm/rgb2yuv_neon_32.S
vendored
Executable file
@@ -0,0 +1,122 @@
|
||||
/*
|
||||
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#if HAVE_AS_DN_DIRECTIVE
|
||||
#include "rgb2yuv_neon_common.S"
|
||||
|
||||
/* downsampled R16G16B16 x8 */
|
||||
alias_qw r16x8, q7
|
||||
alias_qw g16x8, q8
|
||||
alias_qw b16x8, q9
|
||||
|
||||
alias n16x16_o, q11
|
||||
alias n16x16_ol, q11_l
|
||||
alias n16x16_oh, q11_h
|
||||
|
||||
alias y32x16_el, q12
|
||||
alias y32x16_eh, q13
|
||||
alias y32x16_ol, q14
|
||||
alias y32x16_oh, q15
|
||||
|
||||
alias y16x16_e, q12
|
||||
alias y16x16_el, q12_l
|
||||
alias y16x16_eh, q12_h
|
||||
alias y16x16_o, q13
|
||||
alias y16x16_ol, q13_l
|
||||
alias y16x16_oh, q13_h
|
||||
|
||||
|
||||
alias y8x16, y16x16_e
|
||||
|
||||
|
||||
.macro init src
|
||||
// load s32x3x3, narrow to s16x3x3
|
||||
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
|
||||
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
|
||||
|
||||
vmovn.i32 CO_R, q13
|
||||
vmovn.i32 CO_G, q14
|
||||
vmovn.i32 CO_B, q15
|
||||
|
||||
vmov.u8 BIAS_Y, #16
|
||||
vmov.u8 BIAS_U, #128
|
||||
.endm
|
||||
|
||||
|
||||
.macro compute_y_16x1_step action, s8x16, coeff
|
||||
vmov.u8 n16x16_o, #0
|
||||
vtrn.u8 \s8x16, n16x16_o
|
||||
|
||||
\action y32x16_el, \s8x16\()_l, \coeff
|
||||
\action y32x16_eh, \s8x16\()_h, \coeff
|
||||
\action y32x16_ol, n16x16_ol, \coeff
|
||||
\action y32x16_oh, n16x16_oh, \coeff
|
||||
.endm
|
||||
|
||||
/*
|
||||
* in: r8x16, g8x16, b8x16
|
||||
* out: y8x16
|
||||
* clobber: q11-q15, r8x16, g8x16, b8x16
|
||||
*/
|
||||
.macro compute_y_16x1
|
||||
compute_y_16x1_step vmull, r8x16, CO_RY
|
||||
compute_y_16x1_step vmlal, g8x16, CO_GY
|
||||
compute_y_16x1_step vmlal, b8x16, CO_BY
|
||||
|
||||
vrshrn.i32 y16x16_el, y32x16_el, #15
|
||||
vrshrn.i32 y16x16_eh, y32x16_eh, #15
|
||||
vrshrn.i32 y16x16_ol, y32x16_ol, #15
|
||||
vrshrn.i32 y16x16_oh, y32x16_oh, #15
|
||||
|
||||
vtrn.8 y16x16_e, y16x16_o
|
||||
vadd.u8 y8x16, y8x16, BIAS_Y
|
||||
.endm
|
||||
|
||||
alias c32x8_l, q14
|
||||
alias c32x8_h, q15
|
||||
|
||||
alias_qw c16x8, q13
|
||||
alias_qw c8x8x2, q10
|
||||
|
||||
.macro compute_chroma_8x1_step action, s16x8, coeff
|
||||
\action c32x8_l, \s16x8\()_l, \coeff
|
||||
\action c32x8_h, \s16x8\()_h, \coeff
|
||||
.endm
|
||||
|
||||
/*
|
||||
* in: r16x8, g16x8, b16x8
|
||||
* out: c8x8
|
||||
* clobber: q14-q15
|
||||
*/
|
||||
.macro compute_chroma_8x1 c, C
|
||||
compute_chroma_8x1_step vmull, r16x8, CO_R\C
|
||||
compute_chroma_8x1_step vmlal, g16x8, CO_G\C
|
||||
compute_chroma_8x1_step vmlal, b16x8, CO_B\C
|
||||
|
||||
vrshrn.i32 c16x8_l, c32x8_l, #15
|
||||
vrshrn.i32 c16x8_h, c32x8_h, #15
|
||||
vmovn.i16 \c\()8x8, c16x8
|
||||
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
|
||||
.endm
|
||||
|
||||
|
||||
loop_420sp rgbx, nv12, init, kernel_420_16x2, 32
|
||||
#endif
|
291
externals/ffmpeg/libswscale/arm/rgb2yuv_neon_common.S
vendored
Executable file
291
externals/ffmpeg/libswscale/arm/rgb2yuv_neon_common.S
vendored
Executable file
@@ -0,0 +1,291 @@
|
||||
/*
|
||||
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
.macro alias name, tgt, set=1
|
||||
.if \set != 0
|
||||
\name .req \tgt
|
||||
.else
|
||||
.unreq \name
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.altmacro
|
||||
|
||||
.macro alias_dw_all qw, dw_l, dw_h
|
||||
alias q\qw\()_l, d\dw_l
|
||||
alias q\qw\()_h, d\dw_h
|
||||
.if \qw < 15
|
||||
alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
|
||||
.endif
|
||||
.endm
|
||||
|
||||
alias_dw_all 0, 0, 1
|
||||
|
||||
.noaltmacro
|
||||
|
||||
.macro alias_qw name, qw, set=1
|
||||
alias \name\(), \qw, \set
|
||||
alias \name\()_l, \qw\()_l, \set
|
||||
alias \name\()_h, \qw\()_h, \set
|
||||
.endm
|
||||
|
||||
.macro prologue
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
.endm
|
||||
|
||||
.macro epilogue
|
||||
vpop {q4-q7}
|
||||
pop {r4-r12, pc}
|
||||
.endm
|
||||
|
||||
.macro load_arg reg, ix
|
||||
ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
|
||||
.endm
|
||||
|
||||
|
||||
/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma
|
||||
* int width, int height,
|
||||
* int y_stride, int c_stride, int src_stride,
|
||||
* int32_t coeff_table[9]);
|
||||
*/
|
||||
.macro alias_loop_420sp set=1
|
||||
alias src, r0, \set
|
||||
alias src0, src, \set
|
||||
alias y, r1, \set
|
||||
alias y0, y, \set
|
||||
alias chroma, r2, \set
|
||||
alias width, r3, \set
|
||||
alias header, width, \set
|
||||
|
||||
alias height, r4, \set
|
||||
alias y_stride, r5, \set
|
||||
alias c_stride, r6, \set
|
||||
alias c_padding, c_stride, \set
|
||||
alias src_stride, r7, \set
|
||||
|
||||
alias y0_end, r8, \set
|
||||
|
||||
alias src_padding,r9, \set
|
||||
alias y_padding, r10, \set
|
||||
|
||||
alias src1, r11, \set
|
||||
alias y1, r12, \set
|
||||
|
||||
alias coeff_table,r12, \set
|
||||
.endm
|
||||
|
||||
|
||||
.macro loop_420sp s_fmt, d_fmt, init, kernel, precision
|
||||
|
||||
function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
|
||||
prologue
|
||||
|
||||
alias_loop_420sp
|
||||
|
||||
load_arg height, 4
|
||||
load_arg y_stride, 5
|
||||
load_arg c_stride, 6
|
||||
load_arg src_stride, 7
|
||||
load_arg coeff_table, 8
|
||||
|
||||
\init coeff_table
|
||||
|
||||
sub y_padding, y_stride, width
|
||||
sub c_padding, c_stride, width
|
||||
sub src_padding, src_stride, width, LSL #2
|
||||
|
||||
add y0_end, y0, width
|
||||
and header, width, #15
|
||||
|
||||
add y1, y0, y_stride
|
||||
add src1, src0, src_stride
|
||||
|
||||
0:
|
||||
cmp header, #0
|
||||
beq 1f
|
||||
|
||||
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header
|
||||
|
||||
1:
|
||||
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma
|
||||
|
||||
cmp y0, y0_end
|
||||
blt 1b
|
||||
2:
|
||||
add y0, y1, y_padding
|
||||
add y0_end, y1, y_stride
|
||||
add chroma, chroma, c_padding
|
||||
add src0, src1, src_padding
|
||||
|
||||
add y1, y0, y_stride
|
||||
add src1, src0, src_stride
|
||||
|
||||
subs height, height, #2
|
||||
|
||||
bgt 0b
|
||||
|
||||
epilogue
|
||||
|
||||
alias_loop_420sp 0
|
||||
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro downsample
|
||||
vpaddl.u8 r16x8, r8x16
|
||||
vpaddl.u8 g16x8, g8x16
|
||||
vpaddl.u8 b16x8, b8x16
|
||||
.endm
|
||||
|
||||
|
||||
/* acculumate and right shift by 2 */
|
||||
.macro downsample_ars2
|
||||
vpadal.u8 r16x8, r8x16
|
||||
vpadal.u8 g16x8, g8x16
|
||||
vpadal.u8 b16x8, b8x16
|
||||
|
||||
vrshr.u16 r16x8, r16x8, #2
|
||||
vrshr.u16 g16x8, g16x8, #2
|
||||
vrshr.u16 b16x8, b16x8, #2
|
||||
.endm
|
||||
|
||||
.macro store_y8_16x1 dst, count
|
||||
.ifc "\count",""
|
||||
vstmia \dst!, {y8x16}
|
||||
.else
|
||||
vstmia \dst, {y8x16}
|
||||
add \dst, \dst, \count
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro store_chroma_nv12_8x1 dst, count
|
||||
.ifc "\count",""
|
||||
vst2.i8 {u8x8, v8x8}, [\dst]!
|
||||
.else
|
||||
vst2.i8 {u8x8, v8x8}, [\dst], \count
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro store_chroma_nv21_8x1 dst, count
|
||||
.ifc "\count",""
|
||||
vst2.i8 {v8x8, u8x8}, [\dst]!
|
||||
.else
|
||||
vst2.i8 {v8x8, u8x8}, [\dst], \count
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro load_8888_16x1 a, b, c, d, src, count
|
||||
.ifc "\count",""
|
||||
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
|
||||
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]!
|
||||
.else
|
||||
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
|
||||
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]
|
||||
sub \src, \src, #32
|
||||
add \src, \src, \count, LSL #2
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro load_rgbx_16x1 src, count
|
||||
load_8888_16x1 r, g, b, x, \src, \count
|
||||
.endm
|
||||
|
||||
.macro load_bgrx_16x1 src, count
|
||||
load_8888_16x1 b, g, r, x, \src, \count
|
||||
.endm
|
||||
|
||||
.macro alias_src_rgbx set=1
|
||||
alias_src_8888 r, g, b, x, \set
|
||||
.endm
|
||||
|
||||
.macro alias_src_bgrx set=1
|
||||
alias_src_8888 b, g, r, x, \set
|
||||
.endm
|
||||
|
||||
.macro alias_dst_nv12 set=1
|
||||
alias u8x8, c8x8x2_l, \set
|
||||
alias v8x8, c8x8x2_h, \set
|
||||
.endm
|
||||
|
||||
.macro alias_dst_nv21 set=1
|
||||
alias v8x8, c8x8x2_l, \set
|
||||
alias u8x8, c8x8x2_h, \set
|
||||
.endm
|
||||
|
||||
|
||||
// common aliases
|
||||
|
||||
alias CO_R d0
|
||||
CO_RY .dn d0.s16[0]
|
||||
CO_RU .dn d0.s16[1]
|
||||
CO_RV .dn d0.s16[2]
|
||||
|
||||
alias CO_G d1
|
||||
CO_GY .dn d1.s16[0]
|
||||
CO_GU .dn d1.s16[1]
|
||||
CO_GV .dn d1.s16[2]
|
||||
|
||||
alias CO_B d2
|
||||
CO_BY .dn d2.s16[0]
|
||||
CO_BU .dn d2.s16[1]
|
||||
CO_BV .dn d2.s16[2]
|
||||
|
||||
alias BIAS_U, d3
|
||||
alias BIAS_V, BIAS_U
|
||||
|
||||
alias BIAS_Y, q2
|
||||
|
||||
|
||||
/* q3-q6 R8G8B8X8 x16 */
|
||||
|
||||
.macro alias_src_8888 a, b, c, d, set
|
||||
alias_qw \a\()8x16, q3, \set
|
||||
alias_qw \b\()8x16, q4, \set
|
||||
alias_qw \c\()8x16, q5, \set
|
||||
alias_qw \d\()8x16, q6, \set
|
||||
.endm
|
||||
|
||||
.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count
|
||||
alias_src_\rgb_fmt
|
||||
alias_dst_\yuv_fmt
|
||||
|
||||
load_\rgb_fmt\()_16x1 \rgb0, \count
|
||||
|
||||
downsample
|
||||
compute_y_16x1
|
||||
store_y8_16x1 \y0, \count
|
||||
|
||||
|
||||
load_\rgb_fmt\()_16x1 \rgb1, \count
|
||||
downsample_ars2
|
||||
compute_y_16x1
|
||||
store_y8_16x1 \y1, \count
|
||||
|
||||
compute_chroma_8x1 u, U
|
||||
compute_chroma_8x1 v, V
|
||||
|
||||
store_chroma_\yuv_fmt\()_8x1 \chroma, \count
|
||||
|
||||
alias_dst_\yuv_fmt 0
|
||||
alias_src_\rgb_fmt 0
|
||||
.endm
|
47
externals/ffmpeg/libswscale/arm/swscale.c
vendored
Executable file
47
externals/ffmpeg/libswscale/arm/swscale.c
vendored
Executable file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libswscale/swscale.h"
|
||||
#include "libswscale/swscale_internal.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
|
||||
void ff_hscale_8_to_15_neon(SwsContext *c, int16_t *dst, int dstW,
|
||||
const uint8_t *src, const int16_t *filter,
|
||||
const int32_t *filterPos, int filterSize);
|
||||
|
||||
void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
|
||||
const int16_t **src, uint8_t *dest, int dstW,
|
||||
const uint8_t *dither, int offset);
|
||||
|
||||
av_cold void ff_sws_init_swscale_arm(SwsContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
if (c->srcBpc == 8 && c->dstBpc <= 14 &&
|
||||
(c->hLumFilterSize % 8) == 0 &&
|
||||
(c->hChrFilterSize % 8) == 0)
|
||||
{
|
||||
c->hyScale = c->hcScale = ff_hscale_8_to_15_neon;
|
||||
}
|
||||
if (c->dstBpc == 8) {
|
||||
c->yuv2planeX = ff_yuv2planeX_8_neon;
|
||||
}
|
||||
}
|
||||
}
|
186
externals/ffmpeg/libswscale/arm/swscale_unscaled.c
vendored
Executable file
186
externals/ffmpeg/libswscale/arm/swscale_unscaled.c
vendored
Executable file
@@ -0,0 +1,186 @@
|
||||
/*
|
||||
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libswscale/swscale.h"
|
||||
#include "libswscale/swscale_internal.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
|
||||
#if HAVE_AS_DN_DIRECTIVE
|
||||
extern void rgbx_to_nv12_neon_32(const uint8_t *src, uint8_t *y, uint8_t *chroma,
|
||||
int width, int height,
|
||||
int y_stride, int c_stride, int src_stride,
|
||||
int32_t coeff_tbl[9]);
|
||||
|
||||
extern void rgbx_to_nv12_neon_16(const uint8_t *src, uint8_t *y, uint8_t *chroma,
|
||||
int width, int height,
|
||||
int y_stride, int c_stride, int src_stride,
|
||||
int32_t coeff_tbl[9]);
|
||||
|
||||
static int rgbx_to_nv12_neon_32_wrapper(SwsContext *context, const uint8_t *src[],
|
||||
int srcStride[], int srcSliceY, int srcSliceH,
|
||||
uint8_t *dst[], int dstStride[]) {
|
||||
|
||||
rgbx_to_nv12_neon_32(src[0] + srcSliceY * srcStride[0],
|
||||
dst[0] + srcSliceY * dstStride[0],
|
||||
dst[1] + (srcSliceY / 2) * dstStride[1],
|
||||
context->srcW, srcSliceH,
|
||||
dstStride[0], dstStride[1], srcStride[0],
|
||||
context->input_rgb2yuv_table);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[],
|
||||
int srcStride[], int srcSliceY, int srcSliceH,
|
||||
uint8_t *dst[], int dstStride[]) {
|
||||
|
||||
rgbx_to_nv12_neon_16(src[0] + srcSliceY * srcStride[0],
|
||||
dst[0] + srcSliceY * dstStride[0],
|
||||
dst[1] + (srcSliceY / 2) * dstStride[1],
|
||||
context->srcW, srcSliceH,
|
||||
dstStride[0], dstStride[1], srcStride[0],
|
||||
context->input_rgb2yuv_table);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define YUV_TO_RGB_TABLE \
|
||||
c->yuv2rgb_v2r_coeff, \
|
||||
c->yuv2rgb_u2g_coeff, \
|
||||
c->yuv2rgb_v2g_coeff, \
|
||||
c->yuv2rgb_u2b_coeff, \
|
||||
|
||||
#define DECLARE_FF_YUVX_TO_RGBX_FUNCS(ifmt, ofmt) \
|
||||
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
|
||||
uint8_t *dst, int linesize, \
|
||||
const uint8_t *srcY, int linesizeY, \
|
||||
const uint8_t *srcU, int linesizeU, \
|
||||
const uint8_t *srcV, int linesizeV, \
|
||||
const int16_t *table, \
|
||||
int y_offset, \
|
||||
int y_coeff); \
|
||||
\
|
||||
static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \
|
||||
int srcStride[], int srcSliceY, int srcSliceH, \
|
||||
uint8_t *dst[], int dstStride[]) { \
|
||||
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
|
||||
\
|
||||
ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \
|
||||
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
|
||||
src[0], srcStride[0], \
|
||||
src[1], srcStride[1], \
|
||||
src[2], srcStride[2], \
|
||||
yuv2rgb_table, \
|
||||
c->yuv2rgb_y_offset >> 6, \
|
||||
c->yuv2rgb_y_coeff); \
|
||||
\
|
||||
return 0; \
|
||||
} \
|
||||
|
||||
#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra) \
|
||||
|
||||
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p)
|
||||
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p)
|
||||
|
||||
#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt) \
|
||||
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
|
||||
uint8_t *dst, int linesize, \
|
||||
const uint8_t *srcY, int linesizeY, \
|
||||
const uint8_t *srcC, int linesizeC, \
|
||||
const int16_t *table, \
|
||||
int y_offset, \
|
||||
int y_coeff); \
|
||||
\
|
||||
static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \
|
||||
int srcStride[], int srcSliceY, int srcSliceH, \
|
||||
uint8_t *dst[], int dstStride[]) { \
|
||||
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
|
||||
\
|
||||
ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \
|
||||
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
|
||||
src[0], srcStride[0], src[1], srcStride[1], \
|
||||
yuv2rgb_table, \
|
||||
c->yuv2rgb_y_offset >> 6, \
|
||||
c->yuv2rgb_y_coeff); \
|
||||
\
|
||||
return 0; \
|
||||
} \
|
||||
|
||||
#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra) \
|
||||
|
||||
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12)
|
||||
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
|
||||
|
||||
/* We need a 16 pixel width alignment. This constraint can easily be removed
|
||||
* for input reading but for the output which is 4-bytes per pixel (RGBA) the
|
||||
* assembly might be writing as much as 4*15=60 extra bytes at the end of the
|
||||
* line, which won't fit the 32-bytes buffer alignment. */
|
||||
#define SET_FF_NVX_TO_RGBX_FUNC(ifmt, IFMT, ofmt, OFMT, accurate_rnd) do { \
|
||||
if (c->srcFormat == AV_PIX_FMT_##IFMT \
|
||||
&& c->dstFormat == AV_PIX_FMT_##OFMT \
|
||||
&& !(c->srcH & 1) \
|
||||
&& !(c->srcW & 15) \
|
||||
&& !accurate_rnd) { \
|
||||
c->swscale = ifmt##_to_##ofmt##_neon_wrapper; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define SET_FF_NVX_TO_ALL_RGBX_FUNC(nvx, NVX, accurate_rnd) do { \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, argb, ARGB, accurate_rnd); \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgba, RGBA, accurate_rnd); \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd); \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd); \
|
||||
} while (0)
|
||||
|
||||
static void get_unscaled_swscale_neon(SwsContext *c) {
|
||||
int accurate_rnd = c->flags & SWS_ACCURATE_RND;
|
||||
if (c->srcFormat == AV_PIX_FMT_RGBA
|
||||
&& c->dstFormat == AV_PIX_FMT_NV12
|
||||
&& (c->srcW >= 16)) {
|
||||
c->swscale = accurate_rnd ? rgbx_to_nv12_neon_32_wrapper
|
||||
: rgbx_to_nv12_neon_16_wrapper;
|
||||
}
|
||||
|
||||
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd);
|
||||
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
|
||||
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
|
||||
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
|
||||
}
|
||||
|
||||
void ff_get_unscaled_swscale_arm(SwsContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
if (have_neon(cpu_flags))
|
||||
get_unscaled_swscale_neon(c);
|
||||
}
|
||||
#else
|
||||
void ff_get_unscaled_swscale_arm(SwsContext *c)
|
||||
{
|
||||
}
|
||||
#endif
|
280
externals/ffmpeg/libswscale/arm/yuv2rgb_neon.S
vendored
Executable file
280
externals/ffmpeg/libswscale/arm/yuv2rgb_neon.S
vendored
Executable file
@@ -0,0 +1,280 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Matthieu Bouron <matthieu.bouron stupeflix.com>
|
||||
* Copyright (c) 2015 Clément Bœsch <clement stupeflix.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
|
||||
.macro compute_premult
|
||||
vsub.u16 q14,q11 @ q14 = U * (1 << 3) - 128 * (1 << 3)
|
||||
vsub.u16 q15,q11 @ q15 = V * (1 << 3) - 128 * (1 << 3)
|
||||
vqdmulh.s16 q8, q15, d1[0] @ q8 = V * v2r
|
||||
vqdmulh.s16 q9, q14, d1[1] @ q9 = U * u2g
|
||||
vqdmulh.s16 q5, q15, d1[2] @ q5 = V * v2g
|
||||
vadd.s16 q9, q5 @ q9 = U * u2g + V * v2g
|
||||
vqdmulh.s16 q10,q14, d1[3] @ q10 = U * u2b
|
||||
.endm
|
||||
|
||||
.macro compute_color dst_comp1 dst_comp2 pre
|
||||
vadd.s16 q1, q14, \pre
|
||||
vadd.s16 q2, q15, \pre
|
||||
vqrshrun.s16 \dst_comp1, q1, #1
|
||||
vqrshrun.s16 \dst_comp2, q2, #1
|
||||
.endm
|
||||
|
||||
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
|
||||
compute_color \r1, \r2, q8
|
||||
compute_color \g1, \g2, q9
|
||||
compute_color \b1, \b2, q10
|
||||
vmov.u8 \a1, #255
|
||||
vmov.u8 \a2, #255
|
||||
.endm
|
||||
|
||||
.macro compute dst ofmt
|
||||
vshll.u8 q14, d14, #3 @ q14 = Y * (1 << 3)
|
||||
vshll.u8 q15, d15, #3 @ q15 = Y * (1 << 3)
|
||||
vsub.s16 q14, q12 @ q14 = (Y - y_offset)
|
||||
vsub.s16 q15, q12 @ q15 = (Y - y_offset)
|
||||
vqdmulh.s16 q14, q13 @ q14 = (Y - y_offset) * y_coeff
|
||||
vqdmulh.s16 q15, q13 @ q15 = (Y - y_offset) * y_coeff
|
||||
|
||||
.ifc \ofmt,argb
|
||||
compute_rgba d7, d8, d9, d6, d11, d12, d13, d10
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,rgba
|
||||
compute_rgba d6, d7, d8, d9, d10, d11, d12, d13
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,abgr
|
||||
compute_rgba d9, d8, d7, d6, d13, d12, d11, d10
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,bgra
|
||||
compute_rgba d8, d7, d6, d9, d12, d11, d10, d13
|
||||
.endif
|
||||
|
||||
vzip.8 d6, d10 @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16
|
||||
vzip.8 d7, d11 @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16
|
||||
vzip.8 d8, d12 @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16
|
||||
vzip.8 d9, d13 @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16
|
||||
vst4.8 {q3, q4}, [\dst,:128]!
|
||||
vst4.8 {q5, q6}, [\dst,:128]!
|
||||
.endm
|
||||
|
||||
.macro process_1l_internal dst src ofmt
|
||||
vld2.8 {d14, d15}, [\src]! @ q7 = Y (interleaved)
|
||||
compute \dst, \ofmt
|
||||
.endm
|
||||
|
||||
.macro process_1l ofmt
|
||||
compute_premult
|
||||
process_1l_internal r2, r4, \ofmt
|
||||
.endm
|
||||
|
||||
.macro process_2l ofmt
|
||||
compute_premult
|
||||
process_1l_internal r2, r4, \ofmt
|
||||
process_1l_internal r11,r12,\ofmt
|
||||
.endm
|
||||
|
||||
.macro load_args_nv12
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ r4 = srcY
|
||||
ldr r5, [sp, #108] @ r5 = linesizeY
|
||||
ldr r6, [sp, #112] @ r6 = srcC
|
||||
ldr r7, [sp, #116] @ r7 = linesizeC
|
||||
ldr r8, [sp, #120] @ r8 = table
|
||||
ldr r9, [sp, #124] @ r9 = y_offset
|
||||
ldr r10,[sp, #128] @ r10 = y_coeff
|
||||
vdup.16 d0, r10 @ d0 = y_coeff
|
||||
vld1.16 {d1}, [r8] @ d1 = *table
|
||||
add r11, r2, r3 @ r11 = dst + linesize (dst2)
|
||||
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
|
||||
lsl r3, r3, #1
|
||||
lsl r5, r5, #1
|
||||
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
|
||||
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
|
||||
sub r7, r7, r0 @ r7 = linesizeC - width (paddingC)
|
||||
.endm
|
||||
|
||||
.macro load_args_nv21
|
||||
load_args_nv12
|
||||
.endm
|
||||
|
||||
.macro load_args_yuv420p
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ r4 = srcY
|
||||
ldr r5, [sp, #108] @ r5 = linesizeY
|
||||
ldr r6, [sp, #112] @ r6 = srcU
|
||||
ldr r8, [sp, #128] @ r8 = table
|
||||
ldr r9, [sp, #132] @ r9 = y_offset
|
||||
ldr r10,[sp, #136] @ r10 = y_coeff
|
||||
vdup.16 d0, r10 @ d0 = y_coeff
|
||||
vld1.16 {d1}, [r8] @ d1 = *table
|
||||
add r11, r2, r3 @ r11 = dst + linesize (dst2)
|
||||
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
|
||||
lsl r3, r3, #1
|
||||
lsl r5, r5, #1
|
||||
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
|
||||
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
|
||||
ldr r10,[sp, #120] @ r10 = srcV
|
||||
.endm
|
||||
|
||||
.macro load_args_yuv422p
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ r4 = srcY
|
||||
ldr r5, [sp, #108] @ r5 = linesizeY
|
||||
ldr r6, [sp, #112] @ r6 = srcU
|
||||
ldr r7, [sp, #116] @ r7 = linesizeU
|
||||
ldr r12,[sp, #124] @ r12 = linesizeV
|
||||
ldr r8, [sp, #128] @ r8 = table
|
||||
ldr r9, [sp, #132] @ r9 = y_offset
|
||||
ldr r10,[sp, #136] @ r10 = y_coeff
|
||||
vdup.16 d0, r10 @ d0 = y_coeff
|
||||
vld1.16 {d1}, [r8] @ d1 = *table
|
||||
sub r3, r3, r0, lsl #2 @ r3 = linesize - width * 4 (padding)
|
||||
sub r5, r5, r0 @ r5 = linesizeY - width (paddingY)
|
||||
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
|
||||
sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV)
|
||||
ldr r10,[sp, #120] @ r10 = srcV
|
||||
.endm
|
||||
|
||||
.macro load_chroma_nv12
|
||||
pld [r12, #64*3]
|
||||
|
||||
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
|
||||
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
|
||||
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
|
||||
.endm
|
||||
|
||||
.macro load_chroma_nv21
|
||||
pld [r12, #64*3]
|
||||
|
||||
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
|
||||
vshll.u8 q14, d3, #3 @ q14 = U * (1 << 3)
|
||||
vshll.u8 q15, d2, #3 @ q15 = V * (1 << 3)
|
||||
.endm
|
||||
|
||||
.macro load_chroma_yuv420p
|
||||
pld [r10, #64*3]
|
||||
pld [r12, #64*3]
|
||||
|
||||
vld1.8 d2, [r6]! @ d2: chroma red line
|
||||
vld1.8 d3, [r10]! @ d3: chroma blue line
|
||||
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
|
||||
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
|
||||
.endm
|
||||
|
||||
.macro load_chroma_yuv422p
|
||||
pld [r10, #64*3]
|
||||
|
||||
vld1.8 d2, [r6]! @ d2: chroma red line
|
||||
vld1.8 d3, [r10]! @ d3: chroma blue line
|
||||
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
|
||||
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
|
||||
.endm
|
||||
|
||||
.macro increment_and_test_nv12
|
||||
add r11, r11, r3 @ dst2 += padding
|
||||
add r12, r12, r5 @ srcY2 += paddingY
|
||||
add r6, r6, r7 @ srcC += paddingC
|
||||
subs r1, r1, #2 @ height -= 2
|
||||
.endm
|
||||
|
||||
.macro increment_and_test_nv21
|
||||
increment_and_test_nv12
|
||||
.endm
|
||||
|
||||
.macro increment_and_test_yuv420p
|
||||
add r11, r11, r3 @ dst2 += padding
|
||||
add r12, r12, r5 @ srcY2 += paddingY
|
||||
ldr r7, [sp, #116] @ r7 = linesizeU
|
||||
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
|
||||
add r6, r6, r7 @ srcU += paddingU
|
||||
ldr r7, [sp, #124] @ r7 = linesizeV
|
||||
sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV)
|
||||
add r10, r10, r7 @ srcV += paddingV
|
||||
subs r1, r1, #2 @ height -= 2
|
||||
.endm
|
||||
|
||||
.macro increment_and_test_yuv422p
|
||||
add r6, r6, r7 @ srcU += paddingU
|
||||
add r10,r10,r12 @ srcV += paddingV
|
||||
subs r1, r1, #1 @ height -= 1
|
||||
.endm
|
||||
|
||||
.macro process_nv12 ofmt
|
||||
process_2l \ofmt
|
||||
.endm
|
||||
|
||||
.macro process_nv21 ofmt
|
||||
process_2l \ofmt
|
||||
.endm
|
||||
|
||||
.macro process_yuv420p ofmt
|
||||
process_2l \ofmt
|
||||
.endm
|
||||
|
||||
.macro process_yuv422p ofmt
|
||||
process_1l \ofmt
|
||||
.endm
|
||||
|
||||
.macro declare_func ifmt ofmt
|
||||
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
|
||||
load_args_\ifmt
|
||||
vmov.u16 q11, #1024 @ q11 = 128 * (1 << 3)
|
||||
vdup.16 q12, r9 @ q12 = y_offset
|
||||
vmov d26, d0 @ q13 = y_coeff
|
||||
vmov d27, d0 @ q13 = y_coeff
|
||||
1:
|
||||
mov r8, r0 @ r8 = width
|
||||
2:
|
||||
pld [r6, #64*3]
|
||||
pld [r4, #64*3]
|
||||
vmov.i8 d10, #128
|
||||
load_chroma_\ifmt
|
||||
process_\ifmt \ofmt
|
||||
subs r8, r8, #16 @ width -= 16
|
||||
bgt 2b
|
||||
add r2, r2, r3 @ dst += padding
|
||||
add r4, r4, r5 @ srcY += paddingY
|
||||
increment_and_test_\ifmt
|
||||
bgt 1b
|
||||
vpop {q4-q7}
|
||||
pop {r4-r12, lr}
|
||||
mov pc, lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro declare_rgb_funcs ifmt
|
||||
declare_func \ifmt, argb
|
||||
declare_func \ifmt, rgba
|
||||
declare_func \ifmt, abgr
|
||||
declare_func \ifmt, bgra
|
||||
.endm
|
||||
|
||||
declare_rgb_funcs nv12
|
||||
declare_rgb_funcs nv21
|
||||
declare_rgb_funcs yuv420p
|
||||
declare_rgb_funcs yuv422p
|
Reference in New Issue
Block a user