early-access version 1432
This commit is contained in:
8
externals/ffmpeg/libswscale/aarch64/Makefile
vendored
Executable file
8
externals/ffmpeg/libswscale/aarch64/Makefile
vendored
Executable file
@@ -0,0 +1,8 @@
|
||||
OBJS += aarch64/rgb2rgb.o \
|
||||
aarch64/swscale.o \
|
||||
aarch64/swscale_unscaled.o \
|
||||
|
||||
NEON-OBJS += aarch64/hscale.o \
|
||||
aarch64/output.o \
|
||||
aarch64/rgb2rgb_neon.o \
|
||||
aarch64/yuv2rgb_neon.o \
|
80
externals/ffmpeg/libswscale/aarch64/hscale.S
vendored
Executable file
80
externals/ffmpeg/libswscale/aarch64/hscale.S
vendored
Executable file
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_hscale_8_to_15_neon, export=1
|
||||
sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
|
||||
1: ldr w8, [x5], #4 // filterPos[idx]
|
||||
ldr w0, [x5], #4 // filterPos[idx + 1]
|
||||
ldr w11, [x5], #4 // filterPos[idx + 2]
|
||||
ldr w9, [x5], #4 // filterPos[idx + 3]
|
||||
mov x16, x4 // filter0 = filter
|
||||
add x12, x16, x7 // filter1 = filter0 + filterSize*2
|
||||
add x13, x12, x7 // filter2 = filter1 + filterSize*2
|
||||
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
||||
movi v0.2D, #0 // val sum part 1 (for dst[0])
|
||||
movi v1.2D, #0 // val sum part 2 (for dst[1])
|
||||
movi v2.2D, #0 // val sum part 3 (for dst[2])
|
||||
movi v3.2D, #0 // val sum part 4 (for dst[3])
|
||||
add x17, x3, w8, UXTW // srcp + filterPos[0]
|
||||
add x8, x3, w0, UXTW // srcp + filterPos[1]
|
||||
add x0, x3, w11, UXTW // srcp + filterPos[2]
|
||||
add x11, x3, w9, UXTW // srcp + filterPos[3]
|
||||
mov w15, w6 // filterSize counter
|
||||
2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}]
|
||||
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||
ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}]
|
||||
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
|
||||
uxtl v4.8H, v4.8B // unpack part 1 to 16-bit
|
||||
smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}]
|
||||
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
|
||||
uxtl v6.8H, v6.8B // unpack part 2 to 16-bit
|
||||
smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
|
||||
uxtl v16.8H, v16.8B // unpack part 3 to 16-bit
|
||||
smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}]
|
||||
smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
|
||||
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
|
||||
subs w15, w15, #8 // j -= 8: processed 8/filterSize
|
||||
uxtl v18.8H, v18.8B // unpack part 4 to 16-bit
|
||||
smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
|
||||
smlal2 v3.4S, v18.8H, v19.8H // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
|
||||
b.gt 2b // inner loop if filterSize not consumed completely
|
||||
addp v0.4S, v0.4S, v0.4S // part0 horizontal pair adding
|
||||
addp v1.4S, v1.4S, v1.4S // part1 horizontal pair adding
|
||||
addp v2.4S, v2.4S, v2.4S // part2 horizontal pair adding
|
||||
addp v3.4S, v3.4S, v3.4S // part3 horizontal pair adding
|
||||
addp v0.4S, v0.4S, v0.4S // part0 horizontal pair adding
|
||||
addp v1.4S, v1.4S, v1.4S // part1 horizontal pair adding
|
||||
addp v2.4S, v2.4S, v2.4S // part2 horizontal pair adding
|
||||
addp v3.4S, v3.4S, v3.4S // part3 horizontal pair adding
|
||||
zip1 v0.4S, v0.4S, v1.4S // part01 = zip values from part0 and part1
|
||||
zip1 v2.4S, v2.4S, v3.4S // part23 = zip values from part2 and part3
|
||||
mov v0.d[1], v2.d[0] // part0123 = zip values from part01 and part23
|
||||
subs w2, w2, #4 // dstW -= 4
|
||||
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
|
||||
st1 {v0.4H}, [x1], #8 // write to destination part0123
|
||||
b.gt 1b // loop until end of line
|
||||
ret
|
||||
endfunc
|
58
externals/ffmpeg/libswscale/aarch64/output.S
vendored
Executable file
58
externals/ffmpeg/libswscale/aarch64/output.S
vendored
Executable file
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_yuv2planeX_8_neon, export=1
|
||||
ld1 {v0.8B}, [x5] // load 8x8-bit dither
|
||||
cbz w6, 1f // check if offsetting present
|
||||
ext v0.8B, v0.8B, v0.8B, #3 // honor offsetting which can be 0 or 3 only
|
||||
1: uxtl v0.8H, v0.8B // extend dither to 16-bit
|
||||
ushll v1.4S, v0.4H, #12 // extend dither to 32-bit with left shift by 12 (part 1)
|
||||
ushll2 v2.4S, v0.8H, #12 // extend dither to 32-bit with left shift by 12 (part 2)
|
||||
mov x7, #0 // i = 0
|
||||
2: mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
|
||||
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
|
||||
mov w8, w1 // tmpfilterSize = filterSize
|
||||
mov x9, x2 // srcp = src
|
||||
mov x10, x0 // filterp = filter
|
||||
3: ldp x11, x12, [x9], #16 // get 2 pointers: src[j] and src[j+1]
|
||||
add x11, x11, x7, lsl #1 // &src[j ][i]
|
||||
add x12, x12, x7, lsl #1 // &src[j+1][i]
|
||||
ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
ld1 {v6.8H}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
|
||||
ld1r {v7.8H}, [x10], #2 // read 1x16-bit coeff X at filter[j ] and duplicate across lanes
|
||||
ld1r {v16.8H}, [x10], #2 // read 1x16-bit coeff Y at filter[j+1] and duplicate across lanes
|
||||
smlal v3.4S, v5.4H, v7.4H // val0 += {A,B,C,D} * X
|
||||
smlal2 v4.4S, v5.8H, v7.8H // val1 += {E,F,G,H} * X
|
||||
smlal v3.4S, v6.4H, v16.4H // val0 += {I,J,K,L} * Y
|
||||
smlal2 v4.4S, v6.8H, v16.8H // val1 += {M,N,O,P} * Y
|
||||
subs w8, w8, #2 // tmpfilterSize -= 2
|
||||
b.gt 3b // loop until filterSize consumed
|
||||
|
||||
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
|
||||
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
|
||||
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
|
||||
st1 {v3.8b}, [x3], #8 // write to destination
|
||||
subs w4, w4, #8 // dstW -= 8
|
||||
add x7, x7, #8 // i += 8
|
||||
b.gt 2b // loop until width consumed
|
||||
ret
|
||||
endfunc
|
41
externals/ffmpeg/libswscale/aarch64/rgb2rgb.c
vendored
Executable file
41
externals/ffmpeg/libswscale/aarch64/rgb2rgb.c
vendored
Executable file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/bswap.h"
|
||||
#include "libswscale/rgb2rgb.h"
|
||||
#include "libswscale/swscale.h"
|
||||
#include "libswscale/swscale_internal.h"
|
||||
|
||||
void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
|
||||
uint8_t *dest, int width, int height,
|
||||
int src1Stride, int src2Stride, int dstStride);
|
||||
|
||||
av_cold void rgb2rgb_init_aarch64(void)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
interleaveBytes = ff_interleave_bytes_neon;
|
||||
}
|
||||
}
|
79
externals/ffmpeg/libswscale/aarch64/rgb2rgb_neon.S
vendored
Executable file
79
externals/ffmpeg/libswscale/aarch64/rgb2rgb_neon.S
vendored
Executable file
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Copyright (c) 2020 Martin Storsjo
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
// void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
|
||||
// uint8_t *dest, int width, int height,
|
||||
// int src1Stride, int src2Stride, int dstStride);
|
||||
function ff_interleave_bytes_neon, export=1
|
||||
sub w5, w5, w3
|
||||
sub w6, w6, w3
|
||||
sub w7, w7, w3, lsl #1
|
||||
1:
|
||||
ands w8, w3, #0xfffffff0 // & ~15
|
||||
b.eq 3f
|
||||
2:
|
||||
ld1 {v0.16b}, [x0], #16
|
||||
ld1 {v1.16b}, [x1], #16
|
||||
subs w8, w8, #16
|
||||
st2 {v0.16b, v1.16b}, [x2], #32
|
||||
b.gt 2b
|
||||
|
||||
tst w3, #15
|
||||
b.eq 9f
|
||||
|
||||
3:
|
||||
tst w3, #8
|
||||
b.eq 4f
|
||||
ld1 {v0.8b}, [x0], #8
|
||||
ld1 {v1.8b}, [x1], #8
|
||||
st2 {v0.8b, v1.8b}, [x2], #16
|
||||
4:
|
||||
tst w3, #4
|
||||
b.eq 5f
|
||||
|
||||
ld1 {v0.s}[0], [x0], #4
|
||||
ld1 {v1.s}[0], [x1], #4
|
||||
zip1 v0.8b, v0.8b, v1.8b
|
||||
st1 {v0.8b}, [x2], #8
|
||||
|
||||
5:
|
||||
ands w8, w3, #3
|
||||
b.eq 9f
|
||||
6:
|
||||
ldrb w9, [x0], #1
|
||||
ldrb w10, [x1], #1
|
||||
subs w8, w8, #1
|
||||
bfi w9, w10, #8, #8
|
||||
strh w9, [x2], #2
|
||||
b.gt 6b
|
||||
|
||||
9:
|
||||
subs w4, w4, #1
|
||||
b.eq 0f
|
||||
add x0, x0, w5, sxtw
|
||||
add x1, x1, w6, sxtw
|
||||
add x2, x2, w7, sxtw
|
||||
b 1b
|
||||
|
||||
0:
|
||||
ret
|
||||
endfunc
|
47
externals/ffmpeg/libswscale/aarch64/swscale.c
vendored
Executable file
47
externals/ffmpeg/libswscale/aarch64/swscale.c
vendored
Executable file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libswscale/swscale.h"
|
||||
#include "libswscale/swscale_internal.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
|
||||
void ff_hscale_8_to_15_neon(SwsContext *c, int16_t *dst, int dstW,
|
||||
const uint8_t *src, const int16_t *filter,
|
||||
const int32_t *filterPos, int filterSize);
|
||||
|
||||
void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
|
||||
const int16_t **src, uint8_t *dest, int dstW,
|
||||
const uint8_t *dither, int offset);
|
||||
|
||||
av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
if (c->srcBpc == 8 && c->dstBpc <= 14 &&
|
||||
(c->hLumFilterSize % 8) == 0 &&
|
||||
(c->hChrFilterSize % 8) == 0)
|
||||
{
|
||||
c->hyScale = c->hcScale = ff_hscale_8_to_15_neon;
|
||||
}
|
||||
if (c->dstBpc == 8) {
|
||||
c->yuv2planeX = ff_yuv2planeX_8_neon;
|
||||
}
|
||||
}
|
||||
}
|
132
externals/ffmpeg/libswscale/aarch64/swscale_unscaled.c
vendored
Executable file
132
externals/ffmpeg/libswscale/aarch64/swscale_unscaled.c
vendored
Executable file
@@ -0,0 +1,132 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libswscale/swscale.h"
|
||||
#include "libswscale/swscale_internal.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
|
||||
#define YUV_TO_RGB_TABLE \
|
||||
c->yuv2rgb_v2r_coeff, \
|
||||
c->yuv2rgb_u2g_coeff, \
|
||||
c->yuv2rgb_v2g_coeff, \
|
||||
c->yuv2rgb_u2b_coeff, \
|
||||
|
||||
#define DECLARE_FF_YUVX_TO_RGBX_FUNCS(ifmt, ofmt) \
|
||||
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
|
||||
uint8_t *dst, int linesize, \
|
||||
const uint8_t *srcY, int linesizeY, \
|
||||
const uint8_t *srcU, int linesizeU, \
|
||||
const uint8_t *srcV, int linesizeV, \
|
||||
const int16_t *table, \
|
||||
int y_offset, \
|
||||
int y_coeff); \
|
||||
\
|
||||
static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \
|
||||
int srcStride[], int srcSliceY, int srcSliceH, \
|
||||
uint8_t *dst[], int dstStride[]) { \
|
||||
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
|
||||
\
|
||||
ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \
|
||||
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
|
||||
src[0], srcStride[0], \
|
||||
src[1], srcStride[1], \
|
||||
src[2], srcStride[2], \
|
||||
yuv2rgb_table, \
|
||||
c->yuv2rgb_y_offset >> 6, \
|
||||
c->yuv2rgb_y_coeff); \
|
||||
return 0; \
|
||||
} \
|
||||
|
||||
#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra) \
|
||||
|
||||
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p)
|
||||
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p)
|
||||
|
||||
#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt) \
|
||||
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
|
||||
uint8_t *dst, int linesize, \
|
||||
const uint8_t *srcY, int linesizeY, \
|
||||
const uint8_t *srcC, int linesizeC, \
|
||||
const int16_t *table, \
|
||||
int y_offset, \
|
||||
int y_coeff); \
|
||||
\
|
||||
static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \
|
||||
int srcStride[], int srcSliceY, int srcSliceH, \
|
||||
uint8_t *dst[], int dstStride[]) { \
|
||||
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
|
||||
\
|
||||
ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \
|
||||
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
|
||||
src[0], srcStride[0], src[1], srcStride[1], \
|
||||
yuv2rgb_table, \
|
||||
c->yuv2rgb_y_offset >> 6, \
|
||||
c->yuv2rgb_y_coeff); \
|
||||
\
|
||||
return 0; \
|
||||
} \
|
||||
|
||||
#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra) \
|
||||
|
||||
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12)
|
||||
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
|
||||
|
||||
/* We need a 16 pixel width alignment. This constraint can easily be removed
|
||||
* for input reading but for the output which is 4-bytes per pixel (RGBA) the
|
||||
* assembly might be writing as much as 4*15=60 extra bytes at the end of the
|
||||
* line, which won't fit the 32-bytes buffer alignment. */
|
||||
#define SET_FF_NVX_TO_RGBX_FUNC(ifmt, IFMT, ofmt, OFMT, accurate_rnd) do { \
|
||||
if (c->srcFormat == AV_PIX_FMT_##IFMT \
|
||||
&& c->dstFormat == AV_PIX_FMT_##OFMT \
|
||||
&& !(c->srcH & 1) \
|
||||
&& !(c->srcW & 15) \
|
||||
&& !accurate_rnd) \
|
||||
c->swscale = ifmt##_to_##ofmt##_neon_wrapper; \
|
||||
} while (0)
|
||||
|
||||
#define SET_FF_NVX_TO_ALL_RGBX_FUNC(nvx, NVX, accurate_rnd) do { \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, argb, ARGB, accurate_rnd); \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgba, RGBA, accurate_rnd); \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd); \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd); \
|
||||
} while (0)
|
||||
|
||||
static void get_unscaled_swscale_neon(SwsContext *c) {
|
||||
int accurate_rnd = c->flags & SWS_ACCURATE_RND;
|
||||
|
||||
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd);
|
||||
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
|
||||
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
|
||||
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
|
||||
}
|
||||
|
||||
void ff_get_unscaled_swscale_aarch64(SwsContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
if (have_neon(cpu_flags))
|
||||
get_unscaled_swscale_neon(c);
|
||||
}
|
210
externals/ffmpeg/libswscale/aarch64/yuv2rgb_neon.S
vendored
Executable file
210
externals/ffmpeg/libswscale/aarch64/yuv2rgb_neon.S
vendored
Executable file
@@ -0,0 +1,210 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
|
||||
* Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
.macro load_yoff_ycoeff yoff ycoeff
|
||||
#if defined(__APPLE__)
|
||||
ldp w9, w10, [sp, #\yoff]
|
||||
#else
|
||||
ldr w9, [sp, #\yoff]
|
||||
ldr w10, [sp, #\ycoeff]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro load_args_nv12
|
||||
ldr x8, [sp] // table
|
||||
load_yoff_ycoeff 8, 16 // y_offset, y_coeff
|
||||
ld1 {v1.1D}, [x8]
|
||||
dup v0.8H, w10
|
||||
dup v3.8H, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
|
||||
neg w11, w0
|
||||
.endm
|
||||
|
||||
.macro load_args_nv21
|
||||
load_args_nv12
|
||||
.endm
|
||||
|
||||
.macro load_args_yuv420p
|
||||
ldr x13, [sp] // srcV
|
||||
ldr w14, [sp, #8] // linesizeV
|
||||
ldr x8, [sp, #16] // table
|
||||
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
|
||||
ld1 {v1.1D}, [x8]
|
||||
dup v0.8H, w10
|
||||
dup v3.8H, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
|
||||
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
|
||||
lsr w11, w0, #1
|
||||
neg w11, w11
|
||||
.endm
|
||||
|
||||
.macro load_args_yuv422p
|
||||
ldr x13, [sp] // srcV
|
||||
ldr w14, [sp, #8] // linesizeV
|
||||
ldr x8, [sp, #16] // table
|
||||
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
|
||||
ld1 {v1.1D}, [x8]
|
||||
dup v0.8H, w10
|
||||
dup v3.8H, w9
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
|
||||
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
|
||||
.endm
|
||||
|
||||
.macro load_chroma_nv12
|
||||
ld2 {v16.8B, v17.8B}, [x6], #16
|
||||
ushll v18.8H, v16.8B, #3
|
||||
ushll v19.8H, v17.8B, #3
|
||||
.endm
|
||||
|
||||
.macro load_chroma_nv21
|
||||
ld2 {v16.8B, v17.8B}, [x6], #16
|
||||
ushll v19.8H, v16.8B, #3
|
||||
ushll v18.8H, v17.8B, #3
|
||||
.endm
|
||||
|
||||
.macro load_chroma_yuv420p
|
||||
ld1 {v16.8B}, [ x6], #8
|
||||
ld1 {v17.8B}, [x13], #8
|
||||
ushll v18.8H, v16.8B, #3
|
||||
ushll v19.8H, v17.8B, #3
|
||||
.endm
|
||||
|
||||
.macro load_chroma_yuv422p
|
||||
load_chroma_yuv420p
|
||||
.endm
|
||||
|
||||
.macro increment_nv12
|
||||
ands w15, w1, #1
|
||||
csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width
|
||||
add x6, x6, w16, SXTW // srcC += incC
|
||||
.endm
|
||||
|
||||
.macro increment_nv21
|
||||
increment_nv12
|
||||
.endm
|
||||
|
||||
.macro increment_yuv420p
|
||||
ands w15, w1, #1
|
||||
csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2
|
||||
csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2
|
||||
add x6, x6, w16, SXTW // srcU += incU
|
||||
add x13, x13, w17, SXTW // srcV += incV
|
||||
.endm
|
||||
|
||||
.macro increment_yuv422p
|
||||
add x6, x6, w7, UXTW // srcU += incU
|
||||
add x13, x13, w14, UXTW // srcV += incV
|
||||
.endm
|
||||
|
||||
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
|
||||
add v20.8H, v26.8H, v20.8H // Y1 + R1
|
||||
add v21.8H, v27.8H, v21.8H // Y2 + R2
|
||||
add v22.8H, v26.8H, v22.8H // Y1 + G1
|
||||
add v23.8H, v27.8H, v23.8H // Y2 + G2
|
||||
add v24.8H, v26.8H, v24.8H // Y1 + B1
|
||||
add v25.8H, v27.8H, v25.8H // Y2 + B2
|
||||
sqrshrun \r1, v20.8H, #1 // clip_u8((Y1 + R1) >> 1)
|
||||
sqrshrun \r2, v21.8H, #1 // clip_u8((Y2 + R1) >> 1)
|
||||
sqrshrun \g1, v22.8H, #1 // clip_u8((Y1 + G1) >> 1)
|
||||
sqrshrun \g2, v23.8H, #1 // clip_u8((Y2 + G1) >> 1)
|
||||
sqrshrun \b1, v24.8H, #1 // clip_u8((Y1 + B1) >> 1)
|
||||
sqrshrun \b2, v25.8H, #1 // clip_u8((Y2 + B1) >> 1)
|
||||
movi \a1, #255
|
||||
movi \a2, #255
|
||||
.endm
|
||||
|
||||
.macro declare_func ifmt ofmt
|
||||
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
|
||||
load_args_\ifmt
|
||||
1:
|
||||
mov w8, w0 // w8 = width
|
||||
2:
|
||||
movi v5.8H, #4, lsl #8 // 128 * (1<<3)
|
||||
load_chroma_\ifmt
|
||||
sub v18.8H, v18.8H, v5.8H // U*(1<<3) - 128*(1<<3)
|
||||
sub v19.8H, v19.8H, v5.8H // V*(1<<3) - 128*(1<<3)
|
||||
sqdmulh v20.8H, v19.8H, v1.H[0] // V * v2r (R)
|
||||
sqdmulh v22.8H, v18.8H, v1.H[1] // U * u2g
|
||||
sqdmulh v19.8H, v19.8H, v1.H[2] // V * v2g
|
||||
add v22.8H, v22.8H, v19.8H // U * u2g + V * v2g (G)
|
||||
sqdmulh v24.8H, v18.8H, v1.H[3] // U * u2b (B)
|
||||
zip2 v21.8H, v20.8H, v20.8H // R2
|
||||
zip1 v20.8H, v20.8H, v20.8H // R1
|
||||
zip2 v23.8H, v22.8H, v22.8H // G2
|
||||
zip1 v22.8H, v22.8H, v22.8H // G1
|
||||
zip2 v25.8H, v24.8H, v24.8H // B2
|
||||
zip1 v24.8H, v24.8H, v24.8H // B1
|
||||
ld1 {v2.16B}, [x4], #16 // load luma
|
||||
ushll v26.8H, v2.8B, #3 // Y1*(1<<3)
|
||||
ushll2 v27.8H, v2.16B, #3 // Y2*(1<<3)
|
||||
sub v26.8H, v26.8H, v3.8H // Y1*(1<<3) - y_offset
|
||||
sub v27.8H, v27.8H, v3.8H // Y2*(1<<3) - y_offset
|
||||
sqdmulh v26.8H, v26.8H, v0.8H // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
|
||||
sqdmulh v27.8H, v27.8H, v0.8H // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
|
||||
|
||||
.ifc \ofmt,argb // 1 2 3 0
|
||||
compute_rgba v5.8B,v6.8B,v7.8B,v4.8B, v17.8B,v18.8B,v19.8B,v16.8B
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,rgba // 0 1 2 3
|
||||
compute_rgba v4.8B,v5.8B,v6.8B,v7.8B, v16.8B,v17.8B,v18.8B,v19.8B
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,abgr // 3 2 1 0
|
||||
compute_rgba v7.8B,v6.8B,v5.8B,v4.8B, v19.8B,v18.8B,v17.8B,v16.8B
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,bgra // 2 1 0 3
|
||||
compute_rgba v6.8B,v5.8B,v4.8B,v7.8B, v18.8B,v17.8B,v16.8B,v19.8B
|
||||
.endif
|
||||
|
||||
st4 { v4.8B, v5.8B, v6.8B, v7.8B}, [x2], #32
|
||||
st4 {v16.8B,v17.8B,v18.8B,v19.8B}, [x2], #32
|
||||
subs w8, w8, #16 // width -= 16
|
||||
b.gt 2b
|
||||
add x2, x2, w3, UXTW // dst += padding
|
||||
add x4, x4, w5, UXTW // srcY += paddingY
|
||||
increment_\ifmt
|
||||
subs w1, w1, #1 // height -= 1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro declare_rgb_funcs ifmt
|
||||
declare_func \ifmt, argb
|
||||
declare_func \ifmt, rgba
|
||||
declare_func \ifmt, abgr
|
||||
declare_func \ifmt, bgra
|
||||
.endm
|
||||
|
||||
declare_rgb_funcs nv12
|
||||
declare_rgb_funcs nv21
|
||||
declare_rgb_funcs yuv420p
|
||||
declare_rgb_funcs yuv422p
|
Reference in New Issue
Block a user