186 lines
6.1 KiB
NASM
Executable File
186 lines
6.1 KiB
NASM
Executable File
;*****************************************************************************
|
|
;* x86-optimized functions for gblur filter
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
SECTION .text
|
|
|
|
; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps,
|
|
; float nu, float bscale)
|
|
|
|
%macro HORIZ_SLICE 0
|
|
%if UNIX64
|
|
cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, x, y, step, stride, remain
|
|
%else
|
|
cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, nu, bscale, x, y, step, stride, remain
|
|
%endif
|
|
%if WIN64
|
|
movss m0, num
|
|
movss m1, bscalem
|
|
DEFINE_ARGS ptr, width, height, steps, x, y, step, stride, remain
|
|
%endif
|
|
movsxdifnidn widthq, widthd
|
|
|
|
mulss m2, m0, m0 ; nu ^ 2
|
|
mulss m3, m2, m0 ; nu ^ 3
|
|
mulss m4, m3, m0 ; nu ^ 4
|
|
xor xq, xq
|
|
xor yd, yd
|
|
mov strideq, widthq
|
|
; stride = width * 4
|
|
shl strideq, 2
|
|
; w = w - ((w - 1) & 3)
|
|
mov remainq, widthq
|
|
sub remainq, 1
|
|
and remainq, 3
|
|
sub widthq, remainq
|
|
|
|
shufps m0, m0, 0
|
|
shufps m2, m2, 0
|
|
shufps m3, m3, 0
|
|
shufps m4, m4, 0
|
|
|
|
.loop_y:
|
|
xor stepd, stepd
|
|
|
|
.loop_step:
|
|
; p0 *= bscale
|
|
mulss m5, m1, [ptrq + xq * 4]
|
|
movss [ptrq + xq * 4], m5
|
|
inc xq
|
|
; filter rightwards
|
|
; Here we are vectorizing the c version by 4
|
|
; for (x = 1; x < width; x++)
|
|
; ptr[x] += nu * ptr[x - 1];
|
|
; let p0 stands for ptr[x-1], the data from last loop
|
|
; and [p1,p2,p3,p4] be the vector data for this loop.
|
|
; Unrolling the loop, we get:
|
|
; p1' = p1 + p0*nu
|
|
; p2' = p2 + p1*nu + p0*nu^2
|
|
; p3' = p3 + p2*nu + p1*nu^2 + p0*nu^3
|
|
; p4' = p4 + p3*nu + p2*nu^2 + p1*nu^3 + p0*nu^4
|
|
; so we can do it in simd:
|
|
; [p1',p2',p3',p4'] = [p1,p2,p3,p4] + [p0,p1,p2,p3]*nu +
|
|
; [0,p0,p1,p2]*nu^2 + [0,0,p0,p1]*nu^3 +
|
|
; [0,0,0,p0]*nu^4
|
|
|
|
.loop_x:
|
|
movu m6, [ptrq + xq * 4] ; s = [p1,p2,p3,p4]
|
|
pslldq m7, m6, 4 ; [0, p1,p2,p3]
|
|
movss m7, m5 ; [p0,p1,p2,p3]
|
|
FMULADD_PS m6, m7, m0, m6, m8 ; s += [p0,p1,p2,p3] * nu
|
|
pslldq m7, 4 ; [0,p0,p1,p2]
|
|
FMULADD_PS m6, m7, m2, m6, m8 ; s += [0,p0,p1,p2] * nu^2
|
|
pslldq m7, 4
|
|
FMULADD_PS m6, m7, m3, m6, m8 ; s += [0,0,p0,p1] * nu^3
|
|
pslldq m7, 4
|
|
FMULADD_PS m6, m7, m4, m6, m8 ; s += [0,0,0,p0] * nu^4
|
|
movu [ptrq + xq * 4], m6
|
|
shufps m5, m6, m6, q3333
|
|
add xq, 4
|
|
cmp xq, widthq
|
|
jl .loop_x
|
|
|
|
add widthq, remainq
|
|
cmp xq, widthq
|
|
jge .end_scalar
|
|
|
|
.loop_scalar:
|
|
; ptr[x] += nu * ptr[x-1]
|
|
movss m5, [ptrq + 4*xq - 4]
|
|
mulss m5, m0
|
|
addss m5, [ptrq + 4*xq]
|
|
movss [ptrq + 4*xq], m5
|
|
inc xq
|
|
cmp xq, widthq
|
|
jl .loop_scalar
|
|
.end_scalar:
|
|
; ptr[width - 1] *= bscale
|
|
dec xq
|
|
mulss m5, m1, [ptrq + 4*xq]
|
|
movss [ptrq + 4*xq], m5
|
|
shufps m5, m5, 0
|
|
|
|
; filter leftwards
|
|
; for (; x > 0; x--)
|
|
; ptr[x - 1] += nu * ptr[x];
|
|
; The idea here is basically the same as filter rightwards.
|
|
; But we need to take care as the data layout is different.
|
|
; Let p0 stands for the ptr[x], which is the data from last loop.
|
|
; The way we do it in simd as below:
|
|
; [p-4', p-3', p-2', p-1'] = [p-4, p-3, p-2, p-1]
|
|
; + [p-3, p-2, p-1, p0] * nu
|
|
; + [p-2, p-1, p0, 0] * nu^2
|
|
; + [p-1, p0, 0, 0] * nu^3
|
|
; + [p0, 0, 0, 0] * nu^4
|
|
.loop_x_back:
|
|
sub xq, 4
|
|
movu m6, [ptrq + xq * 4] ; s = [p-4, p-3, p-2, p-1]
|
|
psrldq m7, m6, 4 ; [p-3, p-2, p-1, 0 ]
|
|
blendps m7, m5, 0x8 ; [p-3, p-2, p-1, p0 ]
|
|
FMULADD_PS m6, m7, m0, m6, m8 ; s+= [p-3, p-2, p-1, p0 ] * nu
|
|
psrldq m7, 4 ;
|
|
FMULADD_PS m6, m7, m2, m6, m8 ; s+= [p-2, p-1, p0, 0] * nu^2
|
|
psrldq m7, 4
|
|
FMULADD_PS m6, m7, m3, m6, m8 ; s+= [p-1, p0, 0, 0] * nu^3
|
|
psrldq m7, 4
|
|
FMULADD_PS m6, m7, m4, m6, m8 ; s+= [p0, 0, 0, 0] * nu^4
|
|
movu [ptrq + xq * 4], m6
|
|
shufps m5, m6, m6, 0 ; m5 = [p-4', p-4', p-4', p-4']
|
|
cmp xq, remainq
|
|
jg .loop_x_back
|
|
|
|
cmp xq, 0
|
|
jle .end_scalar_back
|
|
|
|
.loop_scalar_back:
|
|
; ptr[x-1] += nu * ptr[x]
|
|
movss m5, [ptrq + 4*xq]
|
|
mulss m5, m0
|
|
addss m5, [ptrq + 4*xq - 4]
|
|
movss [ptrq + 4*xq - 4], m5
|
|
dec xq
|
|
cmp xq, 0
|
|
jg .loop_scalar_back
|
|
.end_scalar_back:
|
|
|
|
; reset aligned width for next line
|
|
sub widthq, remainq
|
|
|
|
inc stepd
|
|
cmp stepd, stepsd
|
|
jl .loop_step
|
|
|
|
add ptrq, strideq
|
|
inc yd
|
|
cmp yd, heightd
|
|
jl .loop_y
|
|
|
|
RET
|
|
%endmacro
|
|
|
|
%if ARCH_X86_64
|
|
INIT_XMM sse4
|
|
HORIZ_SLICE
|
|
|
|
INIT_XMM avx2
|
|
HORIZ_SLICE
|
|
%endif
|