432 lines
10 KiB
NASM
432 lines
10 KiB
NASM
|
;******************************************************************************
|
||
|
;* VP9 MC SIMD optimizations
|
||
|
;*
|
||
|
;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
|
||
|
;*
|
||
|
;* This file is part of FFmpeg.
|
||
|
;*
|
||
|
;* FFmpeg is free software; you can redistribute it and/or
|
||
|
;* modify it under the terms of the GNU Lesser General Public
|
||
|
;* License as published by the Free Software Foundation; either
|
||
|
;* version 2.1 of the License, or (at your option) any later version.
|
||
|
;*
|
||
|
;* FFmpeg is distributed in the hope that it will be useful,
|
||
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
;* Lesser General Public License for more details.
|
||
|
;*
|
||
|
;* You should have received a copy of the GNU Lesser General Public
|
||
|
;* License along with FFmpeg; if not, write to the Free Software
|
||
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||
|
;******************************************************************************
|
||
|
|
||
|
%include "libavutil/x86/x86util.asm"
|
||
|
|
||
|
SECTION_RODATA 32
|
||
|
|
||
|
pd_64: times 8 dd 64
|
||
|
|
||
|
cextern pw_1023
|
||
|
cextern pw_4095
|
||
|
|
||
|
SECTION .text
|
||
|
|
||
|
%macro filter_h4_fn 1-2 12
|
||
|
cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
|
||
|
mova m5, [pw_1023]
|
||
|
.body:
|
||
|
%if notcpuflag(sse4) && ARCH_X86_64
|
||
|
pxor m11, m11
|
||
|
%endif
|
||
|
mova m6, [pd_64]
|
||
|
mova m7, [filteryq+ 0]
|
||
|
%if ARCH_X86_64 && mmsize > 8
|
||
|
mova m8, [filteryq+32]
|
||
|
mova m9, [filteryq+64]
|
||
|
mova m10, [filteryq+96]
|
||
|
%endif
|
||
|
.loop:
|
||
|
movh m0, [srcq-6]
|
||
|
movh m1, [srcq-4]
|
||
|
movh m2, [srcq-2]
|
||
|
movh m3, [srcq+0]
|
||
|
movh m4, [srcq+2]
|
||
|
punpcklwd m0, m1
|
||
|
punpcklwd m2, m3
|
||
|
pmaddwd m0, m7
|
||
|
%if ARCH_X86_64 && mmsize > 8
|
||
|
pmaddwd m2, m8
|
||
|
%else
|
||
|
pmaddwd m2, [filteryq+32]
|
||
|
%endif
|
||
|
movu m1, [srcq+4]
|
||
|
movu m3, [srcq+6]
|
||
|
paddd m0, m2
|
||
|
movu m2, [srcq+8]
|
||
|
add srcq, sstrideq
|
||
|
punpcklwd m4, m1
|
||
|
punpcklwd m3, m2
|
||
|
%if ARCH_X86_64 && mmsize > 8
|
||
|
pmaddwd m4, m9
|
||
|
pmaddwd m3, m10
|
||
|
%else
|
||
|
pmaddwd m4, [filteryq+64]
|
||
|
pmaddwd m3, [filteryq+96]
|
||
|
%endif
|
||
|
paddd m0, m4
|
||
|
paddd m0, m3
|
||
|
paddd m0, m6
|
||
|
psrad m0, 7
|
||
|
%if cpuflag(sse4)
|
||
|
packusdw m0, m0
|
||
|
%else
|
||
|
packssdw m0, m0
|
||
|
%endif
|
||
|
%ifidn %1, avg
|
||
|
movh m1, [dstq]
|
||
|
%endif
|
||
|
pminsw m0, m5
|
||
|
%if notcpuflag(sse4)
|
||
|
%if ARCH_X86_64
|
||
|
pmaxsw m0, m11
|
||
|
%else
|
||
|
pxor m2, m2
|
||
|
pmaxsw m0, m2
|
||
|
%endif
|
||
|
%endif
|
||
|
%ifidn %1, avg
|
||
|
pavgw m0, m1
|
||
|
%endif
|
||
|
movh [dstq], m0
|
||
|
add dstq, dstrideq
|
||
|
dec hd
|
||
|
jg .loop
|
||
|
RET
|
||
|
|
||
|
cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
|
||
|
mova m5, [pw_4095]
|
||
|
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body
|
||
|
%endmacro
|
||
|
|
||
|
INIT_XMM sse2
|
||
|
filter_h4_fn put
|
||
|
filter_h4_fn avg
|
||
|
|
||
|
%macro filter_h_fn 1-2 12
|
||
|
%assign %%px mmsize/2
|
||
|
cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
|
||
|
mova m5, [pw_1023]
|
||
|
.body:
|
||
|
%if notcpuflag(sse4) && ARCH_X86_64
|
||
|
pxor m11, m11
|
||
|
%endif
|
||
|
mova m6, [pd_64]
|
||
|
mova m7, [filteryq+ 0]
|
||
|
%if ARCH_X86_64 && mmsize > 8
|
||
|
mova m8, [filteryq+32]
|
||
|
mova m9, [filteryq+64]
|
||
|
mova m10, [filteryq+96]
|
||
|
%endif
|
||
|
.loop:
|
||
|
movu m0, [srcq-6]
|
||
|
movu m1, [srcq-4]
|
||
|
movu m2, [srcq-2]
|
||
|
movu m3, [srcq+0]
|
||
|
movu m4, [srcq+2]
|
||
|
pmaddwd m0, m7
|
||
|
pmaddwd m1, m7
|
||
|
%if ARCH_X86_64 && mmsize > 8
|
||
|
pmaddwd m2, m8
|
||
|
pmaddwd m3, m8
|
||
|
pmaddwd m4, m9
|
||
|
%else
|
||
|
pmaddwd m2, [filteryq+32]
|
||
|
pmaddwd m3, [filteryq+32]
|
||
|
pmaddwd m4, [filteryq+64]
|
||
|
%endif
|
||
|
paddd m0, m2
|
||
|
paddd m1, m3
|
||
|
paddd m0, m4
|
||
|
movu m2, [srcq+4]
|
||
|
movu m3, [srcq+6]
|
||
|
movu m4, [srcq+8]
|
||
|
add srcq, sstrideq
|
||
|
%if ARCH_X86_64 && mmsize > 8
|
||
|
pmaddwd m2, m9
|
||
|
pmaddwd m3, m10
|
||
|
pmaddwd m4, m10
|
||
|
%else
|
||
|
pmaddwd m2, [filteryq+64]
|
||
|
pmaddwd m3, [filteryq+96]
|
||
|
pmaddwd m4, [filteryq+96]
|
||
|
%endif
|
||
|
paddd m1, m2
|
||
|
paddd m0, m3
|
||
|
paddd m1, m4
|
||
|
paddd m0, m6
|
||
|
paddd m1, m6
|
||
|
psrad m0, 7
|
||
|
psrad m1, 7
|
||
|
%if cpuflag(sse4)
|
||
|
packusdw m0, m0
|
||
|
packusdw m1, m1
|
||
|
%else
|
||
|
packssdw m0, m0
|
||
|
packssdw m1, m1
|
||
|
%endif
|
||
|
punpcklwd m0, m1
|
||
|
pminsw m0, m5
|
||
|
%if notcpuflag(sse4)
|
||
|
%if ARCH_X86_64
|
||
|
pmaxsw m0, m11
|
||
|
%else
|
||
|
pxor m2, m2
|
||
|
pmaxsw m0, m2
|
||
|
%endif
|
||
|
%endif
|
||
|
%ifidn %1, avg
|
||
|
pavgw m0, [dstq]
|
||
|
%endif
|
||
|
mova [dstq], m0
|
||
|
add dstq, dstrideq
|
||
|
dec hd
|
||
|
jg .loop
|
||
|
RET
|
||
|
|
||
|
cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
|
||
|
mova m5, [pw_4095]
|
||
|
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body
|
||
|
%endmacro
|
||
|
|
||
|
INIT_XMM sse2
|
||
|
filter_h_fn put
|
||
|
filter_h_fn avg
|
||
|
%if HAVE_AVX2_EXTERNAL
|
||
|
INIT_YMM avx2
|
||
|
filter_h_fn put
|
||
|
filter_h_fn avg
|
||
|
%endif
|
||
|
|
||
|
%macro filter_v4_fn 1-2 12
|
||
|
%if ARCH_X86_64
|
||
|
cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
|
||
|
%else
|
||
|
cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
|
||
|
mov filteryq, r5mp
|
||
|
%define hd r4mp
|
||
|
%endif
|
||
|
mova m5, [pw_1023]
|
||
|
.body:
|
||
|
%if notcpuflag(sse4) && ARCH_X86_64
|
||
|
pxor m11, m11
|
||
|
%endif
|
||
|
mova m6, [pd_64]
|
||
|
lea sstride3q, [sstrideq*3]
|
||
|
lea src4q, [srcq+sstrideq]
|
||
|
sub srcq, sstride3q
|
||
|
mova m7, [filteryq+ 0]
|
||
|
%if ARCH_X86_64 && mmsize > 8
|
||
|
mova m8, [filteryq+ 32]
|
||
|
mova m9, [filteryq+ 64]
|
||
|
mova m10, [filteryq+ 96]
|
||
|
%endif
|
||
|
.loop:
|
||
|
; FIXME maybe reuse loads from previous rows, or just
|
||
|
; more generally unroll this to prevent multiple loads of
|
||
|
; the same data?
|
||
|
movh m0, [srcq]
|
||
|
movh m1, [srcq+sstrideq]
|
||
|
movh m2, [srcq+sstrideq*2]
|
||
|
movh m3, [srcq+sstride3q]
|
||
|
add srcq, sstrideq
|
||
|
movh m4, [src4q]
|
||
|
punpcklwd m0, m1
|
||
|
punpcklwd m2, m3
|
||
|
pmaddwd m0, m7
|
||
|
%if ARCH_X86_64 && mmsize > 8
|
||
|
pmaddwd m2, m8
|
||
|
%else
|
||
|
pmaddwd m2, [filteryq+ 32]
|
||
|
%endif
|
||
|
movh m1, [src4q+sstrideq]
|
||
|
movh m3, [src4q+sstrideq*2]
|
||
|
paddd m0, m2
|
||
|
movh m2, [src4q+sstride3q]
|
||
|
add src4q, sstrideq
|
||
|
punpcklwd m4, m1
|
||
|
punpcklwd m3, m2
|
||
|
%if ARCH_X86_64 && mmsize > 8
|
||
|
pmaddwd m4, m9
|
||
|
pmaddwd m3, m10
|
||
|
%else
|
||
|
pmaddwd m4, [filteryq+ 64]
|
||
|
pmaddwd m3, [filteryq+ 96]
|
||
|
%endif
|
||
|
paddd m0, m4
|
||
|
paddd m0, m3
|
||
|
paddd m0, m6
|
||
|
psrad m0, 7
|
||
|
%if cpuflag(sse4)
|
||
|
packusdw m0, m0
|
||
|
%else
|
||
|
packssdw m0, m0
|
||
|
%endif
|
||
|
%ifidn %1, avg
|
||
|
movh m1, [dstq]
|
||
|
%endif
|
||
|
pminsw m0, m5
|
||
|
%if notcpuflag(sse4)
|
||
|
%if ARCH_X86_64
|
||
|
pmaxsw m0, m11
|
||
|
%else
|
||
|
pxor m2, m2
|
||
|
pmaxsw m0, m2
|
||
|
%endif
|
||
|
%endif
|
||
|
%ifidn %1, avg
|
||
|
pavgw m0, m1
|
||
|
%endif
|
||
|
movh [dstq], m0
|
||
|
add dstq, dstrideq
|
||
|
dec hd
|
||
|
jg .loop
|
||
|
RET
|
||
|
|
||
|
%if ARCH_X86_64
|
||
|
cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
|
||
|
%else
|
||
|
cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
|
||
|
mov filteryq, r5mp
|
||
|
%endif
|
||
|
mova m5, [pw_4095]
|
||
|
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body
|
||
|
%endmacro
|
||
|
|
||
|
INIT_XMM sse2
|
||
|
filter_v4_fn put
|
||
|
filter_v4_fn avg
|
||
|
|
||
|
%macro filter_v_fn 1-2 13
|
||
|
%assign %%px mmsize/2
|
||
|
%if ARCH_X86_64
|
||
|
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
|
||
|
%else
|
||
|
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
|
||
|
mov filteryq, r5mp
|
||
|
%define hd r4mp
|
||
|
%endif
|
||
|
mova m5, [pw_1023]
|
||
|
.body:
|
||
|
%if notcpuflag(sse4) && ARCH_X86_64
|
||
|
pxor m12, m12
|
||
|
%endif
|
||
|
%if ARCH_X86_64
|
||
|
mova m11, [pd_64]
|
||
|
%endif
|
||
|
lea sstride3q, [sstrideq*3]
|
||
|
lea src4q, [srcq+sstrideq]
|
||
|
sub srcq, sstride3q
|
||
|
mova m7, [filteryq+ 0]
|
||
|
%if ARCH_X86_64 && mmsize > 8
|
||
|
mova m8, [filteryq+ 32]
|
||
|
mova m9, [filteryq+ 64]
|
||
|
mova m10, [filteryq+ 96]
|
||
|
%endif
|
||
|
.loop:
|
||
|
; FIXME maybe reuse loads from previous rows, or just
|
||
|
; more generally unroll this to prevent multiple loads of
|
||
|
; the same data?
|
||
|
movu m0, [srcq]
|
||
|
movu m1, [srcq+sstrideq]
|
||
|
movu m2, [srcq+sstrideq*2]
|
||
|
movu m3, [srcq+sstride3q]
|
||
|
add srcq, sstrideq
|
||
|
movu m4, [src4q]
|
||
|
SBUTTERFLY wd, 0, 1, 6
|
||
|
SBUTTERFLY wd, 2, 3, 6
|
||
|
pmaddwd m0, m7
|
||
|
pmaddwd m1, m7
|
||
|
%if ARCH_X86_64 && mmsize > 8
|
||
|
pmaddwd m2, m8
|
||
|
pmaddwd m3, m8
|
||
|
%else
|
||
|
pmaddwd m2, [filteryq+ 32]
|
||
|
pmaddwd m3, [filteryq+ 32]
|
||
|
%endif
|
||
|
paddd m0, m2
|
||
|
paddd m1, m3
|
||
|
movu m2, [src4q+sstrideq]
|
||
|
movu m3, [src4q+sstrideq*2]
|
||
|
SBUTTERFLY wd, 4, 2, 6
|
||
|
%if ARCH_X86_64 && mmsize > 8
|
||
|
pmaddwd m4, m9
|
||
|
pmaddwd m2, m9
|
||
|
%else
|
||
|
pmaddwd m4, [filteryq+ 64]
|
||
|
pmaddwd m2, [filteryq+ 64]
|
||
|
%endif
|
||
|
paddd m0, m4
|
||
|
paddd m1, m2
|
||
|
movu m4, [src4q+sstride3q]
|
||
|
add src4q, sstrideq
|
||
|
SBUTTERFLY wd, 3, 4, 6
|
||
|
%if ARCH_X86_64 && mmsize > 8
|
||
|
pmaddwd m3, m10
|
||
|
pmaddwd m4, m10
|
||
|
%else
|
||
|
pmaddwd m3, [filteryq+ 96]
|
||
|
pmaddwd m4, [filteryq+ 96]
|
||
|
%endif
|
||
|
paddd m0, m3
|
||
|
paddd m1, m4
|
||
|
%if ARCH_X86_64
|
||
|
paddd m0, m11
|
||
|
paddd m1, m11
|
||
|
%else
|
||
|
paddd m0, [pd_64]
|
||
|
paddd m1, [pd_64]
|
||
|
%endif
|
||
|
psrad m0, 7
|
||
|
psrad m1, 7
|
||
|
%if cpuflag(sse4)
|
||
|
packusdw m0, m1
|
||
|
%else
|
||
|
packssdw m0, m1
|
||
|
%endif
|
||
|
pminsw m0, m5
|
||
|
%if notcpuflag(sse4)
|
||
|
%if ARCH_X86_64
|
||
|
pmaxsw m0, m12
|
||
|
%else
|
||
|
pxor m2, m2
|
||
|
pmaxsw m0, m2
|
||
|
%endif
|
||
|
%endif
|
||
|
%ifidn %1, avg
|
||
|
pavgw m0, [dstq]
|
||
|
%endif
|
||
|
mova [dstq], m0
|
||
|
add dstq, dstrideq
|
||
|
dec hd
|
||
|
jg .loop
|
||
|
RET
|
||
|
|
||
|
%if ARCH_X86_64
|
||
|
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
|
||
|
%else
|
||
|
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
|
||
|
mov filteryq, r5mp
|
||
|
%endif
|
||
|
mova m5, [pw_4095]
|
||
|
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body
|
||
|
%endmacro
|
||
|
|
||
|
INIT_XMM sse2
|
||
|
filter_v_fn put
|
||
|
filter_v_fn avg
|
||
|
%if HAVE_AVX2_EXTERNAL
|
||
|
INIT_YMM avx2
|
||
|
filter_v_fn put
|
||
|
filter_v_fn avg
|
||
|
%endif
|