early-access version 1432
This commit is contained in:
431
externals/ffmpeg/libavcodec/x86/vp9mc_16bpp.asm
vendored
Executable file
431
externals/ffmpeg/libavcodec/x86/vp9mc_16bpp.asm
vendored
Executable file
@@ -0,0 +1,431 @@
|
||||
;******************************************************************************
|
||||
;* VP9 MC SIMD optimizations
|
||||
;*
|
||||
;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
pd_64: times 8 dd 64
|
||||
|
||||
cextern pw_1023
|
||||
cextern pw_4095
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro filter_h4_fn 1-2 12
|
||||
cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
|
||||
mova m5, [pw_1023]
|
||||
.body:
|
||||
%if notcpuflag(sse4) && ARCH_X86_64
|
||||
pxor m11, m11
|
||||
%endif
|
||||
mova m6, [pd_64]
|
||||
mova m7, [filteryq+ 0]
|
||||
%if ARCH_X86_64 && mmsize > 8
|
||||
mova m8, [filteryq+32]
|
||||
mova m9, [filteryq+64]
|
||||
mova m10, [filteryq+96]
|
||||
%endif
|
||||
.loop:
|
||||
movh m0, [srcq-6]
|
||||
movh m1, [srcq-4]
|
||||
movh m2, [srcq-2]
|
||||
movh m3, [srcq+0]
|
||||
movh m4, [srcq+2]
|
||||
punpcklwd m0, m1
|
||||
punpcklwd m2, m3
|
||||
pmaddwd m0, m7
|
||||
%if ARCH_X86_64 && mmsize > 8
|
||||
pmaddwd m2, m8
|
||||
%else
|
||||
pmaddwd m2, [filteryq+32]
|
||||
%endif
|
||||
movu m1, [srcq+4]
|
||||
movu m3, [srcq+6]
|
||||
paddd m0, m2
|
||||
movu m2, [srcq+8]
|
||||
add srcq, sstrideq
|
||||
punpcklwd m4, m1
|
||||
punpcklwd m3, m2
|
||||
%if ARCH_X86_64 && mmsize > 8
|
||||
pmaddwd m4, m9
|
||||
pmaddwd m3, m10
|
||||
%else
|
||||
pmaddwd m4, [filteryq+64]
|
||||
pmaddwd m3, [filteryq+96]
|
||||
%endif
|
||||
paddd m0, m4
|
||||
paddd m0, m3
|
||||
paddd m0, m6
|
||||
psrad m0, 7
|
||||
%if cpuflag(sse4)
|
||||
packusdw m0, m0
|
||||
%else
|
||||
packssdw m0, m0
|
||||
%endif
|
||||
%ifidn %1, avg
|
||||
movh m1, [dstq]
|
||||
%endif
|
||||
pminsw m0, m5
|
||||
%if notcpuflag(sse4)
|
||||
%if ARCH_X86_64
|
||||
pmaxsw m0, m11
|
||||
%else
|
||||
pxor m2, m2
|
||||
pmaxsw m0, m2
|
||||
%endif
|
||||
%endif
|
||||
%ifidn %1, avg
|
||||
pavgw m0, m1
|
||||
%endif
|
||||
movh [dstq], m0
|
||||
add dstq, dstrideq
|
||||
dec hd
|
||||
jg .loop
|
||||
RET
|
||||
|
||||
cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
|
||||
mova m5, [pw_4095]
|
||||
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
filter_h4_fn put
|
||||
filter_h4_fn avg
|
||||
|
||||
%macro filter_h_fn 1-2 12
|
||||
%assign %%px mmsize/2
|
||||
cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
|
||||
mova m5, [pw_1023]
|
||||
.body:
|
||||
%if notcpuflag(sse4) && ARCH_X86_64
|
||||
pxor m11, m11
|
||||
%endif
|
||||
mova m6, [pd_64]
|
||||
mova m7, [filteryq+ 0]
|
||||
%if ARCH_X86_64 && mmsize > 8
|
||||
mova m8, [filteryq+32]
|
||||
mova m9, [filteryq+64]
|
||||
mova m10, [filteryq+96]
|
||||
%endif
|
||||
.loop:
|
||||
movu m0, [srcq-6]
|
||||
movu m1, [srcq-4]
|
||||
movu m2, [srcq-2]
|
||||
movu m3, [srcq+0]
|
||||
movu m4, [srcq+2]
|
||||
pmaddwd m0, m7
|
||||
pmaddwd m1, m7
|
||||
%if ARCH_X86_64 && mmsize > 8
|
||||
pmaddwd m2, m8
|
||||
pmaddwd m3, m8
|
||||
pmaddwd m4, m9
|
||||
%else
|
||||
pmaddwd m2, [filteryq+32]
|
||||
pmaddwd m3, [filteryq+32]
|
||||
pmaddwd m4, [filteryq+64]
|
||||
%endif
|
||||
paddd m0, m2
|
||||
paddd m1, m3
|
||||
paddd m0, m4
|
||||
movu m2, [srcq+4]
|
||||
movu m3, [srcq+6]
|
||||
movu m4, [srcq+8]
|
||||
add srcq, sstrideq
|
||||
%if ARCH_X86_64 && mmsize > 8
|
||||
pmaddwd m2, m9
|
||||
pmaddwd m3, m10
|
||||
pmaddwd m4, m10
|
||||
%else
|
||||
pmaddwd m2, [filteryq+64]
|
||||
pmaddwd m3, [filteryq+96]
|
||||
pmaddwd m4, [filteryq+96]
|
||||
%endif
|
||||
paddd m1, m2
|
||||
paddd m0, m3
|
||||
paddd m1, m4
|
||||
paddd m0, m6
|
||||
paddd m1, m6
|
||||
psrad m0, 7
|
||||
psrad m1, 7
|
||||
%if cpuflag(sse4)
|
||||
packusdw m0, m0
|
||||
packusdw m1, m1
|
||||
%else
|
||||
packssdw m0, m0
|
||||
packssdw m1, m1
|
||||
%endif
|
||||
punpcklwd m0, m1
|
||||
pminsw m0, m5
|
||||
%if notcpuflag(sse4)
|
||||
%if ARCH_X86_64
|
||||
pmaxsw m0, m11
|
||||
%else
|
||||
pxor m2, m2
|
||||
pmaxsw m0, m2
|
||||
%endif
|
||||
%endif
|
||||
%ifidn %1, avg
|
||||
pavgw m0, [dstq]
|
||||
%endif
|
||||
mova [dstq], m0
|
||||
add dstq, dstrideq
|
||||
dec hd
|
||||
jg .loop
|
||||
RET
|
||||
|
||||
cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
|
||||
mova m5, [pw_4095]
|
||||
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
filter_h_fn put
|
||||
filter_h_fn avg
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
filter_h_fn put
|
||||
filter_h_fn avg
|
||||
%endif
|
||||
|
||||
%macro filter_v4_fn 1-2 12
|
||||
%if ARCH_X86_64
|
||||
cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
|
||||
%else
|
||||
cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
|
||||
mov filteryq, r5mp
|
||||
%define hd r4mp
|
||||
%endif
|
||||
mova m5, [pw_1023]
|
||||
.body:
|
||||
%if notcpuflag(sse4) && ARCH_X86_64
|
||||
pxor m11, m11
|
||||
%endif
|
||||
mova m6, [pd_64]
|
||||
lea sstride3q, [sstrideq*3]
|
||||
lea src4q, [srcq+sstrideq]
|
||||
sub srcq, sstride3q
|
||||
mova m7, [filteryq+ 0]
|
||||
%if ARCH_X86_64 && mmsize > 8
|
||||
mova m8, [filteryq+ 32]
|
||||
mova m9, [filteryq+ 64]
|
||||
mova m10, [filteryq+ 96]
|
||||
%endif
|
||||
.loop:
|
||||
; FIXME maybe reuse loads from previous rows, or just
|
||||
; more generally unroll this to prevent multiple loads of
|
||||
; the same data?
|
||||
movh m0, [srcq]
|
||||
movh m1, [srcq+sstrideq]
|
||||
movh m2, [srcq+sstrideq*2]
|
||||
movh m3, [srcq+sstride3q]
|
||||
add srcq, sstrideq
|
||||
movh m4, [src4q]
|
||||
punpcklwd m0, m1
|
||||
punpcklwd m2, m3
|
||||
pmaddwd m0, m7
|
||||
%if ARCH_X86_64 && mmsize > 8
|
||||
pmaddwd m2, m8
|
||||
%else
|
||||
pmaddwd m2, [filteryq+ 32]
|
||||
%endif
|
||||
movh m1, [src4q+sstrideq]
|
||||
movh m3, [src4q+sstrideq*2]
|
||||
paddd m0, m2
|
||||
movh m2, [src4q+sstride3q]
|
||||
add src4q, sstrideq
|
||||
punpcklwd m4, m1
|
||||
punpcklwd m3, m2
|
||||
%if ARCH_X86_64 && mmsize > 8
|
||||
pmaddwd m4, m9
|
||||
pmaddwd m3, m10
|
||||
%else
|
||||
pmaddwd m4, [filteryq+ 64]
|
||||
pmaddwd m3, [filteryq+ 96]
|
||||
%endif
|
||||
paddd m0, m4
|
||||
paddd m0, m3
|
||||
paddd m0, m6
|
||||
psrad m0, 7
|
||||
%if cpuflag(sse4)
|
||||
packusdw m0, m0
|
||||
%else
|
||||
packssdw m0, m0
|
||||
%endif
|
||||
%ifidn %1, avg
|
||||
movh m1, [dstq]
|
||||
%endif
|
||||
pminsw m0, m5
|
||||
%if notcpuflag(sse4)
|
||||
%if ARCH_X86_64
|
||||
pmaxsw m0, m11
|
||||
%else
|
||||
pxor m2, m2
|
||||
pmaxsw m0, m2
|
||||
%endif
|
||||
%endif
|
||||
%ifidn %1, avg
|
||||
pavgw m0, m1
|
||||
%endif
|
||||
movh [dstq], m0
|
||||
add dstq, dstrideq
|
||||
dec hd
|
||||
jg .loop
|
||||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
|
||||
%else
|
||||
cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
|
||||
mov filteryq, r5mp
|
||||
%endif
|
||||
mova m5, [pw_4095]
|
||||
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
filter_v4_fn put
|
||||
filter_v4_fn avg
|
||||
|
||||
%macro filter_v_fn 1-2 13
|
||||
%assign %%px mmsize/2
|
||||
%if ARCH_X86_64
|
||||
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
|
||||
%else
|
||||
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
|
||||
mov filteryq, r5mp
|
||||
%define hd r4mp
|
||||
%endif
|
||||
mova m5, [pw_1023]
|
||||
.body:
|
||||
%if notcpuflag(sse4) && ARCH_X86_64
|
||||
pxor m12, m12
|
||||
%endif
|
||||
%if ARCH_X86_64
|
||||
mova m11, [pd_64]
|
||||
%endif
|
||||
lea sstride3q, [sstrideq*3]
|
||||
lea src4q, [srcq+sstrideq]
|
||||
sub srcq, sstride3q
|
||||
mova m7, [filteryq+ 0]
|
||||
%if ARCH_X86_64 && mmsize > 8
|
||||
mova m8, [filteryq+ 32]
|
||||
mova m9, [filteryq+ 64]
|
||||
mova m10, [filteryq+ 96]
|
||||
%endif
|
||||
.loop:
|
||||
; FIXME maybe reuse loads from previous rows, or just
|
||||
; more generally unroll this to prevent multiple loads of
|
||||
; the same data?
|
||||
movu m0, [srcq]
|
||||
movu m1, [srcq+sstrideq]
|
||||
movu m2, [srcq+sstrideq*2]
|
||||
movu m3, [srcq+sstride3q]
|
||||
add srcq, sstrideq
|
||||
movu m4, [src4q]
|
||||
SBUTTERFLY wd, 0, 1, 6
|
||||
SBUTTERFLY wd, 2, 3, 6
|
||||
pmaddwd m0, m7
|
||||
pmaddwd m1, m7
|
||||
%if ARCH_X86_64 && mmsize > 8
|
||||
pmaddwd m2, m8
|
||||
pmaddwd m3, m8
|
||||
%else
|
||||
pmaddwd m2, [filteryq+ 32]
|
||||
pmaddwd m3, [filteryq+ 32]
|
||||
%endif
|
||||
paddd m0, m2
|
||||
paddd m1, m3
|
||||
movu m2, [src4q+sstrideq]
|
||||
movu m3, [src4q+sstrideq*2]
|
||||
SBUTTERFLY wd, 4, 2, 6
|
||||
%if ARCH_X86_64 && mmsize > 8
|
||||
pmaddwd m4, m9
|
||||
pmaddwd m2, m9
|
||||
%else
|
||||
pmaddwd m4, [filteryq+ 64]
|
||||
pmaddwd m2, [filteryq+ 64]
|
||||
%endif
|
||||
paddd m0, m4
|
||||
paddd m1, m2
|
||||
movu m4, [src4q+sstride3q]
|
||||
add src4q, sstrideq
|
||||
SBUTTERFLY wd, 3, 4, 6
|
||||
%if ARCH_X86_64 && mmsize > 8
|
||||
pmaddwd m3, m10
|
||||
pmaddwd m4, m10
|
||||
%else
|
||||
pmaddwd m3, [filteryq+ 96]
|
||||
pmaddwd m4, [filteryq+ 96]
|
||||
%endif
|
||||
paddd m0, m3
|
||||
paddd m1, m4
|
||||
%if ARCH_X86_64
|
||||
paddd m0, m11
|
||||
paddd m1, m11
|
||||
%else
|
||||
paddd m0, [pd_64]
|
||||
paddd m1, [pd_64]
|
||||
%endif
|
||||
psrad m0, 7
|
||||
psrad m1, 7
|
||||
%if cpuflag(sse4)
|
||||
packusdw m0, m1
|
||||
%else
|
||||
packssdw m0, m1
|
||||
%endif
|
||||
pminsw m0, m5
|
||||
%if notcpuflag(sse4)
|
||||
%if ARCH_X86_64
|
||||
pmaxsw m0, m12
|
||||
%else
|
||||
pxor m2, m2
|
||||
pmaxsw m0, m2
|
||||
%endif
|
||||
%endif
|
||||
%ifidn %1, avg
|
||||
pavgw m0, [dstq]
|
||||
%endif
|
||||
mova [dstq], m0
|
||||
add dstq, dstrideq
|
||||
dec hd
|
||||
jg .loop
|
||||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
|
||||
%else
|
||||
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
|
||||
mov filteryq, r5mp
|
||||
%endif
|
||||
mova m5, [pw_4095]
|
||||
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
filter_v_fn put
|
||||
filter_v_fn avg
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
filter_v_fn put
|
||||
filter_v_fn avg
|
||||
%endif
|
Reference in New Issue
Block a user