1791 lines
		
	
	
		
			65 KiB
		
	
	
	
		
			ArmAsm
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			1791 lines
		
	
	
		
			65 KiB
		
	
	
	
		
			ArmAsm
		
	
	
		
			Executable File
		
	
	
	
	
/*
 | 
						|
 * VP8 NEON optimisations
 | 
						|
 *
 | 
						|
 * Copyright (c) 2010 Rob Clark <rob@ti.com>
 | 
						|
 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
 | 
						|
 * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
 | 
						|
 * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
 | 
						|
 *
 | 
						|
 * This file is part of FFmpeg.
 | 
						|
 *
 | 
						|
 * FFmpeg is free software; you can redistribute it and/or
 | 
						|
 * modify it under the terms of the GNU Lesser General Public
 | 
						|
 * License as published by the Free Software Foundation; either
 | 
						|
 * version 2.1 of the License, or (at your option) any later version.
 | 
						|
 *
 | 
						|
 * FFmpeg is distributed in the hope that it will be useful,
 | 
						|
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
 * Lesser General Public License for more details.
 | 
						|
 *
 | 
						|
 * You should have received a copy of the GNU Lesser General Public
 | 
						|
 * License along with FFmpeg; if not, write to the Free Software
 | 
						|
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | 
						|
 */
 | 
						|
 | 
						|
#include "libavutil/aarch64/asm.S"
 | 
						|
#include "neon.S"
 | 
						|
 | 
						|
function ff_vp8_luma_dc_wht_neon, export=1
 | 
						|
        ld1             {v0.4h - v3.4h}, [x1]
 | 
						|
        movi            v30.8h, #0
 | 
						|
 | 
						|
        add             v4.4h,  v0.4h,  v3.4h
 | 
						|
        add             v6.4h,  v1.4h,  v2.4h
 | 
						|
        st1             {v30.8h}, [x1], #16
 | 
						|
        sub             v7.4h,  v1.4h,  v2.4h
 | 
						|
        sub             v5.4h,  v0.4h,  v3.4h
 | 
						|
        st1             {v30.8h}, [x1]
 | 
						|
        add             v0.4h,  v4.4h,  v6.4h
 | 
						|
        add             v1.4h,  v5.4h,  v7.4h
 | 
						|
        sub             v2.4h,  v4.4h,  v6.4h
 | 
						|
        sub             v3.4h,  v5.4h,  v7.4h
 | 
						|
 | 
						|
        movi            v16.4h, #3
 | 
						|
 | 
						|
        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
 | 
						|
 | 
						|
        add             v0.4h,  v0.4h,  v16.4h
 | 
						|
 | 
						|
        add             v4.4h,  v0.4h,  v3.4h
 | 
						|
        add             v6.4h,  v1.4h,  v2.4h
 | 
						|
        sub             v7.4h,  v1.4h,  v2.4h
 | 
						|
        sub             v5.4h,  v0.4h,  v3.4h
 | 
						|
        add             v0.4h,  v4.4h,  v6.4h
 | 
						|
        add             v1.4h,  v5.4h,  v7.4h
 | 
						|
        sub             v2.4h,  v4.4h,  v6.4h
 | 
						|
        sub             v3.4h,  v5.4h,  v7.4h
 | 
						|
 | 
						|
        sshr            v0.4h,  v0.4h,  #3
 | 
						|
        sshr            v1.4h,  v1.4h,  #3
 | 
						|
        sshr            v2.4h,  v2.4h,  #3
 | 
						|
        sshr            v3.4h,  v3.4h,  #3
 | 
						|
 | 
						|
        mov             x3,  #32
 | 
						|
        st1             {v0.h}[0],  [x0], x3
 | 
						|
        st1             {v1.h}[0],  [x0], x3
 | 
						|
        st1             {v2.h}[0],  [x0], x3
 | 
						|
        st1             {v3.h}[0],  [x0], x3
 | 
						|
        st1             {v0.h}[1],  [x0], x3
 | 
						|
        st1             {v1.h}[1],  [x0], x3
 | 
						|
        st1             {v2.h}[1],  [x0], x3
 | 
						|
        st1             {v3.h}[1],  [x0], x3
 | 
						|
        st1             {v0.h}[2],  [x0], x3
 | 
						|
        st1             {v1.h}[2],  [x0], x3
 | 
						|
        st1             {v2.h}[2],  [x0], x3
 | 
						|
        st1             {v3.h}[2],  [x0], x3
 | 
						|
        st1             {v0.h}[3],  [x0], x3
 | 
						|
        st1             {v1.h}[3],  [x0], x3
 | 
						|
        st1             {v2.h}[3],  [x0], x3
 | 
						|
        st1             {v3.h}[3],  [x0], x3
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_vp8_idct_add_neon, export=1
 | 
						|
        ld1             {v0.8b - v3.8b},  [x1]
 | 
						|
        mov             w4,  #20091
 | 
						|
        movk            w4,  #35468/2, lsl #16
 | 
						|
        dup             v4.2s, w4
 | 
						|
 | 
						|
        smull           v26.4s, v1.4h,  v4.h[0]
 | 
						|
        smull           v27.4s, v3.4h,  v4.h[0]
 | 
						|
        sqdmulh         v20.4h, v1.4h,  v4.h[1]
 | 
						|
        sqdmulh         v23.4h, v3.4h,  v4.h[1]
 | 
						|
        shrn            v21.4h, v26.4s, #16
 | 
						|
        shrn            v22.4h, v27.4s, #16
 | 
						|
        add             v21.4h, v21.4h, v1.4h
 | 
						|
        add             v22.4h, v22.4h, v3.4h
 | 
						|
 | 
						|
        add             v16.4h,  v0.4h,   v2.4h
 | 
						|
        sub             v17.4h,  v0.4h,   v2.4h
 | 
						|
 | 
						|
        add             v18.4h,  v21.4h,  v23.4h
 | 
						|
        sub             v19.4h,  v20.4h,  v22.4h
 | 
						|
 | 
						|
        add             v0.4h,   v16.4h,  v18.4h
 | 
						|
        add             v1.4h,   v17.4h,  v19.4h
 | 
						|
        sub             v3.4h,   v16.4h,  v18.4h
 | 
						|
        sub             v2.4h,   v17.4h,  v19.4h
 | 
						|
 | 
						|
        transpose_4x4H  v0, v1, v2, v3, v24, v5, v6, v7
 | 
						|
 | 
						|
        movi            v29.8h, #0
 | 
						|
        smull           v26.4s,     v1.4h,  v4.h[0]
 | 
						|
        st1             {v29.8h},   [x1],   #16
 | 
						|
        smull           v27.4s,     v3.4h,  v4.h[0]
 | 
						|
        st1             {v29.16b},  [x1]
 | 
						|
        sqdmulh         v21.4h,     v1.4h,  v4.h[1]
 | 
						|
        sqdmulh         v23.4h,     v3.4h,  v4.h[1]
 | 
						|
        shrn            v20.4h,     v26.4s, #16
 | 
						|
        shrn            v22.4h,     v27.4s, #16
 | 
						|
        add             v20.4h,     v20.4h, v1.4h
 | 
						|
        add             v22.4h,     v22.4h, v3.4h
 | 
						|
        add             v16.4h,     v0.4h,  v2.4h
 | 
						|
        sub             v17.4h,     v0.4h,  v2.4h
 | 
						|
 | 
						|
        add             v18.4h,     v20.4h, v23.4h
 | 
						|
        ld1             {v24.s}[0], [x0],   x2
 | 
						|
        sub             v19.4h, v21.4h, v22.4h
 | 
						|
        ld1             {v25.s}[0], [x0],   x2
 | 
						|
        add             v0.4h,      v16.4h, v18.4h
 | 
						|
        add             v1.4h,      v17.4h, v19.4h
 | 
						|
        ld1             {v26.s}[0], [x0],   x2
 | 
						|
        sub             v3.4h,      v16.4h, v18.4h
 | 
						|
        sub             v2.4h,      v17.4h, v19.4h
 | 
						|
        ld1             {v27.s}[0], [x0],   x2
 | 
						|
        srshr           v0.4h,      v0.4h,  #3
 | 
						|
        srshr           v1.4h,      v1.4h,  #3
 | 
						|
        srshr           v2.4h,      v2.4h,  #3
 | 
						|
        srshr           v3.4h,      v3.4h,  #3
 | 
						|
 | 
						|
        sub             x0,  x0,  x2,  lsl #2
 | 
						|
 | 
						|
        transpose_4x4H  v0, v1, v2, v3, v5, v6, v7, v16
 | 
						|
 | 
						|
        uaddw           v0.8h,  v0.8h, v24.8b
 | 
						|
        uaddw           v1.8h,  v1.8h, v25.8b
 | 
						|
        uaddw           v2.8h,  v2.8h, v26.8b
 | 
						|
        uaddw           v3.8h,  v3.8h, v27.8b
 | 
						|
        sqxtun          v0.8b,  v0.8h
 | 
						|
        sqxtun          v1.8b,  v1.8h
 | 
						|
        sqxtun          v2.8b,  v2.8h
 | 
						|
        sqxtun          v3.8b,  v3.8h
 | 
						|
 | 
						|
        st1             {v0.s}[0],  [x0], x2
 | 
						|
        st1             {v1.s}[0],  [x0], x2
 | 
						|
        st1             {v2.s}[0],  [x0], x2
 | 
						|
        st1             {v3.s}[0],  [x0], x2
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_vp8_idct_dc_add4uv_neon, export=1
 | 
						|
        movi            v0.4h,  #0
 | 
						|
        mov             x3,     #32
 | 
						|
        ld1r            {v16.4h},  [x1]
 | 
						|
        st1             {v0.h}[0], [x1], x3
 | 
						|
        ld1r            {v17.4h},  [x1]
 | 
						|
        st1             {v0.h}[0], [x1], x3
 | 
						|
        ld1r            {v18.4h},  [x1]
 | 
						|
        st1             {v0.h}[0], [x1], x3
 | 
						|
        ld1r            {v19.4h},  [x1]
 | 
						|
        st1             {v0.h}[0], [x1], x3
 | 
						|
        ins             v16.d[1],  v17.d[0]
 | 
						|
        ins             v18.d[1],  v19.d[0]
 | 
						|
        mov             x3,  x0
 | 
						|
        srshr           v16.8h,    v16.8h,  #3            // dc >>= 3
 | 
						|
        ld1             {v0.8b},   [x0], x2
 | 
						|
        srshr           v18.8h,    v18.8h,  #3
 | 
						|
        ld1             {v1.8b},   [x0], x2
 | 
						|
        uaddw           v20.8h,    v16.8h, v0.8b
 | 
						|
        ld1             {v2.8b},   [x0], x2
 | 
						|
        uaddw           v0.8h,     v16.8h, v1.8b
 | 
						|
        ld1             {v3.8b},   [x0], x2
 | 
						|
        uaddw           v22.8h,    v16.8h, v2.8b
 | 
						|
        ld1             {v4.8b},   [x0], x2
 | 
						|
        uaddw           v2.8h,     v16.8h, v3.8b
 | 
						|
        ld1             {v5.8b},   [x0], x2
 | 
						|
        uaddw           v24.8h,    v18.8h, v4.8b
 | 
						|
        ld1             {v6.8b},   [x0], x2
 | 
						|
        uaddw           v4.8h,     v18.8h, v5.8b
 | 
						|
        ld1             {v7.8b},   [x0], x2
 | 
						|
        uaddw           v26.8h,    v18.8h, v6.8b
 | 
						|
        sqxtun          v20.8b,    v20.8h
 | 
						|
        uaddw           v6.8h,     v18.8h, v7.8b
 | 
						|
        sqxtun          v21.8b,    v0.8h
 | 
						|
        sqxtun          v22.8b,    v22.8h
 | 
						|
        st1             {v20.8b},  [x3], x2
 | 
						|
        sqxtun          v23.8b,    v2.8h
 | 
						|
        st1             {v21.8b},  [x3], x2
 | 
						|
        sqxtun          v24.8b,    v24.8h
 | 
						|
        st1             {v22.8b},  [x3], x2
 | 
						|
        sqxtun          v25.8b,    v4.8h
 | 
						|
        st1             {v23.8b},  [x3], x2
 | 
						|
        sqxtun          v26.8b,    v26.8h
 | 
						|
        st1             {v24.8b},  [x3], x2
 | 
						|
        sqxtun          v27.8b,    v6.8h
 | 
						|
        st1             {v25.8b},  [x3], x2
 | 
						|
        st1             {v26.8b},  [x3], x2
 | 
						|
        st1             {v27.8b},  [x3], x2
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_vp8_idct_dc_add4y_neon, export=1
 | 
						|
        movi            v0.16b,  #0
 | 
						|
        mov             x3,  #32
 | 
						|
        ld1r            {v16.4h},    [x1]
 | 
						|
        st1             {v0.h}[0],   [x1], x3
 | 
						|
        ld1r            {v17.4h},    [x1]
 | 
						|
        st1             {v0.h}[0],   [x1], x3
 | 
						|
        zip1            v16.2d,      v16.2d, v17.2d
 | 
						|
        ld1r            {v18.4h},    [x1]
 | 
						|
        st1             {v0.h}[0],   [x1], x3
 | 
						|
        ld1r            {v19.4h},    [x1]
 | 
						|
        st1             {v0.h}[0],   [x1], x3
 | 
						|
        zip1            v18.2d,      v18.2d, v19.2d
 | 
						|
        srshr           v16.8h,      v16.8h,  #3            // dc >>= 3
 | 
						|
        ld1             {v0.16b},     [x0], x2
 | 
						|
        srshr           v18.8h,       v18.8h,  #3
 | 
						|
        ld1             {v1.16b},     [x0], x2
 | 
						|
        uaddw           v20.8h,       v16.8h,  v0.8b
 | 
						|
        ld1             {v2.16b},     [x0], x2
 | 
						|
        uaddw2          v0.8h,        v18.8h,   v0.16b
 | 
						|
        ld1             {v3.16b},     [x0], x2
 | 
						|
        uaddw           v21.8h, v16.8h,  v1.8b
 | 
						|
        uaddw2          v1.8h,  v18.8h,  v1.16b
 | 
						|
        uaddw           v22.8h, v16.8h,  v2.8b
 | 
						|
        uaddw2          v2.8h,  v18.8h,  v2.16b
 | 
						|
        uaddw           v23.8h, v16.8h,  v3.8b
 | 
						|
        uaddw2          v3.8h,  v18.8h,  v3.16b
 | 
						|
        sub             x0,  x0,  x2,  lsl #2
 | 
						|
        sqxtun          v20.8b,  v20.8h
 | 
						|
        sqxtun2         v20.16b, v0.8h
 | 
						|
        sqxtun          v21.8b,  v21.8h
 | 
						|
        sqxtun2         v21.16b, v1.8h
 | 
						|
        sqxtun          v22.8b,  v22.8h
 | 
						|
        st1             {v20.16b},    [x0], x2
 | 
						|
        sqxtun2         v22.16b, v2.8h
 | 
						|
        st1             {v21.16b},    [x0], x2
 | 
						|
        sqxtun          v23.8b,  v23.8h
 | 
						|
        st1             {v22.16b},    [x0], x2
 | 
						|
        sqxtun2         v23.16b, v3.8h
 | 
						|
        st1             {v23.16b},    [x0], x2
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_vp8_idct_dc_add_neon, export=1
 | 
						|
        mov             w3,       #0
 | 
						|
        ld1r            {v2.8h},  [x1]
 | 
						|
        strh            w3,       [x1]
 | 
						|
        srshr           v2.8h,  v2.8h,  #3
 | 
						|
        ld1             {v0.s}[0],  [x0], x2
 | 
						|
        ld1             {v0.s}[1],  [x0], x2
 | 
						|
        uaddw           v3.8h,  v2.8h,  v0.8b
 | 
						|
        ld1             {v1.s}[0],  [x0], x2
 | 
						|
        ld1             {v1.s}[1],  [x0], x2
 | 
						|
        uaddw           v4.8h,  v2.8h,  v1.8b
 | 
						|
        sqxtun          v0.8b,  v3.8h
 | 
						|
        sqxtun          v1.8b,  v4.8h
 | 
						|
        sub             x0,  x0,  x2, lsl #2
 | 
						|
        st1             {v0.s}[0],  [x0], x2
 | 
						|
        st1             {v0.s}[1],  [x0], x2
 | 
						|
        st1             {v1.s}[0],  [x0], x2
 | 
						|
        st1             {v1.s}[1],  [x0], x2
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
// Register layout:
 | 
						|
//   P3..Q3 -> v0..v7
 | 
						|
//   flim_E -> v22
 | 
						|
//   flim_I -> v23
 | 
						|
//   hev_thresh -> x5
 | 
						|
//
 | 
						|
.macro  vp8_loop_filter, inner=0, simple=0, hev_thresh
 | 
						|
    .if \simple
 | 
						|
        uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
 | 
						|
        uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
 | 
						|
        uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
 | 
						|
        ushr            v18.16b, v23.16b, #1          // abs(P1-Q1) / 2
 | 
						|
        uqadd           v19.16b, v17.16b,  v18.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
 | 
						|
        movi            v21.16b, #0x80
 | 
						|
        cmhs            v16.16b, v22.16b, v19.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
 | 
						|
    .else
 | 
						|
        // calculate hev and normal_limit:
 | 
						|
        uabd            v20.16b, v2.16b,  v3.16b      // abs(P1-P0)
 | 
						|
        uabd            v21.16b, v5.16b,  v4.16b      // abs(Q1-Q0)
 | 
						|
        uabd            v18.16b, v0.16b,  v1.16b      // abs(P3-P2)
 | 
						|
        uabd            v19.16b, v1.16b,  v2.16b      // abs(P2-P1)
 | 
						|
        cmhs            v16.16b, v23.16b, v20.16b     // abs(P1-P0) <= flim_I
 | 
						|
        cmhs            v17.16b, v23.16b, v21.16b     // abs(Q1-Q0) <= flim_I
 | 
						|
        cmhs            v18.16b, v23.16b, v18.16b     // abs(P3-P2) <= flim_I
 | 
						|
        cmhs            v19.16b, v23.16b, v19.16b     // abs(P2-P1) <= flim_I
 | 
						|
        and             v16.16b, v17.16b, v16.16b
 | 
						|
        uabd            v17.16b, v7.16b,  v6.16b      // abs(Q3-Q2)
 | 
						|
        and             v16.16b, v16.16b, v19.16b
 | 
						|
        uabd            v19.16b, v6.16b,  v5.16b      // abs(Q2-Q1)
 | 
						|
        and             v16.16b, v16.16b, v18.16b
 | 
						|
        cmhs            v18.16b, v23.16b, v17.16b     // abs(Q3-Q2) <= flim_I
 | 
						|
        cmhs            v19.16b, v23.16b, v19.16b     // abs(Q2-Q1) <= flim_I
 | 
						|
        uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
 | 
						|
        uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
 | 
						|
        and             v16.16b, v16.16b, v18.16b
 | 
						|
        uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
 | 
						|
        and             v16.16b, v16.16b, v19.16b
 | 
						|
        ushr            v18.16b, v23.16b, #1          // abs(P1-Q1) / 2
 | 
						|
        dup             v23.16b, \hev_thresh          // hev_thresh
 | 
						|
        uqadd           v19.16b, v17.16b, v18.16b     // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
 | 
						|
        cmhi            v20.16b, v20.16b, v23.16b     // abs(P1-P0) > hev_thresh
 | 
						|
        cmhs            v19.16b, v22.16b, v19.16b     // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
 | 
						|
        cmhi            v22.16b, v21.16b, v23.16b     // abs(Q1-Q0) > hev_thresh
 | 
						|
        and             v16.16b, v16.16b, v19.16b
 | 
						|
        movi            v21.16b, #0x80
 | 
						|
        orr             v17.16b, v20.16b, v22.16b
 | 
						|
    .endif
 | 
						|
 | 
						|
        // at this point:
 | 
						|
        //   v16: normal_limit
 | 
						|
        //   v17: hev
 | 
						|
 | 
						|
        // convert to signed value:
 | 
						|
        eor            v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
 | 
						|
        eor            v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80
 | 
						|
 | 
						|
        movi           v20.8h, #3
 | 
						|
        ssubl          v18.8h, v4.8b,  v3.8b             // QS0 - PS0
 | 
						|
        ssubl2         v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
 | 
						|
        eor            v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
 | 
						|
        eor            v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
 | 
						|
        mul            v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
 | 
						|
        mul            v19.8h, v19.8h, v20.8h
 | 
						|
 | 
						|
        sqsub          v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
 | 
						|
        movi           v22.16b, #4
 | 
						|
        movi           v23.16b, #3
 | 
						|
    .if \inner
 | 
						|
        and            v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
 | 
						|
    .endif
 | 
						|
        saddw          v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
 | 
						|
        saddw2         v19.8h,  v19.8h, v20.16b
 | 
						|
        sqxtn          v18.8b,  v18.8h                   // narrow result back into v18
 | 
						|
        sqxtn2         v18.16b, v19.8h
 | 
						|
    .if !\inner && !\simple
 | 
						|
        eor            v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
 | 
						|
        eor            v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
 | 
						|
    .endif
 | 
						|
        and            v18.16b, v18.16b, v16.16b         // w &= normal_limit
 | 
						|
 | 
						|
        // registers used at this point..
 | 
						|
        //   v0 -> P3  (don't corrupt)
 | 
						|
        //   v1-v6 -> PS2-QS2
 | 
						|
        //   v7 -> Q3  (don't corrupt)
 | 
						|
        //   v17 -> hev
 | 
						|
        //   v18 -> w
 | 
						|
        //   v21 -> #0x80
 | 
						|
        //   v22 -> #4
 | 
						|
        //   v23 -> #3
 | 
						|
        //   v16, v19, v29 -> unused
 | 
						|
        //
 | 
						|
        // filter_common:   is4tap==1
 | 
						|
        //   c1 = clamp(w + 4) >> 3;
 | 
						|
        //   c2 = clamp(w + 3) >> 3;
 | 
						|
        //   Q0 = s2u(QS0 - c1);
 | 
						|
        //   P0 = s2u(PS0 + c2);
 | 
						|
 | 
						|
    .if \simple
 | 
						|
        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
 | 
						|
        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
 | 
						|
        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
 | 
						|
        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
 | 
						|
        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
 | 
						|
        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
 | 
						|
        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
 | 
						|
        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
 | 
						|
        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
 | 
						|
        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
 | 
						|
    .elseif \inner
 | 
						|
        // the !is4tap case of filter_common, only used for inner blocks
 | 
						|
        //   c3 = ((c1&~hev) + 1) >> 1;
 | 
						|
        //   Q1 = s2u(QS1 - c3);
 | 
						|
        //   P1 = s2u(PS1 + c3);
 | 
						|
        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
 | 
						|
        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
 | 
						|
        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
 | 
						|
        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
 | 
						|
        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
 | 
						|
        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
 | 
						|
        bic            v19.16b, v19.16b, v17.16b           // c1 & ~hev
 | 
						|
        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
 | 
						|
        srshr          v19.16b, v19.16b, #1                // c3 >>= 1
 | 
						|
        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
 | 
						|
        sqsub          v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
 | 
						|
        sqadd          v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
 | 
						|
        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
 | 
						|
        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
 | 
						|
    .else
 | 
						|
        and            v20.16b, v18.16b, v17.16b           // w & hev
 | 
						|
        sqadd          v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
 | 
						|
        sqadd          v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
 | 
						|
        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
 | 
						|
        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
 | 
						|
        bic            v18.16b, v18.16b, v17.16b           // w &= ~hev
 | 
						|
        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
 | 
						|
        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
 | 
						|
 | 
						|
        // filter_mbedge:
 | 
						|
        //   a = clamp((27*w + 63) >> 7);
 | 
						|
        //   Q0 = s2u(QS0 - a);
 | 
						|
        //   P0 = s2u(PS0 + a);
 | 
						|
        //   a = clamp((18*w + 63) >> 7);
 | 
						|
        //   Q1 = s2u(QS1 - a);
 | 
						|
        //   P1 = s2u(PS1 + a);
 | 
						|
        //   a = clamp((9*w + 63) >> 7);
 | 
						|
        //   Q2 = s2u(QS2 - a);
 | 
						|
        //   P2 = s2u(PS2 + a);
 | 
						|
        movi           v17.8h,  #63
 | 
						|
        sshll          v22.8h,  v18.8b, #3
 | 
						|
        sshll2         v23.8h,  v18.16b, #3
 | 
						|
        saddw          v22.8h,  v22.8h, v18.8b
 | 
						|
        saddw2         v23.8h,  v23.8h, v18.16b
 | 
						|
        add            v16.8h,  v17.8h, v22.8h
 | 
						|
        add            v17.8h,  v17.8h, v23.8h           //  9*w + 63
 | 
						|
        add            v19.8h,  v16.8h, v22.8h
 | 
						|
        add            v20.8h,  v17.8h, v23.8h           // 18*w + 63
 | 
						|
        add            v22.8h,  v19.8h, v22.8h
 | 
						|
        add            v23.8h,  v20.8h, v23.8h           // 27*w + 63
 | 
						|
        sqshrn         v16.8b,  v16.8h,  #7
 | 
						|
        sqshrn2        v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
 | 
						|
        sqshrn         v19.8b,  v19.8h, #7
 | 
						|
        sqshrn2        v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
 | 
						|
        sqshrn         v22.8b,  v22.8h, #7
 | 
						|
        sqshrn2        v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
 | 
						|
        sqadd          v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
 | 
						|
        sqsub          v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
 | 
						|
        sqadd          v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
 | 
						|
        sqsub          v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
 | 
						|
        sqadd          v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
 | 
						|
        sqsub          v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
 | 
						|
        eor            v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
 | 
						|
        eor            v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
 | 
						|
        eor            v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
 | 
						|
        eor            v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
 | 
						|
        eor            v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
 | 
						|
        eor            v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
 | 
						|
    .endif
 | 
						|
.endm
 | 
						|
 | 
						|
.macro  vp8_v_loop_filter16 name, inner=0, simple=0
 | 
						|
function ff_vp8_v_loop_filter16\name\()_neon, export=1
 | 
						|
        sub             x0,  x0,  x1,  lsl #1+!\simple
 | 
						|
 | 
						|
        // Load pixels:
 | 
						|
    .if !\simple
 | 
						|
        ld1             {v0.16b},     [x0], x1 // P3
 | 
						|
        ld1             {v1.16b},     [x0], x1 // P2
 | 
						|
    .endif
 | 
						|
        ld1             {v2.16b},     [x0], x1 // P1
 | 
						|
        ld1             {v3.16b},     [x0], x1 // P0
 | 
						|
        ld1             {v4.16b},     [x0], x1 // Q0
 | 
						|
        ld1             {v5.16b},     [x0], x1 // Q1
 | 
						|
    .if !\simple
 | 
						|
        ld1             {v6.16b},     [x0], x1 // Q2
 | 
						|
        ld1             {v7.16b},     [x0]     // Q3
 | 
						|
        dup             v23.16b, w3                 // flim_I
 | 
						|
    .endif
 | 
						|
        dup             v22.16b, w2                 // flim_E
 | 
						|
 | 
						|
        vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
 | 
						|
 | 
						|
        // back up to P2:  dst -= stride * 6
 | 
						|
        sub             x0,  x0,  x1,  lsl #2
 | 
						|
    .if !\simple
 | 
						|
        sub             x0,  x0,  x1,  lsl #1
 | 
						|
 | 
						|
        // Store pixels:
 | 
						|
        st1             {v1.16b},     [x0], x1 // P2
 | 
						|
    .endif
 | 
						|
        st1             {v2.16b},     [x0], x1 // P1
 | 
						|
        st1             {v3.16b},     [x0], x1 // P0
 | 
						|
        st1             {v4.16b},     [x0], x1 // Q0
 | 
						|
        st1             {v5.16b},     [x0], x1 // Q1
 | 
						|
    .if !\simple
 | 
						|
        st1             {v6.16b},     [x0]     // Q2
 | 
						|
    .endif
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
.endm
 | 
						|
 | 
						|
vp8_v_loop_filter16
 | 
						|
vp8_v_loop_filter16 _inner,  inner=1
 | 
						|
vp8_v_loop_filter16 _simple, simple=1
 | 
						|
 | 
						|
.macro  vp8_v_loop_filter8uv name, inner=0
 | 
						|
function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
 | 
						|
        sub             x0,  x0,  x2,  lsl #2
 | 
						|
        sub             x1,  x1,  x2,  lsl #2
 | 
						|
        // Load pixels:
 | 
						|
        ld1          {v0.d}[0],     [x0], x2  // P3
 | 
						|
        ld1          {v0.d}[1],     [x1], x2  // P3
 | 
						|
        ld1          {v1.d}[0],     [x0], x2  // P2
 | 
						|
        ld1          {v1.d}[1],     [x1], x2  // P2
 | 
						|
        ld1          {v2.d}[0],     [x0], x2  // P1
 | 
						|
        ld1          {v2.d}[1],     [x1], x2  // P1
 | 
						|
        ld1          {v3.d}[0],     [x0], x2  // P0
 | 
						|
        ld1          {v3.d}[1],     [x1], x2  // P0
 | 
						|
        ld1          {v4.d}[0],     [x0], x2  // Q0
 | 
						|
        ld1          {v4.d}[1],     [x1], x2  // Q0
 | 
						|
        ld1          {v5.d}[0],     [x0], x2  // Q1
 | 
						|
        ld1          {v5.d}[1],     [x1], x2  // Q1
 | 
						|
        ld1          {v6.d}[0],     [x0], x2  // Q2
 | 
						|
        ld1          {v6.d}[1],     [x1], x2  // Q2
 | 
						|
        ld1          {v7.d}[0],     [x0]      // Q3
 | 
						|
        ld1          {v7.d}[1],     [x1]      // Q3
 | 
						|
 | 
						|
        dup          v22.16b, w3                 // flim_E
 | 
						|
        dup          v23.16b, w4                 // flim_I
 | 
						|
 | 
						|
        vp8_loop_filter inner=\inner, hev_thresh=w5
 | 
						|
 | 
						|
        // back up to P2:  u,v -= stride * 6
 | 
						|
        sub          x0,  x0,  x2,  lsl #2
 | 
						|
        sub          x1,  x1,  x2,  lsl #2
 | 
						|
        sub          x0,  x0,  x2,  lsl #1
 | 
						|
        sub          x1,  x1,  x2,  lsl #1
 | 
						|
 | 
						|
        // Store pixels:
 | 
						|
 | 
						|
        st1          {v1.d}[0],     [x0], x2  // P2
 | 
						|
        st1          {v1.d}[1],     [x1], x2  // P2
 | 
						|
        st1          {v2.d}[0],     [x0], x2  // P1
 | 
						|
        st1          {v2.d}[1],     [x1], x2  // P1
 | 
						|
        st1          {v3.d}[0],     [x0], x2  // P0
 | 
						|
        st1          {v3.d}[1],     [x1], x2  // P0
 | 
						|
        st1          {v4.d}[0],     [x0], x2  // Q0
 | 
						|
        st1          {v4.d}[1],     [x1], x2  // Q0
 | 
						|
        st1          {v5.d}[0],     [x0], x2  // Q1
 | 
						|
        st1          {v5.d}[1],     [x1], x2  // Q1
 | 
						|
        st1          {v6.d}[0],     [x0]      // Q2
 | 
						|
        st1          {v6.d}[1],     [x1]      // Q2
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
.endm
 | 
						|
 | 
						|
vp8_v_loop_filter8uv
 | 
						|
vp8_v_loop_filter8uv _inner, inner=1
 | 
						|
 | 
						|
.macro  vp8_h_loop_filter16 name, inner=0, simple=0
 | 
						|
function ff_vp8_h_loop_filter16\name\()_neon, export=1
 | 
						|
 | 
						|
        sub             x0,  x0,  #4
 | 
						|
        // Load pixels:
 | 
						|
        ld1             {v0.d}[0], [x0], x1
 | 
						|
        ld1             {v1.d}[0], [x0], x1
 | 
						|
        ld1             {v2.d}[0], [x0], x1
 | 
						|
        ld1             {v3.d}[0], [x0], x1
 | 
						|
        ld1             {v4.d}[0], [x0], x1
 | 
						|
        ld1             {v5.d}[0], [x0], x1
 | 
						|
        ld1             {v6.d}[0], [x0], x1
 | 
						|
        ld1             {v7.d}[0], [x0], x1
 | 
						|
        ld1             {v0.d}[1], [x0], x1
 | 
						|
        ld1             {v1.d}[1], [x0], x1
 | 
						|
        ld1             {v2.d}[1], [x0], x1
 | 
						|
        ld1             {v3.d}[1], [x0], x1
 | 
						|
        ld1             {v4.d}[1], [x0], x1
 | 
						|
        ld1             {v5.d}[1], [x0], x1
 | 
						|
        ld1             {v6.d}[1], [x0], x1
 | 
						|
        ld1             {v7.d}[1], [x0], x1
 | 
						|
 | 
						|
        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
 | 
						|
 | 
						|
        dup             v22.16b, w2                 // flim_E
 | 
						|
    .if !\simple
 | 
						|
        dup             v23.16b, w3                 // flim_I
 | 
						|
    .endif
 | 
						|
 | 
						|
        vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
 | 
						|
 | 
						|
        sub             x0,  x0,  x1, lsl #4    // backup 16 rows
 | 
						|
 | 
						|
        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
 | 
						|
 | 
						|
        // Store pixels:
 | 
						|
        st1             {v0.d}[0], [x0], x1
 | 
						|
        st1             {v1.d}[0], [x0], x1
 | 
						|
        st1             {v2.d}[0], [x0], x1
 | 
						|
        st1             {v3.d}[0], [x0], x1
 | 
						|
        st1             {v4.d}[0], [x0], x1
 | 
						|
        st1             {v5.d}[0], [x0], x1
 | 
						|
        st1             {v6.d}[0], [x0], x1
 | 
						|
        st1             {v7.d}[0], [x0], x1
 | 
						|
        st1             {v0.d}[1], [x0], x1
 | 
						|
        st1             {v1.d}[1], [x0], x1
 | 
						|
        st1             {v2.d}[1], [x0], x1
 | 
						|
        st1             {v3.d}[1], [x0], x1
 | 
						|
        st1             {v4.d}[1], [x0], x1
 | 
						|
        st1             {v5.d}[1], [x0], x1
 | 
						|
        st1             {v6.d}[1], [x0], x1
 | 
						|
        st1             {v7.d}[1], [x0]
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
.endm
 | 
						|
 | 
						|
vp8_h_loop_filter16
 | 
						|
vp8_h_loop_filter16 _inner,  inner=1
 | 
						|
vp8_h_loop_filter16 _simple, simple=1
 | 
						|
 | 
						|
.macro  vp8_h_loop_filter8uv name, inner=0
 | 
						|
function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
 | 
						|
        sub             x0,  x0,  #4
 | 
						|
        sub             x1,  x1,  #4
 | 
						|
 | 
						|
        // Load pixels:
 | 
						|
        ld1          {v0.d}[0],     [x0], x2 // load u
 | 
						|
        ld1          {v0.d}[1],     [x1], x2 // load v
 | 
						|
        ld1          {v1.d}[0],     [x0], x2
 | 
						|
        ld1          {v1.d}[1],     [x1], x2
 | 
						|
        ld1          {v2.d}[0],     [x0], x2
 | 
						|
        ld1          {v2.d}[1],     [x1], x2
 | 
						|
        ld1          {v3.d}[0],     [x0], x2
 | 
						|
        ld1          {v3.d}[1],     [x1], x2
 | 
						|
        ld1          {v4.d}[0],     [x0], x2
 | 
						|
        ld1          {v4.d}[1],     [x1], x2
 | 
						|
        ld1          {v5.d}[0],     [x0], x2
 | 
						|
        ld1          {v5.d}[1],     [x1], x2
 | 
						|
        ld1          {v6.d}[0],     [x0], x2
 | 
						|
        ld1          {v6.d}[1],     [x1], x2
 | 
						|
        ld1          {v7.d}[0],     [x0], x2
 | 
						|
        ld1          {v7.d}[1],     [x1], x2
 | 
						|
 | 
						|
        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
 | 
						|
 | 
						|
        dup             v22.16b, w3                 // flim_E
 | 
						|
        dup             v23.16b, w4                 // flim_I
 | 
						|
 | 
						|
        vp8_loop_filter inner=\inner, hev_thresh=w5
 | 
						|
 | 
						|
        sub             x0,  x0,  x2, lsl #3    // backup u 8 rows
 | 
						|
        sub             x1,  x1,  x2, lsl #3    // backup v 8 rows
 | 
						|
 | 
						|
        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
 | 
						|
 | 
						|
        // Store pixels:
 | 
						|
        st1          {v0.d}[0],     [x0], x2 // load u
 | 
						|
        st1          {v0.d}[1],     [x1], x2 // load v
 | 
						|
        st1          {v1.d}[0],     [x0], x2
 | 
						|
        st1          {v1.d}[1],     [x1], x2
 | 
						|
        st1          {v2.d}[0],     [x0], x2
 | 
						|
        st1          {v2.d}[1],     [x1], x2
 | 
						|
        st1          {v3.d}[0],     [x0], x2
 | 
						|
        st1          {v3.d}[1],     [x1], x2
 | 
						|
        st1          {v4.d}[0],     [x0], x2
 | 
						|
        st1          {v4.d}[1],     [x1], x2
 | 
						|
        st1          {v5.d}[0],     [x0], x2
 | 
						|
        st1          {v5.d}[1],     [x1], x2
 | 
						|
        st1          {v6.d}[0],     [x0], x2
 | 
						|
        st1          {v6.d}[1],     [x1], x2
 | 
						|
        st1          {v7.d}[0],     [x0]
 | 
						|
        st1          {v7.d}[1],     [x1]
 | 
						|
 | 
						|
        ret
 | 
						|
 | 
						|
endfunc
 | 
						|
.endm
 | 
						|
 | 
						|
vp8_h_loop_filter8uv
 | 
						|
vp8_h_loop_filter8uv _inner, inner=1
 | 
						|
 | 
						|
 | 
						|
function ff_put_vp8_pixels16_neon, export=1
 | 
						|
1:
 | 
						|
        subs            w4, w4, #4
 | 
						|
        ld1             {v0.16b},     [x2], x3
 | 
						|
        ld1             {v1.16b},     [x2], x3
 | 
						|
        ld1             {v2.16b},     [x2], x3
 | 
						|
        ld1             {v3.16b},     [x2], x3
 | 
						|
        st1             {v0.16b},     [x0], x1
 | 
						|
        st1             {v1.16b},     [x0], x1
 | 
						|
        st1             {v2.16b},     [x0], x1
 | 
						|
        st1             {v3.16b},     [x0], x1
 | 
						|
        b.gt            1b
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_pixels8_neon, export=1
 | 
						|
1:
 | 
						|
        subs            w4, w4, #4
 | 
						|
        ld1             {v0.8b},   [x2], x3
 | 
						|
        ld1             {v0.d}[1], [x2], x3
 | 
						|
        ld1             {v1.8b},   [x2], x3
 | 
						|
        ld1             {v1.d}[1], [x2], x3
 | 
						|
        st1             {v0.8b},   [x0], x1
 | 
						|
        st1             {v0.d}[1], [x0], x1
 | 
						|
        st1             {v1.8b},   [x0], x1
 | 
						|
        st1             {v1.d}[1], [x0], x1
 | 
						|
        b.gt            1b
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
/* 4/6-tap 8th-pel MC */
 | 
						|
 | 
						|
.macro  vp8_epel8_h6    d,   s0,   s1
 | 
						|
        ext             v22.8b, \s0\().8b,  \s1\().8b,  #1
 | 
						|
        uxtl            v18.8h, \s0\().8b
 | 
						|
        ext             v23.8b, \s0\().8b,  \s1\().8b,  #2
 | 
						|
        uxtl            v19.8h, v22.8b
 | 
						|
        ext             v24.8b, \s0\().8b,  \s1\().8b,  #3
 | 
						|
        uxtl            v21.8h, v23.8b
 | 
						|
        ext             v25.8b, \s0\().8b,  \s1\().8b,  #4
 | 
						|
        uxtl            v22.8h, v24.8b
 | 
						|
        ext             v26.8b, \s0\().8b,  \s1\().8b,  #5
 | 
						|
        uxtl            v25.8h, v25.8b
 | 
						|
        mul             v21.8h, v21.8h, v0.h[2]
 | 
						|
        uxtl            v26.8h, v26.8b
 | 
						|
        mul             v22.8h, v22.8h, v0.h[3]
 | 
						|
        mls             v21.8h, v19.8h, v0.h[1]
 | 
						|
        mls             v22.8h, v25.8h, v0.h[4]
 | 
						|
        mla             v21.8h, v18.8h, v0.h[0]
 | 
						|
        mla             v22.8h, v26.8h, v0.h[5]
 | 
						|
        sqadd           v22.8h, v21.8h, v22.8h
 | 
						|
        sqrshrun        \d\().8b, v22.8h, #7
 | 
						|
.endm
 | 
						|
 | 
						|
.macro  vp8_epel16_h6   d0,  v0,  v1
 | 
						|
        ext             v22.16b, \v0\().16b, \v1\().16b, #3
 | 
						|
        ext             v23.16b, \v0\().16b, \v1\().16b, #4
 | 
						|
        uxtl            v19.8h,  v22.8b
 | 
						|
        uxtl2           v22.8h,  v22.16b
 | 
						|
        ext             v3.16b,  \v0\().16b, \v1\().16b, #2
 | 
						|
        uxtl            v20.8h,  v23.8b
 | 
						|
        uxtl2           v23.8h,  v23.16b
 | 
						|
        ext             v16.16b, \v0\().16b, \v1\().16b, #1
 | 
						|
        uxtl            v18.8h,  v3.8b
 | 
						|
        uxtl2           v3.8h,   v3.16b
 | 
						|
        ext             v2.16b,  \v0\().16b, \v1\().16b, #5
 | 
						|
        uxtl            v21.8h,  v2.8b
 | 
						|
        uxtl2           v2.8h,   v2.16b
 | 
						|
        uxtl            v17.8h,  v16.8b
 | 
						|
        uxtl2           v16.8h,  v16.16b
 | 
						|
        mul             v19.8h,  v19.8h, v0.h[3]
 | 
						|
        mul             v18.8h,  v18.8h, v0.h[2]
 | 
						|
        mul             v3.8h,   v3.8h,  v0.h[2]
 | 
						|
        mul             v22.8h,  v22.8h, v0.h[3]
 | 
						|
        mls             v19.8h,  v20.8h, v0.h[4]
 | 
						|
        uxtl            v20.8h,  \v0\().8b
 | 
						|
        uxtl2           v1.8h,   \v0\().16b
 | 
						|
        mls             v18.8h,  v17.8h, v0.h[1]
 | 
						|
        mls             v3.8h,   v16.8h, v0.h[1]
 | 
						|
        mls             v22.8h,  v23.8h, v0.h[4]
 | 
						|
        mla             v18.8h,  v20.8h, v0.h[0]
 | 
						|
        mla             v19.8h,  v21.8h, v0.h[5]
 | 
						|
        mla             v3.8h,   v1.8h,  v0.h[0]
 | 
						|
        mla             v22.8h,  v2.8h,  v0.h[5]
 | 
						|
        sqadd           v19.8h,  v18.8h, v19.8h
 | 
						|
        sqadd           v22.8h,  v3.8h,  v22.8h
 | 
						|
        sqrshrun        \d0\().8b,  v19.8h, #7
 | 
						|
        sqrshrun2       \d0\().16b, v22.8h, #7
 | 
						|
.endm
 | 
						|
 | 
						|
.macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
 | 
						|
        uxtl            \s0\().8h, \s0\().8b
 | 
						|
        uxtl            \s3\().8h, \s3\().8b
 | 
						|
        uxtl            \s6\().8h, \s6\().8b
 | 
						|
        uxtl            \s1\().8h, \s1\().8b
 | 
						|
        uxtl            \s4\().8h, \s4\().8b
 | 
						|
        uxtl            \s2\().8h, \s2\().8b
 | 
						|
        uxtl            \s5\().8h, \s5\().8b
 | 
						|
        mul             \s0\().8h, \s0\().8h, v0.h[0]
 | 
						|
        mul             v31.8h   , \s3\().8h, v0.h[3]
 | 
						|
        mul             \s3\().8h, \s3\().8h, v0.h[2]
 | 
						|
        mul             \s6\().8h, \s6\().8h, v0.h[5]
 | 
						|
 | 
						|
        mls             \s0\().8h, \s1\().8h, v0.h[1]
 | 
						|
        mls             v31.8h   , \s4\().8h, v0.h[4]
 | 
						|
        mls             \s3\().8h, \s2\().8h, v0.h[1]
 | 
						|
        mls             \s6\().8h, \s5\().8h, v0.h[4]
 | 
						|
 | 
						|
        mla             \s0\().8h, \s2\().8h, v0.h[2]
 | 
						|
        mla             v31.8h   , \s5\().8h, v0.h[5]
 | 
						|
        mla             \s3\().8h, \s1\().8h, v0.h[0]
 | 
						|
        mla             \s6\().8h, \s4\().8h, v0.h[3]
 | 
						|
        sqadd           v31.8h   , \s0\().8h, v31.8h
 | 
						|
        sqadd           \s6\().8h, \s3\().8h, \s6\().8h
 | 
						|
        sqrshrun        \d0\().8b, v31.8h,    #7
 | 
						|
        sqrshrun        \d1\().8b, \s6\().8h, #7
 | 
						|
.endm
 | 
						|
 | 
						|
.macro  vp8_epel8_h4    d,   v0,   v1
 | 
						|
        ext             v22.8b, \v0\().8b,  \v1\().8b,  #1
 | 
						|
        uxtl            v19.8h, \v0\().8b
 | 
						|
        ext             v23.8b, \v0\().8b,  \v1\().8b,  #2
 | 
						|
        uxtl            v20.8h, v22.8b
 | 
						|
        ext             v25.8b, \v0\().8b,  \v1\().8b,  #3
 | 
						|
        uxtl            v22.8h, v23.8b
 | 
						|
        uxtl            v25.8h, v25.8b
 | 
						|
        mul             v20.8h, v20.8h, v0.h[2]
 | 
						|
        mul             v22.8h, v22.8h, v0.h[3]
 | 
						|
        mls             v20.8h, v19.8h, v0.h[1]
 | 
						|
        mls             v22.8h, v25.8h, v0.h[4]
 | 
						|
        sqadd           v22.8h, v20.8h, v22.8h
 | 
						|
        sqrshrun        \d\().8b, v22.8h, #7
 | 
						|
.endm
 | 
						|
 | 
						|
.macro  vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
 | 
						|
        uxtl            \s0\().8h,  \s0\().8b
 | 
						|
        uxtl            \s1\().8h,  \s1\().8b
 | 
						|
        uxtl            \s2\().8h,  \s2\().8b
 | 
						|
        uxtl            \s3\().8h,  \s3\().8b
 | 
						|
        uxtl            \s4\().8h,  \s4\().8b
 | 
						|
        mul             v21.8h,     \s1\().8h, v0.h[2]
 | 
						|
        mul             v23.8h,     \s2\().8h, v0.h[3]
 | 
						|
        mul             \s2\().8h,  \s2\().8h, v0.h[2]
 | 
						|
        mul             v22.8h,     \s3\().8h, v0.h[3]
 | 
						|
        mls             v21.8h,     \s0\().8h, v0.h[1]
 | 
						|
        mls             v23.8h,     \s3\().8h, v0.h[4]
 | 
						|
        mls             \s2\().8h,  \s1\().8h, v0.h[1]
 | 
						|
        mls             v22.8h,     \s4\().8h, v0.h[4]
 | 
						|
        sqadd           v21.8h,     v21.8h,    v23.8h
 | 
						|
        sqadd           \s2\().8h,  \s2\().8h, v22.8h
 | 
						|
        sqrshrun        \d0\().8b,  v21.8h,    #7
 | 
						|
        sqrshrun2       \d0\().16b, \s2\().8h, #7
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
// note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
 | 
						|
// arithmetic can be used to apply filters
 | 
						|
const   subpel_filters, align=4
 | 
						|
        .short     0,   6, 123,  12,   1,   0,   0,   0
 | 
						|
        .short     2,  11, 108,  36,   8,   1,   0,   0
 | 
						|
        .short     0,   9,  93,  50,   6,   0,   0,   0
 | 
						|
        .short     3,  16,  77,  77,  16,   3,   0,   0
 | 
						|
        .short     0,   6,  50,  93,   9,   0,   0,   0
 | 
						|
        .short     1,   8,  36, 108,  11,   2,   0,   0
 | 
						|
        .short     0,   1,  12, 123,   6,   0,   0,   0
 | 
						|
endconst
 | 
						|
 | 
						|
function ff_put_vp8_epel16_v6_neon, export=1
 | 
						|
        sub             x2,  x2,  x3,  lsl #1
 | 
						|
 | 
						|
        sxtw            x4,  w4
 | 
						|
        sxtw            x6,  w6
 | 
						|
        movrel          x17,  subpel_filters, -16
 | 
						|
        add             x6,  x17,  x6, lsl #4  // y
 | 
						|
        ld1             {v0.8h},     [x6]
 | 
						|
1:
 | 
						|
        ld1             {v1.1d - v2.1d},    [x2], x3
 | 
						|
        ld1             {v3.1d - v4.1d},    [x2], x3
 | 
						|
        ld1             {v16.1d - v17.1d},  [x2], x3
 | 
						|
        ld1             {v18.1d - v19.1d},  [x2], x3
 | 
						|
        ld1             {v20.1d - v21.1d},  [x2], x3
 | 
						|
        ld1             {v22.1d - v23.1d},  [x2], x3
 | 
						|
        ld1             {v24.1d - v25.1d},  [x2]
 | 
						|
        sub             x2,  x2,  x3, lsl #2
 | 
						|
 | 
						|
        vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
 | 
						|
        vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
 | 
						|
 | 
						|
        st1             {v1.1d - v2.1d}, [x0], x1
 | 
						|
        st1             {v3.1d - v4.1d}, [x0], x1
 | 
						|
        subs            x4, x4, #2
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel16_h6_neon, export=1
 | 
						|
        sub             x2,  x2,  #2
 | 
						|
        sxtw            x5,  w5 // x
 | 
						|
 | 
						|
        // first pass (horizontal):
 | 
						|
        movrel          x17,  subpel_filters, -16
 | 
						|
        add             x5,  x17,  x5, lsl #4 // x
 | 
						|
        ld1             {v0.8h},  [x5]
 | 
						|
1:
 | 
						|
        ld1             {v1.16b, v2.16b}, [x2], x3
 | 
						|
        vp8_epel16_h6   v1, v1, v2
 | 
						|
        st1             {v1.16b}, [x0], x1
 | 
						|
 | 
						|
        subs            w4, w4, #1
 | 
						|
        b.ne            1b
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
 | 
						|
function ff_put_vp8_epel16_h6v6_neon, export=1
 | 
						|
        sub             x2,  x2,  x3,  lsl #1
 | 
						|
        sub             x2,  x2,  #2
 | 
						|
 | 
						|
        // first pass (horizontal):
 | 
						|
        movrel          x17,  subpel_filters, -16
 | 
						|
        sxtw            x5,  w5 // x
 | 
						|
        add             x16,  x17,  x5, lsl #4 // x
 | 
						|
        sub             sp,  sp,  #336+16
 | 
						|
        ld1             {v0.8h},  [x16]
 | 
						|
        add             x7,  sp,  #15
 | 
						|
        sxtw            x4,  w4
 | 
						|
        add             x16, x4, #5   // h
 | 
						|
        bic             x7,  x7,  #15
 | 
						|
1:
 | 
						|
        ld1             {v1.16b, v2.16b}, [x2], x3
 | 
						|
        vp8_epel16_h6   v1, v1, v2
 | 
						|
        st1             {v1.16b}, [x7], #16
 | 
						|
        subs            x16, x16, #1
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
 | 
						|
        // second pass (vertical):
 | 
						|
        sxtw            x6,  w6
 | 
						|
        add             x6,  x17,  x6, lsl #4  // y
 | 
						|
        add             x7,  sp,  #15
 | 
						|
        ld1             {v0.8h},     [x6]
 | 
						|
        bic             x7,  x7,  #15
 | 
						|
2:
 | 
						|
        ld1             {v1.8b - v4.8b},    [x7], #32
 | 
						|
        ld1             {v16.8b - v19.8b},  [x7], #32
 | 
						|
        ld1             {v20.8b - v23.8b},  [x7], #32
 | 
						|
        ld1             {v24.8b - v25.8b},  [x7]
 | 
						|
        sub             x7,  x7,  #64
 | 
						|
 | 
						|
        vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
 | 
						|
        vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
 | 
						|
        trn1            v1.2d, v1.2d, v2.2d
 | 
						|
        trn1            v3.2d, v3.2d, v4.2d
 | 
						|
 | 
						|
        st1             {v1.16b}, [x0], x1
 | 
						|
        st1             {v3.16b}, [x0], x1
 | 
						|
        subs            x4, x4, #2
 | 
						|
        b.ne            2b
 | 
						|
 | 
						|
        add             sp,  sp,  #336+16
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel8_v6_neon, export=1
 | 
						|
        sub             x2,  x2,  x3,  lsl #1
 | 
						|
 | 
						|
        movrel          x7,  subpel_filters, -16
 | 
						|
        add             x6,  x7,  w6, uxtw #4
 | 
						|
        ld1             {v0.8h},  [x6]
 | 
						|
1:
 | 
						|
        ld1             {v2.8b},  [x2], x3
 | 
						|
        ld1             {v3.8b},  [x2], x3
 | 
						|
        ld1             {v4.8b},  [x2], x3
 | 
						|
        ld1             {v5.8b},  [x2], x3
 | 
						|
        ld1             {v6.8b},  [x2], x3
 | 
						|
        ld1             {v7.8b},  [x2], x3
 | 
						|
        ld1             {v28.8b}, [x2]
 | 
						|
 | 
						|
        sub             x2,  x2,  x3,  lsl #2
 | 
						|
 | 
						|
        vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
 | 
						|
 | 
						|
        st1             {v2.8b}, [x0], x1
 | 
						|
        st1             {v3.8b}, [x0], x1
 | 
						|
        subs            w4,  w4,  #2
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel8_h6_neon, export=1
 | 
						|
        sub             x2,  x2,  #2
 | 
						|
 | 
						|
        movrel          x7,  subpel_filters, -16
 | 
						|
        add             x5,  x7,  w5, uxtw #4
 | 
						|
        ld1             {v0.8h},        [x5]
 | 
						|
1:
 | 
						|
        ld1             {v2.8b, v3.8b}, [x2], x3
 | 
						|
 | 
						|
        vp8_epel8_h6    v2,  v2,  v3
 | 
						|
 | 
						|
        st1             {v2.8b}, [x0], x1
 | 
						|
        subs            w4,  w4,  #1
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel8_h6v6_neon, export=1
 | 
						|
        sub             x2,  x2,  x3,  lsl #1
 | 
						|
        sub             x2,  x2,  #2
 | 
						|
        sxtw            x4,  w4
 | 
						|
 | 
						|
        // first pass (horizontal):
 | 
						|
        movrel          x17,  subpel_filters, -16
 | 
						|
        sxtw            x5,  w5
 | 
						|
        add             x5,  x17,  x5, lsl #4 // x
 | 
						|
        sub             sp,  sp,  #168+16
 | 
						|
        ld1             {v0.8h},  [x5]
 | 
						|
        add             x7,  sp,  #15
 | 
						|
        add             x16, x4,  #5   // h
 | 
						|
        bic             x7,  x7,  #15
 | 
						|
1:
 | 
						|
        ld1             {v1.8b, v2.8b}, [x2], x3
 | 
						|
 | 
						|
        vp8_epel8_h6    v1, v1, v2
 | 
						|
 | 
						|
        st1             {v1.8b}, [x7], #8
 | 
						|
        subs            x16, x16, #1
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        // second pass (vertical):
 | 
						|
        sxtw            x6,  w6
 | 
						|
        add             x6,  x17,  x6, lsl #4  // y
 | 
						|
        add             x7,  sp,   #15
 | 
						|
        ld1             {v0.8h},   [x6]
 | 
						|
        bic             x7,  x7,   #15
 | 
						|
2:
 | 
						|
        ld1             {v1.8b - v4.8b}, [x7], #32
 | 
						|
        ld1             {v5.8b - v7.8b}, [x7]
 | 
						|
 | 
						|
        sub             x7,  x7,  #16
 | 
						|
 | 
						|
        vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
 | 
						|
 | 
						|
        st1             {v1.8b}, [x0], x1
 | 
						|
        st1             {v2.8b}, [x0], x1
 | 
						|
        subs            x4, x4, #2
 | 
						|
        b.ne            2b
 | 
						|
 | 
						|
        add             sp,  sp,  #168+16
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel8_v4_neon, export=1
 | 
						|
        sub             x2,  x2,  x3
 | 
						|
 | 
						|
        movrel          x7,  subpel_filters, -16
 | 
						|
        add             x6,  x7,  w6, uxtw #4
 | 
						|
        ld1             {v0.8h},     [x6]
 | 
						|
1:
 | 
						|
        ld1             {v2.8b},     [x2], x3
 | 
						|
        ld1             {v3.8b},     [x2], x3
 | 
						|
        ld1             {v4.8b},     [x2], x3
 | 
						|
        ld1             {v5.8b},     [x2], x3
 | 
						|
        ld1             {v6.8b},     [x2]
 | 
						|
        sub             x2,  x2,  x3,  lsl #1
 | 
						|
 | 
						|
        vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
 | 
						|
 | 
						|
        st1             {v2.d}[0], [x0], x1
 | 
						|
        st1             {v2.d}[1], [x0], x1
 | 
						|
        subs            w4,  w4,  #2
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel8_h4_neon, export=1
 | 
						|
        sub             x2,  x2,  #1
 | 
						|
 | 
						|
        movrel          x7,  subpel_filters, -16
 | 
						|
        add             x5,  x7,  w5, uxtw #4
 | 
						|
        ld1             {v0.8h},       [x5]
 | 
						|
1:
 | 
						|
        ld1             {v2.8b,v3.8b}, [x2], x3
 | 
						|
 | 
						|
        vp8_epel8_h4    v2,  v2,  v3
 | 
						|
 | 
						|
        st1             {v2.8b}, [x0], x1
 | 
						|
        subs            w4,  w4,  #1
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel8_h4v6_neon, export=1
 | 
						|
        sub             x2,  x2,  x3,  lsl #1
 | 
						|
        sub             x2,  x2,  #1
 | 
						|
        sxtw            x4,  w4
 | 
						|
 | 
						|
        // first pass (horizontal):
 | 
						|
        movrel          x17,  subpel_filters, -16
 | 
						|
        sxtw            x5,  w5
 | 
						|
        add             x5,  x17,  x5, lsl #4 // x
 | 
						|
        sub             sp,  sp,  #168+16
 | 
						|
        ld1             {v0.8h},  [x5]
 | 
						|
        add             x7,  sp,  #15
 | 
						|
        add             x16, x4, #5   // h
 | 
						|
        bic             x7,  x7,  #15
 | 
						|
1:
 | 
						|
        ld1             {v1.8b, v2.8b}, [x2], x3
 | 
						|
 | 
						|
        vp8_epel8_h4    v1, v1, v2
 | 
						|
 | 
						|
        st1             {v1.8b}, [x7], #8
 | 
						|
        subs            x16, x16, #1
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        // second pass (vertical):
 | 
						|
        sxtw            x6,  w6
 | 
						|
        add             x6,  x17,  x6, lsl #4  // y
 | 
						|
        add             x7,  sp,   #15
 | 
						|
        ld1             {v0.8h},   [x6]
 | 
						|
        bic             x7,  x7,   #15
 | 
						|
2:
 | 
						|
        ld1             {v1.8b - v4.8b}, [x7], #32
 | 
						|
        ld1             {v5.8b - v7.8b}, [x7]
 | 
						|
 | 
						|
        sub             x7,  x7,  #16
 | 
						|
 | 
						|
        vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
 | 
						|
 | 
						|
        st1             {v1.8b}, [x0], x1
 | 
						|
        st1             {v2.8b}, [x0], x1
 | 
						|
        subs            x4, x4, #2
 | 
						|
        b.ne            2b
 | 
						|
 | 
						|
        add             sp,  sp,  #168+16
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel8_h4v4_neon, export=1
 | 
						|
        sub             x2,  x2,  x3
 | 
						|
        sub             x2,  x2,  #1
 | 
						|
        sxtw            x4,  w4
 | 
						|
 | 
						|
 | 
						|
        // first pass (horizontal):
 | 
						|
        movrel          x17,  subpel_filters, -16
 | 
						|
        sxtw            x5,  w5
 | 
						|
        add             x5,  x17,  x5, lsl #4 // x
 | 
						|
        sub             sp,  sp,  #168+16
 | 
						|
        ld1             {v0.8h},  [x5]
 | 
						|
        add             x7,  sp,  #15
 | 
						|
        add             x16, x4, #3   // h
 | 
						|
        bic             x7,  x7,  #15
 | 
						|
1:
 | 
						|
        ld1             {v1.8b, v2.8b}, [x2], x3
 | 
						|
 | 
						|
        vp8_epel8_h4    v1, v1, v2
 | 
						|
 | 
						|
        st1             {v1.8b}, [x7], #8
 | 
						|
        subs            x16, x16, #1
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        // second pass (vertical):
 | 
						|
        sxtw            x6,  w6
 | 
						|
        add             x6,  x17,  x6, lsl #4  // y
 | 
						|
        add             x7,  sp,   #15
 | 
						|
        ld1             {v0.8h},   [x6]
 | 
						|
        bic             x7,  x7,   #15
 | 
						|
2:
 | 
						|
        ld1             {v1.8b - v2.8b}, [x7], #16
 | 
						|
        ld1             {v3.8b - v5.8b}, [x7]
 | 
						|
 | 
						|
        vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
 | 
						|
 | 
						|
        st1             {v1.d}[0], [x0], x1
 | 
						|
        st1             {v1.d}[1], [x0], x1
 | 
						|
        subs            x4, x4, #2
 | 
						|
        b.ne            2b
 | 
						|
 | 
						|
        add             sp,  sp,  #168+16
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel8_h6v4_neon, export=1
 | 
						|
        sub             x2,  x2,  x3
 | 
						|
        sub             x2,  x2,  #2
 | 
						|
        sxtw            x4,  w4
 | 
						|
 | 
						|
 | 
						|
        // first pass (horizontal):
 | 
						|
        movrel          x17,  subpel_filters, -16
 | 
						|
        sxtw            x5,  w5
 | 
						|
        add             x5,  x17,  x5, lsl #4 // x
 | 
						|
        sub             sp,  sp,  #168+16
 | 
						|
        ld1             {v0.8h},  [x5]
 | 
						|
        add             x7,  sp,  #15
 | 
						|
        add             x16, x4, #3   // h
 | 
						|
        bic             x7,  x7,  #15
 | 
						|
1:
 | 
						|
        ld1             {v1.8b, v2.8b}, [x2], x3
 | 
						|
 | 
						|
        vp8_epel8_h6    v1, v1, v2
 | 
						|
 | 
						|
        st1             {v1.8b}, [x7], #8
 | 
						|
        subs            x16, x16, #1
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        // second pass (vertical):
 | 
						|
        sxtw            x6,  w6
 | 
						|
        add             x6,  x17,  x6, lsl #4  // y
 | 
						|
        add             x7,  sp,   #15
 | 
						|
        ld1             {v0.8h},   [x6]
 | 
						|
        bic             x7,  x7,   #15
 | 
						|
2:
 | 
						|
        ld1             {v1.8b - v2.8b}, [x7], #16
 | 
						|
        ld1             {v3.8b - v5.8b}, [x7]
 | 
						|
 | 
						|
        vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
 | 
						|
 | 
						|
        st1             {v1.d}[0], [x0], x1
 | 
						|
        st1             {v1.d}[1], [x0], x1
 | 
						|
        subs            x4, x4, #2
 | 
						|
        b.ne            2b
 | 
						|
 | 
						|
        add             sp,  sp,  #168+16
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel4_v6_neon, export=1
 | 
						|
        sub             x2,  x2,  x3,  lsl #1
 | 
						|
 | 
						|
        movrel          x7,  subpel_filters, -16
 | 
						|
        add             x6,  x7,  w6, uxtw #4
 | 
						|
        ld1             {v0.8h},    [x6]
 | 
						|
1:
 | 
						|
        ld1r            {v2.2s},    [x2], x3
 | 
						|
        ld1r            {v3.2s},    [x2], x3
 | 
						|
        ld1r            {v4.2s},    [x2], x3
 | 
						|
        ld1r            {v5.2s},    [x2], x3
 | 
						|
        ld1r            {v6.2s},    [x2], x3
 | 
						|
        ld1r            {v7.2s},    [x2], x3
 | 
						|
        ld1r            {v28.2s},   [x2]
 | 
						|
        sub             x2,  x2,  x3,  lsl #2
 | 
						|
        ld1             {v2.s}[1],  [x2], x3
 | 
						|
        ld1             {v3.s}[1],  [x2], x3
 | 
						|
        ld1             {v4.s}[1],  [x2], x3
 | 
						|
        ld1             {v5.s}[1],  [x2], x3
 | 
						|
        ld1             {v6.s}[1],  [x2], x3
 | 
						|
        ld1             {v7.s}[1],  [x2], x3
 | 
						|
        ld1             {v28.s}[1], [x2]
 | 
						|
        sub             x2,  x2,  x3,  lsl #2
 | 
						|
 | 
						|
        vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
 | 
						|
 | 
						|
        st1             {v2.s}[0],  [x0], x1
 | 
						|
        st1             {v3.s}[0],  [x0], x1
 | 
						|
        st1             {v2.s}[1],  [x0], x1
 | 
						|
        st1             {v3.s}[1],  [x0], x1
 | 
						|
        subs            w4,  w4,  #4
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel4_h6_neon, export=1
 | 
						|
        sub             x2,  x2,  #2
 | 
						|
 | 
						|
        movrel          x7,  subpel_filters, -16
 | 
						|
        add             x5,  x7,  w5, uxtw #4
 | 
						|
        ld1             {v0.8h},       [x5]
 | 
						|
1:
 | 
						|
        ld1             {v2.8b,v3.8b}, [x2], x3
 | 
						|
        vp8_epel8_h6    v2,  v2,  v3
 | 
						|
        st1             {v2.s}[0], [x0], x1
 | 
						|
        subs            w4,  w4,  #1
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel4_h6v6_neon, export=1
 | 
						|
        sub             x2,  x2,  x3,  lsl #1
 | 
						|
        sub             x2,  x2,  #2
 | 
						|
 | 
						|
        movrel          x7,  subpel_filters, -16
 | 
						|
        add             x5,  x7,  w5, uxtw #4
 | 
						|
        ld1             {v0.8h},       [x5]
 | 
						|
 | 
						|
        sub             sp,  sp,  #52
 | 
						|
        add             w8,  w4,  #5
 | 
						|
        mov             x9,  sp
 | 
						|
1:
 | 
						|
        ld1             {v2.8b,v3.8b}, [x2], x3
 | 
						|
        vp8_epel8_h6    v2,  v2,  v3
 | 
						|
        st1             {v2.s}[0],     [x9], #4
 | 
						|
        subs            w8,  w8,  #1
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        add             x6,  x7,  w6, uxtw #4
 | 
						|
        ld1             {v0.8h},       [x6]
 | 
						|
        mov             x9,  sp
 | 
						|
2:
 | 
						|
        ld1             {v2.8b,v3.8b}, [x9], #16
 | 
						|
        ld1             {v6.8b},       [x9], #8
 | 
						|
        ld1r            {v28.2s},      [x9]
 | 
						|
        sub             x9,  x9,  #16
 | 
						|
        ld1             {v4.8b,v5.8b}, [x9], #16
 | 
						|
        ld1             {v7.8b},       [x9], #8
 | 
						|
        ld1             {v28.s}[1],    [x9]
 | 
						|
        sub             x9,  x9,  #16
 | 
						|
        trn1            v1.2s, v2.2s, v4.2s
 | 
						|
        trn2            v4.2s, v2.2s, v4.2s
 | 
						|
        trn1            v2.2s, v3.2s, v5.2s
 | 
						|
        trn2            v5.2s, v3.2s, v5.2s
 | 
						|
        trn1            v3.2s, v6.2s, v7.2s
 | 
						|
        trn2            v7.2s, v6.2s, v7.2s
 | 
						|
        vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
 | 
						|
        st1             {v2.s}[0],  [x0], x1
 | 
						|
        st1             {v3.s}[0],  [x0], x1
 | 
						|
        st1             {v2.s}[1],  [x0], x1
 | 
						|
        st1             {v3.s}[1],  [x0], x1
 | 
						|
        subs            w4,  w4,  #4
 | 
						|
        b.ne            2b
 | 
						|
 | 
						|
        add             sp,  sp,  #52
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel4_h4v6_neon, export=1
 | 
						|
        sub             x2,  x2,  x3,  lsl #1
 | 
						|
        sub             x2,  x2,  #1
 | 
						|
 | 
						|
        movrel          x7,  subpel_filters, -16
 | 
						|
        add             x5,  x7,  w5, uxtw #4
 | 
						|
        ld1             {v0.8h},       [x5]
 | 
						|
 | 
						|
        sub             sp,  sp,  #52
 | 
						|
        add             w8,  w4,  #5
 | 
						|
        mov             x9,  sp
 | 
						|
1:
 | 
						|
        ld1             {v2.8b},       [x2], x3
 | 
						|
        vp8_epel8_h4    v2,  v2,  v2
 | 
						|
        st1             {v2.s}[0],     [x9], #4
 | 
						|
        subs            w8,  w8,  #1
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        add             x6,  x7,  w6, uxtw #4
 | 
						|
        ld1             {v0.8h},       [x6]
 | 
						|
        mov             x9,  sp
 | 
						|
2:
 | 
						|
        ld1             {v2.8b,v3.8b}, [x9], #16
 | 
						|
        ld1             {v6.8b},       [x9], #8
 | 
						|
        ld1r            {v28.2s},      [x9]
 | 
						|
        sub             x9,  x9,  #16
 | 
						|
        ld1             {v4.8b,v5.8b}, [x9], #16
 | 
						|
        ld1             {v7.8b},       [x9], #8
 | 
						|
        ld1             {v28.s}[1],    [x9]
 | 
						|
        sub             x9,  x9,  #16
 | 
						|
        trn1            v1.2s, v2.2s, v4.2s
 | 
						|
        trn2            v4.2s, v2.2s, v4.2s
 | 
						|
        trn1            v2.2s, v3.2s, v5.2s
 | 
						|
        trn2            v5.2s, v3.2s, v5.2s
 | 
						|
        trn1            v3.2s, v6.2s, v7.2s
 | 
						|
        trn2            v7.2s, v6.2s, v7.2s
 | 
						|
        vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
 | 
						|
        st1             {v2.s}[0],  [x0], x1
 | 
						|
        st1             {v3.s}[0],  [x0], x1
 | 
						|
        st1             {v2.s}[1],  [x0], x1
 | 
						|
        st1             {v3.s}[1],  [x0], x1
 | 
						|
        subs            w4,  w4,  #4
 | 
						|
        b.ne            2b
 | 
						|
 | 
						|
        add             sp,  sp,  #52
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel4_h6v4_neon, export=1
 | 
						|
        sub             x2,  x2,  x3
 | 
						|
        sub             x2,  x2,  #2
 | 
						|
 | 
						|
        movrel          x7,  subpel_filters, -16
 | 
						|
        add             x5,  x7,  w5, uxtw #4
 | 
						|
        ld1             {v0.8h},       [x5]
 | 
						|
 | 
						|
        sub             sp,  sp,  #44
 | 
						|
        add             w8,  w4,  #3
 | 
						|
        mov             x9,  sp
 | 
						|
1:
 | 
						|
        ld1             {v2.8b,v3.8b}, [x2], x3
 | 
						|
        vp8_epel8_h6    v2, v2, v3
 | 
						|
        st1             {v2.s}[0],     [x9], #4
 | 
						|
        subs            w8,  w8,  #1
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        add             x6,  x7,  w6, uxtw #4
 | 
						|
        ld1             {v0.8h},       [x6]
 | 
						|
        mov             x9,  sp
 | 
						|
2:
 | 
						|
        ld1             {v2.8b,v3.8b}, [x9], #16
 | 
						|
        ld1r            {v6.2s},       [x9]
 | 
						|
        sub             x9,  x9,  #8
 | 
						|
        ld1             {v4.8b,v5.8b}, [x9], #16
 | 
						|
        ld1             {v6.s}[1],     [x9]
 | 
						|
        sub             x9,  x9,  #8
 | 
						|
        trn1            v1.2s, v2.2s, v4.2s
 | 
						|
        trn2            v4.2s, v2.2s, v4.2s
 | 
						|
        trn1            v2.2s, v3.2s, v5.2s
 | 
						|
        trn2            v5.2s, v3.2s, v5.2s
 | 
						|
        vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
 | 
						|
        st1             {v1.s}[0],  [x0], x1
 | 
						|
        st1             {v1.s}[2],  [x0], x1
 | 
						|
        st1             {v1.s}[1],  [x0], x1
 | 
						|
        st1             {v1.s}[3],  [x0], x1
 | 
						|
        subs            w4,  w4,  #4
 | 
						|
        b.ne            2b
 | 
						|
 | 
						|
        add             sp,  sp,  #44
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel4_h4_neon, export=1
 | 
						|
        sub             x2,  x2,  #1
 | 
						|
 | 
						|
        movrel          x7,  subpel_filters, -16
 | 
						|
        add             x5,  x7,  w5, uxtw #4
 | 
						|
        ld1             {v0.8h},    [x5]
 | 
						|
1:
 | 
						|
        ld1             {v2.8b},    [x2], x3
 | 
						|
        vp8_epel8_h4    v2,  v2,  v2
 | 
						|
        st1             {v2.s}[0],  [x0], x1
 | 
						|
        subs            w4,  w4,  #1
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel4_v4_neon, export=1
 | 
						|
        sub             x2,  x2,  x3
 | 
						|
 | 
						|
        movrel          x7,  subpel_filters, -16
 | 
						|
        add             x6,  x7,  w6, uxtw #4
 | 
						|
        ld1             {v0.8h},   [x6]
 | 
						|
1:
 | 
						|
        ld1r            {v2.2s},   [x2], x3
 | 
						|
        ld1r            {v3.2s},   [x2], x3
 | 
						|
        ld1r            {v4.2s},   [x2], x3
 | 
						|
        ld1r            {v5.2s},   [x2], x3
 | 
						|
        ld1r            {v6.2s},   [x2]
 | 
						|
        sub             x2,  x2,  x3,  lsl #1
 | 
						|
        ld1             {v2.s}[1], [x2], x3
 | 
						|
        ld1             {v3.s}[1], [x2], x3
 | 
						|
        ld1             {v4.s}[1], [x2], x3
 | 
						|
        ld1             {v5.s}[1], [x2], x3
 | 
						|
        ld1             {v6.s}[1], [x2]
 | 
						|
        sub             x2,  x2,  x3,  lsl #1
 | 
						|
 | 
						|
        vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
 | 
						|
 | 
						|
        st1             {v2.s}[0], [x0], x1
 | 
						|
        st1             {v2.s}[2], [x0], x1
 | 
						|
        st1             {v2.s}[1], [x0], x1
 | 
						|
        st1             {v2.s}[3], [x0], x1
 | 
						|
        subs            w4,  w4,  #4
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_epel4_h4v4_neon, export=1
 | 
						|
        sub             x2,  x2,  x3
 | 
						|
        sub             x2,  x2,  #1
 | 
						|
 | 
						|
        movrel          x7,  subpel_filters, -16
 | 
						|
        add             x5,  x7,  w5, uxtw #4
 | 
						|
        ld1             {v0.8h},       [x5]
 | 
						|
 | 
						|
        sub             sp,  sp,  #44
 | 
						|
        add             w8,  w4,  #3
 | 
						|
        mov             x9,  sp
 | 
						|
1:
 | 
						|
        ld1             {v2.8b},       [x2], x3
 | 
						|
        vp8_epel8_h4    v2,  v2,  v3
 | 
						|
        st1             {v2.s}[0],     [x9], #4
 | 
						|
        subs            w8,  w8,  #1
 | 
						|
        b.ne            1b
 | 
						|
 | 
						|
        add             x6,  x7,  w6, uxtw #4
 | 
						|
        ld1             {v0.8h},       [x6]
 | 
						|
        mov             x9,  sp
 | 
						|
2:
 | 
						|
        ld1             {v2.8b,v3.8b}, [x9], #16
 | 
						|
        ld1r            {v6.2s},       [x9]
 | 
						|
        sub             x9,  x9,  #8
 | 
						|
        ld1             {v4.8b,v5.8b}, [x9], #16
 | 
						|
        ld1             {v6.s}[1],     [x9]
 | 
						|
        sub             x9,  x9,  #8
 | 
						|
        trn1            v1.2s, v2.2s, v4.2s
 | 
						|
        trn2            v4.2s, v2.2s, v4.2s
 | 
						|
        trn1            v2.2s, v3.2s, v5.2s
 | 
						|
        trn2            v5.2s, v3.2s, v5.2s
 | 
						|
        vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
 | 
						|
        st1             {v1.s}[0], [x0], x1
 | 
						|
        st1             {v1.s}[2], [x0], x1
 | 
						|
        st1             {v1.s}[1], [x0], x1
 | 
						|
        st1             {v1.s}[3], [x0], x1
 | 
						|
        subs            w4,  w4,  #4
 | 
						|
        b.ne            2b
 | 
						|
 | 
						|
        add             sp,  sp,  #44
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
/* Bilinear MC */
 | 
						|
 | 
						|
function ff_put_vp8_bilin16_h_neon, export=1
 | 
						|
        mov             w7,     #8
 | 
						|
        dup             v0.8b,  w5
 | 
						|
        sub             w5,     w7,     w5
 | 
						|
        dup             v1.8b,  w5
 | 
						|
1:
 | 
						|
        subs            w4,     w4,     #2
 | 
						|
        ld1             {v2.8b,v3.8b,v4.8b},    [x2], x3
 | 
						|
        ext             v5.8b,  v3.8b,  v4.8b,  #1
 | 
						|
        ext             v4.8b,  v2.8b,  v3.8b,  #1
 | 
						|
        umull           v16.8h, v2.8b,  v1.8b
 | 
						|
        umlal           v16.8h, v4.8b,  v0.8b
 | 
						|
        ld1             {v18.8b,v19.8b,v20.8b}, [x2], x3
 | 
						|
        umull           v6.8h,  v3.8b,  v1.8b
 | 
						|
        umlal           v6.8h,  v5.8b,  v0.8b
 | 
						|
        ext             v21.8b, v19.8b, v20.8b, #1
 | 
						|
        ext             v20.8b, v18.8b, v19.8b, #1
 | 
						|
        umull           v22.8h, v18.8b, v1.8b
 | 
						|
        umlal           v22.8h, v20.8b, v0.8b
 | 
						|
        umull           v24.8h, v19.8b, v1.8b
 | 
						|
        umlal           v24.8h, v21.8b, v0.8b
 | 
						|
        rshrn           v4.8b,  v16.8h, #3
 | 
						|
        rshrn2          v4.16b, v6.8h,  #3
 | 
						|
        rshrn           v6.8b,  v22.8h, #3
 | 
						|
        rshrn2          v6.16b, v24.8h, #3
 | 
						|
        st1             {v4.16b}, [x0], x1
 | 
						|
        st1             {v6.16b}, [x0], x1
 | 
						|
        b.gt            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_bilin16_v_neon, export=1
 | 
						|
        mov             w7,     #8
 | 
						|
        dup             v0.16b, w6
 | 
						|
        sub             w6,     w7,     w6
 | 
						|
        dup             v1.16b, w6
 | 
						|
 | 
						|
        ld1             {v2.16b}, [x2], x3
 | 
						|
1:
 | 
						|
        subs            w4,     w4,     #2
 | 
						|
        ld1             {v4.16b}, [x2], x3
 | 
						|
        umull           v6.8h,  v2.8b,  v1.8b
 | 
						|
        umlal           v6.8h,  v4.8b,  v0.8b
 | 
						|
        umull2          v16.8h, v2.16b, v1.16b
 | 
						|
        umlal2          v16.8h, v4.16b, v0.16b
 | 
						|
        ld1             {v2.16b}, [x2], x3
 | 
						|
        umull           v18.8h, v4.8b,  v1.8b
 | 
						|
        umlal           v18.8h, v2.8b,  v0.8b
 | 
						|
        umull2          v20.8h, v4.16b, v1.16b
 | 
						|
        umlal2          v20.8h, v2.16b, v0.16b
 | 
						|
        rshrn           v4.8b,  v6.8h,  #3
 | 
						|
        rshrn2          v4.16b, v16.8h, #3
 | 
						|
        rshrn           v6.8b,  v18.8h, #3
 | 
						|
        rshrn2          v6.16b, v20.8h, #3
 | 
						|
        st1             {v4.16b}, [x0], x1
 | 
						|
        st1             {v6.16b}, [x0], x1
 | 
						|
        b.gt            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_bilin16_hv_neon, export=1
 | 
						|
        mov             w7,      #8
 | 
						|
        dup             v0.8b,   w5            // mx
 | 
						|
        sub             w5,      w7,     w5
 | 
						|
        dup             v1.8b,   w5
 | 
						|
        dup             v2.16b,  w6            // my
 | 
						|
        sub             w6,      w7,     w6
 | 
						|
        dup             v3.16b,  w6
 | 
						|
 | 
						|
        ld1             {v4.8b,v5.8b,v6.8b},    [x2], x3
 | 
						|
 | 
						|
        ext             v7.8b,   v5.8b,  v6.8b, #1
 | 
						|
        ext             v6.8b,   v4.8b,  v5.8b, #1
 | 
						|
        umull           v16.8h,  v4.8b,  v1.8b
 | 
						|
        umlal           v16.8h,  v6.8b,  v0.8b
 | 
						|
        umull           v18.8h,  v5.8b,  v1.8b
 | 
						|
        umlal           v18.8h,  v7.8b,  v0.8b
 | 
						|
        rshrn           v4.8b,   v16.8h, #3
 | 
						|
        rshrn2          v4.16b,  v18.8h, #3
 | 
						|
1:
 | 
						|
        subs            w4,  w4,  #2
 | 
						|
        ld1             {v18.8b,v19.8b,v20.8b},  [x2], x3
 | 
						|
        ext             v21.8b,  v19.8b, v20.8b, #1
 | 
						|
        ext             v20.8b,  v18.8b, v19.8b, #1
 | 
						|
        umull           v22.8h,  v18.8b, v1.8b
 | 
						|
        umlal           v22.8h,  v20.8b, v0.8b
 | 
						|
        ld1             {v26.8b,v27.8b,v28.8b},  [x2], x3
 | 
						|
        umull           v24.8h,  v19.8b, v1.8b
 | 
						|
        umlal           v24.8h,  v21.8b, v0.8b
 | 
						|
        ext             v29.8b,  v27.8b, v28.8b, #1
 | 
						|
        ext             v28.8b,  v26.8b, v27.8b, #1
 | 
						|
        umull           v16.8h,  v26.8b, v1.8b
 | 
						|
        umlal           v16.8h,  v28.8b, v0.8b
 | 
						|
        umull           v18.8h,  v27.8b, v1.8b
 | 
						|
        umlal           v18.8h,  v29.8b, v0.8b
 | 
						|
        rshrn           v6.8b,   v22.8h, #3
 | 
						|
        rshrn2          v6.16b,  v24.8h, #3
 | 
						|
        umull           v24.8h,  v4.8b,  v3.8b
 | 
						|
        umlal           v24.8h,  v6.8b,  v2.8b
 | 
						|
        umull2          v30.8h,  v4.16b, v3.16b
 | 
						|
        umlal2          v30.8h,  v6.16b, v2.16b
 | 
						|
        rshrn           v4.8b,   v16.8h, #3
 | 
						|
        rshrn2          v4.16b,  v18.8h, #3
 | 
						|
        umull           v20.8h,  v6.8b,  v3.8b
 | 
						|
        umlal           v20.8h,  v4.8b,  v2.8b
 | 
						|
        umull2          v22.8h,  v6.16b, v3.16b
 | 
						|
        umlal2          v22.8h,  v4.16b, v2.16b
 | 
						|
        rshrn           v24.8b,  v24.8h, #3
 | 
						|
        rshrn2          v24.16b, v30.8h, #3
 | 
						|
        st1             {v24.16b}, [x0], x1
 | 
						|
        rshrn           v20.8b,  v20.8h, #3
 | 
						|
        rshrn2          v20.16b, v22.8h, #3
 | 
						|
        st1             {v20.16b}, [x0], x1
 | 
						|
        b.gt            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_bilin8_h_neon, export=1
 | 
						|
        mov             w7,     #8
 | 
						|
        dup             v0.8b,  w5
 | 
						|
        sub             w5,     w7,     w5
 | 
						|
        dup             v1.8b,  w5
 | 
						|
1:
 | 
						|
        subs            w4,     w4,     #2
 | 
						|
        ld1             {v2.8b,v3.8b},  [x2],  x3
 | 
						|
        ext             v3.8b,  v2.8b,  v3.8b, #1
 | 
						|
        umull           v4.8h,  v2.8b,  v1.8b
 | 
						|
        umlal           v4.8h,  v3.8b,  v0.8b
 | 
						|
        ld1             {v6.8b,v7.8b},  [x2],  x3
 | 
						|
        ext             v7.8b,  v6.8b,  v7.8b, #1
 | 
						|
        umull           v16.8h, v6.8b,  v1.8b
 | 
						|
        umlal           v16.8h, v7.8b,  v0.8b
 | 
						|
        rshrn           v4.8b,  v4.8h,  #3
 | 
						|
        rshrn           v16.8b, v16.8h, #3
 | 
						|
        st1             {v4.8b},  [x0], x1
 | 
						|
        st1             {v16.8b}, [x0], x1
 | 
						|
        b.gt            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_bilin8_v_neon, export=1
 | 
						|
        mov             w7,      #8
 | 
						|
        dup             v0.8b,   w6
 | 
						|
        sub             w6,      w7,    w6
 | 
						|
        dup             v1.8b,   w6
 | 
						|
 | 
						|
        ld1             {v2.8b}, [x2],  x3
 | 
						|
1:
 | 
						|
        subs            w4,      w4,    #2
 | 
						|
        ld1             {v3.8b}, [x2],  x3
 | 
						|
        umull           v4.8h,   v2.8b, v1.8b
 | 
						|
        umlal           v4.8h,   v3.8b, v0.8b
 | 
						|
        ld1             {v2.8b}, [x2],  x3
 | 
						|
        umull           v6.8h,   v3.8b, v1.8b
 | 
						|
        umlal           v6.8h,   v2.8b, v0.8b
 | 
						|
        rshrn           v4.8b,   v4.8h, #3
 | 
						|
        rshrn           v6.8b,   v6.8h, #3
 | 
						|
        st1             {v4.8b}, [x0],  x1
 | 
						|
        st1             {v6.8b}, [x0],  x1
 | 
						|
        b.gt            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_bilin8_hv_neon, export=1
 | 
						|
        mov             w7,     #8
 | 
						|
        dup             v0.8b,  w5             // mx
 | 
						|
        sub             w5,     w7,     w5
 | 
						|
        dup             v1.8b,  w5
 | 
						|
        dup             v2.8b,  w6             // my
 | 
						|
        sub             w6,     w7,     w6
 | 
						|
        dup             v3.8b,  w6
 | 
						|
 | 
						|
        ld1             {v4.8b,v5.8b},  [x2],  x3
 | 
						|
        ext             v5.8b,  v4.8b,  v5.8b, #1
 | 
						|
        umull           v18.8h, v4.8b,  v1.8b
 | 
						|
        umlal           v18.8h, v5.8b,  v0.8b
 | 
						|
        rshrn           v22.8b, v18.8h, #3
 | 
						|
1:
 | 
						|
        subs            w4,     w4,     #2
 | 
						|
        ld1             {v6.8b,v7.8b},  [x2],  x3
 | 
						|
        ext             v7.8b,  v6.8b,  v7.8b, #1
 | 
						|
        umull           v16.8h, v6.8b,  v1.8b
 | 
						|
        umlal           v16.8h, v7.8b,  v0.8b
 | 
						|
        ld1             {v4.8b,v5.8b},  [x2],  x3
 | 
						|
        ext             v5.8b,  v4.8b,  v5.8b, #1
 | 
						|
        umull           v18.8h, v4.8b,  v1.8b
 | 
						|
        umlal           v18.8h, v5.8b,  v0.8b
 | 
						|
        rshrn           v16.8b, v16.8h, #3
 | 
						|
        umull           v20.8h, v22.8b, v3.8b
 | 
						|
        umlal           v20.8h, v16.8b, v2.8b
 | 
						|
        rshrn           v22.8b, v18.8h, #3
 | 
						|
        umull           v24.8h, v16.8b, v3.8b
 | 
						|
        umlal           v24.8h, v22.8b, v2.8b
 | 
						|
        rshrn           v20.8b, v20.8h, #3
 | 
						|
        st1             {v20.8b}, [x0], x1
 | 
						|
        rshrn           v23.8b, v24.8h, #3
 | 
						|
        st1             {v23.8b}, [x0], x1
 | 
						|
        b.gt            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_bilin4_h_neon, export=1
 | 
						|
        mov             w7,      #8
 | 
						|
        dup             v0.8b,   w5
 | 
						|
        sub             w5,      w7,     w5
 | 
						|
        dup             v1.8b,   w5
 | 
						|
1:
 | 
						|
        subs            w4,      w4,     #2
 | 
						|
        ld1             {v2.8b}, [x2],   x3
 | 
						|
        ext             v3.8b,   v2.8b,  v3.8b,  #1
 | 
						|
        ld1             {v6.8b}, [x2],   x3
 | 
						|
        ext             v7.8b,   v6.8b,  v7.8b,  #1
 | 
						|
        trn1            v2.2s,   v2.2s,  v6.2s
 | 
						|
        trn1            v3.2s,   v3.2s,  v7.2s
 | 
						|
        umull           v4.8h,   v2.8b,  v1.8b
 | 
						|
        umlal           v4.8h,   v3.8b,  v0.8b
 | 
						|
        rshrn           v4.8b,   v4.8h,  #3
 | 
						|
        st1             {v4.s}[0], [x0], x1
 | 
						|
        st1             {v4.s}[1], [x0], x1
 | 
						|
        b.gt            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_bilin4_v_neon, export=1
 | 
						|
        mov             w7,     #8
 | 
						|
        dup             v0.8b,  w6
 | 
						|
        sub             w6,     w7,  w6
 | 
						|
        dup             v1.8b,  w6
 | 
						|
 | 
						|
        ld1r            {v2.2s},    [x2], x3
 | 
						|
1:
 | 
						|
        ld1r            {v3.2s},   [x2]
 | 
						|
        ld1             {v2.s}[1], [x2], x3
 | 
						|
        ld1             {v3.s}[1], [x2], x3
 | 
						|
        umull           v4.8h,  v2.8b,  v1.8b
 | 
						|
        umlal           v4.8h,  v3.8b,  v0.8b
 | 
						|
        trn2            v2.2s,  v3.2s,  v2.2s
 | 
						|
        rshrn           v4.8b,  v4.8h,  #3
 | 
						|
        st1             {v4.s}[0], [x0], x1
 | 
						|
        st1             {v4.s}[1], [x0], x1
 | 
						|
        subs            w4,     w4,     #2
 | 
						|
        b.gt            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 | 
						|
 | 
						|
function ff_put_vp8_bilin4_hv_neon, export=1
 | 
						|
        mov             w7,      #8
 | 
						|
        dup             v0.8b,   w5             // mx
 | 
						|
        sub             w5,      w7,     w5
 | 
						|
        dup             v1.8b,   w5
 | 
						|
        dup             v2.8b,   w6             // my
 | 
						|
        sub             w6,      w7,     w6
 | 
						|
        dup             v3.8b,   w6
 | 
						|
 | 
						|
        ld1             {v4.8b}, [x2],   x3
 | 
						|
        ext             v5.8b,   v4.8b,  v4.8b,  #1
 | 
						|
        umull           v18.8h,  v4.8b,  v1.8b
 | 
						|
        umlal           v18.8h,  v5.8b,  v0.8b
 | 
						|
        rshrn           v22.8b,  v18.8h, #3
 | 
						|
1:
 | 
						|
        subs            w4,      w4,     #2
 | 
						|
        ld1             {v6.8b}, [x2],   x3
 | 
						|
        ext             v7.8b,   v6.8b,  v6.8b,  #1
 | 
						|
        ld1             {v4.8b}, [x2],   x3
 | 
						|
        ext             v5.8b,   v4.8b,  v4.8b,  #1
 | 
						|
        trn1            v6.2s,   v6.2s,  v4.2s
 | 
						|
        trn1            v7.2s,   v7.2s,  v5.2s
 | 
						|
        umull           v16.8h,  v6.8b,  v1.8b
 | 
						|
        umlal           v16.8h,  v7.8b,  v0.8b
 | 
						|
        rshrn           v16.8b,  v16.8h, #3
 | 
						|
        umull           v20.8h,  v16.8b, v2.8b
 | 
						|
        trn1            v22.2s,  v22.2s, v16.2s
 | 
						|
        umlal           v20.8h,  v22.8b, v3.8b
 | 
						|
        rev64           v22.2s,  v16.2s
 | 
						|
        rshrn           v20.8b,  v20.8h, #3
 | 
						|
        st1             {v20.s}[0], [x0], x1
 | 
						|
        st1             {v20.s}[1], [x0], x1
 | 
						|
        b.gt            1b
 | 
						|
 | 
						|
        ret
 | 
						|
endfunc
 |