early-access version 1432

2021-02-09 04:25:58 +01:00
parent de64eab4b4
commit 3d5a9d908a
7336 changed files with 1773492 additions and 111 deletions
--- a/externals/ffmpeg/libavcodec/aarch64/Makefile
+++ b/externals/ffmpeg/libavcodec/aarch64/Makefile
@@ -0,0 +1,63 @@
+# subsystems
+OBJS-$(CONFIG_FFT)                      += aarch64/fft_init_aarch64.o
+OBJS-$(CONFIG_FMTCONVERT)               += aarch64/fmtconvert_init.o
+OBJS-$(CONFIG_H264CHROMA)               += aarch64/h264chroma_init_aarch64.o
+OBJS-$(CONFIG_H264DSP)                  += aarch64/h264dsp_init_aarch64.o
+OBJS-$(CONFIG_H264PRED)                 += aarch64/h264pred_init.o
+OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
+OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
+OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
+OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
+OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
+OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o
+OBJS-$(CONFIG_VIDEODSP)                 += aarch64/videodsp_init.o
+OBJS-$(CONFIG_VP8DSP)                   += aarch64/vp8dsp_init_aarch64.o
+
+# decoders/encoders
+OBJS-$(CONFIG_AAC_DECODER)              += aarch64/aacpsdsp_init_aarch64.o \
+                                           aarch64/sbrdsp_init_aarch64.o
+OBJS-$(CONFIG_DCA_DECODER)              += aarch64/synth_filter_init.o
+OBJS-$(CONFIG_OPUS_DECODER)             += aarch64/opusdsp_init.o
+OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
+OBJS-$(CONFIG_VC1DSP)                   += aarch64/vc1dsp_init_aarch64.o
+OBJS-$(CONFIG_VORBIS_DECODER)           += aarch64/vorbisdsp_init.o
+OBJS-$(CONFIG_VP9_DECODER)              += aarch64/vp9dsp_init_10bpp_aarch64.o \
+                                           aarch64/vp9dsp_init_12bpp_aarch64.o \
+                                           aarch64/vp9mc_aarch64.o             \
+                                           aarch64/vp9dsp_init_aarch64.o
+
+# ARMv8 optimizations
+
+# subsystems
+ARMV8-OBJS-$(CONFIG_VIDEODSP)           += aarch64/videodsp.o
+
+# NEON optimizations
+
+# subsystems
+NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/sbrdsp_neon.o
+NEON-OBJS-$(CONFIG_FFT)                 += aarch64/fft_neon.o
+NEON-OBJS-$(CONFIG_FMTCONVERT)          += aarch64/fmtconvert_neon.o
+NEON-OBJS-$(CONFIG_H264CHROMA)          += aarch64/h264cmc_neon.o
+NEON-OBJS-$(CONFIG_H264DSP)             += aarch64/h264dsp_neon.o              \
+                                           aarch64/h264idct_neon.o
+NEON-OBJS-$(CONFIG_H264PRED)            += aarch64/h264pred_neon.o
+NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
+                                           aarch64/hpeldsp_neon.o
+NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
+NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/simple_idct_neon.o
+NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
+NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
+NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
+NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
+
+# decoders/encoders
+NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/aacpsdsp_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_OPUS_DECODER)        += aarch64/opusdsp_neon.o
+NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
+NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
+                                           aarch64/vp9itxfm_neon.o             \
+                                           aarch64/vp9lpf_16bpp_neon.o         \
+                                           aarch64/vp9lpf_neon.o               \
+                                           aarch64/vp9mc_16bpp_neon.o          \
+                                           aarch64/vp9mc_neon.o
--- a/externals/ffmpeg/libavcodec/aarch64/aacpsdsp_init_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/aacpsdsp_init_aarch64.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/aacpsdsp.h"
+
+void ff_ps_add_squares_neon(float *dst, const float (*src)[2], int n);
+void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2],
+                                float *src1, int n);
+void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                ptrdiff_t stride, int n);
+void ff_ps_stereo_interpolate_neon(float (*l)[2], float (*r)[2],
+                                   float h[2][4], float h_step[2][4],
+                                   int len);
+void ff_ps_stereo_interpolate_ipdopd_neon(float (*l)[2], float (*r)[2],
+                                          float h[2][4], float h_step[2][4],
+                                          int len);
+
+av_cold void ff_psdsp_init_aarch64(PSDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        s->add_squares           = ff_ps_add_squares_neon;
+        s->mul_pair_single       = ff_ps_mul_pair_single_neon;
+        s->hybrid_analysis       = ff_ps_hybrid_analysis_neon;
+        s->stereo_interpolate[0] = ff_ps_stereo_interpolate_neon;
+        s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_neon;
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/aacpsdsp_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/aacpsdsp_neon.S
@@ -0,0 +1,148 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_ps_add_squares_neon, export=1
+1:      ld1         {v0.4S,v1.4S}, [x1], #32
+        fmul        v0.4S, v0.4S, v0.4S
+        fmul        v1.4S, v1.4S, v1.4S
+        faddp       v2.4S, v0.4S, v1.4S
+        ld1         {v3.4S}, [x0]
+        fadd        v3.4S, v3.4S, v2.4S
+        st1         {v3.4S}, [x0], #16
+        subs        w2, w2, #4
+        b.gt        1b
+        ret
+endfunc
+
+function ff_ps_mul_pair_single_neon, export=1
+1:      ld1         {v0.4S,v1.4S}, [x1], #32
+        ld1         {v2.4S},       [x2], #16
+        zip1        v3.4S, v2.4S, v2.4S
+        zip2        v4.4S, v2.4S, v2.4S
+        fmul        v0.4S, v0.4S, v3.4S
+        fmul        v1.4S, v1.4S, v4.4S
+        st1         {v0.4S,v1.4S}, [x0], #32
+        subs        w3, w3, #4
+        b.gt        1b
+        ret
+endfunc
+
+function ff_ps_stereo_interpolate_neon, export=1
+        ld1         {v0.4S}, [x2]
+        ld1         {v1.4S}, [x3]
+        zip1        v4.4S, v0.4S, v0.4S
+        zip2        v5.4S, v0.4S, v0.4S
+        zip1        v6.4S, v1.4S, v1.4S
+        zip2        v7.4S, v1.4S, v1.4S
+1:      ld1         {v2.2S}, [x0]
+        ld1         {v3.2S}, [x1]
+        fadd        v4.4S, v4.4S, v6.4S
+        fadd        v5.4S, v5.4S, v7.4S
+        mov         v2.D[1], v2.D[0]
+        mov         v3.D[1], v3.D[0]
+        fmul        v2.4S, v2.4S, v4.4S
+        fmla        v2.4S, v3.4S, v5.4S
+        st1         {v2.D}[0], [x0], #8
+        st1         {v2.D}[1], [x1], #8
+        subs        w4, w4, #1
+        b.gt        1b
+        ret
+endfunc
+
+function ff_ps_stereo_interpolate_ipdopd_neon, export=1
+        ld1         {v0.4S,v1.4S}, [x2]
+        ld1         {v6.4S,v7.4S}, [x3]
+        fneg        v2.4S, v1.4S
+        fneg        v3.4S, v7.4S
+        zip1        v16.4S, v0.4S, v0.4S
+        zip2        v17.4S, v0.4S, v0.4S
+        zip1        v18.4S, v2.4S, v1.4S
+        zip2        v19.4S, v2.4S, v1.4S
+        zip1        v20.4S, v6.4S, v6.4S
+        zip2        v21.4S, v6.4S, v6.4S
+        zip1        v22.4S, v3.4S, v7.4S
+        zip2        v23.4S, v3.4S, v7.4S
+1:      ld1         {v2.2S}, [x0]
+        ld1         {v3.2S}, [x1]
+        fadd        v16.4S, v16.4S, v20.4S
+        fadd        v17.4S, v17.4S, v21.4S
+        mov         v2.D[1], v2.D[0]
+        mov         v3.D[1], v3.D[0]
+        fmul        v4.4S, v2.4S, v16.4S
+        fmla        v4.4S, v3.4S, v17.4S
+        fadd        v18.4S, v18.4S, v22.4S
+        fadd        v19.4S, v19.4S, v23.4S
+        ext         v2.16B, v2.16B, v2.16B, #4
+        ext         v3.16B, v3.16B, v3.16B, #4
+        fmla        v4.4S, v2.4S, v18.4S
+        fmla        v4.4S, v3.4S, v19.4S
+        st1         {v4.D}[0], [x0], #8
+        st1         {v4.D}[1], [x1], #8
+        subs        w4, w4, #1
+        b.gt        1b
+        ret
+endfunc
+
+function ff_ps_hybrid_analysis_neon, export=1
+        lsl         x3, x3, #3
+        ld2         {v0.4S,v1.4S}, [x1], #32
+        ld2         {v2.2S,v3.2S}, [x1], #16
+        ld1         {v24.2S},      [x1], #8
+        ld2         {v4.2S,v5.2S}, [x1], #16
+        ld2         {v6.4S,v7.4S}, [x1]
+        rev64       v6.4S, v6.4S
+        rev64       v7.4S, v7.4S
+        ext         v6.16B, v6.16B, v6.16B, #8
+        ext         v7.16B, v7.16B, v7.16B, #8
+        rev64       v4.2S, v4.2S
+        rev64       v5.2S, v5.2S
+        mov         v2.D[1], v3.D[0]
+        mov         v4.D[1], v5.D[0]
+        mov         v5.D[1], v2.D[0]
+        mov         v3.D[1], v4.D[0]
+        fadd        v16.4S, v0.4S, v6.4S
+        fadd        v17.4S, v1.4S, v7.4S
+        fsub        v18.4S, v1.4S, v7.4S
+        fsub        v19.4S, v0.4S, v6.4S
+        fadd        v22.4S, v2.4S, v4.4S
+        fsub        v23.4S, v5.4S, v3.4S
+        trn1        v20.2D, v22.2D, v23.2D      // {re4+re8, re5+re7, im8-im4, im7-im5}
+        trn2        v21.2D, v22.2D, v23.2D      // {im4+im8, im5+im7, re4-re8, re5-re7}
+1:      ld2         {v2.4S,v3.4S}, [x2], #32
+        ld2         {v4.2S,v5.2S}, [x2], #16
+        ld1         {v6.2S},       [x2], #8
+        add         x2, x2, #8
+        mov         v4.D[1], v5.D[0]
+        mov         v6.S[1], v6.S[0]
+        fmul        v6.2S, v6.2S, v24.2S
+        fmul        v0.4S, v2.4S, v16.4S
+        fmul        v1.4S, v2.4S, v17.4S
+        fmls        v0.4S, v3.4S, v18.4S
+        fmla        v1.4S, v3.4S, v19.4S
+        fmla        v0.4S, v4.4S, v20.4S
+        fmla        v1.4S, v4.4S, v21.4S
+        faddp       v0.4S, v0.4S, v1.4S
+        faddp       v0.4S, v0.4S, v0.4S
+        fadd        v0.2S, v0.2S, v6.2S
+        st1         {v0.2S}, [x0], x3
+        subs        w4, w4, #1
+        b.gt        1b
+        ret
+endfunc
--- a/externals/ffmpeg/libavcodec/aarch64/asm-offsets.h
+++ b/externals/ffmpeg/libavcodec/aarch64/asm-offsets.h
@@ -0,0 +1,25 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_ASM_OFFSETS_H
+#define AVCODEC_AARCH64_ASM_OFFSETS_H
+
+/* FFTContext */
+#define IMDCT_HALF                      0x48
+
+#endif /* AVCODEC_AARCH64_ASM_OFFSETS_H */
--- a/externals/ffmpeg/libavcodec/aarch64/cabac.h
+++ b/externals/ffmpeg/libavcodec/aarch64/cabac.h
@@ -0,0 +1,104 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_CABAC_H
+#define AVCODEC_AARCH64_CABAC_H
+
+#include "config.h"
+#if HAVE_INLINE_ASM
+
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavcodec/cabac.h"
+
+#define get_cabac_inline get_cabac_inline_aarch64
+static av_always_inline int get_cabac_inline_aarch64(CABACContext *c,
+                                                     uint8_t *const state)
+{
+    int bit;
+    void *reg_a, *reg_b, *reg_c, *tmp;
+
+    __asm__ volatile(
+        "ldrb       %w[bit]       , [%[state]]                  \n\t"
+        "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
+        "mov        %w[tmp]       , %w[range]                   \n\t"
+        "and        %w[range]     , %w[range]   , #0xC0         \n\t"
+        "lsl        %w[r_c]       , %w[range]   , #1            \n\t"
+        "add        %[r_b]        , %[r_b]      , %w[bit], UXTW \n\t"
+        "ldrb       %w[range]     , [%[r_b], %w[r_c], SXTW]     \n\t"
+        "sub        %w[r_c]       , %w[tmp]     , %w[range]     \n\t"
+        "lsl        %w[tmp]       , %w[r_c]     , #17           \n\t"
+        "cmp        %w[tmp]       , %w[low]                     \n\t"
+        "csel       %w[tmp]       , %w[tmp]     , wzr      , cc \n\t"
+        "csel       %w[range]     , %w[r_c]     , %w[range], gt \n\t"
+        "cinv       %w[bit]       , %w[bit]     , cc            \n\t"
+        "sub        %w[low]       , %w[low]     , %w[tmp]       \n\t"
+        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
+        "add        %[r_a]        , %[tables]   , %[mlps_off]   \n\t"
+        "ldrb       %w[tmp]       , [%[r_b], %w[range], SXTW]   \n\t"
+        "ldrb       %w[r_a]       , [%[r_a], %w[bit], SXTW]     \n\t"
+        "lsl        %w[low]       , %w[low]     , %w[tmp]       \n\t"
+        "lsl        %w[range]     , %w[range]   , %w[tmp]       \n\t"
+        "uxth       %w[r_c]       , %w[low]                     \n\t"
+        "strb       %w[r_a]       , [%[state]]                  \n\t"
+        "cbnz       %w[r_c]       , 2f                          \n\t"
+        "ldr        %[r_c]        , [%[c], %[byte]]             \n\t"
+        "ldr        %[r_a]        , [%[c], %[end]]              \n\t"
+        "ldrh       %w[tmp]       , [%[r_c]]                    \n\t"
+        "cmp        %[r_c]        , %[r_a]                      \n\t"
+        "b.ge       1f                                          \n\t"
+        "add        %[r_a]        , %[r_c]      , #2            \n\t"
+        "str        %[r_a]        , [%[c], %[byte]]             \n\t"
+        "1:                                                     \n\t"
+        "sub        %w[r_c]       , %w[low]     , #1            \n\t"
+        "eor        %w[r_c]       , %w[r_c]     , %w[low]       \n\t"
+        "rev        %w[tmp]       , %w[tmp]                     \n\t"
+        "lsr        %w[r_c]       , %w[r_c]     , #15           \n\t"
+        "lsr        %w[tmp]       , %w[tmp]     , #15           \n\t"
+        "ldrb       %w[r_c]       , [%[r_b], %w[r_c], SXTW]     \n\t"
+        "mov        %w[r_b]       , #0xFFFF                     \n\t"
+        "mov        %w[r_a]       , #7                          \n\t"
+        "sub        %w[tmp]       , %w[tmp]     , %w[r_b]       \n\t"
+        "sub        %w[r_c]       , %w[r_a]     , %w[r_c]       \n\t"
+        "lsl        %w[tmp]       , %w[tmp]     , %w[r_c]       \n\t"
+        "add        %w[low]       , %w[low]     , %w[tmp]       \n\t"
+        "2:                                                     \n\t"
+        :    [bit]"=&r"(bit),
+             [low]"+&r"(c->low),
+           [range]"+&r"(c->range),
+             [r_a]"=&r"(reg_a),
+             [r_b]"=&r"(reg_b),
+             [r_c]"=&r"(reg_c),
+             [tmp]"=&r"(tmp)
+        :        [c]"r"(c),
+             [state]"r"(state),
+            [tables]"r"(ff_h264_cabac_tables),
+              [byte]"i"(offsetof(CABACContext, bytestream)),
+               [end]"i"(offsetof(CABACContext, bytestream_end)),
+          [norm_off]"I"(H264_NORM_SHIFT_OFFSET),
+           [lps_off]"I"(H264_LPS_RANGE_OFFSET),
+          [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
+        : "memory", "cc"
+        );
+
+    return bit & 1;
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_AARCH64_CABAC_H */
--- a/externals/ffmpeg/libavcodec/aarch64/fft_init_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/fft_init_aarch64.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+
+#include "libavcodec/fft.h"
+
+void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
+
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+
+av_cold void ff_fft_init_aarch64(FFTContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        s->fft_permute  = ff_fft_permute_neon;
+        s->fft_calc     = ff_fft_calc_neon;
+#if CONFIG_MDCT
+        s->imdct_calc   = ff_imdct_calc_neon;
+        s->imdct_half   = ff_imdct_half_neon;
+        s->mdct_calc    = ff_mdct_calc_neon;
+        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+#endif
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/fft_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/fft_neon.S
@@ -0,0 +1,442 @@
+/*
+ * ARM NEON optimised FFT
+ *
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2009 Naotoshi Nojiri
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define M_SQRT1_2 0.70710678118654752440
+
+.macro transpose d0, d1, s0, s1
+        trn1            \d0, \s0, \s1
+        trn2            \d1, \s0, \s1
+.endm
+
+
+function fft4_neon
+        ld1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
+
+        fadd            v4.2s,  v0.2s,  v1.2s   // r0+r1,i0+i1
+        fsub            v6.2s,  v0.2s,  v1.2s   // r0-r1,i0-i1
+
+        ext             v16.8b, v2.8b,  v3.8b,  #4
+        ext             v17.8b, v3.8b,  v2.8b,  #4
+
+        fadd            v5.2s,  v2.2s,  v3.2s   // i2+i3,r2+r3
+        fsub            v7.2s,  v16.2s, v17.2s  // r3-r2,i2-i3
+
+        fadd            v0.2s,  v4.2s,  v5.2s
+        fsub            v2.2s,  v4.2s,  v5.2s
+        fadd            v1.2s,  v6.2s,  v7.2s
+        fsub            v3.2s,  v6.2s,  v7.2s
+
+        st1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
+
+        ret
+endfunc
+
+function fft8_neon
+        mov             x1,  x0
+        ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
+        ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
+        ext             v22.8b, v2.8b,  v3.8b,  #4
+        ext             v23.8b, v3.8b,  v2.8b,  #4
+        fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
+        fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
+        fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
+        fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
+        rev64           v27.2s, v28.2s  // ???
+        fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
+        fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
+        fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
+        ext             v6.8b,  v4.8b,  v5.8b,  #4
+        ext             v7.8b,  v5.8b,  v4.8b,  #4
+        fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
+        fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
+        fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
+        fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
+        fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
+        fadd            v0.2s,  v20.2s, v21.2s
+        fsub            v2.2s,  v20.2s, v21.2s
+        fadd            v1.2s,  v22.2s, v23.2s
+        rev64           v26.2s, v26.2s
+        rev64           v27.2s, v27.2s
+        fsub            v3.2s,  v22.2s, v23.2s
+        fsub            v6.2s,  v6.2s,  v7.2s
+        fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
+        fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
+        fadd            v7.2s,  v4.2s,  v5.2s
+        fsub            v18.2s, v2.2s,  v6.2s
+        ext             v26.8b, v24.8b, v25.8b, #4
+        ext             v27.8b, v25.8b, v24.8b, #4
+        fadd            v2.2s,  v2.2s,  v6.2s
+        fsub            v16.2s, v0.2s,  v7.2s
+        fadd            v5.2s,  v25.2s, v24.2s
+        fsub            v4.2s,  v26.2s, v27.2s
+        fadd            v0.2s,  v0.2s,  v7.2s
+        fsub            v17.2s, v1.2s,  v5.2s
+        fsub            v19.2s, v3.2s,  v4.2s
+        fadd            v3.2s,  v3.2s,  v4.2s
+        fadd            v1.2s,  v1.2s,  v5.2s
+
+        st1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
+        st1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x1]
+
+        ret
+endfunc
+
+function fft16_neon
+        mov             x1,  x0
+        ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
+        ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
+        ext             v22.8b, v2.8b,  v3.8b,  #4
+        ext             v23.8b, v3.8b,  v2.8b,  #4
+        fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
+        fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
+        fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
+        fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
+        rev64           v27.2s, v28.2s  // ???
+        fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
+        fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
+        fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
+        ext             v6.8b,  v4.8b,  v5.8b,  #4
+        ext             v7.8b,  v5.8b,  v4.8b,  #4
+        fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
+        fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
+        fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
+        fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
+        fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
+        fadd            v0.2s,  v20.2s, v21.2s
+        fsub            v2.2s,  v20.2s, v21.2s
+        fadd            v1.2s,  v22.2s, v23.2s
+        rev64           v26.2s, v26.2s
+        rev64           v27.2s, v27.2s
+        fsub            v3.2s,  v22.2s, v23.2s
+        fsub            v6.2s,  v6.2s,  v7.2s
+        fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
+        fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
+        fadd            v7.2s,  v4.2s,  v5.2s
+        fsub            v18.2s, v2.2s,  v6.2s
+        ld1             {v20.4s,v21.4s}, [x0], #32
+        ld1             {v22.4s,v23.4s}, [x0], #32
+        ext             v26.8b, v24.8b, v25.8b, #4
+        ext             v27.8b, v25.8b, v24.8b, #4
+        fadd            v2.2s,  v2.2s,  v6.2s
+        fsub            v16.2s, v0.2s,  v7.2s
+        fadd            v5.2s,  v25.2s, v24.2s
+        fsub            v4.2s,  v26.2s, v27.2s
+        transpose       v24.2d, v25.2d, v20.2d, v22.2d
+        transpose       v26.2d, v27.2d, v21.2d, v23.2d
+        fadd            v0.2s,  v0.2s,  v7.2s
+        fsub            v17.2s, v1.2s,  v5.2s
+        fsub            v19.2s, v3.2s,  v4.2s
+        fadd            v3.2s,  v3.2s,  v4.2s
+        fadd            v1.2s,  v1.2s,  v5.2s
+        ext             v20.16b, v21.16b, v21.16b,  #4
+        ext             v21.16b, v23.16b, v23.16b,  #4
+
+        zip1            v0.2d,  v0.2d,  v1.2d   // {z[0],   z[1]}
+        zip1            v1.2d,  v2.2d,  v3.2d   // {z[2],   z[3]}
+        zip1            v2.2d,  v16.2d, v17.2d  // {z[o1],  z[o1+1]}
+        zip1            v3.2d,  v18.2d, v19.2d  // {z[o1+2],z[o1+3]}
+
+        // 2 x fft4
+        transpose       v22.2d, v23.2d, v20.2d, v21.2d
+
+        fadd            v4.4s,  v24.4s, v25.4s
+        fadd            v5.4s,  v26.4s, v27.4s
+        fsub            v6.4s,  v24.4s, v25.4s
+        fsub            v7.4s,  v22.4s, v23.4s
+
+        ld1             {v23.4s},  [x14]
+
+        fadd            v24.4s, v4.4s,  v5.4s   // {z[o2+0],z[o2+1]}
+        fsub            v26.4s, v4.4s,  v5.4s   // {z[o2+2],z[o2+3]}
+        fadd            v25.4s, v6.4s,  v7.4s   // {z[o3+0],z[o3+1]}
+        fsub            v27.4s, v6.4s,  v7.4s   // {z[o3+2],z[o3+3]}
+
+        //fft_pass_neon_16
+        rev64           v7.4s,  v25.4s
+        fmul            v25.4s, v25.4s, v23.s[1]
+        fmul            v7.4s,  v7.4s,  v29.4s
+        fmla            v25.4s, v7.4s,  v23.s[3] // {t1a,t2a,t5a,t6a}
+
+        zip1            v20.4s, v24.4s, v25.4s
+        zip2            v21.4s, v24.4s, v25.4s
+        fneg            v22.4s, v20.4s
+        fadd            v4.4s,  v21.4s, v20.4s
+        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
+        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
+
+        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
+        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+        fsub            v20.4s, v0.4s,  v4.4s   // {z[o2],z[o2+1]}
+        fadd            v16.4s, v0.4s,  v4.4s   // {z[0], z[1]}
+        fsub            v22.4s, v2.4s,  v5.4s   // {z[o3],z[o3+1]}
+        fadd            v18.4s, v2.4s,  v5.4s   // {z[o1],z[o1+1]}
+
+//second half
+        rev64           v6.4s,  v26.4s
+        fmul            v26.4s, v26.4s, v23.s[2]
+        rev64           v7.4s,  v27.4s
+        fmul            v27.4s, v27.4s, v23.s[3]
+        fmul            v6.4s,  v6.4s,  v29.4s
+        fmul            v7.4s,  v7.4s,  v29.4s
+        fmla            v26.4s, v6.4s,  v23.s[2] // {t1,t2,t5,t6}
+        fmla            v27.4s, v7.4s,  v23.s[1] // {t1a,t2a,t5a,t6a}
+
+        zip1            v24.4s, v26.4s, v27.4s
+        zip2            v25.4s, v26.4s, v27.4s
+        fneg            v26.4s, v24.4s
+        fadd            v4.4s,  v25.4s, v24.4s
+        fsub            v6.4s,  v24.4s, v25.4s  // just the second half
+        fadd            v5.4s,  v25.4s, v26.4s  // just the first half
+
+        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
+        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+        fadd            v17.4s, v1.4s, v4.4s    // {z[2], z[3]}
+        fsub            v21.4s, v1.4s, v4.4s    // {z[o2+2],z[o2+3]}
+        fadd            v19.4s, v3.4s, v5.4s    // {z[o1+2],z[o1+3]}
+        fsub            v23.4s, v3.4s, v5.4s    // {z[o3+2],z[o3+3]}
+
+        st1             {v16.4s,v17.4s}, [x1], #32
+        st1             {v18.4s,v19.4s}, [x1], #32
+        st1             {v20.4s,v21.4s}, [x1], #32
+        st1             {v22.4s,v23.4s}, [x1], #32
+
+        ret
+endfunc
+
+
+const  trans4_float, align=4
+        .byte    0,  1,  2,  3
+        .byte    8,  9, 10, 11
+        .byte    4,  5,  6,  7
+        .byte   12, 13, 14, 15
+endconst
+
+const  trans8_float, align=4
+        .byte   24, 25, 26, 27
+        .byte    0,  1,  2,  3
+        .byte   28, 29, 30, 31
+        .byte    4,  5,  6,  7
+endconst
+
+function fft_pass_neon
+        sub             x6,  x2,  #1            // n - 1, loop counter
+        lsl             x5,  x2,  #3            // 2 * n * sizeof FFTSample
+        lsl             x1,  x2,  #4            // 2 * n * sizeof FFTComplex
+        add             x5,  x4,  x5            // wim
+        add             x3,  x1,  x2,  lsl #5   // 4 * n * sizeof FFTComplex
+        add             x2,  x0,  x2,  lsl #5   // &z[o2]
+        add             x3,  x0,  x3            // &z[o3]
+        add             x1,  x0,  x1            // &z[o1]
+        ld1             {v20.4s},[x2]           // {z[o2],z[o2+1]}
+        ld1             {v22.4s},[x3]           // {z[o3],z[o3+1]}
+        ld1             {v4.2s},  [x4], #8      // {wre[0],wre[1]}
+        trn2            v25.2d, v20.2d, v22.2d
+        sub             x5,  x5,  #4            // wim--
+        trn1            v24.2d, v20.2d, v22.2d
+        ld1             {v5.s}[0],  [x5], x7    // d5[0] = wim[-1]
+        rev64           v7.4s,  v25.4s
+        fmul            v25.4s, v25.4s, v4.s[1]
+        ld1             {v16.4s}, [x0]          // {z[0],z[1]}
+        fmul            v7.4s,  v7.4s,  v29.4s
+        ld1             {v17.4s}, [x1]          // {z[o1],z[o1+1]}
+        prfm            pldl1keep, [x2, #16]
+        prfm            pldl1keep, [x3, #16]
+        fmla            v25.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
+        prfm            pldl1keep, [x0, #16]
+        prfm            pldl1keep, [x1, #16]
+
+        zip1            v20.4s, v24.4s, v25.4s
+        zip2            v21.4s, v24.4s, v25.4s
+        fneg            v22.4s, v20.4s
+        fadd            v4.4s,  v21.4s, v20.4s
+        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
+        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
+
+        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
+        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+        fadd            v20.4s, v16.4s, v4.4s
+        fsub            v22.4s, v16.4s, v4.4s
+        fadd            v21.4s, v17.4s, v5.4s
+        st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
+        fsub            v23.4s, v17.4s, v5.4s
+
+        st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
+        st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
+        st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
+1:
+        ld1             {v20.4s},[x2]    // {z[o2],z[o2+1]}
+        ld1             {v22.4s},[x3]    // {z[o3],z[o3+1]}
+        ld1             {v4.2s}, [x4], #8       // {wre[0],wre[1]}
+        transpose       v26.2d, v27.2d, v20.2d, v22.2d
+        ld1             {v5.2s}, [x5], x7       // {wim[-1],wim[0]}
+        rev64           v6.4s,  v26.4s
+        fmul            v26.4s, v26.4s, v4.s[0]
+        rev64           v7.4s,  v27.4s
+        fmul            v27.4s, v27.4s, v4.s[1]
+        fmul            v6.4s,  v6.4s,  v29.4s
+        fmul            v7.4s,  v7.4s,  v29.4s
+        ld1             {v16.4s},[x0]           // {z[0],z[1]}
+        fmla            v26.4s, v6.4s,  v5.s[1] // {t1,t2,t5,t6}
+        fmla            v27.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
+        ld1             {v17.4s},[x1]           // {z[o1],z[o1+1]}
+
+        subs            x6,  x6,  #1            // n--
+
+        zip1            v20.4s, v26.4s, v27.4s
+        zip2            v21.4s, v26.4s, v27.4s
+        fneg            v22.4s, v20.4s
+        fadd            v4.4s,  v21.4s, v20.4s
+        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
+        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
+
+        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
+        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+        fadd            v20.4s, v16.4s, v4.4s
+        fsub            v22.4s, v16.4s, v4.4s
+        fadd            v21.4s, v17.4s, v5.4s
+        st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
+        fsub            v23.4s, v17.4s, v5.4s
+
+        st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
+        st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
+        st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro  def_fft n, n2, n4
+function fft\n\()_neon, align=6
+        sub             sp,  sp,  #16
+        stp             x28, x30, [sp]
+        add             x28, x0,  #\n4*2*8
+        bl              fft\n2\()_neon
+        mov             x0,  x28
+        bl              fft\n4\()_neon
+        add             x0,  x28, #\n4*1*8
+        bl              fft\n4\()_neon
+        sub             x0,  x28, #\n4*2*8
+        ldp             x28, x30, [sp], #16
+        movrel          x4,  X(ff_cos_\n)
+        mov             x2,  #\n4>>1
+        b               fft_pass_neon
+endfunc
+.endm
+
+        def_fft    32,    16,     8
+        def_fft    64,    32,    16
+        def_fft   128,    64,    32
+        def_fft   256,   128,    64
+        def_fft   512,   256,   128
+        def_fft  1024,   512,   256
+        def_fft  2048,  1024,   512
+        def_fft  4096,  2048,  1024
+        def_fft  8192,  4096,  2048
+        def_fft 16384,  8192,  4096
+        def_fft 32768, 16384,  8192
+        def_fft 65536, 32768, 16384
+
+function ff_fft_calc_neon, export=1
+        prfm            pldl1keep, [x1]
+        movrel          x10, trans4_float
+        ldr             w2,  [x0]
+        movrel          x11, trans8_float
+        sub             w2,  w2,  #2
+        movrel          x3,  fft_tab_neon
+        ld1             {v30.16b}, [x10]
+        mov             x7,  #-8
+        movrel          x12, pmmp
+        ldr             x3,  [x3, x2, lsl #3]
+        movrel          x13, mppm
+        movrel          x14, X(ff_cos_16)
+        ld1             {v31.16b}, [x11]
+        mov             x0,  x1
+        ld1             {v29.4s},  [x12]         // pmmp
+        ld1             {v28.4s},  [x13]
+        br              x3
+endfunc
+
+function ff_fft_permute_neon, export=1
+        mov             x6,  #1
+        ldr             w2,  [x0]       // nbits
+        ldr             x3,  [x0, #16]  // tmp_buf
+        ldr             x0,  [x0, #8]   // revtab
+        lsl             x6,  x6, x2
+        mov             x2,  x6
+1:
+        ld1             {v0.2s,v1.2s}, [x1], #16
+        ldr             w4,  [x0], #4
+        uxth            w5,  w4
+        lsr             w4,  w4,  #16
+        add             x5,  x3,  x5,  lsl #3
+        add             x4,  x3,  x4,  lsl #3
+        st1             {v0.2s}, [x5]
+        st1             {v1.2s}, [x4]
+        subs            x6,  x6, #2
+        b.gt            1b
+
+        sub             x1,  x1,  x2,  lsl #3
+1:
+        ld1             {v0.4s,v1.4s}, [x3], #32
+        st1             {v0.4s,v1.4s}, [x1], #32
+        subs            x2,  x2,  #4
+        b.gt            1b
+
+        ret
+endfunc
+
+const   fft_tab_neon, relocate=1
+        .quad fft4_neon
+        .quad fft8_neon
+        .quad fft16_neon
+        .quad fft32_neon
+        .quad fft64_neon
+        .quad fft128_neon
+        .quad fft256_neon
+        .quad fft512_neon
+        .quad fft1024_neon
+        .quad fft2048_neon
+        .quad fft4096_neon
+        .quad fft8192_neon
+        .quad fft16384_neon
+        .quad fft32768_neon
+        .quad fft65536_neon
+endconst
+
+const   pmmp, align=4
+        .float          +1.0, -1.0, -1.0, +1.0
+endconst
+
+const   mppm, align=4
+        .float          -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+endconst
--- a/externals/ffmpeg/libavcodec/aarch64/fmtconvert_init.c
+++ b/externals/ffmpeg/libavcodec/aarch64/fmtconvert_init.c
@@ -0,0 +1,43 @@
+/*
+ * ARM optimized Format Conversion Utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/fmtconvert.h"
+
+void ff_int32_to_float_fmul_array8_neon(FmtConvertContext *c, float *dst,
+                                        const int32_t *src, const float *mul,
+                                        int len);
+void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src,
+                                        float mul, int len);
+
+av_cold void ff_fmt_convert_init_aarch64(FmtConvertContext *c,
+                                         AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_neon;
+        c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/fmtconvert_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/fmtconvert_neon.S
@@ -0,0 +1,76 @@
+/*
+ * ARM NEON optimised Format Conversion Utils
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2015 Janne Grunau  <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/aarch64/asm.S"
+
+function ff_int32_to_float_fmul_scalar_neon, export=1
+        ld1             {v1.4s,v2.4s}, [x1], #32
+        scvtf           v1.4s,  v1.4s
+        scvtf           v2.4s,  v2.4s
+1:
+        subs            w2,  w2,  #8
+        fmul            v3.4s,  v1.4s,  v0.s[0]
+        fmul            v4.4s,  v2.4s,  v0.s[0]
+        b.le            2f
+        ld1             {v1.4s,v2.4s}, [x1], #32
+        st1             {v3.4s,v4.4s}, [x0], #32
+        scvtf           v1.4s,  v1.4s
+        scvtf           v2.4s,  v2.4s
+        b               1b
+2:
+        st1             {v3.4s,v4.4s}, [x0]
+        ret
+endfunc
+
+function ff_int32_to_float_fmul_array8_neon, export=1
+        lsr             w4,  w4,  #3
+        subs            w5,  w4,  #1
+        b.eq            1f
+2:
+        ld1             {v0.4s,v1.4s}, [x2], #32
+        ld1             {v2.4s,v3.4s}, [x2], #32
+        scvtf           v0.4s,  v0.4s
+        scvtf           v1.4s,  v1.4s
+        ld1             {v16.2s},  [x3], #8
+        scvtf           v2.4s,  v2.4s
+        scvtf           v3.4s,  v3.4s
+        fmul            v4.4s,  v0.4s,  v16.s[0]
+        fmul            v5.4s,  v1.4s,  v16.s[0]
+        fmul            v6.4s,  v2.4s,  v16.s[1]
+        fmul            v7.4s,  v3.4s,  v16.s[1]
+        st1             {v4.4s,v5.4s}, [x1], #32
+        st1             {v6.4s,v7.4s}, [x1], #32
+        subs            w5,  w5,  #2
+        b.gt            2b
+        b.eq            1f
+        ret
+1:
+        ld1             {v0.4s,v1.4s}, [x2]
+        ld1             {v16.s}[0],  [x3]
+        scvtf           v0.4s,  v0.4s
+        scvtf           v1.4s,  v1.4s
+        fmul            v4.4s,  v0.4s,  v16.s[0]
+        fmul            v5.4s,  v1.4s,  v16.s[0]
+        st1             {v4.4s,v5.4s}, [x1]
+        ret
+endfunc
--- a/externals/ffmpeg/libavcodec/aarch64/h264chroma_init_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/h264chroma_init_aarch64.c
@@ -0,0 +1,59 @@
+/*
+ * ARM NEON optimised H.264 chroma functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/h264chroma.h"
+
+#include "config.h"
+
+void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+
+void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+
+av_cold void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth)
+{
+    const int high_bit_depth = bit_depth > 8;
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags) && !high_bit_depth) {
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
+        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
+        c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
+
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
+        c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/h264cmc_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/h264cmc_neon.S
@@ -0,0 +1,450 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
+.macro  h264_chroma_mc8 type, codec=h264
+function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
+  .ifc \type,avg
+        mov             x8,  x0
+  .endif
+        prfm            pldl1strm, [x1]
+        prfm            pldl1strm, [x1, x2]
+  .ifc \codec,rv40
+        movrel          x6,  rv40bias
+        lsr             w9,  w5,  #1
+        lsr             w10, w4,  #1
+        lsl             w9,  w9,  #3
+        lsl             w10, w10, #1
+        add             w9,  w9,  w10
+        add             x6,  x6,  w9, UXTW
+        ld1r            {v22.8H}, [x6]
+  .endif
+  .ifc \codec,vc1
+        movi            v22.8H,   #28
+  .endif
+        mul             w7,  w4,  w5
+        lsl             w14, w5,  #3
+        lsl             w13, w4,  #3
+        cmp             w7,  #0
+        sub             w6,  w14, w7
+        sub             w12, w13, w7
+        sub             w4,  w7,  w13
+        sub             w4,  w4,  w14
+        add             w4,  w4,  #64
+        b.eq            2f
+
+        dup             v0.8B,  w4
+        dup             v1.8B,  w12
+        ld1             {v4.8B, v5.8B}, [x1], x2
+        dup             v2.8B,  w6
+        dup             v3.8B,  w7
+        ext             v5.8B,  v4.8B,  v5.8B,  #1
+1:      ld1             {v6.8B, v7.8B}, [x1], x2
+        umull           v16.8H, v4.8B,  v0.8B
+        umlal           v16.8H, v5.8B,  v1.8B
+        ext             v7.8B,  v6.8B,  v7.8B,  #1
+        ld1             {v4.8B, v5.8B}, [x1], x2
+        umlal           v16.8H, v6.8B,  v2.8B
+        prfm            pldl1strm, [x1]
+        ext             v5.8B,  v4.8B,  v5.8B,  #1
+        umlal           v16.8H, v7.8B,  v3.8B
+        umull           v17.8H, v6.8B,  v0.8B
+        subs            w3,  w3,  #2
+        umlal           v17.8H, v7.8B, v1.8B
+        umlal           v17.8H, v4.8B, v2.8B
+        umlal           v17.8H, v5.8B, v3.8B
+        prfm            pldl1strm, [x1, x2]
+  .ifc \codec,h264
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
+  .else
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
+  .endif
+  .ifc \type,avg
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
+  .endif
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
+        b.gt            1b
+        ret
+
+2:      adds            w12, w12, w6
+        dup             v0.8B, w4
+        b.eq            5f
+        tst             w6,  w6
+        dup             v1.8B, w12
+        b.eq            4f
+
+        ld1             {v4.8B}, [x1], x2
+3:      ld1             {v6.8B}, [x1], x2
+        umull           v16.8H, v4.8B,  v0.8B
+        umlal           v16.8H, v6.8B,  v1.8B
+        ld1             {v4.8B}, [x1], x2
+        umull           v17.8H, v6.8B,  v0.8B
+        umlal           v17.8H, v4.8B,  v1.8B
+        prfm            pldl1strm, [x1]
+  .ifc \codec,h264
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
+  .else
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
+  .endif
+        prfm            pldl1strm, [x1, x2]
+  .ifc \type,avg
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
+  .endif
+        subs            w3,  w3,  #2
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
+        b.gt            3b
+        ret
+
+4:      ld1             {v4.8B, v5.8B}, [x1], x2
+        ld1             {v6.8B, v7.8B}, [x1], x2
+        ext             v5.8B,  v4.8B,  v5.8B,  #1
+        ext             v7.8B,  v6.8B,  v7.8B,  #1
+        prfm            pldl1strm, [x1]
+        subs            w3,  w3,  #2
+        umull           v16.8H, v4.8B, v0.8B
+        umlal           v16.8H, v5.8B, v1.8B
+        umull           v17.8H, v6.8B, v0.8B
+        umlal           v17.8H, v7.8B, v1.8B
+        prfm            pldl1strm, [x1, x2]
+  .ifc \codec,h264
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
+  .else
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
+  .endif
+  .ifc \type,avg
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
+  .endif
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
+        b.gt            4b
+        ret
+
+5:      ld1             {v4.8B}, [x1], x2
+        ld1             {v5.8B}, [x1], x2
+        prfm            pldl1strm, [x1]
+        subs            w3,  w3,  #2
+        umull           v16.8H, v4.8B, v0.8B
+        umull           v17.8H, v5.8B, v0.8B
+        prfm            pldl1strm, [x1, x2]
+  .ifc \codec,h264
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
+  .else
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
+  .endif
+  .ifc \type,avg
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
+  .endif
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
+        b.gt            5b
+        ret
+endfunc
+.endm
+
+/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
+.macro  h264_chroma_mc4 type, codec=h264
+function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
+  .ifc \type,avg
+        mov             x8,  x0
+  .endif
+        prfm            pldl1strm, [x1]
+        prfm            pldl1strm, [x1, x2]
+  .ifc \codec,rv40
+        movrel          x6,  rv40bias
+        lsr             w9,  w5,  #1
+        lsr             w10, w4,  #1
+        lsl             w9,  w9,  #3
+        lsl             w10, w10, #1
+        add             w9,  w9,  w10
+        add             x6,  x6,  w9, UXTW
+        ld1r            {v22.8H}, [x6]
+  .endif
+  .ifc \codec,vc1
+        movi            v22.8H,   #28
+  .endif
+        mul             w7,  w4,  w5
+        lsl             w14, w5,  #3
+        lsl             w13, w4,  #3
+        cmp             w7,  #0
+        sub             w6,  w14, w7
+        sub             w12, w13, w7
+        sub             w4,  w7,  w13
+        sub             w4,  w4,  w14
+        add             w4,  w4,  #64
+        b.eq            2f
+
+        dup             v24.8B,  w4
+        dup             v25.8B,  w12
+        ld1             {v4.8B}, [x1], x2
+        dup             v26.8B,  w6
+        dup             v27.8B,  w7
+        ext             v5.8B,  v4.8B,  v5.8B, #1
+        trn1            v0.2S,  v24.2S, v25.2S
+        trn1            v2.2S,  v26.2S, v27.2S
+        trn1            v4.2S,  v4.2S,  v5.2S
+1:      ld1             {v6.8B}, [x1], x2
+        ext             v7.8B,  v6.8B,  v7.8B, #1
+        trn1            v6.2S,  v6.2S,  v7.2S
+        umull           v18.8H, v4.8B,  v0.8B
+        umlal           v18.8H, v6.8B,  v2.8B
+        ld1             {v4.8B}, [x1], x2
+        ext             v5.8B,  v4.8B,  v5.8B, #1
+        trn1            v4.2S,  v4.2S,  v5.2S
+        prfm            pldl1strm, [x1]
+        umull           v19.8H, v6.8B,  v0.8B
+        umlal           v19.8H, v4.8B,  v2.8B
+        trn1            v30.2D, v18.2D, v19.2D
+        trn2            v31.2D, v18.2D, v19.2D
+        add             v18.8H, v30.8H, v31.8H
+  .ifc \codec,h264
+        rshrn           v16.8B, v18.8H, #6
+  .else
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
+  .endif
+        subs            w3,  w3,  #2
+        prfm            pldl1strm, [x1, x2]
+  .ifc \type,avg
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+  .endif
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
+        b.gt            1b
+        ret
+
+2:      adds            w12, w12, w6
+        dup             v30.8B, w4
+        b.eq            5f
+        tst             w6,  w6
+        dup             v31.8B, w12
+        trn1            v0.2S,  v30.2S, v31.2S
+        trn2            v1.2S,  v30.2S, v31.2S
+        b.eq            4f
+
+        ext             v1.8B,  v0.8B,  v1.8B, #4
+        ld1             {v4.S}[0], [x1], x2
+3:      ld1             {v4.S}[1], [x1], x2
+        umull           v18.8H, v4.8B,  v0.8B
+        ld1             {v4.S}[0], [x1], x2
+        umull           v19.8H, v4.8B,  v1.8B
+        trn1            v30.2D, v18.2D, v19.2D
+        trn2            v31.2D, v18.2D, v19.2D
+        add             v18.8H, v30.8H, v31.8H
+        prfm            pldl1strm, [x1]
+  .ifc \codec,h264
+        rshrn           v16.8B, v18.8H, #6
+  .else
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
+  .endif
+  .ifc \type,avg
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+  .endif
+        subs            w3,  w3,  #2
+        prfm            pldl1strm, [x1, x2]
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
+        b.gt            3b
+        ret
+
+4:      ld1             {v4.8B}, [x1], x2
+        ld1             {v6.8B}, [x1], x2
+        ext             v5.8B,  v4.8B,  v5.8B, #1
+        ext             v7.8B,  v6.8B,  v7.8B, #1
+        trn1            v4.2S,  v4.2S,  v5.2S
+        trn1            v6.2S,  v6.2S,  v7.2S
+        umull           v18.8H, v4.8B,  v0.8B
+        umull           v19.8H, v6.8B,  v0.8B
+        subs            w3,  w3,  #2
+        trn1            v30.2D, v18.2D, v19.2D
+        trn2            v31.2D, v18.2D, v19.2D
+        add             v18.8H, v30.8H, v31.8H
+        prfm            pldl1strm, [x1]
+  .ifc \codec,h264
+        rshrn           v16.8B, v18.8H, #6
+  .else
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
+  .endif
+  .ifc \type,avg
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+  .endif
+        prfm            pldl1strm, [x1]
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
+        b.gt            4b
+        ret
+
+5:      ld1             {v4.S}[0], [x1], x2
+        ld1             {v4.S}[1], [x1], x2
+        umull           v18.8H, v4.8B,  v30.8B
+        subs            w3,  w3,  #2
+        prfm            pldl1strm, [x1]
+  .ifc \codec,h264
+        rshrn           v16.8B, v18.8H, #6
+  .else
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
+  .endif
+  .ifc \type,avg
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+  .endif
+        prfm            pldl1strm, [x1]
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
+        b.gt            5b
+        ret
+endfunc
+.endm
+
+.macro  h264_chroma_mc2 type
+function ff_\type\()_h264_chroma_mc2_neon, export=1
+        prfm            pldl1strm, [x1]
+        prfm            pldl1strm, [x1, x2]
+        orr             w7,  w4,  w5
+        cbz             w7,  2f
+
+        mul             w7,  w4,  w5
+        lsl             w14, w5,  #3
+        lsl             w13, w4,  #3
+        sub             w6,  w14, w7
+        sub             w12, w13, w7
+        sub             w4,  w7,  w13
+        sub             w4,  w4,  w14
+        add             w4,  w4,  #64
+        dup             v0.8B,  w4
+        dup             v2.8B,  w12
+        dup             v1.8B,  w6
+        dup             v3.8B,  w7
+        trn1            v0.4H,  v0.4H,  v2.4H
+        trn1            v1.4H,  v1.4H,  v3.4H
+1:
+        ld1             {v4.S}[0],  [x1], x2
+        ld1             {v4.S}[1],  [x1], x2
+        rev64           v5.2S,  v4.2S
+        ld1             {v5.S}[1],  [x1]
+        ext             v6.8B,  v4.8B,  v5.8B,  #1
+        ext             v7.8B,  v5.8B,  v4.8B,  #1
+        trn1            v4.4H,  v4.4H,  v6.4H
+        trn1            v5.4H,  v5.4H,  v7.4H
+        umull           v16.8H, v4.8B,  v0.8B
+        umlal           v16.8H, v5.8B,  v1.8B
+  .ifc \type,avg
+        ld1             {v18.H}[0], [x0], x2
+        ld1             {v18.H}[2], [x0]
+        sub             x0,  x0,  x2
+  .endif
+        rev64           v17.4S, v16.4S
+        add             v16.8H, v16.8H, v17.8H
+        rshrn           v16.8B, v16.8H, #6
+  .ifc \type,avg
+        urhadd          v16.8B, v16.8B, v18.8B
+  .endif
+        st1             {v16.H}[0], [x0], x2
+        st1             {v16.H}[2], [x0], x2
+        subs            w3,  w3,  #2
+        b.gt            1b
+        ret
+
+2:
+        ld1             {v16.H}[0], [x1], x2
+        ld1             {v16.H}[1], [x1], x2
+  .ifc \type,avg
+        ld1             {v18.H}[0], [x0], x2
+        ld1             {v18.H}[1], [x0]
+        sub             x0,  x0,  x2
+        urhadd          v16.8B, v16.8B, v18.8B
+  .endif
+        st1             {v16.H}[0], [x0], x2
+        st1             {v16.H}[1], [x0], x2
+        subs            w3,  w3,  #2
+        b.gt            2b
+        ret
+endfunc
+.endm
+
+        h264_chroma_mc8 put
+        h264_chroma_mc8 avg
+        h264_chroma_mc4 put
+        h264_chroma_mc4 avg
+        h264_chroma_mc2 put
+        h264_chroma_mc2 avg
+
+#if CONFIG_RV40_DECODER
+const   rv40bias
+        .short           0, 16, 32, 16
+        .short          32, 28, 32, 28
+        .short           0, 32, 16, 32
+        .short          32, 28, 32, 28
+endconst
+
+        h264_chroma_mc8 put, rv40
+        h264_chroma_mc8 avg, rv40
+        h264_chroma_mc4 put, rv40
+        h264_chroma_mc4 avg, rv40
+#endif
+
+#if CONFIG_VC1DSP
+        h264_chroma_mc8 put, vc1
+        h264_chroma_mc8 avg, vc1
+        h264_chroma_mc4 put, vc1
+        h264_chroma_mc4 avg, vc1
+#endif
--- a/externals/ffmpeg/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/h264dsp.h"
+
+void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
+                                     int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
+                                     int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
+                                           int beta);
+void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
+                                           int beta);
+void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
+                                       int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
+                                       int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma422_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
+                                          int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
+                                             int alpha, int beta);
+void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
+                                             int alpha, int beta);
+void ff_h264_h_loop_filter_chroma422_intra_neon(uint8_t *pix, ptrdiff_t stride,
+                                                int alpha, int beta);
+void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, ptrdiff_t stride,
+                                                   int alpha, int beta);
+
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, ptrdiff_t stride, int height,
+                                   int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, ptrdiff_t stride, int height,
+                                  int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, ptrdiff_t stride, int height,
+                                  int log2_den, int weight, int offset);
+
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                     int height, int log2_den, int weightd,
+                                     int weights, int offset);
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                    int height, int log2_den, int weightd,
+                                    int weights, int offset);
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                    int height, int log2_den, int weightd,
+                                    int weights, int offset);
+
+void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
+                             int16_t *block, int stride,
+                             const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
+                                  int16_t *block, int stride,
+                                  const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
+                            int16_t *block, int stride,
+                            const uint8_t nnzc[6*8]);
+
+void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
+                             int16_t *block, int stride,
+                             const uint8_t nnzc[6*8]);
+
+av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
+                                     const int chroma_format_idc)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags) && bit_depth == 8) {
+        c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
+        c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
+        c->h264_v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon;
+        c->h264_h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon;
+
+        c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+        c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon;
+
+        if (chroma_format_idc <= 1) {
+            c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+            c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon;
+            c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon;
+        } else {
+            c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon;
+            c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon;
+            c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon;
+            c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon;
+        }
+
+        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
+        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
+        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
+
+        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
+        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
+        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
+
+        c->h264_idct_add        = ff_h264_idct_add_neon;
+        c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
+        c->h264_idct_add16      = ff_h264_idct_add16_neon;
+        c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
+        if (chroma_format_idc <= 1)
+            c->h264_idct_add8   = ff_h264_idct_add8_neon;
+        c->h264_idct8_add       = ff_h264_idct8_add_neon;
+        c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_neon;
+        c->h264_idct8_add4      = ff_h264_idct8_add4_neon;
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/h264dsp_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/h264dsp_neon.S
@@ -0,0 +1,829 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+.macro  h264_loop_filter_start
+        cmp             w2,  #0
+        ldr             w6,  [x4]
+        ccmp            w3,  #0, #0, ne
+        mov             v24.S[0], w6
+        and             w8,  w6,  w6,  lsl #16
+        b.eq            1f
+        ands            w8,  w8,  w8,  lsl #8
+        b.ge            2f
+1:
+        ret
+2:
+.endm
+
+.macro  h264_loop_filter_luma
+        dup             v22.16B, w2                     // alpha
+        uxtl            v24.8H,  v24.8B
+        uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
+        uxtl            v24.4S,  v24.4H
+        uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
+        sli             v24.8H,  v24.8H,  #8
+        uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
+        sli             v24.4S,  v24.4S,  #16
+        cmhi            v21.16B, v22.16B, v21.16B       // < alpha
+        dup             v22.16B, w3                     // beta
+        cmlt            v23.16B, v24.16B, #0
+        cmhi            v28.16B, v22.16B, v28.16B       // < beta
+        cmhi            v30.16B, v22.16B, v30.16B       // < beta
+        bic             v21.16B, v21.16B, v23.16B
+        uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
+        and             v21.16B, v21.16B, v28.16B
+        uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
+        and             v21.16B, v21.16B, v30.16B      // < beta
+        shrn            v30.8b,  v21.8h,  #4
+        mov             x7, v30.d[0]
+        cmhi            v17.16B, v22.16B, v17.16B       // < beta
+        cmhi            v19.16B, v22.16B, v19.16B       // < beta
+        cbz             x7,  9f
+        and             v17.16B, v17.16B, v21.16B
+        and             v19.16B, v19.16B, v21.16B
+        and             v24.16B, v24.16B, v21.16B
+        urhadd          v28.16B, v16.16B,  v0.16B
+        sub             v21.16B, v24.16B, v17.16B
+        uqadd           v23.16B, v18.16B, v24.16B
+        uhadd           v20.16B, v20.16B, v28.16B
+        sub             v21.16B, v21.16B, v19.16B
+        uhadd           v28.16B,  v4.16B, v28.16B
+        umin            v23.16B, v23.16B, v20.16B
+        uqsub           v22.16B, v18.16B, v24.16B
+        uqadd           v4.16B,   v2.16B, v24.16B
+        umax            v23.16B, v23.16B, v22.16B
+        uqsub           v22.16B,  v2.16B, v24.16B
+        umin            v28.16B,  v4.16B, v28.16B
+        uxtl            v4.8H,    v0.8B
+        umax            v28.16B, v28.16B, v22.16B
+        uxtl2           v20.8H,   v0.16B
+        usubw           v4.8H,    v4.8H,  v16.8B
+        usubw2          v20.8H,  v20.8H,  v16.16B
+        shl             v4.8H,    v4.8H,  #2
+        shl             v20.8H,  v20.8H,  #2
+        uaddw           v4.8H,    v4.8H,  v18.8B
+        uaddw2          v20.8H,  v20.8H,  v18.16B
+        usubw           v4.8H,    v4.8H,   v2.8B
+        usubw2          v20.8H,  v20.8H,   v2.16B
+        rshrn           v4.8B,    v4.8H,  #3
+        rshrn2          v4.16B,  v20.8H,  #3
+        bsl             v17.16B, v23.16B, v18.16B
+        bsl             v19.16B, v28.16B,  v2.16B
+        neg             v23.16B, v21.16B
+        uxtl            v28.8H,  v16.8B
+        smin            v4.16B,   v4.16B, v21.16B
+        uxtl2           v21.8H,  v16.16B
+        smax            v4.16B,   v4.16B, v23.16B
+        uxtl            v22.8H,   v0.8B
+        uxtl2           v24.8H,   v0.16B
+        saddw           v28.8H,  v28.8H,  v4.8B
+        saddw2          v21.8H,  v21.8H,  v4.16B
+        ssubw           v22.8H,  v22.8H,  v4.8B
+        ssubw2          v24.8H,  v24.8H,  v4.16B
+        sqxtun          v16.8B,  v28.8H
+        sqxtun2         v16.16B, v21.8H
+        sqxtun          v0.8B,   v22.8H
+        sqxtun2         v0.16B,  v24.8H
+.endm
+
+function ff_h264_v_loop_filter_luma_neon, export=1
+        h264_loop_filter_start
+        sxtw            x1,  w1
+
+        ld1             {v0.16B},  [x0], x1
+        ld1             {v2.16B},  [x0], x1
+        ld1             {v4.16B},  [x0], x1
+        sub             x0,  x0,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #1
+        ld1             {v20.16B},  [x0], x1
+        ld1             {v18.16B},  [x0], x1
+        ld1             {v16.16B},  [x0], x1
+
+        h264_loop_filter_luma
+
+        sub             x0,  x0,  x1, lsl #1
+        st1             {v17.16B},  [x0], x1
+        st1             {v16.16B}, [x0], x1
+        st1             {v0.16B},  [x0], x1
+        st1             {v19.16B}, [x0]
+9:
+        ret
+endfunc
+
+function ff_h264_h_loop_filter_luma_neon, export=1
+        h264_loop_filter_start
+        sxtw            x1,  w1
+
+        sub             x0,  x0,  #4
+        ld1             {v6.8B},  [x0], x1
+        ld1             {v20.8B}, [x0], x1
+        ld1             {v18.8B}, [x0], x1
+        ld1             {v16.8B}, [x0], x1
+        ld1             {v0.8B},  [x0], x1
+        ld1             {v2.8B},  [x0], x1
+        ld1             {v4.8B},  [x0], x1
+        ld1             {v26.8B}, [x0], x1
+        ld1             {v6.D}[1],  [x0], x1
+        ld1             {v20.D}[1], [x0], x1
+        ld1             {v18.D}[1], [x0], x1
+        ld1             {v16.D}[1], [x0], x1
+        ld1             {v0.D}[1],  [x0], x1
+        ld1             {v2.D}[1],  [x0], x1
+        ld1             {v4.D}[1],  [x0], x1
+        ld1             {v26.D}[1], [x0], x1
+
+        transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
+
+        h264_loop_filter_luma
+
+        transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
+
+        sub             x0,  x0,  x1, lsl #4
+        add             x0,  x0,  #2
+        st1             {v17.S}[0],  [x0], x1
+        st1             {v16.S}[0], [x0], x1
+        st1             {v0.S}[0],  [x0], x1
+        st1             {v19.S}[0], [x0], x1
+        st1             {v17.S}[1],  [x0], x1
+        st1             {v16.S}[1], [x0], x1
+        st1             {v0.S}[1],  [x0], x1
+        st1             {v19.S}[1], [x0], x1
+        st1             {v17.S}[2],  [x0], x1
+        st1             {v16.S}[2], [x0], x1
+        st1             {v0.S}[2],  [x0], x1
+        st1             {v19.S}[2], [x0], x1
+        st1             {v17.S}[3],  [x0], x1
+        st1             {v16.S}[3], [x0], x1
+        st1             {v0.S}[3],  [x0], x1
+        st1             {v19.S}[3], [x0], x1
+9:
+        ret
+endfunc
+
+
+.macro h264_loop_filter_start_intra
+    orr             w4,  w2,  w3
+    cbnz            w4,  1f
+    ret
+1:
+    sxtw            x1,  w1
+    dup             v30.16b, w2                // alpha
+    dup             v31.16b, w3                // beta
+.endm
+
+.macro h264_loop_filter_luma_intra
+    uabd            v16.16b, v7.16b,  v0.16b        // abs(p0 - q0)
+    uabd            v17.16b, v6.16b,  v7.16b        // abs(p1 - p0)
+    uabd            v18.16b, v1.16b,  v0.16b        // abs(q1 - q0)
+    cmhi            v19.16b, v30.16b, v16.16b       // < alpha
+    cmhi            v17.16b, v31.16b, v17.16b       // < beta
+    cmhi            v18.16b, v31.16b, v18.16b       // < beta
+
+    movi            v29.16b, #2
+    ushr            v30.16b, v30.16b, #2            // alpha >> 2
+    add             v30.16b, v30.16b, v29.16b       // (alpha >> 2) + 2
+    cmhi            v16.16b, v30.16b, v16.16b       // < (alpha >> 2) + 2
+
+    and             v19.16b, v19.16b, v17.16b
+    and             v19.16b, v19.16b, v18.16b
+    shrn            v20.8b,  v19.8h,  #4
+    mov             x4, v20.d[0]
+    cbz             x4, 9f
+
+    ushll           v20.8h,  v6.8b,   #1
+    ushll           v22.8h,  v1.8b,   #1
+    ushll2          v21.8h,  v6.16b,  #1
+    ushll2          v23.8h,  v1.16b,  #1
+    uaddw           v20.8h,  v20.8h,  v7.8b
+    uaddw           v22.8h,  v22.8h,  v0.8b
+    uaddw2          v21.8h,  v21.8h,  v7.16b
+    uaddw2          v23.8h,  v23.8h,  v0.16b
+    uaddw           v20.8h,  v20.8h,  v1.8b
+    uaddw           v22.8h,  v22.8h,  v6.8b
+    uaddw2          v21.8h,  v21.8h,  v1.16b
+    uaddw2          v23.8h,  v23.8h,  v6.16b
+
+    rshrn           v24.8b,  v20.8h,  #2 // p0'_1
+    rshrn           v25.8b,  v22.8h,  #2 // q0'_1
+    rshrn2          v24.16b, v21.8h,  #2 // p0'_1
+    rshrn2          v25.16b, v23.8h,  #2 // q0'_1
+
+    uabd            v17.16b, v5.16b,  v7.16b        // abs(p2 - p0)
+    uabd            v18.16b, v2.16b,  v0.16b        // abs(q2 - q0)
+    cmhi            v17.16b, v31.16b, v17.16b       // < beta
+    cmhi            v18.16b, v31.16b, v18.16b       // < beta
+
+    and             v17.16b, v16.16b, v17.16b  // if_2 && if_3
+    and             v18.16b, v16.16b, v18.16b  // if_2 && if_4
+
+    not             v30.16b, v17.16b
+    not             v31.16b, v18.16b
+
+    and             v30.16b, v30.16b, v19.16b  // if_1 && !(if_2 && if_3)
+    and             v31.16b, v31.16b, v19.16b  // if_1 && !(if_2 && if_4)
+
+    and             v17.16b, v19.16b, v17.16b  // if_1 && if_2 && if_3
+    and             v18.16b, v19.16b, v18.16b  // if_1 && if_2 && if_4
+
+    //calc            p, v7, v6, v5, v4, v17, v7, v6, v5, v4
+    uaddl           v26.8h,  v5.8b,   v7.8b
+    uaddl2          v27.8h,  v5.16b,  v7.16b
+    uaddw           v26.8h,  v26.8h,  v0.8b
+    uaddw2          v27.8h,  v27.8h,  v0.16b
+    add             v20.8h,  v20.8h,  v26.8h
+    add             v21.8h,  v21.8h,  v27.8h
+    uaddw           v20.8h,  v20.8h,  v0.8b
+    uaddw2          v21.8h,  v21.8h,  v0.16b
+    rshrn           v20.8b,  v20.8h,  #3 // p0'_2
+    rshrn2          v20.16b, v21.8h,  #3 // p0'_2
+    uaddw           v26.8h,  v26.8h,  v6.8b
+    uaddw2          v27.8h,  v27.8h,  v6.16b
+    rshrn           v21.8b,  v26.8h,  #2 // p1'_2
+    rshrn2          v21.16b, v27.8h,  #2 // p1'_2
+    uaddl           v28.8h,  v4.8b,   v5.8b
+    uaddl2          v29.8h,  v4.16b,  v5.16b
+    shl             v28.8h,  v28.8h,  #1
+    shl             v29.8h,  v29.8h,  #1
+    add             v28.8h,  v28.8h,  v26.8h
+    add             v29.8h,  v29.8h,  v27.8h
+    rshrn           v19.8b,  v28.8h,  #3 // p2'_2
+    rshrn2          v19.16b, v29.8h,  #3 // p2'_2
+
+    //calc            q, v0, v1, v2, v3, v18, v0, v1, v2, v3
+    uaddl           v26.8h,  v2.8b,   v0.8b
+    uaddl2          v27.8h,  v2.16b,  v0.16b
+    uaddw           v26.8h,  v26.8h,  v7.8b
+    uaddw2          v27.8h,  v27.8h,  v7.16b
+    add             v22.8h,  v22.8h,  v26.8h
+    add             v23.8h,  v23.8h,  v27.8h
+    uaddw           v22.8h,  v22.8h,  v7.8b
+    uaddw2          v23.8h,  v23.8h,  v7.16b
+    rshrn           v22.8b,  v22.8h,  #3 // q0'_2
+    rshrn2          v22.16b, v23.8h,  #3 // q0'_2
+    uaddw           v26.8h,  v26.8h,  v1.8b
+    uaddw2          v27.8h,  v27.8h,  v1.16b
+    rshrn           v23.8b,  v26.8h,  #2 // q1'_2
+    rshrn2          v23.16b, v27.8h,  #2 // q1'_2
+    uaddl           v28.8h,  v2.8b,   v3.8b
+    uaddl2          v29.8h,  v2.16b,  v3.16b
+    shl             v28.8h,  v28.8h,  #1
+    shl             v29.8h,  v29.8h,  #1
+    add             v28.8h,  v28.8h,  v26.8h
+    add             v29.8h,  v29.8h,  v27.8h
+    rshrn           v26.8b,  v28.8h,  #3 // q2'_2
+    rshrn2          v26.16b, v29.8h,  #3 // q2'_2
+
+    bit             v7.16b,  v24.16b, v30.16b  // p0'_1
+    bit             v0.16b,  v25.16b, v31.16b  // q0'_1
+    bit             v7.16b, v20.16b,  v17.16b  // p0'_2
+    bit             v6.16b, v21.16b,  v17.16b  // p1'_2
+    bit             v5.16b, v19.16b,  v17.16b  // p2'_2
+    bit             v0.16b, v22.16b,  v18.16b  // q0'_2
+    bit             v1.16b, v23.16b,  v18.16b  // q1'_2
+    bit             v2.16b, v26.16b,  v18.16b  // q2'_2
+.endm
+
+function ff_h264_v_loop_filter_luma_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    ld1             {v0.16b},  [x0], x1 // q0
+    ld1             {v1.16b},  [x0], x1 // q1
+    ld1             {v2.16b},  [x0], x1 // q2
+    ld1             {v3.16b},  [x0], x1 // q3
+    sub             x0,  x0,  x1, lsl #3
+    ld1             {v4.16b},  [x0], x1 // p3
+    ld1             {v5.16b},  [x0], x1 // p2
+    ld1             {v6.16b},  [x0], x1 // p1
+    ld1             {v7.16b},  [x0]     // p0
+
+    h264_loop_filter_luma_intra
+
+    sub             x0,  x0,  x1, lsl #1
+    st1             {v5.16b}, [x0], x1  // p2
+    st1             {v6.16b}, [x0], x1  // p1
+    st1             {v7.16b}, [x0], x1  // p0
+    st1             {v0.16b}, [x0], x1  // q0
+    st1             {v1.16b}, [x0], x1  // q1
+    st1             {v2.16b}, [x0]      // q2
+9:
+    ret
+endfunc
+
+function ff_h264_h_loop_filter_luma_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x0,  x0,  #4
+    ld1             {v4.8b},  [x0], x1
+    ld1             {v5.8b},  [x0], x1
+    ld1             {v6.8b},  [x0], x1
+    ld1             {v7.8b},  [x0], x1
+    ld1             {v0.8b},  [x0], x1
+    ld1             {v1.8b},  [x0], x1
+    ld1             {v2.8b},  [x0], x1
+    ld1             {v3.8b},  [x0], x1
+    ld1             {v4.d}[1],  [x0], x1
+    ld1             {v5.d}[1],  [x0], x1
+    ld1             {v6.d}[1],  [x0], x1
+    ld1             {v7.d}[1],  [x0], x1
+    ld1             {v0.d}[1],  [x0], x1
+    ld1             {v1.d}[1],  [x0], x1
+    ld1             {v2.d}[1],  [x0], x1
+    ld1             {v3.d}[1],  [x0], x1
+
+    transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
+
+    h264_loop_filter_luma_intra
+
+    transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
+
+    sub             x0,  x0,  x1, lsl #4
+    st1             {v4.8b},  [x0], x1
+    st1             {v5.8b},  [x0], x1
+    st1             {v6.8b},  [x0], x1
+    st1             {v7.8b},  [x0], x1
+    st1             {v0.8b},  [x0], x1
+    st1             {v1.8b},  [x0], x1
+    st1             {v2.8b},  [x0], x1
+    st1             {v3.8b},  [x0], x1
+    st1             {v4.d}[1],  [x0], x1
+    st1             {v5.d}[1],  [x0], x1
+    st1             {v6.d}[1],  [x0], x1
+    st1             {v7.d}[1],  [x0], x1
+    st1             {v0.d}[1],  [x0], x1
+    st1             {v1.d}[1],  [x0], x1
+    st1             {v2.d}[1],  [x0], x1
+    st1             {v3.d}[1],  [x0], x1
+9:
+    ret
+endfunc
+
+.macro  h264_loop_filter_chroma
+        dup             v22.8B, w2              // alpha
+        dup             v23.8B, w3              // beta
+        uxtl            v24.8H, v24.8B
+        uabd            v26.8B, v16.8B, v0.8B   // abs(p0 - q0)
+        uabd            v28.8B, v18.8B, v16.8B  // abs(p1 - p0)
+        uabd            v30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
+        cmhi            v26.8B, v22.8B, v26.8B  // < alpha
+        cmhi            v28.8B, v23.8B, v28.8B  // < beta
+        cmhi            v30.8B, v23.8B, v30.8B  // < beta
+        uxtl            v4.8H,  v0.8B
+        and             v26.8B, v26.8B, v28.8B
+        usubw           v4.8H,  v4.8H,  v16.8B
+        and             v26.8B, v26.8B, v30.8B
+        shl             v4.8H,  v4.8H,  #2
+        mov             x8,  v26.d[0]
+        sli             v24.8H, v24.8H, #8
+        uaddw           v4.8H,  v4.8H,  v18.8B
+        cbz             x8,  9f
+        usubw           v4.8H,  v4.8H,  v2.8B
+        rshrn           v4.8B,  v4.8H,  #3
+        smin            v4.8B,  v4.8B,  v24.8B
+        neg             v25.8B, v24.8B
+        smax            v4.8B,  v4.8B,  v25.8B
+        uxtl            v22.8H, v0.8B
+        and             v4.8B,  v4.8B,  v26.8B
+        uxtl            v28.8H, v16.8B
+        saddw           v28.8H, v28.8H, v4.8B
+        ssubw           v22.8H, v22.8H, v4.8B
+        sqxtun          v16.8B, v28.8H
+        sqxtun          v0.8B,  v22.8H
+.endm
+
+function ff_h264_v_loop_filter_chroma_neon, export=1
+        h264_loop_filter_start
+        sxtw            x1,  w1
+
+        sub             x0,  x0,  x1, lsl #1
+        ld1             {v18.8B}, [x0], x1
+        ld1             {v16.8B}, [x0], x1
+        ld1             {v0.8B},  [x0], x1
+        ld1             {v2.8B},  [x0]
+
+        h264_loop_filter_chroma
+
+        sub             x0,  x0,  x1, lsl #1
+        st1             {v16.8B}, [x0], x1
+        st1             {v0.8B},  [x0], x1
+9:
+        ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_neon, export=1
+        h264_loop_filter_start
+        sxtw            x1,  w1
+
+        sub             x0,  x0,  #2
+h_loop_filter_chroma420:
+        ld1             {v18.S}[0], [x0], x1
+        ld1             {v16.S}[0], [x0], x1
+        ld1             {v0.S}[0],  [x0], x1
+        ld1             {v2.S}[0],  [x0], x1
+        ld1             {v18.S}[1], [x0], x1
+        ld1             {v16.S}[1], [x0], x1
+        ld1             {v0.S}[1],  [x0], x1
+        ld1             {v2.S}[1],  [x0], x1
+
+        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
+
+        h264_loop_filter_chroma
+
+        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
+
+        sub             x0,  x0,  x1, lsl #3
+        st1             {v18.S}[0], [x0], x1
+        st1             {v16.S}[0], [x0], x1
+        st1             {v0.S}[0],  [x0], x1
+        st1             {v2.S}[0],  [x0], x1
+        st1             {v18.S}[1], [x0], x1
+        st1             {v16.S}[1], [x0], x1
+        st1             {v0.S}[1],  [x0], x1
+        st1             {v2.S}[1],  [x0], x1
+9:
+        ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma422_neon, export=1
+        sxtw            x1,  w1
+        h264_loop_filter_start
+        add             x5,  x0,  x1
+        sub             x0,  x0,  #2
+        add             x1,  x1,  x1
+        mov             x7,  x30
+        bl              h_loop_filter_chroma420
+        mov             x30, x7
+        sub             x0,  x5,  #2
+        mov             v24.s[0], w6
+        b               h_loop_filter_chroma420
+endfunc
+
+.macro h264_loop_filter_chroma_intra
+    uabd            v26.8b, v16.8b, v17.8b  // abs(p0 - q0)
+    uabd            v27.8b, v18.8b, v16.8b  // abs(p1 - p0)
+    uabd            v28.8b, v19.8b, v17.8b  // abs(q1 - q0)
+    cmhi            v26.8b, v30.8b, v26.8b  // < alpha
+    cmhi            v27.8b, v31.8b, v27.8b  // < beta
+    cmhi            v28.8b, v31.8b, v28.8b  // < beta
+    and             v26.8b, v26.8b, v27.8b
+    and             v26.8b, v26.8b, v28.8b
+    mov             x2, v26.d[0]
+
+    ushll           v4.8h,   v18.8b,  #1
+    ushll           v6.8h,   v19.8b,  #1
+    cbz             x2, 9f
+    uaddl           v20.8h,  v16.8b,  v19.8b
+    uaddl           v22.8h,  v17.8b,  v18.8b
+    add             v20.8h,  v20.8h,  v4.8h
+    add             v22.8h,  v22.8h,  v6.8h
+    uqrshrn         v24.8b,  v20.8h,  #2
+    uqrshrn         v25.8b,  v22.8h,  #2
+    bit             v16.8b, v24.8b, v26.8b
+    bit             v17.8b, v25.8b, v26.8b
+.endm
+
+function ff_h264_v_loop_filter_chroma_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x0,  x0,  x1, lsl #1
+    ld1             {v18.8b}, [x0], x1
+    ld1             {v16.8b}, [x0], x1
+    ld1             {v17.8b}, [x0], x1
+    ld1             {v19.8b}, [x0]
+
+    h264_loop_filter_chroma_intra
+
+    sub             x0,  x0,  x1, lsl #1
+    st1             {v16.8b}, [x0], x1
+    st1             {v17.8b}, [x0], x1
+
+9:
+    ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x4,  x0,  #2
+    sub             x0,  x0,  #1
+    ld1             {v18.8b}, [x4], x1
+    ld1             {v16.8b}, [x4], x1
+    ld1             {v17.8b}, [x4], x1
+    ld1             {v19.8b}, [x4], x1
+
+    transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
+
+    h264_loop_filter_chroma_intra
+
+    st2             {v16.b,v17.b}[0], [x0], x1
+    st2             {v16.b,v17.b}[1], [x0], x1
+    st2             {v16.b,v17.b}[2], [x0], x1
+    st2             {v16.b,v17.b}[3], [x0], x1
+
+9:
+    ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x4,  x0,  #2
+    sub             x0,  x0,  #1
+h_loop_filter_chroma420_intra:
+    ld1             {v18.8b}, [x4], x1
+    ld1             {v16.8b}, [x4], x1
+    ld1             {v17.8b}, [x4], x1
+    ld1             {v19.8b}, [x4], x1
+    ld1             {v18.s}[1], [x4], x1
+    ld1             {v16.s}[1], [x4], x1
+    ld1             {v17.s}[1], [x4], x1
+    ld1             {v19.s}[1], [x4], x1
+
+    transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
+
+    h264_loop_filter_chroma_intra
+
+    st2             {v16.b,v17.b}[0], [x0], x1
+    st2             {v16.b,v17.b}[1], [x0], x1
+    st2             {v16.b,v17.b}[2], [x0], x1
+    st2             {v16.b,v17.b}[3], [x0], x1
+    st2             {v16.b,v17.b}[4], [x0], x1
+    st2             {v16.b,v17.b}[5], [x0], x1
+    st2             {v16.b,v17.b}[6], [x0], x1
+    st2             {v16.b,v17.b}[7], [x0], x1
+
+9:
+    ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
+    h264_loop_filter_start_intra
+    sub             x4,  x0,  #2
+    add             x5,  x0,  x1, lsl #3
+    sub             x0,  x0,  #1
+    mov             x7,  x30
+    bl              h_loop_filter_chroma420_intra
+    sub             x0,  x5,  #1
+    mov             x30, x7
+    b               h_loop_filter_chroma420_intra
+endfunc
+
+.macro  biweight_16     macs, macd
+        dup             v0.16B,  w5
+        dup             v1.16B,  w6
+        mov             v4.16B,  v16.16B
+        mov             v6.16B,  v16.16B
+1:      subs            w3,  w3,  #2
+        ld1             {v20.16B}, [x0], x2
+        \macd           v4.8H,   v0.8B,  v20.8B
+        \macd\()2       v6.8H,   v0.16B, v20.16B
+        ld1             {v22.16B}, [x1], x2
+        \macs           v4.8H,   v1.8B,  v22.8B
+        \macs\()2       v6.8H,   v1.16B, v22.16B
+        mov             v24.16B, v16.16B
+        ld1             {v28.16B}, [x0], x2
+        mov             v26.16B, v16.16B
+        \macd           v24.8H,  v0.8B,  v28.8B
+        \macd\()2       v26.8H,  v0.16B, v28.16B
+        ld1             {v30.16B}, [x1], x2
+        \macs           v24.8H,  v1.8B,  v30.8B
+        \macs\()2       v26.8H,  v1.16B, v30.16B
+        sshl            v4.8H,   v4.8H,  v18.8H
+        sshl            v6.8H,   v6.8H,  v18.8H
+        sqxtun          v4.8B,   v4.8H
+        sqxtun2         v4.16B,  v6.8H
+        sshl            v24.8H,  v24.8H, v18.8H
+        sshl            v26.8H,  v26.8H, v18.8H
+        sqxtun          v24.8B,  v24.8H
+        sqxtun2         v24.16B, v26.8H
+        mov             v6.16B,  v16.16B
+        st1             {v4.16B},  [x7], x2
+        mov             v4.16B,  v16.16B
+        st1             {v24.16B}, [x7], x2
+        b.ne            1b
+        ret
+.endm
+
+.macro  biweight_8      macs, macd
+        dup             v0.8B,  w5
+        dup             v1.8B,  w6
+        mov             v2.16B,  v16.16B
+        mov             v20.16B, v16.16B
+1:      subs            w3,  w3,  #2
+        ld1             {v4.8B}, [x0], x2
+        \macd           v2.8H,  v0.8B,  v4.8B
+        ld1             {v5.8B}, [x1], x2
+        \macs           v2.8H,  v1.8B,  v5.8B
+        ld1             {v6.8B}, [x0], x2
+        \macd           v20.8H, v0.8B,  v6.8B
+        ld1             {v7.8B}, [x1], x2
+        \macs           v20.8H, v1.8B,  v7.8B
+        sshl            v2.8H,  v2.8H,  v18.8H
+        sqxtun          v2.8B,  v2.8H
+        sshl            v20.8H, v20.8H, v18.8H
+        sqxtun          v4.8B,  v20.8H
+        mov             v20.16B, v16.16B
+        st1             {v2.8B}, [x7], x2
+        mov             v2.16B,  v16.16B
+        st1             {v4.8B}, [x7], x2
+        b.ne            1b
+        ret
+.endm
+
+.macro  biweight_4      macs, macd
+        dup             v0.8B,  w5
+        dup             v1.8B,  w6
+        mov             v2.16B, v16.16B
+        mov             v20.16B,v16.16B
+1:      subs            w3,  w3,  #4
+        ld1             {v4.S}[0], [x0], x2
+        ld1             {v4.S}[1], [x0], x2
+        \macd           v2.8H,  v0.8B,  v4.8B
+        ld1             {v5.S}[0], [x1], x2
+        ld1             {v5.S}[1], [x1], x2
+        \macs           v2.8H,  v1.8B,  v5.8B
+        b.lt            2f
+        ld1             {v6.S}[0], [x0], x2
+        ld1             {v6.S}[1], [x0], x2
+        \macd           v20.8H, v0.8B,  v6.8B
+        ld1             {v7.S}[0], [x1], x2
+        ld1             {v7.S}[1], [x1], x2
+        \macs           v20.8H, v1.8B,  v7.8B
+        sshl            v2.8H,  v2.8H,  v18.8H
+        sqxtun          v2.8B,  v2.8H
+        sshl            v20.8H, v20.8H, v18.8H
+        sqxtun          v4.8B,  v20.8H
+        mov             v20.16B, v16.16B
+        st1             {v2.S}[0], [x7], x2
+        st1             {v2.S}[1], [x7], x2
+        mov             v2.16B,  v16.16B
+        st1             {v4.S}[0], [x7], x2
+        st1             {v4.S}[1], [x7], x2
+        b.ne            1b
+        ret
+2:      sshl            v2.8H,  v2.8H,  v18.8H
+        sqxtun          v2.8B,  v2.8H
+        st1             {v2.S}[0], [x7], x2
+        st1             {v2.S}[1], [x7], x2
+        ret
+.endm
+
+.macro  biweight_func   w
+function ff_biweight_h264_pixels_\w\()_neon, export=1
+        sxtw            x2,  w2
+        lsr             w8,  w5,  #31
+        add             w7,  w7,  #1
+        eor             w8,  w8,  w6,  lsr #30
+        orr             w7,  w7,  #1
+        dup             v18.8H,   w4
+        lsl             w7,  w7,  w4
+        not             v18.16B,  v18.16B
+        dup             v16.8H,   w7
+        mov             x7,  x0
+        cbz             w8,  10f
+        subs            w8,  w8,  #1
+        b.eq            20f
+        subs            w8,  w8,  #1
+        b.eq            30f
+        b               40f
+10:     biweight_\w     umlal, umlal
+20:     neg             w5, w5
+        biweight_\w     umlal, umlsl
+30:     neg             w5, w5
+        neg             w6, w6
+        biweight_\w     umlsl, umlsl
+40:     neg             w6, w6
+        biweight_\w     umlsl, umlal
+endfunc
+.endm
+
+        biweight_func   16
+        biweight_func   8
+        biweight_func   4
+
+.macro  weight_16       add
+        dup             v0.16B,  w4
+1:      subs            w2,  w2,  #2
+        ld1             {v20.16B}, [x0], x1
+        umull           v4.8H,   v0.8B,  v20.8B
+        umull2          v6.8H,   v0.16B, v20.16B
+        ld1             {v28.16B}, [x0], x1
+        umull           v24.8H,  v0.8B,  v28.8B
+        umull2          v26.8H,  v0.16B, v28.16B
+        \add            v4.8H,   v16.8H, v4.8H
+        srshl           v4.8H,   v4.8H,  v18.8H
+        \add            v6.8H,   v16.8H, v6.8H
+        srshl           v6.8H,   v6.8H,  v18.8H
+        sqxtun          v4.8B,   v4.8H
+        sqxtun2         v4.16B,  v6.8H
+        \add            v24.8H,  v16.8H, v24.8H
+        srshl           v24.8H,  v24.8H, v18.8H
+        \add            v26.8H,  v16.8H, v26.8H
+        srshl           v26.8H,  v26.8H, v18.8H
+        sqxtun          v24.8B,  v24.8H
+        sqxtun2         v24.16B, v26.8H
+        st1             {v4.16B},  [x5], x1
+        st1             {v24.16B}, [x5], x1
+        b.ne            1b
+        ret
+.endm
+
+.macro  weight_8        add
+        dup             v0.8B,  w4
+1:      subs            w2,  w2,  #2
+        ld1             {v4.8B}, [x0], x1
+        umull           v2.8H,  v0.8B,  v4.8B
+        ld1             {v6.8B}, [x0], x1
+        umull           v20.8H, v0.8B,  v6.8B
+        \add            v2.8H,  v16.8H,  v2.8H
+        srshl           v2.8H,  v2.8H,  v18.8H
+        sqxtun          v2.8B,  v2.8H
+        \add            v20.8H, v16.8H,  v20.8H
+        srshl           v20.8H, v20.8H, v18.8H
+        sqxtun          v4.8B,  v20.8H
+        st1             {v2.8B}, [x5], x1
+        st1             {v4.8B}, [x5], x1
+        b.ne            1b
+        ret
+.endm
+
+.macro  weight_4        add
+        dup             v0.8B,  w4
+1:      subs            w2,  w2,  #4
+        ld1             {v4.S}[0], [x0], x1
+        ld1             {v4.S}[1], [x0], x1
+        umull           v2.8H,  v0.8B,  v4.8B
+        b.lt            2f
+        ld1             {v6.S}[0], [x0], x1
+        ld1             {v6.S}[1], [x0], x1
+        umull           v20.8H, v0.8B,  v6.8B
+        \add            v2.8H,  v16.8H,  v2.8H
+        srshl           v2.8H,  v2.8H,  v18.8H
+        sqxtun          v2.8B,  v2.8H
+        \add            v20.8H, v16.8H,  v20.8H
+        srshl           v20.8H, v20.8h, v18.8H
+        sqxtun          v4.8B,  v20.8H
+        st1             {v2.S}[0], [x5], x1
+        st1             {v2.S}[1], [x5], x1
+        st1             {v4.S}[0], [x5], x1
+        st1             {v4.S}[1], [x5], x1
+        b.ne            1b
+        ret
+2:      \add            v2.8H,  v16.8H,  v2.8H
+        srshl           v2.8H,  v2.8H,  v18.8H
+        sqxtun          v2.8B,  v2.8H
+        st1             {v2.S}[0], [x5], x1
+        st1             {v2.S}[1], [x5], x1
+        ret
+.endm
+
+.macro  weight_func     w
+function ff_weight_h264_pixels_\w\()_neon, export=1
+        sxtw            x1,  w1
+        cmp             w3,  #1
+        mov             w6,  #1
+        lsl             w5,  w5,  w3
+        dup             v16.8H,  w5
+        mov             x5,  x0
+        b.le            20f
+        sub             w6,  w6,  w3
+        dup             v18.8H,  w6
+        cmp             w4, #0
+        b.lt            10f
+        weight_\w       shadd
+10:     neg             w4,  w4
+        weight_\w       shsub
+20:     neg             w6,  w3
+        dup             v18.8H,  w6
+        cmp             w4,  #0
+        b.lt            10f
+        weight_\w       add
+10:     neg             w4,  w4
+        weight_\w       sub
+endfunc
+.endm
+
+        weight_func     16
+        weight_func     8
+        weight_func     4
--- a/externals/ffmpeg/libavcodec/aarch64/h264idct_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/h264idct_neon.S
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+function ff_h264_idct_add_neon, export=1
+.L_ff_h264_idct_add_neon:
+        ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
+        sxtw            x2,     w2
+        movi            v30.8H, #0
+
+        add             v4.4H,  v0.4H,  v2.4H
+        sshr            v16.4H, v1.4H,  #1
+        st1             {v30.8H},    [x1], #16
+        sshr            v17.4H, v3.4H,  #1
+        st1             {v30.8H},    [x1], #16
+        sub             v5.4H,  v0.4H,  v2.4H
+        sub             v6.4H,  v16.4H, v3.4H
+        add             v7.4H,  v1.4H,  v17.4H
+        add             v0.4H,  v4.4H,  v7.4H
+        add             v1.4H,  v5.4H,  v6.4H
+        sub             v2.4H,  v5.4H,  v6.4H
+        sub             v3.4H,  v4.4H,  v7.4H
+
+        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
+
+        add             v4.4H,  v0.4H,  v2.4H
+        ld1             {v18.S}[0], [x0], x2
+        sshr            v16.4H,  v3.4H,  #1
+        sshr            v17.4H,  v1.4H,  #1
+        ld1             {v18.S}[1], [x0], x2
+        sub             v5.4H,  v0.4H,  v2.4H
+        ld1             {v19.S}[1], [x0], x2
+        add             v6.4H,  v16.4H, v1.4H
+        ins             v4.D[1],  v5.D[0]
+        sub             v7.4H,  v17.4H, v3.4H
+        ld1             {v19.S}[0], [x0], x2
+        ins             v6.D[1],  v7.D[0]
+        sub             x0,  x0,  x2, lsl #2
+        add             v0.8H,  v4.8H,  v6.8H
+        sub             v1.8H,  v4.8H,  v6.8H
+
+        srshr           v0.8H,  v0.8H,  #6
+        srshr           v1.8H,  v1.8H,  #6
+
+        uaddw           v0.8H,  v0.8H,  v18.8B
+        uaddw           v1.8H,  v1.8H,  v19.8B
+
+        sqxtun          v0.8B, v0.8H
+        sqxtun          v1.8B, v1.8H
+
+        st1             {v0.S}[0],  [x0], x2
+        st1             {v0.S}[1],  [x0], x2
+        st1             {v1.S}[1],  [x0], x2
+        st1             {v1.S}[0],  [x0], x2
+
+        sub             x1,  x1,  #32
+        ret
+endfunc
+
+function ff_h264_idct_dc_add_neon, export=1
+.L_ff_h264_idct_dc_add_neon:
+        sxtw            x2,  w2
+        mov             w3,       #0
+        ld1r            {v2.8H},  [x1]
+        strh            w3,       [x1]
+        srshr           v2.8H,  v2.8H,  #6
+        ld1             {v0.S}[0],  [x0], x2
+        ld1             {v0.S}[1],  [x0], x2
+        uaddw           v3.8H,  v2.8H,  v0.8B
+        ld1             {v1.S}[0],  [x0], x2
+        ld1             {v1.S}[1],  [x0], x2
+        uaddw           v4.8H,  v2.8H,  v1.8B
+        sqxtun          v0.8B,  v3.8H
+        sqxtun          v1.8B,  v4.8H
+        sub             x0,  x0,  x2, lsl #2
+        st1             {v0.S}[0],  [x0], x2
+        st1             {v0.S}[1],  [x0], x2
+        st1             {v1.S}[0],  [x0], x2
+        st1             {v1.S}[1],  [x0], x2
+        ret
+endfunc
+
+function ff_h264_idct_add16_neon, export=1
+        mov             x12, x30
+        mov             x6,  x0         // dest
+        mov             x5,  x1         // block_offset
+        mov             x1,  x2         // block
+        mov             w9,  w3         // stride
+        movrel          x7,  scan8
+        mov             x10, #16
+        movrel          x13, .L_ff_h264_idct_dc_add_neon
+        movrel          x14, .L_ff_h264_idct_add_neon
+1:      mov             w2,  w9
+        ldrb            w3,  [x7], #1
+        ldrsw           x0,  [x5], #4
+        ldrb            w3,  [x4,  w3,  uxtw]
+        subs            w3,  w3,  #1
+        b.lt            2f
+        ldrsh           w3,  [x1]
+        add             x0,  x0,  x6
+        ccmp            w3,  #0,  #4,  eq
+        csel            x15, x13, x14, ne
+        blr             x15
+2:      subs            x10, x10, #1
+        add             x1,  x1,  #32
+        b.ne            1b
+        ret             x12
+endfunc
+
+function ff_h264_idct_add16intra_neon, export=1
+        mov             x12, x30
+        mov             x6,  x0         // dest
+        mov             x5,  x1         // block_offset
+        mov             x1,  x2         // block
+        mov             w9,  w3         // stride
+        movrel          x7,  scan8
+        mov             x10, #16
+        movrel          x13, .L_ff_h264_idct_dc_add_neon
+        movrel          x14, .L_ff_h264_idct_add_neon
+1:      mov             w2,  w9
+        ldrb            w3,  [x7], #1
+        ldrsw           x0,  [x5], #4
+        ldrb            w3,  [x4,  w3,  uxtw]
+        add             x0,  x0,  x6
+        cmp             w3,  #0
+        ldrsh           w3,  [x1]
+        csel            x15, x13, x14, eq
+        ccmp            w3,  #0,  #0,  eq
+        b.eq            2f
+        blr             x15
+2:      subs            x10, x10, #1
+        add             x1,  x1,  #32
+        b.ne            1b
+        ret             x12
+endfunc
+
+function ff_h264_idct_add8_neon, export=1
+        sub             sp,  sp, #0x40
+        stp             x19, x20, [sp]
+        mov             x12, x30
+        ldp             x6,  x15, [x0]          // dest[0], dest[1]
+        add             x5,  x1,  #16*4         // block_offset
+        add             x9,  x2,  #16*32        // block
+        mov             w19, w3                 // stride
+        movrel          x13, .L_ff_h264_idct_dc_add_neon
+        movrel          x14, .L_ff_h264_idct_add_neon
+        movrel          x7,  scan8, 16
+        mov             x10, #0
+        mov             x11, #16
+1:      mov             w2,  w19
+        ldrb            w3,  [x7, x10]          // scan8[i]
+        ldrsw           x0,  [x5, x10, lsl #2]  // block_offset[i]
+        ldrb            w3,  [x4, w3,  uxtw]    // nnzc[ scan8[i] ]
+        add             x0,  x0,  x6            // block_offset[i] + dst[j-1]
+        add             x1,  x9,  x10, lsl #5   // block + i * 16
+        cmp             w3,  #0
+        ldrsh           w3,  [x1]               // block[i*16]
+        csel            x20, x13, x14, eq
+        ccmp            w3,  #0,  #0,  eq
+        b.eq            2f
+        blr             x20
+2:      add             x10, x10, #1
+        cmp             x10, #4
+        csel            x10, x11, x10, eq     // mov x10, #16
+        csel            x6,  x15, x6,  eq
+        cmp             x10, #20
+        b.lt            1b
+        ldp             x19, x20, [sp]
+        add             sp,  sp,  #0x40
+        ret             x12
+endfunc
+
+.macro  idct8x8_cols    pass
+  .if \pass == 0
+        va      .req    v18
+        vb      .req    v30
+        sshr            v18.8H, v26.8H, #1
+        add             v16.8H, v24.8H, v28.8H
+        ld1             {v30.8H, v31.8H}, [x1]
+        st1             {v19.8H}, [x1],  #16
+        st1             {v19.8H}, [x1],  #16
+        sub             v17.8H,  v24.8H, v28.8H
+        sshr            v19.8H,  v30.8H, #1
+        sub             v18.8H,  v18.8H,  v30.8H
+        add             v19.8H,  v19.8H,  v26.8H
+  .else
+        va      .req    v30
+        vb      .req    v18
+        sshr            v30.8H, v26.8H, #1
+        sshr            v19.8H, v18.8H, #1
+        add             v16.8H, v24.8H, v28.8H
+        sub             v17.8H, v24.8H, v28.8H
+        sub             v30.8H, v30.8H, v18.8H
+        add             v19.8H, v19.8H, v26.8H
+  .endif
+        add             v26.8H, v17.8H, va.8H
+        sub             v28.8H, v17.8H, va.8H
+        add             v24.8H, v16.8H, v19.8H
+        sub             vb.8H,  v16.8H, v19.8H
+        sub             v16.8H, v29.8H, v27.8H
+        add             v17.8H, v31.8H, v25.8H
+        sub             va.8H,  v31.8H, v25.8H
+        add             v19.8H, v29.8H, v27.8H
+        sub             v16.8H, v16.8H, v31.8H
+        sub             v17.8H, v17.8H, v27.8H
+        add             va.8H,  va.8H,  v29.8H
+        add             v19.8H, v19.8H, v25.8H
+        sshr            v25.8H, v25.8H, #1
+        sshr            v27.8H, v27.8H, #1
+        sshr            v29.8H, v29.8H, #1
+        sshr            v31.8H, v31.8H, #1
+        sub             v16.8H, v16.8H, v31.8H
+        sub             v17.8H, v17.8H, v27.8H
+        add             va.8H,  va.8H,  v29.8H
+        add             v19.8H, v19.8H, v25.8H
+        sshr            v25.8H, v16.8H, #2
+        sshr            v27.8H, v17.8H, #2
+        sshr            v29.8H, va.8H,  #2
+        sshr            v31.8H, v19.8H, #2
+        sub             v19.8H, v19.8H, v25.8H
+        sub             va.8H,  v27.8H, va.8H
+        add             v17.8H, v17.8H, v29.8H
+        add             v16.8H, v16.8H, v31.8H
+  .if \pass == 0
+        sub             v31.8H, v24.8H, v19.8H
+        add             v24.8H, v24.8H, v19.8H
+        add             v25.8H, v26.8H, v18.8H
+        sub             v18.8H, v26.8H, v18.8H
+        add             v26.8H, v28.8H, v17.8H
+        add             v27.8H, v30.8H, v16.8H
+        sub             v29.8H, v28.8H, v17.8H
+        sub             v28.8H, v30.8H, v16.8H
+  .else
+        sub             v31.8H, v24.8H, v19.8H
+        add             v24.8H, v24.8H, v19.8H
+        add             v25.8H, v26.8H, v30.8H
+        sub             v30.8H, v26.8H, v30.8H
+        add             v26.8H, v28.8H, v17.8H
+        sub             v29.8H, v28.8H, v17.8H
+        add             v27.8H, v18.8H, v16.8H
+        sub             v28.8H, v18.8H, v16.8H
+  .endif
+        .unreq          va
+        .unreq          vb
+.endm
+
+function ff_h264_idct8_add_neon, export=1
+.L_ff_h264_idct8_add_neon:
+        movi            v19.8H,   #0
+        sxtw            x2,       w2
+        ld1             {v24.8H, v25.8H}, [x1]
+        st1             {v19.8H},  [x1],   #16
+        st1             {v19.8H},  [x1],   #16
+        ld1             {v26.8H, v27.8H}, [x1]
+        st1             {v19.8H},  [x1],   #16
+        st1             {v19.8H},  [x1],   #16
+        ld1             {v28.8H, v29.8H}, [x1]
+        st1             {v19.8H},  [x1],   #16
+        st1             {v19.8H},  [x1],   #16
+
+        idct8x8_cols    0
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
+        idct8x8_cols    1
+
+        mov             x3,  x0
+        srshr           v24.8H, v24.8H, #6
+        ld1             {v0.8B},     [x0], x2
+        srshr           v25.8H, v25.8H, #6
+        ld1             {v1.8B},     [x0], x2
+        srshr           v26.8H, v26.8H, #6
+        ld1             {v2.8B},     [x0], x2
+        srshr           v27.8H, v27.8H, #6
+        ld1             {v3.8B},     [x0], x2
+        srshr           v28.8H, v28.8H, #6
+        ld1             {v4.8B},     [x0], x2
+        srshr           v29.8H, v29.8H, #6
+        ld1             {v5.8B},     [x0], x2
+        srshr           v30.8H, v30.8H, #6
+        ld1             {v6.8B},     [x0], x2
+        srshr           v31.8H, v31.8H, #6
+        ld1             {v7.8B},     [x0], x2
+        uaddw           v24.8H, v24.8H, v0.8B
+        uaddw           v25.8H, v25.8H, v1.8B
+        uaddw           v26.8H, v26.8H, v2.8B
+        sqxtun          v0.8B,  v24.8H
+        uaddw           v27.8H, v27.8H, v3.8B
+        sqxtun          v1.8B,  v25.8H
+        uaddw           v28.8H, v28.8H, v4.8B
+        sqxtun          v2.8B,  v26.8H
+        st1             {v0.8B},     [x3], x2
+        uaddw           v29.8H, v29.8H, v5.8B
+        sqxtun          v3.8B,  v27.8H
+        st1             {v1.8B},     [x3], x2
+        uaddw           v30.8H, v30.8H, v6.8B
+        sqxtun          v4.8B,  v28.8H
+        st1             {v2.8B},     [x3], x2
+        uaddw           v31.8H, v31.8H, v7.8B
+        sqxtun          v5.8B,  v29.8H
+        st1             {v3.8B},     [x3], x2
+        sqxtun          v6.8B,  v30.8H
+        sqxtun          v7.8B,  v31.8H
+        st1             {v4.8B},     [x3], x2
+        st1             {v5.8B},     [x3], x2
+        st1             {v6.8B},     [x3], x2
+        st1             {v7.8B},     [x3], x2
+
+        sub             x1,  x1,  #128
+        ret
+endfunc
+
+function ff_h264_idct8_dc_add_neon, export=1
+.L_ff_h264_idct8_dc_add_neon:
+        mov             w3,       #0
+        sxtw            x2,       w2
+        ld1r            {v31.8H}, [x1]
+        strh            w3,       [x1]
+        ld1             {v0.8B},  [x0], x2
+        srshr           v31.8H, v31.8H, #6
+        ld1             {v1.8B},     [x0], x2
+        ld1             {v2.8B},     [x0], x2
+        uaddw           v24.8H, v31.8H, v0.8B
+        ld1             {v3.8B},     [x0], x2
+        uaddw           v25.8H, v31.8H, v1.8B
+        ld1             {v4.8B},     [x0], x2
+        uaddw           v26.8H, v31.8H, v2.8B
+        ld1             {v5.8B},     [x0], x2
+        uaddw           v27.8H, v31.8H, v3.8B
+        ld1             {v6.8B},     [x0], x2
+        uaddw           v28.8H, v31.8H, v4.8B
+        ld1             {v7.8B},     [x0], x2
+        uaddw           v29.8H, v31.8H, v5.8B
+        uaddw           v30.8H, v31.8H, v6.8B
+        uaddw           v31.8H, v31.8H, v7.8B
+        sqxtun          v0.8B,  v24.8H
+        sqxtun          v1.8B,  v25.8H
+        sqxtun          v2.8B,  v26.8H
+        sqxtun          v3.8B,  v27.8H
+        sub             x0,  x0,  x2, lsl #3
+        st1             {v0.8B},     [x0], x2
+        sqxtun          v4.8B,  v28.8H
+        st1             {v1.8B},     [x0], x2
+        sqxtun          v5.8B,  v29.8H
+        st1             {v2.8B},     [x0], x2
+        sqxtun          v6.8B,  v30.8H
+        st1             {v3.8B},     [x0], x2
+        sqxtun          v7.8B,  v31.8H
+        st1             {v4.8B},     [x0], x2
+        st1             {v5.8B},     [x0], x2
+        st1             {v6.8B},     [x0], x2
+        st1             {v7.8B},     [x0], x2
+        ret
+endfunc
+
+function ff_h264_idct8_add4_neon, export=1
+        mov             x12, x30
+        mov             x6,  x0
+        mov             x5,  x1
+        mov             x1,  x2
+        mov             w2,  w3
+        movrel          x7,  scan8
+        mov             w10, #16
+        movrel          x13, .L_ff_h264_idct8_dc_add_neon
+        movrel          x14, .L_ff_h264_idct8_add_neon
+1:      ldrb            w9,  [x7], #4
+        ldrsw           x0,  [x5], #16
+        ldrb            w9,  [x4, w9, UXTW]
+        subs            w9,  w9,  #1
+        b.lt            2f
+        ldrsh           w11,  [x1]
+        add             x0,  x6,  x0
+        ccmp            w11, #0,  #4,  eq
+        csel            x15, x13, x14, ne
+        blr             x15
+2:      subs            w10, w10, #4
+        add             x1,  x1,  #128
+        b.ne            1b
+        ret             x12
+endfunc
+
+const   scan8
+        .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
+        .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
+        .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
+        .byte           6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
+        .byte           4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
+        .byte           6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
+        .byte           4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
+        .byte           6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
+        .byte           4+11*8, 5+11*8, 4+12*8, 5+12*8
+        .byte           6+11*8, 7+11*8, 6+12*8, 7+12*8
+        .byte           4+13*8, 5+13*8, 4+14*8, 5+14*8
+        .byte           6+13*8, 7+13*8, 6+14*8, 7+14*8
+endconst
--- a/externals/ffmpeg/libavcodec/aarch64/h264pred_init.c
+++ b/externals/ffmpeg/libavcodec/aarch64/h264pred_init.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/h264pred.h"
+
+void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride);
+
+void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
+
+static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
+                                        const int bit_depth,
+                                        const int chroma_format_idc)
+{
+    const int high_depth = bit_depth > 8;
+
+    if (high_depth)
+        return;
+
+    if (chroma_format_idc <= 1) {
+        h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
+        h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
+        if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
+            h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
+        h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon;
+        if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
+            codec_id != AV_CODEC_ID_VP8) {
+            h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon;
+            h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
+            h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
+            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
+            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
+            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
+            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
+        }
+    }
+
+    h->pred16x16[DC_PRED8x8     ] = ff_pred16x16_dc_neon;
+    h->pred16x16[VERT_PRED8x8   ] = ff_pred16x16_vert_neon;
+    h->pred16x16[HOR_PRED8x8    ] = ff_pred16x16_hor_neon;
+    h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
+    h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
+    h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
+    if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
+        codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
+        h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
+}
+
+av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
+                                       int bit_depth, const int chroma_format_idc)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags))
+        h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
+}
--- a/externals/ffmpeg/libavcodec/aarch64/h264pred_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/h264pred_neon.S
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
+.if \n >= 8 || \hi == 0
+        ld1             {\rd\().b}[0],  [\rs], \rt
+        ld1             {\rd\().b}[1],  [\rs], \rt
+        ld1             {\rd\().b}[2],  [\rs], \rt
+        ld1             {\rd\().b}[3],  [\rs], \rt
+.endif
+.if \n >= 8 || \hi == 1
+        ld1             {\rd\().b}[4],  [\rs], \rt
+        ld1             {\rd\().b}[5],  [\rs], \rt
+        ld1             {\rd\().b}[6],  [\rs], \rt
+        ld1             {\rd\().b}[7],  [\rs], \rt
+.endif
+.if \n == 16
+        ld1             {\rd\().b}[8],  [\rs], \rt
+        ld1             {\rd\().b}[9],  [\rs], \rt
+        ld1             {\rd\().b}[10], [\rs], \rt
+        ld1             {\rd\().b}[11], [\rs], \rt
+        ld1             {\rd\().b}[12], [\rs], \rt
+        ld1             {\rd\().b}[13], [\rs], \rt
+        ld1             {\rd\().b}[14], [\rs], \rt
+        ld1             {\rd\().b}[15], [\rs], \rt
+.endif
+.endm
+
+function ff_pred16x16_128_dc_neon, export=1
+        movi            v0.16b,  #128
+        b               .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_top_dc_neon, export=1
+        sub             x2,  x0,  x1
+        ld1             {v0.16b},  [x2]
+        uaddlv          h0,  v0.16b
+        rshrn           v0.8b,  v0.8h,  #4
+        dup             v0.16b, v0.b[0]
+        b               .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_left_dc_neon, export=1
+        sub             x2,  x0,  #1
+        ldcol.8         v0,  x2,  x1, 16
+        uaddlv          h0,  v0.16b
+        rshrn           v0.8b,  v0.8h,  #4
+        dup             v0.16b, v0.b[0]
+        b               .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_dc_neon, export=1
+        sub             x2,  x0,  x1
+        sub             x3,  x0,  #1
+        ld1             {v0.16b}, [x2]
+        ldcol.8         v1,  x3,  x1, 16
+        uaddlv          h0,  v0.16b
+        uaddlv          h1,  v1.16b
+        add             v0.4h,  v0.4h,  v1.4h
+        rshrn           v0.8b,  v0.8h,  #5
+        dup             v0.16b, v0.b[0]
+.L_pred16x16_dc_end:
+        mov             w3,  #8
+6:      st1             {v0.16b}, [x0], x1
+        st1             {v0.16b}, [x0], x1
+        subs            w3,  w3,  #1
+        b.ne            6b
+        ret
+endfunc
+
+function ff_pred16x16_hor_neon, export=1
+        sub             x2,  x0,  #1
+        mov             w3,  #16
+1:      ld1r            {v0.16b}, [x2], x1
+        st1             {v0.16b}, [x0], x1
+        subs            w3,  w3,  #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred16x16_vert_neon, export=1
+        sub             x2,  x0,  x1
+        add             x1,  x1,  x1
+        ld1             {v0.16b}, [x2], x1
+        mov             w3,  #8
+1:      st1             {v0.16b}, [x0], x1
+        st1             {v0.16b}, [x2], x1
+        subs            w3,  w3,  #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred16x16_plane_neon, export=1
+        sub             x3,  x0,  x1
+        movrel          x4,  p16weight
+        add             x2,  x3,  #8
+        sub             x3,  x3,  #1
+        ld1             {v0.8b},  [x3]
+        ld1             {v2.8b},  [x2], x1
+        ldcol.8         v1,  x3,  x1
+        add             x3,  x3,  x1
+        ldcol.8         v3,  x3,  x1
+        rev64           v0.8b,  v0.8b
+        rev64           v1.8b,  v1.8b
+        uaddl           v7.8h,  v2.8b,  v3.8b
+        usubl           v2.8h,  v2.8b,  v0.8b
+        usubl           v3.8h,  v3.8b,  v1.8b
+        ld1             {v0.8h},     [x4]
+        mul             v2.8h,  v2.8h,  v0.8h
+        mul             v3.8h,  v3.8h,  v0.8h
+        addp            v2.8h,  v2.8h,  v3.8h
+        addp            v2.8h,  v2.8h,  v2.8h
+        addp            v2.4h,  v2.4h,  v2.4h
+        sshll           v3.4s,  v2.4h,  #2
+        saddw           v2.4s,  v3.4s,  v2.4h
+        rshrn           v4.4h,  v2.4s,  #6
+        trn2            v5.4h,  v4.4h,  v4.4h
+        add             v2.4h,  v4.4h,  v5.4h
+        shl             v3.4h,  v2.4h,  #3
+        ext             v7.16b, v7.16b, v7.16b, #14
+        sub             v3.4h,  v3.4h,  v2.4h   // 7 * (b + c)
+        add             v7.4h,  v7.4h,  v0.4h
+        shl             v2.4h,  v7.4h,  #4
+        sub             v2.4h,  v2.4h,  v3.4h
+        shl             v3.4h,  v4.4h,  #4
+        ext             v0.16b, v0.16b, v0.16b, #14
+        sub             v6.4h,  v5.4h,  v3.4h
+        mov             v0.h[0],  wzr
+        mul             v0.8h,  v0.8h,  v4.h[0]
+        dup             v1.8h,  v2.h[0]
+        dup             v2.8h,  v4.h[0]
+        dup             v3.8h,  v6.h[0]
+        shl             v2.8h,  v2.8h,  #3
+        add             v1.8h,  v1.8h,  v0.8h
+        add             v3.8h,  v3.8h,  v2.8h
+        mov             w3,  #16
+1:
+        sqshrun         v0.8b,  v1.8h,  #5
+        add             v1.8h,  v1.8h,  v2.8h
+        sqshrun2        v0.16b, v1.8h,  #5
+        add             v1.8h,  v1.8h,  v3.8h
+        st1             {v0.16b}, [x0], x1
+        subs            w3,  w3,  #1
+        b.ne            1b
+        ret
+endfunc
+
+const   p16weight, align=4
+        .short          1,2,3,4,5,6,7,8
+endconst
+const   p8weight, align=4
+        .short          1,2,3,4,1,2,3,4
+endconst
+
+function ff_pred8x8_hor_neon, export=1
+        sub             x2,  x0,  #1
+        mov             w3,  #8
+1:      ld1r            {v0.8b},  [x2], x1
+        st1             {v0.8b},  [x0], x1
+        subs            w3,  w3,  #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred8x8_vert_neon, export=1
+        sub             x2,  x0,  x1
+        lsl             x1,  x1,  #1
+        ld1             {v0.8b},  [x2], x1
+        mov             w3,  #4
+1:      st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x2], x1
+        subs            w3,  w3,  #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred8x8_plane_neon, export=1
+        sub             x3,  x0,  x1
+        movrel          x4,  p8weight
+        movrel          x5,  p16weight
+        add             x2,  x3,  #4
+        sub             x3,  x3,  #1
+        ld1             {v0.s}[0],  [x3]
+        ld1             {v2.s}[0],  [x2], x1
+        ldcol.8         v0,  x3,  x1,  4,  hi=1
+        add             x3,  x3,  x1
+        ldcol.8         v3,  x3,  x1,  4
+        uaddl           v7.8h,  v2.8b,  v3.8b
+        rev32           v0.8b,  v0.8b
+        trn1            v2.2s,  v2.2s,  v3.2s
+        usubl           v2.8h,  v2.8b,  v0.8b
+        ld1             {v6.8h},  [x4]
+        mul             v2.8h,  v2.8h,  v6.8h
+        ld1             {v0.8h},  [x5]
+        saddlp          v2.4s,  v2.8h
+        addp            v2.4s,  v2.4s,  v2.4s
+        shl             v3.4s,  v2.4s,  #4
+        add             v2.4s,  v3.4s,  v2.4s
+        rshrn           v5.4h,  v2.4s,  #5
+        addp            v2.4h,  v5.4h,  v5.4h
+        shl             v3.4h,  v2.4h,  #1
+        add             v3.4h,  v3.4h,  v2.4h
+        rev64           v7.4h,  v7.4h
+        add             v7.4h,  v7.4h,  v0.4h
+        shl             v2.4h,  v7.4h,  #4
+        sub             v2.4h,  v2.4h,  v3.4h
+        ext             v0.16b, v0.16b, v0.16b, #14
+        mov             v0.h[0],  wzr
+        mul             v0.8h,  v0.8h,  v5.h[0]
+        dup             v1.8h,  v2.h[0]
+        dup             v2.8h,  v5.h[1]
+        add             v1.8h,  v1.8h,  v0.8h
+        mov             w3,  #8
+1:
+        sqshrun         v0.8b,  v1.8h,  #5
+        add             v1.8h,  v1.8h,  v2.8h
+        st1             {v0.8b},  [x0], x1
+        subs            w3,  w3,  #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred8x8_128_dc_neon, export=1
+        movi            v0.8b,  #128
+        movi            v1.8b,  #128
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_top_dc_neon, export=1
+        sub             x2,  x0,  x1
+        ld1             {v0.8b},  [x2]
+        uaddlp          v0.4h,  v0.8b
+        addp            v0.4h,  v0.4h,  v0.4h
+        zip1            v0.8h,  v0.8h,  v0.8h
+        rshrn           v2.8b,  v0.8h,  #2
+        zip1            v0.8b,  v2.8b,  v2.8b
+        zip1            v1.8b,  v2.8b,  v2.8b
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_left_dc_neon, export=1
+        sub             x2,  x0,  #1
+        ldcol.8         v0,  x2,  x1
+        uaddlp          v0.4h,  v0.8b
+        addp            v0.4h,  v0.4h,  v0.4h
+        rshrn           v2.8b,  v0.8h,  #2
+        dup             v1.8b,  v2.b[1]
+        dup             v0.8b,  v2.b[0]
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_dc_neon, export=1
+        sub             x2,  x0,  x1
+        sub             x3,  x0,  #1
+        ld1             {v0.8b}, [x2]
+        ldcol.8         v1,  x3,  x1
+        uaddlp          v0.4h,  v0.8b
+        uaddlp          v1.4h,  v1.8b
+        trn1            v2.2s,  v0.2s,  v1.2s
+        trn2            v3.2s,  v0.2s,  v1.2s
+        addp            v4.4h,  v2.4h,  v3.4h
+        addp            v5.4h,  v4.4h,  v4.4h
+        rshrn           v6.8b,  v5.8h,  #3
+        rshrn           v7.8b,  v4.8h,  #2
+        dup             v0.8b,  v6.b[0]
+        dup             v2.8b,  v7.b[2]
+        dup             v1.8b,  v7.b[3]
+        dup             v3.8b,  v6.b[1]
+        zip1            v0.2s,  v0.2s,  v2.2s
+        zip1            v1.2s,  v1.2s,  v3.2s
+.L_pred8x8_dc_end:
+        mov             w3,  #4
+        add             x2,  x0,  x1,  lsl #2
+6:      st1             {v0.8b},  [x0], x1
+        st1             {v1.8b},  [x2], x1
+        subs            w3,  w3,  #1
+        b.ne            6b
+        ret
+endfunc
+
+function ff_pred8x8_l0t_dc_neon, export=1
+        sub             x2,  x0,  x1
+        sub             x3,  x0,  #1
+        ld1             {v0.8b},  [x2]
+        ldcol.8         v1,  x3,  x1,  4
+        zip1            v0.4s,  v0.4s,  v1.4s
+        uaddlp          v0.8h,  v0.16b
+        addp            v0.8h,  v0.8h,  v0.8h
+        addp            v1.4h,  v0.4h,  v0.4h
+        rshrn           v2.8b,  v0.8h,  #2
+        rshrn           v3.8b,  v1.8h,  #3
+        dup             v4.8b,  v3.b[0]
+        dup             v6.8b,  v2.b[2]
+        dup             v5.8b,  v2.b[0]
+        zip1            v0.2s,  v4.2s,  v6.2s
+        zip1            v1.2s,  v5.2s,  v6.2s
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_l00_dc_neon, export=1
+        sub             x2,  x0,  #1
+        ldcol.8         v0,  x2,  x1,  4
+        uaddlp          v0.4h,  v0.8b
+        addp            v0.4h,  v0.4h,  v0.4h
+        rshrn           v0.8b,  v0.8h,  #2
+        movi            v1.8b,  #128
+        dup             v0.8b,  v0.b[0]
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_0lt_dc_neon, export=1
+        add             x3,  x0,  x1,  lsl #2
+        sub             x2,  x0,  x1
+        sub             x3,  x3,  #1
+        ld1             {v0.8b},  [x2]
+        ldcol.8         v1,  x3,  x1,  4,  hi=1
+        zip1            v0.4s,  v0.4s,  v1.4s
+        uaddlp          v0.8h,  v0.16b
+        addp            v0.8h,  v0.8h,  v0.8h
+        addp            v1.4h,  v0.4h,  v0.4h
+        rshrn           v2.8b,  v0.8h,  #2
+        rshrn           v3.8b,  v1.8h,  #3
+        dup             v4.8b,  v2.b[0]
+        dup             v5.8b,  v2.b[3]
+        dup             v6.8b,  v2.b[2]
+        dup             v7.8b,  v3.b[1]
+        zip1            v0.2s,  v4.2s,  v6.2s
+        zip1            v1.2s,  v5.2s,  v7.2s
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_0l0_dc_neon, export=1
+        add             x2,  x0,  x1,  lsl #2
+        sub             x2,  x2,  #1
+        ldcol.8         v1,  x2,  x1,  4
+        uaddlp          v2.4h,  v1.8b
+        addp            v2.4h,  v2.4h,  v2.4h
+        rshrn           v1.8b,  v2.8h,  #2
+        movi            v0.8b,  #128
+        dup             v1.8b,  v1.b[0]
+        b               .L_pred8x8_dc_end
+endfunc
--- a/externals/ffmpeg/libavcodec/aarch64/h264qpel_init_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/h264qpel_init_aarch64.c
@@ -0,0 +1,172 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/h264qpel.h"
+
+void ff_put_h264_qpel16_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_put_h264_qpel8_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel16_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel8_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
+{
+    const int high_bit_depth = bit_depth > 8;
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags) && !high_bit_depth) {
+        c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
+        c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
+        c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
+        c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
+        c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
+        c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
+        c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
+        c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
+        c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
+        c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
+        c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
+        c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
+        c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
+        c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
+        c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
+        c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
+
+        c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon;
+        c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
+        c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
+        c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
+        c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
+        c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
+        c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
+        c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
+        c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
+        c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
+        c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
+        c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
+        c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
+        c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
+        c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
+        c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
+
+        c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon;
+        c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon;
+        c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon;
+        c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon;
+        c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon;
+        c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon;
+        c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon;
+
+        c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon;
+        c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon;
+        c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon;
+        c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon;
+        c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
+        c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
+        c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/h264qpel_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/h264qpel_neon.S
@@ -0,0 +1,934 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+        /* H.264 qpel MC */
+
+.macro  lowpass_const   r
+        movz            \r, #20, lsl #16
+        movk            \r, #5
+        mov             v6.S[0], \r
+.endm
+
+//trashes v0-v5
+.macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
+        ext             v2.8B,      \r0\().8B, \r1\().8B, #2
+        ext             v3.8B,      \r0\().8B, \r1\().8B, #3
+        uaddl           v2.8H,      v2.8B,     v3.8B
+        ext             v4.8B,      \r0\().8B, \r1\().8B, #1
+        ext             v5.8B,      \r0\().8B, \r1\().8B, #4
+        uaddl           v4.8H,      v4.8B,     v5.8B
+        ext             v1.8B,      \r0\().8B, \r1\().8B, #5
+        uaddl           \d0\().8H,  \r0\().8B, v1.8B
+        ext             v0.8B,      \r2\().8B, \r3\().8B, #2
+        mla             \d0\().8H,  v2.8H,     v6.H[1]
+        ext             v1.8B,      \r2\().8B, \r3\().8B, #3
+        uaddl           v0.8H,      v0.8B,     v1.8B
+        ext             v1.8B,      \r2\().8B, \r3\().8B, #1
+        mls             \d0\().8H,  v4.8H,     v6.H[0]
+        ext             v3.8B,      \r2\().8B, \r3\().8B, #4
+        uaddl           v1.8H,      v1.8B,     v3.8B
+        ext             v2.8B,      \r2\().8B, \r3\().8B, #5
+        uaddl           \d1\().8H,  \r2\().8B, v2.8B
+        mla             \d1\().8H,  v0.8H,     v6.H[1]
+        mls             \d1\().8H,  v1.8H,     v6.H[0]
+  .if \narrow
+        sqrshrun        \d0\().8B,  \d0\().8H, #5
+        sqrshrun        \d1\().8B,  \d1\().8H, #5
+  .endif
+.endm
+
+//trashes v0-v5, v7, v30-v31
+.macro  lowpass_8H      r0,  r1
+        ext             v0.16B,     \r0\().16B, \r0\().16B, #2
+        ext             v1.16B,     \r0\().16B, \r0\().16B, #3
+        uaddl           v0.8H,      v0.8B,      v1.8B
+        ext             v2.16B,     \r0\().16B, \r0\().16B, #1
+        ext             v3.16B,     \r0\().16B, \r0\().16B, #4
+        uaddl           v2.8H,      v2.8B,      v3.8B
+        ext             v30.16B,    \r0\().16B, \r0\().16B, #5
+        uaddl           \r0\().8H,  \r0\().8B,  v30.8B
+        ext             v4.16B,     \r1\().16B, \r1\().16B, #2
+        mla             \r0\().8H,  v0.8H,      v6.H[1]
+        ext             v5.16B,     \r1\().16B, \r1\().16B, #3
+        uaddl           v4.8H,      v4.8B,      v5.8B
+        ext             v7.16B,     \r1\().16B, \r1\().16B, #1
+        mls             \r0\().8H,  v2.8H,      v6.H[0]
+        ext             v0.16B,     \r1\().16B, \r1\().16B, #4
+        uaddl           v7.8H,      v7.8B,      v0.8B
+        ext             v31.16B,    \r1\().16B, \r1\().16B, #5
+        uaddl           \r1\().8H,  \r1\().8B,  v31.8B
+        mla             \r1\().8H,  v4.8H,      v6.H[1]
+        mls             \r1\().8H,  v7.8H,      v6.H[0]
+.endm
+
+// trashes v2-v5, v30
+.macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
+        ext             v2.8B,     \r0\().8B, \r1\().8B, #2
+        ext             v3.8B,     \r0\().8B, \r1\().8B, #3
+        uaddl           v2.8H,     v2.8B,     v3.8B
+        ext             v4.8B,     \r0\().8B, \r1\().8B, #1
+        ext             v5.8B,     \r0\().8B, \r1\().8B, #4
+        uaddl           v4.8H,     v4.8B,     v5.8B
+        ext             v30.8B,    \r0\().8B, \r1\().8B, #5
+        uaddl           \d0\().8H, \r0\().8B, v30.8B
+        mla             \d0\().8H, v2.8H,     v6.H[1]
+        mls             \d0\().8H, v4.8H,     v6.H[0]
+  .if \narrow
+        sqrshrun        \d0\().8B, \d0\().8H, #5
+  .endif
+.endm
+
+// trashed v0-v7
+.macro  lowpass_8.16    r0,  r1,  r2
+        ext             v1.16B,     \r0\().16B, \r1\().16B, #4
+        ext             v0.16B,     \r0\().16B, \r1\().16B, #6
+        saddl           v5.4S,      v1.4H,      v0.4H
+        ext             v2.16B,     \r0\().16B, \r1\().16B, #2
+        saddl2          v1.4S,      v1.8H,      v0.8H
+        ext             v3.16B,     \r0\().16B, \r1\().16B, #8
+        saddl           v6.4S,      v2.4H,      v3.4H
+        ext             \r1\().16B, \r0\().16B, \r1\().16B, #10
+        saddl2          v2.4S,      v2.8H,      v3.8H
+        saddl           v0.4S,      \r0\().4H,  \r1\().4H
+        saddl2          v4.4S,      \r0\().8H,  \r1\().8H
+
+        shl             v3.4S,  v5.4S,  #4
+        shl             v5.4S,  v5.4S,  #2
+        shl             v7.4S,  v6.4S,  #2
+        add             v5.4S,  v5.4S,  v3.4S
+        add             v6.4S,  v6.4S,  v7.4S
+
+        shl             v3.4S,  v1.4S,  #4
+        shl             v1.4S,  v1.4S,  #2
+        shl             v7.4S,  v2.4S,  #2
+        add             v1.4S,  v1.4S,  v3.4S
+        add             v2.4S,  v2.4S,  v7.4S
+
+        add             v5.4S,  v5.4S,  v0.4S
+        sub             v5.4S,  v5.4S,  v6.4S
+
+        add             v1.4S,  v1.4S,  v4.4S
+        sub             v1.4S,  v1.4S,  v2.4S
+
+        rshrn           v5.4H,  v5.4S,  #10
+        rshrn2          v5.8H,  v1.4S,  #10
+
+        sqxtun          \r2\().8B,  v5.8H
+.endm
+
+function put_h264_qpel16_h_lowpass_neon_packed
+        mov             x4,  x30
+        mov             x12, #16
+        mov             x3,  #8
+        bl              put_h264_qpel8_h_lowpass_neon
+        sub             x1,  x1,  x2, lsl #4
+        add             x1,  x1,  #8
+        mov             x12, #16
+        mov             x30, x4
+        b               put_h264_qpel8_h_lowpass_neon
+endfunc
+
+.macro  h264_qpel_h_lowpass type
+function \type\()_h264_qpel16_h_lowpass_neon
+        mov             x13, x30
+        mov             x12, #16
+        bl              \type\()_h264_qpel8_h_lowpass_neon
+        sub             x0,  x0,  x3, lsl #4
+        sub             x1,  x1,  x2, lsl #4
+        add             x0,  x0,  #8
+        add             x1,  x1,  #8
+        mov             x12, #16
+        mov             x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_neon
+1:      ld1             {v28.8B, v29.8B}, [x1], x2
+        ld1             {v16.8B, v17.8B}, [x1], x2
+        subs            x12, x12, #2
+        lowpass_8       v28, v29, v16, v17, v28, v16
+  .ifc \type,avg
+        ld1             {v2.8B},    [x0], x3
+        urhadd          v28.8B, v28.8B,  v2.8B
+        ld1             {v3.8B},    [x0]
+        urhadd          v16.8B, v16.8B, v3.8B
+        sub             x0,  x0,  x3
+  .endif
+        st1             {v28.8B},    [x0], x3
+        st1             {v16.8B},    [x0], x3
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+        h264_qpel_h_lowpass put
+        h264_qpel_h_lowpass avg
+
+.macro  h264_qpel_h_lowpass_l2 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon
+        mov             x13, x30
+        mov             x12, #16
+        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
+        sub             x0,  x0,  x2, lsl #4
+        sub             x1,  x1,  x2, lsl #4
+        sub             x3,  x3,  x2, lsl #4
+        add             x0,  x0,  #8
+        add             x1,  x1,  #8
+        add             x3,  x3,  #8
+        mov             x12, #16
+        mov             x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_l2_neon
+1:      ld1             {v26.8B, v27.8B}, [x1], x2
+        ld1             {v16.8B, v17.8B}, [x1], x2
+        ld1             {v28.8B},     [x3], x2
+        ld1             {v29.8B},     [x3], x2
+        subs            x12, x12, #2
+        lowpass_8       v26, v27, v16, v17, v26, v27
+        urhadd          v26.8B, v26.8B, v28.8B
+        urhadd          v27.8B, v27.8B, v29.8B
+  .ifc \type,avg
+        ld1             {v2.8B},      [x0], x2
+        urhadd          v26.8B, v26.8B, v2.8B
+        ld1             {v3.8B},      [x0]
+        urhadd          v27.8B, v27.8B, v3.8B
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v26.8B},     [x0], x2
+        st1             {v27.8B},     [x0], x2
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+        h264_qpel_h_lowpass_l2 put
+        h264_qpel_h_lowpass_l2 avg
+
+function put_h264_qpel16_v_lowpass_neon_packed
+        mov             x4,  x30
+        mov             x2,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+        b               put_h264_qpel8_v_lowpass_neon
+endfunc
+
+.macro  h264_qpel_v_lowpass type
+function \type\()_h264_qpel16_v_lowpass_neon
+        mov             x4,  x30
+        bl              \type\()_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_v_lowpass_neon
+        sub             x0,  x0,  x2, lsl #4
+        add             x0,  x0,  #8
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        bl              \type\()_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_neon
+        ld1             {v16.8B}, [x1], x3
+        ld1             {v18.8B}, [x1], x3
+        ld1             {v20.8B}, [x1], x3
+        ld1             {v22.8B}, [x1], x3
+        ld1             {v24.8B}, [x1], x3
+        ld1             {v26.8B}, [x1], x3
+        ld1             {v28.8B}, [x1], x3
+        ld1             {v30.8B}, [x1], x3
+        ld1             {v17.8B}, [x1], x3
+        ld1             {v19.8B}, [x1], x3
+        ld1             {v21.8B}, [x1], x3
+        ld1             {v23.8B}, [x1], x3
+        ld1             {v25.8B}, [x1]
+
+        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
+        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
+        lowpass_8       v16, v17, v18, v19, v16, v17
+        lowpass_8       v20, v21, v22, v23, v18, v19
+        lowpass_8       v24, v25, v26, v27, v20, v21
+        lowpass_8       v28, v29, v30, v31, v22, v23
+        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+
+  .ifc \type,avg
+        ld1             {v24.8B},  [x0], x2
+        urhadd          v16.8B, v16.8B, v24.8B
+        ld1             {v25.8B}, [x0], x2
+        urhadd          v17.8B, v17.8B, v25.8B
+        ld1             {v26.8B}, [x0], x2
+        urhadd          v18.8B, v18.8B, v26.8B
+        ld1             {v27.8B}, [x0], x2
+        urhadd          v19.8B, v19.8B, v27.8B
+        ld1             {v28.8B}, [x0], x2
+        urhadd          v20.8B, v20.8B, v28.8B
+        ld1             {v29.8B}, [x0], x2
+        urhadd          v21.8B, v21.8B, v29.8B
+        ld1             {v30.8B}, [x0], x2
+        urhadd          v22.8B, v22.8B, v30.8B
+        ld1             {v31.8B}, [x0], x2
+        urhadd          v23.8B, v23.8B, v31.8B
+        sub             x0,  x0,  x2,  lsl #3
+  .endif
+
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
+        st1             {v18.8B}, [x0], x2
+        st1             {v19.8B}, [x0], x2
+        st1             {v20.8B}, [x0], x2
+        st1             {v21.8B}, [x0], x2
+        st1             {v22.8B}, [x0], x2
+        st1             {v23.8B}, [x0], x2
+
+        ret
+endfunc
+.endm
+
+        h264_qpel_v_lowpass put
+        h264_qpel_v_lowpass avg
+
+.macro  h264_qpel_v_lowpass_l2 type
+function \type\()_h264_qpel16_v_lowpass_l2_neon
+        mov             x4,  x30
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        sub             x0,  x0,  x3, lsl #4
+        sub             x12, x12, x2, lsl #4
+        add             x0,  x0,  #8
+        add             x12, x12, #8
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_l2_neon
+        ld1             {v16.8B}, [x1], x3
+        ld1             {v18.8B}, [x1], x3
+        ld1             {v20.8B}, [x1], x3
+        ld1             {v22.8B}, [x1], x3
+        ld1             {v24.8B}, [x1], x3
+        ld1             {v26.8B}, [x1], x3
+        ld1             {v28.8B}, [x1], x3
+        ld1             {v30.8B}, [x1], x3
+        ld1             {v17.8B}, [x1], x3
+        ld1             {v19.8B}, [x1], x3
+        ld1             {v21.8B}, [x1], x3
+        ld1             {v23.8B}, [x1], x3
+        ld1             {v25.8B}, [x1]
+
+        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
+        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
+        lowpass_8       v16, v17, v18, v19, v16, v17
+        lowpass_8       v20, v21, v22, v23, v18, v19
+        lowpass_8       v24, v25, v26, v27, v20, v21
+        lowpass_8       v28, v29, v30, v31, v22, v23
+        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+
+        ld1             {v24.8B},  [x12], x2
+        ld1             {v25.8B},  [x12], x2
+        ld1             {v26.8B},  [x12], x2
+        ld1             {v27.8B},  [x12], x2
+        ld1             {v28.8B},  [x12], x2
+        urhadd          v16.8B, v24.8B, v16.8B
+        urhadd          v17.8B, v25.8B, v17.8B
+        ld1             {v29.8B},  [x12], x2
+        urhadd          v18.8B, v26.8B, v18.8B
+        urhadd          v19.8B, v27.8B, v19.8B
+        ld1             {v30.8B}, [x12], x2
+        urhadd          v20.8B, v28.8B, v20.8B
+        urhadd          v21.8B, v29.8B, v21.8B
+        ld1             {v31.8B}, [x12], x2
+        urhadd          v22.8B, v30.8B, v22.8B
+        urhadd          v23.8B, v31.8B, v23.8B
+
+  .ifc \type,avg
+        ld1             {v24.8B}, [x0], x3
+        urhadd          v16.8B, v16.8B, v24.8B
+        ld1             {v25.8B}, [x0], x3
+        urhadd          v17.8B, v17.8B, v25.8B
+        ld1             {v26.8B}, [x0], x3
+        urhadd          v18.8B, v18.8B, v26.8B
+        ld1             {v27.8B}, [x0], x3
+        urhadd          v19.8B, v19.8B, v27.8B
+        ld1             {v28.8B}, [x0], x3
+        urhadd          v20.8B, v20.8B, v28.8B
+        ld1             {v29.8B}, [x0], x3
+        urhadd          v21.8B, v21.8B, v29.8B
+        ld1             {v30.8B}, [x0], x3
+        urhadd          v22.8B, v22.8B, v30.8B
+        ld1             {v31.8B}, [x0], x3
+        urhadd          v23.8B, v23.8B, v31.8B
+        sub             x0,  x0,  x3,  lsl #3
+  .endif
+
+        st1             {v16.8B}, [x0], x3
+        st1             {v17.8B}, [x0], x3
+        st1             {v18.8B}, [x0], x3
+        st1             {v19.8B}, [x0], x3
+        st1             {v20.8B}, [x0], x3
+        st1             {v21.8B}, [x0], x3
+        st1             {v22.8B}, [x0], x3
+        st1             {v23.8B}, [x0], x3
+
+        ret
+endfunc
+.endm
+
+        h264_qpel_v_lowpass_l2 put
+        h264_qpel_v_lowpass_l2 avg
+
+function put_h264_qpel8_hv_lowpass_neon_top
+        lowpass_const   w12
+        ld1             {v16.8H}, [x1], x3
+        ld1             {v17.8H}, [x1], x3
+        ld1             {v18.8H}, [x1], x3
+        ld1             {v19.8H}, [x1], x3
+        ld1             {v20.8H}, [x1], x3
+        ld1             {v21.8H}, [x1], x3
+        ld1             {v22.8H}, [x1], x3
+        ld1             {v23.8H}, [x1], x3
+        ld1             {v24.8H}, [x1], x3
+        ld1             {v25.8H}, [x1], x3
+        ld1             {v26.8H}, [x1], x3
+        ld1             {v27.8H}, [x1], x3
+        ld1             {v28.8H}, [x1]
+        lowpass_8H      v16, v17
+        lowpass_8H      v18, v19
+        lowpass_8H      v20, v21
+        lowpass_8H      v22, v23
+        lowpass_8H      v24, v25
+        lowpass_8H      v26, v27
+        lowpass_8H      v28, v29
+
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0,  v1
+
+        lowpass_8.16    v16, v24, v16
+        lowpass_8.16    v17, v25, v17
+
+        lowpass_8.16    v18, v26, v18
+        lowpass_8.16    v19, v27, v19
+
+        lowpass_8.16    v20, v28, v20
+        lowpass_8.16    v21, v29, v21
+
+        lowpass_8.16    v22, v30, v22
+        lowpass_8.16    v23, v31, v23
+
+        transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+
+        ret
+endfunc
+
+.macro  h264_qpel8_hv_lowpass type
+function \type\()_h264_qpel8_hv_lowpass_neon
+        mov             x10, x30
+        bl              put_h264_qpel8_hv_lowpass_neon_top
+  .ifc \type,avg
+        ld1             {v0.8B},      [x0], x2
+        urhadd          v16.8B, v16.8B, v0.8B
+        ld1             {v1.8B},      [x0], x2
+        urhadd          v17.8B, v17.8B, v1.8B
+        ld1             {v2.8B},      [x0], x2
+        urhadd          v18.8B, v18.8B, v2.8B
+        ld1             {v3.8B},      [x0], x2
+        urhadd          v19.8B, v19.8B, v3.8B
+        ld1             {v4.8B},      [x0], x2
+        urhadd          v20.8B, v20.8B, v4.8B
+        ld1             {v5.8B},      [x0], x2
+        urhadd          v21.8B, v21.8B, v5.8B
+        ld1             {v6.8B},      [x0], x2
+        urhadd          v22.8B, v22.8B, v6.8B
+        ld1             {v7.8B},      [x0], x2
+        urhadd          v23.8B, v23.8B, v7.8B
+        sub             x0,  x0,  x2,  lsl #3
+  .endif
+
+        st1             {v16.8B},     [x0], x2
+        st1             {v17.8B},     [x0], x2
+        st1             {v18.8B},     [x0], x2
+        st1             {v19.8B},     [x0], x2
+        st1             {v20.8B},     [x0], x2
+        st1             {v21.8B},     [x0], x2
+        st1             {v22.8B},     [x0], x2
+        st1             {v23.8B},     [x0], x2
+
+        ret             x10
+endfunc
+.endm
+
+        h264_qpel8_hv_lowpass put
+        h264_qpel8_hv_lowpass avg
+
+.macro  h264_qpel8_hv_lowpass_l2 type
+function \type\()_h264_qpel8_hv_lowpass_l2_neon
+        mov             x10, x30
+        bl              put_h264_qpel8_hv_lowpass_neon_top
+
+        ld1             {v0.8B, v1.8B},  [x2], #16
+        ld1             {v2.8B, v3.8B},  [x2], #16
+        urhadd          v0.8B,  v0.8B,  v16.8B
+        urhadd          v1.8B,  v1.8B,  v17.8B
+        ld1             {v4.8B, v5.8B},  [x2], #16
+        urhadd          v2.8B,  v2.8B,  v18.8B
+        urhadd          v3.8B,  v3.8B,  v19.8B
+        ld1             {v6.8B, v7.8B},  [x2], #16
+        urhadd          v4.8B,  v4.8B,  v20.8B
+        urhadd          v5.8B,  v5.8B,  v21.8B
+        urhadd          v6.8B,  v6.8B,  v22.8B
+        urhadd          v7.8B,  v7.8B,  v23.8B
+  .ifc \type,avg
+        ld1             {v16.8B},     [x0], x3
+        urhadd          v0.8B,  v0.8B,  v16.8B
+        ld1             {v17.8B},     [x0], x3
+        urhadd          v1.8B,  v1.8B,  v17.8B
+        ld1             {v18.8B},     [x0], x3
+        urhadd          v2.8B,  v2.8B,  v18.8B
+        ld1             {v19.8B},     [x0], x3
+        urhadd          v3.8B,  v3.8B,  v19.8B
+        ld1             {v20.8B},     [x0], x3
+        urhadd          v4.8B,  v4.8B,  v20.8B
+        ld1             {v21.8B},     [x0], x3
+        urhadd          v5.8B,  v5.8B,  v21.8B
+        ld1             {v22.8B},     [x0], x3
+        urhadd          v6.8B,  v6.8B,  v22.8B
+        ld1             {v23.8B},     [x0], x3
+        urhadd          v7.8B,  v7.8B,  v23.8B
+        sub             x0,  x0,  x3,  lsl #3
+  .endif
+        st1             {v0.8B},      [x0], x3
+        st1             {v1.8B},      [x0], x3
+        st1             {v2.8B},      [x0], x3
+        st1             {v3.8B},      [x0], x3
+        st1             {v4.8B},      [x0], x3
+        st1             {v5.8B},      [x0], x3
+        st1             {v6.8B},      [x0], x3
+        st1             {v7.8B},      [x0], x3
+
+        ret             x10
+endfunc
+.endm
+
+        h264_qpel8_hv_lowpass_l2 put
+        h264_qpel8_hv_lowpass_l2 avg
+
+.macro  h264_qpel16_hv  type
+function \type\()_h264_qpel16_hv_lowpass_neon
+        mov             x13, x30
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        sub             x0,  x0,  x2, lsl #4
+        add             x0,  x0,  #8
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x13
+        b               \type\()_h264_qpel8_hv_lowpass_neon
+endfunc
+
+function \type\()_h264_qpel16_hv_lowpass_l2_neon
+        mov             x13, x30
+        sub             x2,  x4,  #256
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        sub             x0,  x0,  x3, lsl #4
+        add             x0,  x0,  #8
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x13
+        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
+endfunc
+.endm
+
+        h264_qpel16_hv put
+        h264_qpel16_hv avg
+
+.macro  h264_qpel8      type
+function ff_\type\()_h264_qpel8_mc10_neon, export=1
+        lowpass_const   w3
+        mov             x3,  x1
+        sub             x1,  x1,  #2
+        mov             x12, #8
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc20_neon, export=1
+        lowpass_const   w3
+        sub             x1,  x1,  #2
+        mov             x3,  x2
+        mov             x12, #8
+        b               \type\()_h264_qpel8_h_lowpass_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc30_neon, export=1
+        lowpass_const   w3
+        add             x3,  x1,  #1
+        sub             x1,  x1,  #2
+        mov             x12, #8
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc01_neon, export=1
+        mov             x14, x30
+        mov             x12, x1
+\type\()_h264_qpel8_mc01:
+        lowpass_const   w3
+        mov             x3,  x2
+        sub             x1,  x1,  x2, lsl #1
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc11_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel8_mc11:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #64
+        mov             x0,  sp
+        sub             x1,  x1,  #2
+        mov             x3,  #8
+        mov             x12, #8
+        bl              put_h264_qpel8_h_lowpass_neon
+        mov             x0,  x8
+        mov             x3,  x2
+        mov             x12, sp
+        sub             x1,  x9,  x2, lsl #1
+        mov             x2,  #8
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc21_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel8_mc21:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #(8*8+16*12)
+        sub             x1,  x1,  #2
+        mov             x3,  #8
+        mov             x0,  sp
+        mov             x12, #8
+        bl              put_h264_qpel8_h_lowpass_neon
+        mov             x4,  x0
+        mov             x0,  x8
+        sub             x1,  x9,  x2, lsl #1
+        sub             x1,  x1,  #2
+        mov             x3,  x2
+        sub             x2,  x4,  #64
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc31_neon, export=1
+        add             x1,  x1,  #1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        sub             x1,  x1,  #1
+        b               \type\()_h264_qpel8_mc11
+endfunc
+
+function ff_\type\()_h264_qpel8_mc02_neon, export=1
+        mov             x14, x30
+        lowpass_const   w3
+        sub             x1,  x1,  x2, lsl #1
+        mov             x3,  x2
+        bl              \type\()_h264_qpel8_v_lowpass_neon
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc12_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel8_mc12:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #(8*8+16*12)
+        sub             x1,  x1,  x2, lsl #1
+        mov             x3,  x2
+        mov             x2,  #8
+        mov             x0,  sp
+        bl              put_h264_qpel8_v_lowpass_neon
+        mov             x4,  x0
+        mov             x0,  x8
+        sub             x1,  x9,  x3, lsl #1
+        sub             x1,  x1,  #2
+        sub             x2,  x4,  #64
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc22_neon, export=1
+        mov             x14, x30
+        mov             x11, sp
+        sub             x1,  x1,  x2, lsl #1
+        sub             x1,  x1,  #2
+        mov             x3,  x2
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc32_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  #1
+        b               \type\()_h264_qpel8_mc12
+endfunc
+
+function ff_\type\()_h264_qpel8_mc03_neon, export=1
+        mov             x14, x30
+        add             x12, x1,  x2
+        b               \type\()_h264_qpel8_mc01
+endfunc
+
+function ff_\type\()_h264_qpel8_mc13_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        b               \type\()_h264_qpel8_mc11
+endfunc
+
+function ff_\type\()_h264_qpel8_mc23_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        b               \type\()_h264_qpel8_mc21
+endfunc
+
+function ff_\type\()_h264_qpel8_mc33_neon, export=1
+        add             x1,  x1,  #1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        sub             x1,  x1,  #1
+        b               \type\()_h264_qpel8_mc11
+endfunc
+.endm
+
+        h264_qpel8 put
+        h264_qpel8 avg
+
+.macro  h264_qpel16     type
+function ff_\type\()_h264_qpel16_mc10_neon, export=1
+        lowpass_const   w3
+        mov             x3,  x1
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc20_neon, export=1
+        lowpass_const   w3
+        sub             x1,  x1,  #2
+        mov             x3,  x2
+        b               \type\()_h264_qpel16_h_lowpass_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc30_neon, export=1
+        lowpass_const   w3
+        add             x3,  x1,  #1
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc01_neon, export=1
+        mov             x14, x30
+        mov             x12, x1
+\type\()_h264_qpel16_mc01:
+        lowpass_const   w3
+        mov             x3,  x2
+        sub             x1,  x1,  x2, lsl #1
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc11_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel16_mc11:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #256
+        mov             x0,  sp
+        sub             x1,  x1,  #2
+        mov             x3,  #16
+        bl              put_h264_qpel16_h_lowpass_neon
+        mov             x0,  x8
+        mov             x3,  x2
+        mov             x12, sp
+        sub             x1,  x9,  x2, lsl #1
+        mov             x2,  #16
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc21_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel16_mc21:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #(16*16+16*12)
+        sub             x1,  x1,  #2
+        mov             x0,  sp
+        bl              put_h264_qpel16_h_lowpass_neon_packed
+        mov             x4,  x0
+        mov             x0,  x8
+        sub             x1,  x9,  x2, lsl #1
+        sub             x1,  x1,  #2
+        mov             x3,  x2
+        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc31_neon, export=1
+        add             x1,  x1,  #1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        sub             x1,  x1,  #1
+        b               \type\()_h264_qpel16_mc11
+endfunc
+
+function ff_\type\()_h264_qpel16_mc02_neon, export=1
+        mov             x14, x30
+        lowpass_const   w3
+        sub             x1,  x1,  x2, lsl #1
+        mov             x3,  x2
+        bl              \type\()_h264_qpel16_v_lowpass_neon
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc12_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel16_mc12:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #(16*16+16*12)
+        sub             x1,  x1,  x2, lsl #1
+        mov             x0,  sp
+        mov             x3,  x2
+        bl              put_h264_qpel16_v_lowpass_neon_packed
+        mov             x4,  x0
+        mov             x0,  x8
+        sub             x1,  x9,  x3, lsl #1
+        sub             x1,  x1,  #2
+        mov             x2,  x3
+        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc22_neon, export=1
+        mov             x14, x30
+        lowpass_const   w3
+        mov             x11, sp
+        sub             x1,  x1,  x2, lsl #1
+        sub             x1,  x1,  #2
+        mov             x3,  x2
+        bl              \type\()_h264_qpel16_hv_lowpass_neon
+        mov             sp,  x11 // restore stack
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc32_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  #1
+        b               \type\()_h264_qpel16_mc12
+endfunc
+
+function ff_\type\()_h264_qpel16_mc03_neon, export=1
+        mov             x14, x30
+        add             x12, x1,  x2
+        b               \type\()_h264_qpel16_mc01
+endfunc
+
+function ff_\type\()_h264_qpel16_mc13_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        b               \type\()_h264_qpel16_mc11
+endfunc
+
+function ff_\type\()_h264_qpel16_mc23_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        b               \type\()_h264_qpel16_mc21
+endfunc
+
+function ff_\type\()_h264_qpel16_mc33_neon, export=1
+        add             x1,  x1,  #1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        sub             x1,  x1,  #1
+        b               \type\()_h264_qpel16_mc11
+endfunc
+.endm
+
+        h264_qpel16 put
+        h264_qpel16 avg
--- a/externals/ffmpeg/libavcodec/aarch64/hpeldsp_init_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/hpeldsp_init_aarch64.c
@@ -0,0 +1,123 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/hpeldsp.h"
+
+void     ff_put_pixels16_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void  ff_put_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void  ff_put_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void      ff_put_pixels8_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void   ff_put_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void   ff_put_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void  ff_put_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+
+void  ff_put_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void  ff_put_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void   ff_put_pixels8_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void   ff_put_pixels8_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void  ff_put_pixels8_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+
+void     ff_avg_pixels16_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void  ff_avg_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void  ff_avg_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void      ff_avg_pixels8_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void   ff_avg_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void   ff_avg_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void  ff_avg_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+
+void  ff_avg_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void  ff_avg_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+
+av_cold void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
+        c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
+        c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
+        c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
+        c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
+        c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
+        c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
+        c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
+
+        c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
+        c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
+        c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
+        c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
+        c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
+        c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
+        c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
+        c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
+
+        c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
+        c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
+        c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
+        c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
+        c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
+        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
+        c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
+        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
+
+        c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
+        c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
+        c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
+        c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/hpeldsp_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/hpeldsp_neon.S
@@ -0,0 +1,397 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro  pixels16        rnd=1, avg=0
+  .if \avg
+        mov             x12, x0
+  .endif
+1:      ld1             {v0.16B},  [x1], x2
+        ld1             {v1.16B},  [x1], x2
+        ld1             {v2.16B},  [x1], x2
+        ld1             {v3.16B},  [x1], x2
+  .if \avg
+        ld1             {v4.16B},  [x12], x2
+        urhadd          v0.16B,  v0.16B,  v4.16B
+        ld1             {v5.16B},  [x12], x2
+        urhadd          v1.16B,  v1.16B,  v5.16B
+        ld1             {v6.16B},  [x12], x2
+        urhadd          v2.16B,  v2.16B,  v6.16B
+        ld1             {v7.16B},  [x12], x2
+        urhadd          v3.16B,  v3.16B,  v7.16B
+  .endif
+        subs            w3,  w3,  #4
+        st1             {v0.16B},  [x0], x2
+        st1             {v1.16B},  [x0], x2
+        st1             {v2.16B},  [x0], x2
+        st1             {v3.16B},  [x0], x2
+        b.ne            1b
+        ret
+.endm
+
+.macro  pixels16_x2     rnd=1, avg=0
+1:      ld1             {v0.16B, v1.16B}, [x1], x2
+        ld1             {v2.16B, v3.16B}, [x1], x2
+        subs            w3,  w3,  #2
+        ext             v1.16B,  v0.16B,  v1.16B,  #1
+        avg             v0.16B,  v0.16B,  v1.16B
+        ext             v3.16B,  v2.16B,  v3.16B,  #1
+        avg             v2.16B,  v2.16B,  v3.16B
+  .if \avg
+        ld1             {v1.16B}, [x0], x2
+        ld1             {v3.16B}, [x0]
+        urhadd          v0.16B,  v0.16B,  v1.16B
+        urhadd          v2.16B,  v2.16B,  v3.16B
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v0.16B}, [x0], x2
+        st1             {v2.16B}, [x0], x2
+        b.ne            1b
+        ret
+.endm
+
+.macro  pixels16_y2     rnd=1, avg=0
+        sub             w3,  w3,  #2
+        ld1             {v0.16B}, [x1], x2
+        ld1             {v1.16B}, [x1], x2
+1:      subs            w3,  w3,  #2
+        avg             v2.16B,  v0.16B,  v1.16B
+        ld1             {v0.16B}, [x1], x2
+        avg             v3.16B,  v0.16B,  v1.16B
+        ld1             {v1.16B}, [x1], x2
+  .if \avg
+        ld1             {v4.16B}, [x0], x2
+        ld1             {v5.16B}, [x0]
+        urhadd          v2.16B,  v2.16B,  v4.16B
+        urhadd          v3.16B,  v3.16B,  v5.16B
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v2.16B}, [x0], x2
+        st1             {v3.16B}, [x0], x2
+        b.ne            1b
+
+        avg             v2.16B,  v0.16B,  v1.16B
+        ld1             {v0.16B}, [x1], x2
+        avg             v3.16B,  v0.16B,  v1.16B
+  .if \avg
+        ld1             {v4.16B}, [x0], x2
+        ld1             {v5.16B}, [x0]
+        urhadd          v2.16B,  v2.16B,  v4.16B
+        urhadd          v3.16B,  v3.16B,  v5.16B
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v2.16B},     [x0], x2
+        st1             {v3.16B},     [x0], x2
+
+        ret
+.endm
+
+.macro  pixels16_xy2    rnd=1, avg=0
+        sub             w3,  w3,  #2
+        ld1             {v0.16B, v1.16B}, [x1], x2
+        ld1             {v4.16B, v5.16B}, [x1], x2
+NRND    movi            v26.8H, #1
+        ext             v1.16B,  v0.16B,  v1.16B,  #1
+        ext             v5.16B,  v4.16B,  v5.16B,  #1
+        uaddl           v16.8H,  v0.8B,   v1.8B
+        uaddl2          v20.8H,  v0.16B,  v1.16B
+        uaddl           v18.8H,  v4.8B,   v5.8B
+        uaddl2          v22.8H,  v4.16B,  v5.16B
+1:      subs            w3,  w3,  #2
+        ld1             {v0.16B, v1.16B}, [x1], x2
+        add             v24.8H,  v16.8H,  v18.8H
+NRND    add             v24.8H,  v24.8H,  v26.8H
+        ext             v30.16B, v0.16B,  v1.16B,  #1
+        add             v1.8H,   v20.8H,  v22.8H
+        mshrn           v28.8B,  v24.8H,  #2
+NRND    add             v1.8H,   v1.8H,   v26.8H
+        mshrn2          v28.16B, v1.8H,   #2
+  .if \avg
+        ld1             {v16.16B},        [x0]
+        urhadd          v28.16B, v28.16B, v16.16B
+  .endif
+        uaddl           v16.8H,  v0.8B,   v30.8B
+        ld1             {v2.16B, v3.16B}, [x1], x2
+        uaddl2          v20.8H,  v0.16B,  v30.16B
+        st1             {v28.16B},        [x0], x2
+        add             v24.8H,  v16.8H,  v18.8H
+NRND    add             v24.8H,  v24.8H,  v26.8H
+        ext             v3.16B,  v2.16B,  v3.16B,  #1
+        add             v0.8H,   v20.8H,  v22.8H
+        mshrn           v30.8B,  v24.8H,  #2
+NRND    add             v0.8H,   v0.8H,   v26.8H
+        mshrn2          v30.16B, v0.8H,   #2
+  .if \avg
+        ld1             {v18.16B},        [x0]
+        urhadd          v30.16B, v30.16B, v18.16B
+  .endif
+        uaddl           v18.8H,   v2.8B,  v3.8B
+        uaddl2          v22.8H,   v2.16B, v3.16B
+        st1             {v30.16B},        [x0], x2
+        b.gt            1b
+
+        ld1             {v0.16B, v1.16B}, [x1], x2
+        add             v24.8H,  v16.8H,  v18.8H
+NRND    add             v24.8H,  v24.8H,  v26.8H
+        ext             v30.16B, v0.16B,  v1.16B,  #1
+        add             v1.8H,   v20.8H,  v22.8H
+        mshrn           v28.8B,  v24.8H,  #2
+NRND    add             v1.8H,   v1.8H,   v26.8H
+        mshrn2          v28.16B, v1.8H,   #2
+  .if \avg
+        ld1             {v16.16B},        [x0]
+        urhadd          v28.16B, v28.16B, v16.16B
+  .endif
+        uaddl           v16.8H,  v0.8B,   v30.8B
+        uaddl2          v20.8H,  v0.16B,  v30.16B
+        st1             {v28.16B},        [x0], x2
+        add             v24.8H,  v16.8H,  v18.8H
+NRND    add             v24.8H,  v24.8H,  v26.8H
+        add             v0.8H,   v20.8H,  v22.8H
+        mshrn           v30.8B,  v24.8H,  #2
+NRND    add             v0.8H,   v0.8H,   v26.8H
+        mshrn2          v30.16B, v0.8H,   #2
+  .if \avg
+        ld1             {v18.16B},        [x0]
+        urhadd          v30.16B, v30.16B, v18.16B
+  .endif
+        st1             {v30.16B},        [x0], x2
+
+        ret
+.endm
+
+.macro  pixels8         rnd=1, avg=0
+1:      ld1             {v0.8B}, [x1], x2
+        ld1             {v1.8B}, [x1], x2
+        ld1             {v2.8B}, [x1], x2
+        ld1             {v3.8B}, [x1], x2
+  .if \avg
+        ld1             {v4.8B}, [x0], x2
+        urhadd          v0.8B,  v0.8B,  v4.8B
+        ld1             {v5.8B}, [x0], x2
+        urhadd          v1.8B,  v1.8B,  v5.8B
+        ld1             {v6.8B}, [x0], x2
+        urhadd          v2.8B,  v2.8B,  v6.8B
+        ld1             {v7.8B}, [x0], x2
+        urhadd          v3.8B,  v3.8B,  v7.8B
+        sub             x0,  x0,  x2,  lsl #2
+  .endif
+        subs            w3,  w3,  #4
+        st1             {v0.8B}, [x0], x2
+        st1             {v1.8B}, [x0], x2
+        st1             {v2.8B}, [x0], x2
+        st1             {v3.8B}, [x0], x2
+        b.ne            1b
+        ret
+.endm
+
+.macro  pixels8_x2      rnd=1, avg=0
+1:      ld1             {v0.8B, v1.8B}, [x1], x2
+        ext             v1.8B,  v0.8B,  v1.8B,  #1
+        ld1             {v2.8B, v3.8B}, [x1], x2
+        ext             v3.8B,  v2.8B,  v3.8B,  #1
+        subs            w3,  w3,  #2
+        avg             v0.8B,   v0.8B,   v1.8B
+        avg             v2.8B,   v2.8B,   v3.8B
+  .if \avg
+        ld1             {v4.8B},     [x0], x2
+        ld1             {v5.8B},     [x0]
+        urhadd          v0.8B,   v0.8B,   v4.8B
+        urhadd          v2.8B,   v2.8B,   v5.8B
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v0.8B}, [x0], x2
+        st1             {v2.8B}, [x0], x2
+        b.ne            1b
+        ret
+.endm
+
+.macro  pixels8_y2      rnd=1, avg=0
+        sub             w3,  w3,  #2
+        ld1             {v0.8B},  [x1], x2
+        ld1             {v1.8B},  [x1], x2
+1:      subs            w3,  w3,  #2
+        avg             v4.8B,  v0.8B,  v1.8B
+        ld1             {v0.8B},  [x1], x2
+        avg             v5.8B,  v0.8B,  v1.8B
+        ld1             {v1.8B},  [x1], x2
+  .if \avg
+        ld1             {v2.8B},     [x0], x2
+        ld1             {v3.8B},     [x0]
+        urhadd          v4.8B,  v4.8B,  v2.8B
+        urhadd          v5.8B,  v5.8B,  v3.8B
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v4.8B},     [x0], x2
+        st1             {v5.8B},     [x0], x2
+        b.ne            1b
+
+        avg             v4.8B,  v0.8B,  v1.8B
+        ld1             {v0.8B},  [x1], x2
+        avg             v5.8B,  v0.8B,  v1.8B
+  .if \avg
+        ld1             {v2.8B},     [x0], x2
+        ld1             {v3.8B},     [x0]
+        urhadd          v4.8B,  v4.8B,  v2.8B
+        urhadd          v5.8B,  v5.8B,  v3.8B
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v4.8B},     [x0], x2
+        st1             {v5.8B},     [x0], x2
+
+        ret
+.endm
+
+.macro  pixels8_xy2     rnd=1, avg=0
+        sub             w3,  w3,  #2
+        ld1             {v0.16B},     [x1], x2
+        ld1             {v1.16B},     [x1], x2
+NRND    movi            v19.8H, #1
+        ext             v4.16B,  v0.16B,  v4.16B,  #1
+        ext             v6.16B,  v1.16B,  v6.16B,  #1
+        uaddl           v16.8H,  v0.8B,  v4.8B
+        uaddl           v17.8H,  v1.8B,  v6.8B
+1:      subs            w3,  w3,  #2
+        ld1             {v0.16B},     [x1], x2
+        add             v18.8H, v16.8H,  v17.8H
+        ext             v4.16B,  v0.16B,  v4.16B,  #1
+NRND    add             v18.8H, v18.8H, v19.8H
+        uaddl           v16.8H,  v0.8B,  v4.8B
+        mshrn           v5.8B,  v18.8H, #2
+        ld1             {v1.16B},     [x1], x2
+        add             v18.8H, v16.8H,  v17.8H
+  .if \avg
+        ld1             {v7.8B},     [x0]
+        urhadd          v5.8B,  v5.8B,  v7.8B
+  .endif
+NRND    add             v18.8H, v18.8H, v19.8H
+        st1             {v5.8B},     [x0], x2
+        mshrn           v7.8B,  v18.8H, #2
+  .if \avg
+        ld1             {v5.8B},     [x0]
+        urhadd          v7.8B,  v7.8B,  v5.8B
+  .endif
+        ext             v6.16B,  v1.16B,  v6.16B,  #1
+        uaddl           v17.8H,  v1.8B,   v6.8B
+        st1             {v7.8B},     [x0], x2
+        b.gt            1b
+
+        ld1             {v0.16B},     [x1], x2
+        add             v18.8H, v16.8H, v17.8H
+        ext             v4.16B, v0.16B, v4.16B,  #1
+NRND    add             v18.8H, v18.8H, v19.8H
+        uaddl           v16.8H,  v0.8B, v4.8B
+        mshrn           v5.8B,  v18.8H, #2
+        add             v18.8H, v16.8H, v17.8H
+  .if \avg
+        ld1             {v7.8B},     [x0]
+        urhadd          v5.8B,  v5.8B,  v7.8B
+  .endif
+NRND    add             v18.8H, v18.8H, v19.8H
+        st1             {v5.8B},     [x0], x2
+        mshrn           v7.8B,  v18.8H, #2
+  .if \avg
+        ld1             {v5.8B},     [x0]
+        urhadd          v7.8B,  v7.8B,  v5.8B
+  .endif
+        st1             {v7.8B},     [x0], x2
+
+        ret
+.endm
+
+.macro  pixfunc         pfx, name, suf, rnd=1, avg=0
+  .if \rnd
+    .macro avg  rd, rn, rm
+        urhadd          \rd, \rn, \rm
+    .endm
+    .macro mshrn rd, rn, rm
+        rshrn           \rd, \rn, \rm
+    .endm
+    .macro mshrn2 rd, rn, rm
+        rshrn2          \rd, \rn, \rm
+    .endm
+    .macro NRND insn:vararg
+    .endm
+  .else
+    .macro avg  rd, rn, rm
+        uhadd           \rd, \rn, \rm
+    .endm
+    .macro mshrn rd, rn, rm
+        shrn            \rd, \rn, \rm
+    .endm
+    .macro mshrn2 rd, rn, rm
+        shrn2           \rd, \rn, \rm
+    .endm
+    .macro NRND insn:vararg
+        \insn
+    .endm
+  .endif
+function ff_\pfx\name\suf\()_neon, export=1
+        \name           \rnd, \avg
+endfunc
+        .purgem         avg
+        .purgem         mshrn
+        .purgem         mshrn2
+        .purgem         NRND
+.endm
+
+.macro  pixfunc2        pfx, name, avg=0
+        pixfunc         \pfx, \name,          rnd=1, avg=\avg
+        pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
+.endm
+
+function ff_put_h264_qpel16_mc00_neon, export=1
+        mov             w3,  #16
+endfunc
+
+        pixfunc         put_, pixels16,     avg=0
+        pixfunc2        put_, pixels16_x2,  avg=0
+        pixfunc2        put_, pixels16_y2,  avg=0
+        pixfunc2        put_, pixels16_xy2, avg=0
+
+function ff_avg_h264_qpel16_mc00_neon, export=1
+        mov             w3,  #16
+endfunc
+
+        pixfunc         avg_, pixels16,     avg=1
+        pixfunc2        avg_, pixels16_x2,  avg=1
+        pixfunc2        avg_, pixels16_y2,  avg=1
+        pixfunc2        avg_, pixels16_xy2, avg=1
+
+function ff_put_h264_qpel8_mc00_neon, export=1
+        mov             w3,  #8
+endfunc
+
+        pixfunc         put_, pixels8,     avg=0
+        pixfunc2        put_, pixels8_x2,  avg=0
+        pixfunc2        put_, pixels8_y2,  avg=0
+        pixfunc2        put_, pixels8_xy2, avg=0
+
+function ff_avg_h264_qpel8_mc00_neon, export=1
+        mov             w3,  #8
+endfunc
+
+        pixfunc         avg_, pixels8,     avg=1
+        pixfunc         avg_, pixels8_x2,  avg=1
+        pixfunc         avg_, pixels8_y2,  avg=1
+        pixfunc         avg_, pixels8_xy2, avg=1
--- a/externals/ffmpeg/libavcodec/aarch64/idct.h
+++ b/externals/ffmpeg/libavcodec/aarch64/idct.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_IDCT_H
+#define AVCODEC_AARCH64_IDCT_H
+
+#include <stdint.h>
+
+void ff_simple_idct_neon(int16_t *data);
+void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+
+#endif /* AVCODEC_AARCH64_IDCT_H */
--- a/externals/ffmpeg/libavcodec/aarch64/idctdsp_init_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/idctdsp_init_aarch64.c
@@ -0,0 +1,45 @@
+/*
+ * ARM-NEON-optimized IDCT functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idct.h"
+
+av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+                                     unsigned high_bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) {
+        if (avctx->idct_algo == FF_IDCT_AUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+            c->idct_put  = ff_simple_idct_put_neon;
+            c->idct_add  = ff_simple_idct_add_neon;
+            c->idct      = ff_simple_idct_neon;
+            c->perm_type = FF_IDCT_PERM_PARTTRANS;
+        }
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/mdct_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/mdct_neon.S
@@ -0,0 +1,323 @@
+/*
+ * AArch64 NEON optimised MDCT
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_imdct_half_neon, export=1
+        sub             sp,  sp,  #32
+        stp             x19, x20, [sp]
+        str             x30, [sp, #16]
+        mov             x12, #1
+        ldr             w14, [x0, #28]          // mdct_bits
+        ldr             x4,  [x0, #32]          // tcos
+        ldr             x3,  [x0, #8]           // revtab
+        lsl             x12, x12, x14           // n  = 1 << nbits
+        lsr             x14, x12, #2            // n4 = n >> 2
+        add             x7,  x2,  x12,  lsl #1
+        mov             x12, #-16
+        sub             x7,  x7,  #16
+
+        ld2             {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0
+        ld2             {v0.2s,v1.2s},   [x2], #16 // d0 =m0,x d1 =m1,x
+        rev64           v17.2s, v17.2s
+        ld2             {v2.2s,v3.2s},   [x4], #16 // d2=c0,c1 d3=s0,s2
+        fmul            v6.2s,  v17.2s, v2.2s
+        fmul            v7.2s,  v0.2s,  v2.2s
+1:
+        subs            x14, x14, #2
+        ldr             w6,  [x3], #4
+        fmul            v4.2s,  v0.2s,  v3.2s
+        fmul            v5.2s,  v17.2s, v3.2s
+        fsub            v4.2s,  v6.2s,  v4.2s
+        fadd            v5.2s,  v5.2s,  v7.2s
+        ubfm            x8,  x6,  #16, #31
+        ubfm            x6,  x6,  #0,  #15
+        add             x8,  x1,  x8,  lsl #3
+        add             x6,  x1,  x6,  lsl #3
+        b.eq            2f
+        ld2             {v16.2s,v17.2s}, [x7], x12
+        ld2             {v0.2s,v1.2s},   [x2], #16
+        rev64           v17.2s, v17.2s
+        ld2             {v2.2s,v3.2s},   [x4], #16    // d2=c0,c1 d3=s0,s2
+        fmul            v6.2s,  v17.2s, v2.2s
+        fmul            v7.2s,  v0.2s,  v2.2s
+        st2             {v4.s,v5.s}[0], [x6]
+        st2             {v4.s,v5.s}[1], [x8]
+        b               1b
+2:
+        st2             {v4.s,v5.s}[0], [x6]
+        st2             {v4.s,v5.s}[1], [x8]
+
+        mov             x19, x0
+        mov             x20, x1
+        bl              X(ff_fft_calc_neon)
+
+        mov             x12, #1
+        ldr             w14, [x19, #28]          // mdct_bits
+        ldr             x4,  [x19, #32]          // tcos
+        lsl             x12, x12, x14            // n  = 1 << nbits
+        lsr             x14, x12, #3             // n8 = n >> 3
+
+        add             x4,  x4,  x14, lsl #3
+        add             x6,  x20, x14, lsl #3
+        sub             x1,  x4,  #16
+        sub             x3,  x6,  #16
+
+        mov             x7,  #-16
+        mov             x8,  x6
+        mov             x0,  x3
+
+        ld2             {v0.2s,v1.2s},  [x3], x7 // d0 =i1,r1 d1 =i0,r0
+        ld2             {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3
+        ld2             {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
+3:
+        subs            x14, x14, #2
+        fmul            v7.2s,  v0.2s,  v17.2s
+        ld2             {v18.2s,v19.2s},[x4], #16    // d17=c2,c3 d19=s2,s3
+        fmul            v4.2s,  v1.2s,  v17.2s
+        fmul            v6.2s,  v21.2s, v19.2s
+        fmul            v5.2s,  v20.2s, v19.2s
+        fmul            v22.2s, v1.2s,  v16.2s
+        fmul            v23.2s, v21.2s, v18.2s
+        fmul            v24.2s, v0.2s,  v16.2s
+        fmul            v25.2s, v20.2s, v18.2s
+        fadd            v7.2s,  v7.2s,  v22.2s
+        fadd            v5.2s,  v5.2s,  v23.2s
+        fsub            v4.2s,  v4.2s,  v24.2s
+        fsub            v6.2s,  v6.2s,  v25.2s
+        b.eq            4f
+        ld2             {v0.2s,v1.2s},  [x3], x7
+        ld2             {v20.2s,v21.2s},[x6], #16
+        ld2             {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
+        rev64           v5.2s,  v5.2s
+        rev64           v7.2s,  v7.2s
+        st2             {v4.2s,v5.2s},  [x0], x7
+        st2             {v6.2s,v7.2s},  [x8], #16
+        b               3b
+4:
+        rev64           v5.2s,  v5.2s
+        rev64           v7.2s,  v7.2s
+        st2             {v4.2s,v5.2s},  [x0]
+        st2             {v6.2s,v7.2s},  [x8]
+
+        ldp             x19, x20, [sp]
+        ldr             x30, [sp, #16]
+        add             sp,  sp,  #32
+
+        ret
+endfunc
+
+function ff_imdct_calc_neon, export=1
+        sub             sp,  sp,  #32
+        stp             x19, x20, [sp]
+        str             x30, [sp, #16]
+        ldr             w3,  [x0, #28]          // mdct_bits
+        mov             x19, #1
+        mov             x20, x1
+        lsl             x19, x19, x3
+        add             x1,  x1,  x19
+
+        bl              X(ff_imdct_half_neon)
+
+        add             x0,  x20, x19,  lsl #2
+        add             x1,  x20, x19,  lsl #1
+        sub             x0,  x0,  #8
+        sub             x2,  x1,  #16
+        mov             x3,  #-16
+        mov             x6,  #-8
+1:
+        ld1             {v0.4s}, [x2], x3
+        prfum           pldl1keep, [x0, #-16]
+        rev64           v0.4s, v0.4s
+        ld1             {v2.2s,v3.2s}, [x1], #16
+        fneg            v4.4s,  v0.4s
+        prfum           pldl1keep, [x2, #-16]
+        rev64           v2.2s, v2.2s
+        rev64           v3.2s, v3.2s
+        ext             v4.16b, v4.16b, v4.16b, #8
+        st1             {v2.2s}, [x0], x6
+        st1             {v3.2s}, [x0], x6
+        st1             {v4.4s}, [x20], #16
+        subs            x19, x19,  #16
+        b.gt            1b
+
+        ldp             x19, x20, [sp], #16
+        ldr             x30, [sp], #16
+
+        ret
+endfunc
+
+
+function ff_mdct_calc_neon, export=1
+        sub             sp,  sp,  #32
+        stp             x19, x20, [sp]
+        str             x30, [sp, #16]
+
+        mov             x12, #1
+        ldr             w14, [x0, #28]          // mdct_bits
+        ldr             x4,  [x0, #32]          // tcos
+        ldr             x3,  [x0, #8]           // revtab
+        lsl             x14, x12, x14           // n  = 1 << nbits
+        add             x7,  x2,  x14           // in4u
+        sub             x9,  x7,  #16           // in4d
+        add             x2,  x7,  x14, lsl #1   // in3u
+        add             x8,  x9,  x14, lsl #1   // in3d
+        add             x5,  x4,  x14, lsl #1
+        sub             x5,  x5,  #16
+        sub             x3,  x3,  #4
+        mov             x12, #-16
+        lsr             x13, x14, #1
+
+        ld2             {v16.2s,v17.2s}, [x9], x12  // in0u0,in0u1 in4d1,in4d0
+        ld2             {v18.2s,v19.2s}, [x8], x12  // in2u0,in2u1 in3d1,in3d0
+        ld2             {v0.2s, v1.2s},  [x7], #16  // in4u0,in4u1 in2d1,in2d0
+        rev64           v17.2s, v17.2s              // in4d0,in4d1 in3d0,in3d1
+        rev64           v19.2s, v19.2s              // in4d0,in4d1 in3d0,in3d1
+        ld2             {v2.2s, v3.2s},  [x2], #16  // in3u0,in3u1 in1d1,in1d0
+        fsub            v0.2s,  v17.2s, v0.2s       // in4d-in4u      I
+        ld2             {v20.2s,v21.2s}, [x4], #16  // c0,c1 s0,s1
+        rev64           v1.2s,  v1.2s               // in2d0,in2d1 in1d0,in1d1
+        rev64           v3.2s,  v3.2s               // in2d0,in2d1 in1d0,in1d1
+        ld2             {v30.2s,v31.2s}, [x5], x12  // c2,c3 s2,s3
+        fadd            v2.2s,  v2.2s,  v19.2s      // in3u+in3d     -R
+        fsub            v16.2s, v16.2s, v1.2s       // in0u-in2d      R
+        fadd            v18.2s, v18.2s, v3.2s       // in2u+in1d     -I
+1:
+        fmul            v7.2s,  v0.2s,  v21.2s      //  I*s
+        ldr             w10, [x3, x13]
+        fmul            v6.2s,  v2.2s,  v20.2s      // -R*c
+        ldr             w6,  [x3, #4]!
+        fmul            v4.2s,  v2.2s,  v21.2s      // -R*s
+        fmul            v5.2s,  v0.2s,  v20.2s      //  I*c
+        fmul            v24.2s, v16.2s, v30.2s      //  R*c
+        fmul            v25.2s, v18.2s, v31.2s      // -I*s
+        fmul            v22.2s, v16.2s, v31.2s      //  R*s
+        fmul            v23.2s, v18.2s, v30.2s      //  I*c
+        subs            x14, x14, #16
+        subs            x13, x13, #8
+        fsub            v6.2s,  v6.2s,  v7.2s       // -R*c-I*s
+        fadd            v7.2s,  v4.2s,  v5.2s       // -R*s+I*c
+        fsub            v24.2s, v25.2s, v24.2s      // I*s-R*c
+        fadd            v25.2s, v22.2s, v23.2s      // R*s-I*c
+        b.eq            1f
+        mov             x12, #-16
+        ld2             {v16.2s,v17.2s}, [x9], x12  // in0u0,in0u1 in4d1,in4d0
+        ld2             {v18.2s,v19.2s}, [x8], x12  // in2u0,in2u1 in3d1,in3d0
+        fneg            v7.2s,  v7.2s               //  R*s-I*c
+        ld2             {v0.2s, v1.2s},  [x7], #16  // in4u0,in4u1 in2d1,in2d0
+        rev64           v17.2s, v17.2s              // in4d0,in4d1 in3d0,in3d1
+        rev64           v19.2s, v19.2s              // in4d0,in4d1 in3d0,in3d1
+        ld2             {v2.2s, v3.2s},  [x2], #16  // in3u0,in3u1 in1d1,in1d0
+        fsub            v0.2s,  v17.2s, v0.2s       // in4d-in4u      I
+        ld2             {v20.2s,v21.2s}, [x4], #16  // c0,c1 s0,s1
+        rev64           v1.2s,  v1.2s               // in2d0,in2d1 in1d0,in1d1
+        rev64           v3.2s,  v3.2s               // in2d0,in2d1 in1d0,in1d1
+        ld2             {v30.2s,v31.2s}, [x5], x12  // c2,c3 s2,s3
+        fadd            v2.2s,  v2.2s,  v19.2s      // in3u+in3d     -R
+        fsub            v16.2s, v16.2s, v1.2s       // in0u-in2d      R
+        fadd            v18.2s, v18.2s, v3.2s       // in2u+in1d     -I
+        ubfm            x12, x6,  #16, #31
+        ubfm            x6,  x6,  #0,  #15
+        add             x12, x1,  x12, lsl #3
+        add             x6,  x1,  x6,  lsl #3
+        st2             {v6.s,v7.s}[0],   [x6]
+        st2             {v6.s,v7.s}[1],   [x12]
+        ubfm            x6,  x10, #16, #31
+        ubfm            x10, x10, #0,  #15
+        add             x6 , x1,  x6,  lsl #3
+        add             x10, x1,  x10, lsl #3
+        st2             {v24.s,v25.s}[0], [x10]
+        st2             {v24.s,v25.s}[1], [x6]
+        b               1b
+1:
+        fneg            v7.2s,  v7.2s           //  R*s-I*c
+        ubfm            x12, x6,  #16, #31
+        ubfm            x6,  x6,  #0,  #15
+        add             x12, x1,  x12, lsl #3
+        add             x6,  x1,  x6,  lsl #3
+        st2             {v6.s,v7.s}[0],   [x6]
+        st2             {v6.s,v7.s}[1],   [x12]
+        ubfm            x6,  x10, #16, #31
+        ubfm            x10, x10, #0,  #15
+        add             x6 , x1,  x6,  lsl #3
+        add             x10, x1,  x10, lsl #3
+        st2             {v24.s,v25.s}[0], [x10]
+        st2             {v24.s,v25.s}[1], [x6]
+
+        mov             x19, x0
+        mov             x20, x1
+        bl              X(ff_fft_calc_neon)
+
+        mov             x12, #1
+        ldr             w14, [x19, #28]         // mdct_bits
+        ldr             x4,  [x19, #32]         // tcos
+        lsl             x12, x12, x14           // n  = 1 << nbits
+        lsr             x14, x12, #3            // n8 = n >> 3
+
+        add             x4,  x4,  x14, lsl #3
+        add             x6,  x20, x14, lsl #3
+        sub             x1,  x4,  #16
+        sub             x3,  x6,  #16
+
+        mov             x7,  #-16
+        mov             x8,  x6
+        mov             x0,  x3
+
+        ld2             {v0.2s,v1.2s},   [x3], x7   // d0 =r1,i1 d1 =r0,i0
+        ld2             {v20.2s,v21.2s}, [x6], #16  // d20=r2,i2 d21=r3,i3
+        ld2             {v16.2s,v17.2s}, [x1], x7   // c1,c0 s1,s0
+1:
+        subs            x14, x14, #2
+        fmul            v7.2s,  v0.2s,  v17.2s      // r1*s1,r0*s0
+        ld2             {v18.2s,v19.2s}, [x4], #16  // c2,c3 s2,s3
+        fmul            v4.2s,  v1.2s,  v17.2s      // i1*s1,i0*s0
+        fmul            v6.2s,  v21.2s, v19.2s      // i2*s2,i3*s3
+        fmul            v5.2s,  v20.2s, v19.2s      // r2*s2,r3*s3
+        fmul            v24.2s, v0.2s,  v16.2s      // r1*c1,r0*c0
+        fmul            v25.2s, v20.2s, v18.2s      // r2*c2,r3*c3
+        fmul            v22.2s, v21.2s, v18.2s      // i2*c2,i3*c3
+        fmul            v23.2s, v1.2s,  v16.2s      // i1*c1,i0*c0
+        fadd            v4.2s,  v4.2s,  v24.2s      // i1*s1+r1*c1,i0*s0+r0*c0
+        fadd            v6.2s,  v6.2s,  v25.2s      // i2*s2+r2*c2,i3*s3+r3*c3
+        fsub            v5.2s,  v22.2s, v5.2s       // i2*c2-r2*s2,i3*c3-r3*s3
+        fsub            v7.2s,  v23.2s, v7.2s       // i1*c1-r1*s1,i0*c0-r0*s0
+        fneg            v4.2s,  v4.2s
+        fneg            v6.2s,  v6.2s
+        b.eq            1f
+        ld2             {v0.2s, v1.2s},  [x3], x7
+        ld2             {v20.2s,v21.2s}, [x6], #16
+        ld2             {v16.2s,v17.2s}, [x1], x7   // c1,c0 s1,s0
+        rev64           v5.2s,  v5.2s
+        rev64           v7.2s,  v7.2s
+        st2             {v4.2s,v5.2s},  [x0], x7
+        st2             {v6.2s,v7.2s},  [x8], #16
+        b               1b
+1:
+        rev64           v5.2s,  v5.2s
+        rev64           v7.2s,  v7.2s
+        st2             {v4.2s,v5.2s},  [x0]
+        st2             {v6.2s,v7.2s},  [x8]
+
+        ldp             x19, x20, [sp], #16
+        ldr             x30, [sp], #16
+        ret
+endfunc
--- a/externals/ffmpeg/libavcodec/aarch64/mpegaudiodsp_init.c
+++ b/externals/ffmpeg/libavcodec/aarch64/mpegaudiodsp_init.c
@@ -0,0 +1,40 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegaudiodsp.h"
+#include "config.h"
+
+void ff_mpadsp_apply_window_fixed_neon(int32_t *synth_buf, int32_t *window,
+                                       int *dither, int16_t *samples, ptrdiff_t incr);
+void ff_mpadsp_apply_window_float_neon(float *synth_buf, float *window,
+                                       int *dither, float *samples, ptrdiff_t incr);
+
+av_cold void ff_mpadsp_init_aarch64(MPADSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        s->apply_window_fixed = ff_mpadsp_apply_window_fixed_neon;
+        s->apply_window_float = ff_mpadsp_apply_window_float_neon;
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/mpegaudiodsp_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/mpegaudiodsp_neon.S
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define FRAC_BITS   23   // fractional bits for sb_samples and dct
+#define WFRAC_BITS  16   // fractional bits for window
+#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
+
+const   tbl_rev128_s, align=4
+        .byte           12, 13, 14, 15
+        .byte            8,  9, 10, 11
+        .byte            4,  5,  6,  7
+        .byte            0,  1,  2,  3
+endconst
+
+.macro   apply_window   type, st
+function ff_mpadsp_apply_window_\type\()_neon, export=1
+        mov             x7,  x0
+        add             x8,  x0,  #512<<2
+        ld1             {v0.4s,v1.4s,v2.4s,v3.4s},  [x7],  #64
+        ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x7],  #64
+        st1             {v0.4s,v1.4s,v2.4s,v3.4s},  [x8],  #64
+        st1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x8],  #64
+        movrel          x15, tbl_rev128_s
+        ld1             {v27.4s}, [x15]
+.ifc \type, fixed
+        lsl             x4,  x4,  #1
+.else
+        lsl             x4,  x4,  #2
+.endif
+        add             x10, x0,  #45<<2
+        add             x0,  x0,  #16<<2
+        add             x1,  x1,  #16<<2
+        add             x5,  x3,  x4,  lsl #5
+        sub             x5,  x5,  x4            // samples2
+        neg             x13, x4                 // -incr
+        mov             x9,  #64<<2
+.ifc \type, fixed
+        ld1r            {v16.2s}, [x2]          // dither_state
+        sxtl            v16.2d, v16.2s
+        movi            v29.2d, #0
+        movi            v30.2d, #(1<<OUT_SHIFT)-1
+        trn1            v31.2d, v29.2d, v30.2d
+        trn2            v30.2d, v30.2d, v29.2d
+        trn1            v16.2d, v16.2d, v29.2d
+.else
+        movi            v16.4s, #0
+        movi            v28.4s, #0
+.endif
+        mov             x14, #4
+1:
+        mov             x8,  x0
+        sub             x7,  x1,  #3<<2
+        sub             x6,  x1,  x14, lsl #4
+        add             x7,  x7,  x14, lsl #4
+        add             x11, x6, #(32)<<2      // w  + 32
+        add             x12, x7, #(32)<<2      // w2 + 32
+        mov             x15, #8
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+2:
+        subs            x15, x15, #1
+        ld1             {v0.4s},  [x8],  x9
+        ld1             {v1.4s},  [x10], x9
+        ld1             {v2.4s},  [x6],  x9
+        ld1             {v3.4s},  [x7],  x9
+        tbl             v6.16b, {v0.16b}, v27.16b
+        tbl             v7.16b, {v1.16b}, v27.16b
+        ld1             {v4.4s},  [x11], x9
+        ld1             {v5.4s},  [x12], x9
+        MLA             v16, v2, v0
+        MLA2            v17, v2, v0
+        MLS             v18, v3, v6
+        MLS2            v19, v3, v6
+        MLS             v16, v4, v7
+        MLS2            v17, v4, v7
+        MLS             v18, v5, v1
+        MLS2            v19, v5, v1
+        b.gt            2b
+
+        cmp             x14, #4
+        sub             x10, x10, #64<<5        // 64 * 8 * sizeof(int32_t)
+
+.ifc \type, fixed
+        and             v28.16b, v16.16b, v30.16b
+        ext             v28.16b, v29.16b, v28.16b, #8
+
+        b.eq            4f
+        round_sample    v19, 1, 1
+4:
+        round_sample    v16, 1, 0
+        shrn            v16.2s, v16.2d,  #OUT_SHIFT
+        round_sample    v19, 0, 0
+        shrn            v19.2s, v19.2d,  #OUT_SHIFT
+        round_sample    v17, 0, 1
+        round_sample    v18, 1, 1
+        round_sample    v17, 1, 0
+        shrn2           v16.4s, v17.2d,  #OUT_SHIFT
+        round_sample    v18, 0, 0
+        shrn2           v19.4s, v18.2d,  #OUT_SHIFT
+        sqxtn           v16.4h, v16.4s
+        sqxtn           v18.4h, v19.4s
+.else
+        ext             v18.16b, v18.16b, v18.16b, #8
+.endif
+
+        st1             {v16.\st\()}[0], [x3], x4
+        b.eq            4f
+        st1             {v18.\st\()}[1], [x5], x13
+4:
+        st1             {v16.\st\()}[1], [x3], x4
+        st1             {v18.\st\()}[0], [x5], x13
+        st1             {v16.\st\()}[2], [x3], x4
+        st1             {v18.\st\()}[3], [x5], x13
+        st1             {v16.\st\()}[3], [x3], x4
+        st1             {v18.\st\()}[2], [x5], x13
+
+        mov             v16.16b, v28.16b
+
+        subs            x14, x14, #1
+        add             x0,  x0,  #4<<2
+        sub             x10, x10, #4<<2
+        b.gt            1b
+
+// computing samples[16]
+        add             x6,  x1,  #32<<2
+        ld1             {v0.2s},  [x6],  x9
+        ld1             {v1.2s},  [x0],  x9
+.rept   3
+        ld1             {v2.2s},  [x6],  x9
+        ld1             {v3.2s},  [x0],  x9
+        MLS             v16, v0,  v1
+        ld1             {v0.2s},  [x6],  x9
+        ld1             {v1.2s},  [x0],  x9
+        MLS             v16, v2,  v3
+.endr
+        ld1             {v2.2s},  [x6],  x9
+        ld1             {v3.2s},  [x0],  x9
+        MLS             v16, v0,  v1
+        MLS             v16, v2,  v3
+
+.ifc \type, fixed
+        and             v28.16b, v16.16b, v30.16b
+        shrn            v20.2s,  v16.2d,  #OUT_SHIFT
+        xtn             v28.2s,  v28.2d
+        sqxtn           v20.4h,  v20.4s
+        st1             {v28.s}[0], [x2]        // save dither_state
+        st1             {v20.h}[0], [x3]
+.else
+        st1             {v16.s}[0], [x3]
+.endif
+
+        ret
+endfunc
+.purgem round_sample
+.purgem MLA
+.purgem MLA2
+.purgem MLS
+.purgem MLS2
+.endm
+
+
+.macro  round_sample    r, idx, next
+        add             \r\().2d, \r\().2d, v28.2d
+.if \idx == 0
+        and             v28.16b,  \r\().16b,  v30.16b
+.else // \idx == 1
+        and             v28.16b,  \r\().16b,  v31.16b
+.endif
+.if \idx != \next
+  .if \next == 0
+        ext             v28.16b, v28.16b, v29.16b, #8
+  .else
+        ext             v28.16b, v29.16b, v28.16b, #8
+  .endif
+.endif
+.endm
+.macro  MLA             d, s1, s2
+        smlal           \d\().2d, \s1\().2s, \s2\().2s
+.endm
+.macro  MLA2            d, s1, s2
+        smlal2          \d\().2d, \s1\().4s, \s2\().4s
+.endm
+.macro  MLS             d, s1, s2
+        smlsl           \d\().2d, \s1\().2s, \s2\().2s
+.endm
+.macro  MLS2            d, s1, s2
+        smlsl2          \d\().2d, \s1\().4s, \s2\().4s
+.endm
+apply_window fixed, h
+
+
+// nothing to do for round_sample and ML{A,S}2
+.macro  round_sample    r, idx, next
+.endm
+.macro  MLA2            d, s1, s2
+.endm
+.macro  MLS2            d, s1, s2
+.endm
+.macro  MLA             d, s1, s2
+        fmla            \d\().4s, \s1\().4s, \s2\().4s
+.endm
+.macro  MLS             d, s1, s2
+        fmls            \d\().4s, \s1\().4s, \s2\().4s
+.endm
+apply_window float, s
--- a/externals/ffmpeg/libavcodec/aarch64/neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/neon.S
@@ -0,0 +1,149 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+.macro  transpose_8x8B  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+        trn1            \r8\().8B,  \r0\().8B,  \r1\().8B
+        trn2            \r9\().8B,  \r0\().8B,  \r1\().8B
+        trn1            \r1\().8B,  \r2\().8B,  \r3\().8B
+        trn2            \r3\().8B,  \r2\().8B,  \r3\().8B
+        trn1            \r0\().8B,  \r4\().8B,  \r5\().8B
+        trn2            \r5\().8B,  \r4\().8B,  \r5\().8B
+        trn1            \r2\().8B,  \r6\().8B,  \r7\().8B
+        trn2            \r7\().8B,  \r6\().8B,  \r7\().8B
+
+        trn1            \r4\().4H,  \r0\().4H,  \r2\().4H
+        trn2            \r2\().4H,  \r0\().4H,  \r2\().4H
+        trn1            \r6\().4H,  \r5\().4H,  \r7\().4H
+        trn2            \r7\().4H,  \r5\().4H,  \r7\().4H
+        trn1            \r5\().4H,  \r9\().4H,  \r3\().4H
+        trn2            \r9\().4H,  \r9\().4H,  \r3\().4H
+        trn1            \r3\().4H,  \r8\().4H,  \r1\().4H
+        trn2            \r8\().4H,  \r8\().4H,  \r1\().4H
+
+        trn1            \r0\().2S,  \r3\().2S,  \r4\().2S
+        trn2            \r4\().2S,  \r3\().2S,  \r4\().2S
+
+        trn1            \r1\().2S,  \r5\().2S,  \r6\().2S
+        trn2            \r5\().2S,  \r5\().2S,  \r6\().2S
+
+        trn2            \r6\().2S,  \r8\().2S,  \r2\().2S
+        trn1            \r2\().2S,  \r8\().2S,  \r2\().2S
+
+        trn1            \r3\().2S,  \r9\().2S,  \r7\().2S
+        trn2            \r7\().2S,  \r9\().2S,  \r7\().2S
+.endm
+
+.macro  transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+        trn1            \t0\().16B, \r0\().16B, \r1\().16B
+        trn2            \t1\().16B, \r0\().16B, \r1\().16B
+        trn1            \r1\().16B, \r2\().16B, \r3\().16B
+        trn2            \r3\().16B, \r2\().16B, \r3\().16B
+        trn1            \r0\().16B, \r4\().16B, \r5\().16B
+        trn2            \r5\().16B, \r4\().16B, \r5\().16B
+        trn1            \r2\().16B, \r6\().16B, \r7\().16B
+        trn2            \r7\().16B, \r6\().16B, \r7\().16B
+
+        trn1            \r4\().8H,  \r0\().8H,  \r2\().8H
+        trn2            \r2\().8H,  \r0\().8H,  \r2\().8H
+        trn1            \r6\().8H,  \r5\().8H,  \r7\().8H
+        trn2            \r7\().8H,  \r5\().8H,  \r7\().8H
+        trn1            \r5\().8H,  \t1\().8H,  \r3\().8H
+        trn2            \t1\().8H,  \t1\().8H,  \r3\().8H
+        trn1            \r3\().8H,  \t0\().8H,  \r1\().8H
+        trn2            \t0\().8H,  \t0\().8H,  \r1\().8H
+
+        trn1            \r0\().4S,  \r3\().4S,  \r4\().4S
+        trn2            \r4\().4S,  \r3\().4S,  \r4\().4S
+
+        trn1            \r1\().4S,  \r5\().4S,  \r6\().4S
+        trn2            \r5\().4S,  \r5\().4S,  \r6\().4S
+
+        trn2            \r6\().4S,  \t0\().4S,  \r2\().4S
+        trn1            \r2\().4S,  \t0\().4S,  \r2\().4S
+
+        trn1            \r3\().4S,  \t1\().4S,  \r7\().4S
+        trn2            \r7\().4S,  \t1\().4S,  \r7\().4S
+.endm
+
+.macro  transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().16B, \r0\().16B,  \r1\().16B
+        trn2            \t5\().16B, \r0\().16B,  \r1\().16B
+        trn1            \t6\().16B, \r2\().16B,  \r3\().16B
+        trn2            \t7\().16B, \r2\().16B,  \r3\().16B
+
+        trn1            \r0\().8H,  \t4\().8H,  \t6\().8H
+        trn2            \r2\().8H,  \t4\().8H,  \t6\().8H
+        trn1            \r1\().8H,  \t5\().8H,  \t7\().8H
+        trn2            \r3\().8H,  \t5\().8H,  \t7\().8H
+.endm
+
+.macro  transpose_4x8B  r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().8B,  \r0\().8B,  \r1\().8B
+        trn2            \t5\().8B,  \r0\().8B,  \r1\().8B
+        trn1            \t6\().8B,  \r2\().8B,  \r3\().8B
+        trn2            \t7\().8B,  \r2\().8B,  \r3\().8B
+
+        trn1            \r0\().4H,  \t4\().4H,  \t6\().4H
+        trn2            \r2\().4H,  \t4\().4H,  \t6\().4H
+        trn1            \r1\().4H,  \t5\().4H,  \t7\().4H
+        trn2            \r3\().4H,  \t5\().4H,  \t7\().4H
+.endm
+
+.macro  transpose_4x4H  r0, r1, r2, r3, r4, r5, r6, r7
+        trn1            \r4\().4H,  \r0\().4H,  \r1\().4H
+        trn2            \r5\().4H,  \r0\().4H,  \r1\().4H
+        trn1            \r6\().4H,  \r2\().4H,  \r3\().4H
+        trn2            \r7\().4H,  \r2\().4H,  \r3\().4H
+        trn1            \r0\().2S,  \r4\().2S,  \r6\().2S
+        trn2            \r2\().2S,  \r4\().2S,  \r6\().2S
+        trn1            \r1\().2S,  \r5\().2S,  \r7\().2S
+        trn2            \r3\().2S,  \r5\().2S,  \r7\().2S
+.endm
+
+.macro  transpose_8x8H  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+        trn1            \r8\().8H,  \r0\().8H,  \r1\().8H
+        trn2            \r9\().8H,  \r0\().8H,  \r1\().8H
+        trn1            \r1\().8H,  \r2\().8H,  \r3\().8H
+        trn2            \r3\().8H,  \r2\().8H,  \r3\().8H
+        trn1            \r0\().8H,  \r4\().8H,  \r5\().8H
+        trn2            \r5\().8H,  \r4\().8H,  \r5\().8H
+        trn1            \r2\().8H,  \r6\().8H,  \r7\().8H
+        trn2            \r7\().8H,  \r6\().8H,  \r7\().8H
+
+        trn1            \r4\().4S,  \r0\().4S,  \r2\().4S
+        trn2            \r2\().4S,  \r0\().4S,  \r2\().4S
+        trn1            \r6\().4S,  \r5\().4S,  \r7\().4S
+        trn2            \r7\().4S,  \r5\().4S,  \r7\().4S
+        trn1            \r5\().4S,  \r9\().4S,  \r3\().4S
+        trn2            \r9\().4S,  \r9\().4S,  \r3\().4S
+        trn1            \r3\().4S,  \r8\().4S,  \r1\().4S
+        trn2            \r8\().4S,  \r8\().4S,  \r1\().4S
+
+        trn1            \r0\().2D,  \r3\().2D,  \r4\().2D
+        trn2            \r4\().2D,  \r3\().2D,  \r4\().2D
+
+        trn1            \r1\().2D,  \r5\().2D,  \r6\().2D
+        trn2            \r5\().2D,  \r5\().2D,  \r6\().2D
+
+        trn2            \r6\().2D,  \r8\().2D,  \r2\().2D
+        trn1            \r2\().2D,  \r8\().2D,  \r2\().2D
+
+        trn1            \r3\().2D,  \r9\().2D,  \r7\().2D
+        trn2            \r7\().2D,  \r9\().2D,  \r7\().2D
+
+.endm
--- a/externals/ffmpeg/libavcodec/aarch64/neontest.c
+++ b/externals/ffmpeg/libavcodec/aarch64/neontest.c
@@ -0,0 +1,99 @@
+/*
+ * check NEON registers for clobbers
+ * Copyright (c) 2013 Martin Storsjo
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/avcodec.h"
+#include "libavutil/aarch64/neontest.h"
+
+wrap(avcodec_open2(AVCodecContext *avctx,
+                   const AVCodec *codec,
+                   AVDictionary **options))
+{
+    testneonclobbers(avcodec_open2, avctx, codec, options);
+}
+
+wrap(avcodec_decode_audio4(AVCodecContext *avctx,
+                           AVFrame *frame,
+                           int *got_frame_ptr,
+                           AVPacket *avpkt))
+{
+    testneonclobbers(avcodec_decode_audio4, avctx, frame,
+                     got_frame_ptr, avpkt);
+}
+
+wrap(avcodec_decode_video2(AVCodecContext *avctx,
+                           AVFrame *picture,
+                           int *got_picture_ptr,
+                           AVPacket *avpkt))
+{
+    testneonclobbers(avcodec_decode_video2, avctx, picture,
+                     got_picture_ptr, avpkt);
+}
+
+wrap(avcodec_decode_subtitle2(AVCodecContext *avctx,
+                              AVSubtitle *sub,
+                              int *got_sub_ptr,
+                              AVPacket *avpkt))
+{
+    testneonclobbers(avcodec_decode_subtitle2, avctx, sub,
+                     got_sub_ptr, avpkt);
+}
+
+wrap(avcodec_encode_audio2(AVCodecContext *avctx,
+                           AVPacket *avpkt,
+                           const AVFrame *frame,
+                           int *got_packet_ptr))
+{
+    testneonclobbers(avcodec_encode_audio2, avctx, avpkt, frame,
+                     got_packet_ptr);
+}
+
+wrap(avcodec_encode_subtitle(AVCodecContext *avctx,
+                             uint8_t *buf, int buf_size,
+                             const AVSubtitle *sub))
+{
+    testneonclobbers(avcodec_encode_subtitle, avctx, buf, buf_size, sub);
+}
+
+wrap(avcodec_encode_video2(AVCodecContext *avctx, AVPacket *avpkt,
+                           const AVFrame *frame, int *got_packet_ptr))
+{
+    testneonclobbers(avcodec_encode_video2, avctx, avpkt, frame, got_packet_ptr);
+}
+
+wrap(avcodec_send_packet(AVCodecContext *avctx, const AVPacket *avpkt))
+{
+    testneonclobbers(avcodec_send_packet, avctx, avpkt);
+}
+
+wrap(avcodec_receive_packet(AVCodecContext *avctx, AVPacket *avpkt))
+{
+    testneonclobbers(avcodec_receive_packet, avctx, avpkt);
+}
+
+wrap(avcodec_send_frame(AVCodecContext *avctx, const AVFrame *frame))
+{
+    testneonclobbers(avcodec_send_frame, avctx, frame);
+}
+
+wrap(avcodec_receive_frame(AVCodecContext *avctx, AVFrame *frame))
+{
+    testneonclobbers(avcodec_receive_frame, avctx, frame);
+}
--- a/externals/ffmpeg/libavcodec/aarch64/opusdsp_init.c
+++ b/externals/ffmpeg/libavcodec/aarch64/opusdsp_init.c
@@ -0,0 +1,35 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/opusdsp.h"
+
+void ff_opus_postfilter_neon(float *data, int period, float *gains, int len);
+float ff_opus_deemphasis_neon(float *out, float *in, float coeff, int len);
+
+av_cold void ff_opus_dsp_init_aarch64(OpusDSP *ctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        ctx->postfilter = ff_opus_postfilter_neon;
+        ctx->deemphasis = ff_opus_deemphasis_neon;
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/opusdsp_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/opusdsp_neon.S
@@ -0,0 +1,113 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+           // 0.85..^1    0.85..^2    0.85..^3    0.85..^4
+const tab_st, align=4
+        .word 0x3f599a00, 0x3f38f671, 0x3f1d382a, 0x3f05a32f
+endconst
+const tab_x0, align=4
+        .word 0x0,        0x3f599a00, 0x3f38f671, 0x3f1d382a
+endconst
+const tab_x1, align=4
+        .word 0x0,        0x0,        0x3f599a00, 0x3f38f671
+endconst
+const tab_x2, align=4
+        .word 0x0,        0x0,        0x0,        0x3f599a00
+endconst
+
+function ff_opus_deemphasis_neon, export=1
+        movrel  x4, tab_st
+        ld1    {v4.4s}, [x4]
+        movrel  x4, tab_x0
+        ld1    {v5.4s}, [x4]
+        movrel  x4, tab_x1
+        ld1    {v6.4s}, [x4]
+        movrel  x4, tab_x2
+        ld1    {v7.4s}, [x4]
+
+        fmul v0.4s, v4.4s, v0.s[0]
+
+1:      ld1  {v1.4s, v2.4s}, [x1], #32
+
+        fmla v0.4s, v5.4s, v1.s[0]
+        fmul v3.4s, v7.4s, v2.s[2]
+
+        fmla v0.4s, v6.4s, v1.s[1]
+        fmla v3.4s, v6.4s, v2.s[1]
+
+        fmla v0.4s, v7.4s, v1.s[2]
+        fmla v3.4s, v5.4s, v2.s[0]
+
+        fadd v1.4s, v1.4s, v0.4s
+        fadd v2.4s, v2.4s, v3.4s
+
+        fmla v2.4s, v4.4s, v1.s[3]
+
+        st1  {v1.4s, v2.4s}, [x0], #32
+        fmul v0.4s, v4.4s, v2.s[3]
+
+        subs w2, w2, #8
+        b.gt 1b
+
+        mov s0, v2.s[3]
+
+        ret
+endfunc
+
+function ff_opus_postfilter_neon, export=1
+        ld1 {v0.4s}, [x2]
+        dup v1.4s, v0.s[1]
+        dup v2.4s, v0.s[2]
+        dup v0.4s, v0.s[0]
+
+        add w1, w1, #2
+        sub x1, x0, x1, lsl #2
+
+        ld1 {v3.4s}, [x1]
+        fmul v3.4s, v3.4s, v2.4s
+
+1:      add x1, x1, #4
+        ld1 {v4.4s}, [x1]
+        add x1, x1, #4
+        ld1 {v5.4s}, [x1]
+        add x1, x1, #4
+        ld1 {v6.4s}, [x1]
+        add x1, x1, #4
+        ld1 {v7.4s}, [x1]
+
+        fmla v3.4s, v7.4s, v2.4s
+        fadd v6.4s, v6.4s, v4.4s
+
+        ld1 {v4.4s}, [x0]
+        fmla v4.4s, v5.4s, v0.4s
+
+        fmul v6.4s, v6.4s, v1.4s
+        fadd v6.4s, v6.4s, v3.4s
+
+        fadd v4.4s, v4.4s, v6.4s
+        fmul v3.4s, v7.4s, v2.4s
+
+        st1  {v4.4s}, [x0], #16
+
+        subs w3, w3, #4
+        b.gt 1b
+
+        ret
+endfunc
--- a/externals/ffmpeg/libavcodec/aarch64/pixblockdsp_init_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/pixblockdsp_init_aarch64.c
@@ -0,0 +1,46 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/pixblockdsp.h"
+
+void ff_get_pixels_neon(int16_t *block, const uint8_t *pixels,
+                        ptrdiff_t stride);
+void ff_diff_pixels_neon(int16_t *block, const uint8_t *s1,
+                         const uint8_t *s2, ptrdiff_t stride);
+
+av_cold void ff_pixblockdsp_init_aarch64(PixblockDSPContext *c,
+                                         AVCodecContext *avctx,
+                                         unsigned high_bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        if (!high_bit_depth) {
+            c->get_pixels_unaligned =
+            c->get_pixels = ff_get_pixels_neon;
+        }
+        c->diff_pixels_unaligned =
+        c->diff_pixels = ff_diff_pixels_neon;
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/pixblockdsp_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/pixblockdsp_neon.S
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Martin Storsjo
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_get_pixels_neon, export=1
+        mov             w3,  #8
+1:
+        ld1             {v0.8b}, [x1], x2
+        subs            w3,  w3,  #2
+        ld1             {v1.8b}, [x1], x2
+        uxtl            v0.8h,   v0.8b
+        uxtl            v1.8h,   v1.8b
+        st1             {v0.8h, v1.8h}, [x0], #32
+        b.gt            1b
+
+        ret
+endfunc
+
+function ff_diff_pixels_neon, export=1
+        mov             w4,  #8
+1:
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v1.8b}, [x2], x3
+        subs            w4,  w4,  #2
+        ld1             {v2.8b}, [x1], x3
+        usubl           v0.8h,   v0.8b,   v1.8b
+        ld1             {v3.8b}, [x2], x3
+        usubl           v1.8h,   v2.8b,   v3.8b
+        st1             {v0.8h, v1.8h}, [x0], #32
+        b.gt            1b
+
+        ret
+endfunc
--- a/externals/ffmpeg/libavcodec/aarch64/rv40dsp_init_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/rv40dsp_init_aarch64.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/rv34dsp.h"
+
+#include "config.h"
+
+void ff_put_rv40_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_put_rv40_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+
+void ff_avg_rv40_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_avg_rv40_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+
+av_cold void ff_rv40dsp_init_aarch64(RV34DSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon;
+        c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon;
+        c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
+        c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/sbrdsp_init_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/sbrdsp_init_aarch64.c
@@ -0,0 +1,70 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/sbrdsp.h"
+
+void ff_sbr_sum64x5_neon(float *z);
+float ff_sbr_sum_square_neon(float (*x)[2], int n);
+void ff_sbr_neg_odd_64_neon(float *x);
+void ff_sbr_qmf_pre_shuffle_neon(float *z);
+void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z);
+void ff_sbr_qmf_deint_neg_neon(float *v, const float *src);
+void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1);
+void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2],
+                           const float *g_filt, int m_max, intptr_t ixh);
+void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2],
+                        const float alpha0[2], const float alpha1[2],
+                        float bw, int start, int end);
+void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]);
+void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+
+av_cold void ff_sbrdsp_init_aarch64(SBRDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        s->sum64x5 = ff_sbr_sum64x5_neon;
+        s->sum_square = ff_sbr_sum_square_neon;
+        s->neg_odd_64 = ff_sbr_neg_odd_64_neon;
+        s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon;
+        s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon;
+        s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon;
+        s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon;
+        s->hf_g_filt = ff_sbr_hf_g_filt_neon;
+        s->hf_gen = ff_sbr_hf_gen_neon;
+        s->autocorrelate = ff_sbr_autocorrelate_neon;
+        s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon;
+        s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon;
+        s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon;
+        s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon;
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/sbrdsp_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/sbrdsp_neon.S
@@ -0,0 +1,327 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+const factors, align=4
+        .float 1.0, -1.0, 1.0, -1.0
+endconst
+
+const phi_noise_0, align=4
+        .float 1.0, 0.0, 1.0, 0.0
+endconst
+
+const phi_noise_1, align=4
+        .float 0.0,  1.0,  0.0, -1.0
+        .float 0.0, -1.0,  0.0,  1.0
+endconst
+
+const phi_noise_2, align=4
+        .float -1.0, 0.0, -1.0, 0.0
+endconst
+
+const phi_noise_3, align=4
+        .float 0.0, -1.0,  0.0,  1.0
+        .float 0.0,  1.0,  0.0, -1.0
+endconst
+
+function ff_sbr_sum64x5_neon, export=1
+        add             x1, x0, #64*4
+        add             x2, x0, #128*4
+        add             x3, x0, #192*4
+        add             x4, x0, #256*4
+        mov             x5, #64
+1:      ld1             {v0.4S}, [x0]
+        ld1             {v1.4S}, [x1], #16
+        fadd            v0.4S, v0.4S, v1.4S
+        ld1             {v2.4S}, [x2], #16
+        fadd            v0.4S, v0.4S, v2.4S
+        ld1             {v3.4S}, [x3], #16
+        fadd            v0.4S, v0.4S, v3.4S
+        ld1             {v4.4S}, [x4], #16
+        fadd            v0.4S, v0.4S, v4.4S
+        st1             {v0.4S}, [x0], #16
+        subs            x5, x5, #4
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_sum_square_neon, export=1
+        movi            v0.4S, #0
+1:      ld1             {v1.4S}, [x0], #16
+        fmla            v0.4S, v1.4S, v1.4S
+        subs            w1, w1, #2
+        b.gt            1b
+        faddp           v0.4S, v0.4S, v0.4S
+        faddp           v0.4S, v0.4S, v0.4S
+        ret
+endfunc
+
+function ff_sbr_neg_odd_64_neon, export=1
+        mov             x1, x0
+        movi            v5.4S, #1<<7, lsl #24
+        ld2             {v0.4S, v1.4S}, [x0], #32
+        eor             v1.16B, v1.16B, v5.16B
+        ld2             {v2.4S, v3.4S}, [x0], #32
+.rept 3
+        st2             {v0.4S, v1.4S}, [x1], #32
+        eor             v3.16B, v3.16B, v5.16B
+        ld2             {v0.4S, v1.4S}, [x0], #32
+        st2             {v2.4S, v3.4S}, [x1], #32
+        eor             v1.16B, v1.16B, v5.16B
+        ld2             {v2.4S, v3.4S}, [x0], #32
+.endr
+        eor             v3.16B, v3.16B, v5.16B
+        st2             {v0.4S, v1.4S}, [x1], #32
+        st2             {v2.4S, v3.4S}, [x1], #32
+        ret
+endfunc
+
+function ff_sbr_qmf_pre_shuffle_neon, export=1
+        add             x1, x0, #60*4
+        add             x2, x0, #64*4
+        mov             x3, #-16
+        mov             x4, #-4
+        movi            v6.4S, #1<<7, lsl #24
+        ld1             {v0.2S}, [x0], #8
+        st1             {v0.2S}, [x2], #8
+.rept 7
+        ld1             {v1.4S}, [x1], x3
+        ld1             {v2.4S}, [x0], #16
+        eor             v1.16B, v1.16B, v6.16B
+        rev64           v1.4S, v1.4S
+        ext             v1.16B, v1.16B, v1.16B, #8
+        st2             {v1.4S, v2.4S}, [x2], #32
+.endr
+        add             x1, x1, #8
+        ld1             {v1.2S}, [x1], x4
+        ld1             {v2.2S}, [x0], #8
+        ld1             {v1.S}[3], [x1]
+        ld1             {v2.S}[2], [x0]
+        eor             v1.16B, v1.16B, v6.16B
+        rev64           v1.4S, v1.4S
+        st2             {v1.2S, v2.2S}, [x2], #16
+        st2             {v1.S, v2.S}[2], [x2]
+        ret
+endfunc
+
+function ff_sbr_qmf_post_shuffle_neon, export=1
+        add             x2, x1, #60*4
+        mov             x3, #-16
+        mov             x4, #32
+        movi            v6.4S, #1<<7, lsl #24
+1:      ld1             {v0.4S}, [x2], x3
+        ld1             {v1.4S}, [x1], #16
+        eor             v0.16B, v0.16B, v6.16B
+        rev64           v0.4S, v0.4S
+        ext             v0.16B, v0.16B, v0.16B, #8
+        st2             {v0.4S, v1.4S}, [x0], #32
+        subs            x4, x4, #4
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_qmf_deint_neg_neon, export=1
+        add             x1, x1, #56*4
+        add             x2, x0, #60*4
+        mov             x3, #-32
+        mov             x4, #32
+        movi            v2.4S, #1<<7, lsl #24
+1:      ld2             {v0.4S, v1.4S}, [x1], x3
+        eor             v0.16B, v0.16B, v2.16B
+        rev64           v1.4S, v1.4S
+        ext             v1.16B, v1.16B, v1.16B, #8
+        st1             {v0.4S}, [x2]
+        st1             {v1.4S}, [x0], #16
+        sub             x2, x2, #16
+        subs            x4, x4, #4
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_qmf_deint_bfly_neon, export=1
+        add             x2, x2, #60*4
+        add             x3, x0, #124*4
+        mov             x4, #64
+        mov             x5, #-16
+1:      ld1             {v0.4S}, [x1], #16
+        ld1             {v1.4S}, [x2], x5
+        rev64           v2.4S, v0.4S
+        ext             v2.16B, v2.16B, v2.16B, #8
+        rev64           v3.4S, v1.4S
+        ext             v3.16B, v3.16B, v3.16B, #8
+        fadd            v1.4S, v1.4S, v2.4S
+        fsub            v0.4S, v0.4S, v3.4S
+        st1             {v0.4S}, [x0], #16
+        st1             {v1.4S}, [x3], x5
+        subs            x4, x4, #4
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_hf_gen_neon, export=1
+        sxtw            x4, w4
+        sxtw            x5, w5
+        movrel          x6, factors
+        ld1             {v7.4S}, [x6]
+        dup             v1.4S, v0.S[0]
+        mov             v2.8B, v1.8B
+        mov             v2.S[2], v7.S[0]
+        mov             v2.S[3], v7.S[0]
+        fmul            v1.4S, v1.4S, v2.4S
+        ld1             {v0.D}[0], [x3]
+        ld1             {v0.D}[1], [x2]
+        fmul            v0.4S, v0.4S, v1.4S
+        fmul            v1.4S, v0.4S, v7.4S
+        rev64           v0.4S, v0.4S
+        sub             x7, x5, x4
+        add             x0, x0, x4, lsl #3
+        add             x1, x1, x4, lsl #3
+        sub             x1, x1, #16
+1:      ld1             {v2.4S}, [x1], #16
+        ld1             {v3.2S}, [x1]
+        fmul            v4.4S, v2.4S, v1.4S
+        fmul            v5.4S, v2.4S, v0.4S
+        faddp           v4.4S, v4.4S, v4.4S
+        faddp           v5.4S, v5.4S, v5.4S
+        faddp           v4.4S, v4.4S, v4.4S
+        faddp           v5.4S, v5.4S, v5.4S
+        mov             v4.S[1], v5.S[0]
+        fadd            v4.2S, v4.2S, v3.2S
+        st1             {v4.2S}, [x0], #8
+        sub             x1, x1, #8
+        subs            x7, x7, #1
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_hf_g_filt_neon, export=1
+        sxtw            x3, w3
+        sxtw            x4, w4
+        mov             x5, #40*2*4
+        add             x1, x1, x4, lsl #3
+1:      ld1             {v0.2S}, [x1], x5
+        ld1             {v1.S}[0], [x2], #4
+        fmul            v2.4S, v0.4S, v1.S[0]
+        st1             {v2.2S}, [x0], #8
+        subs            x3, x3, #1
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_autocorrelate_neon, export=1
+        mov             x2, #38
+        movrel          x3, factors
+        ld1             {v0.4S}, [x3]
+        movi            v1.4S, #0
+        movi            v2.4S, #0
+        movi            v3.4S, #0
+        ld1             {v4.2S}, [x0], #8
+        ld1             {v5.2S}, [x0], #8
+        fmul            v16.2S, v4.2S, v4.2S
+        fmul            v17.2S, v5.2S, v4.S[0]
+        fmul            v18.2S, v5.2S, v4.S[1]
+1:      ld1             {v5.D}[1], [x0], #8
+        fmla            v1.2S, v4.2S, v4.2S
+        fmla            v2.4S, v5.4S, v4.S[0]
+        fmla            v3.4S, v5.4S, v4.S[1]
+        mov             v4.D[0], v5.D[0]
+        mov             v5.D[0], v5.D[1]
+        subs            x2, x2, #1
+        b.gt            1b
+        fmul            v19.2S, v4.2S, v4.2S
+        fmul            v20.2S, v5.2S, v4.S[0]
+        fmul            v21.2S, v5.2S, v4.S[1]
+        fadd            v22.4S, v2.4S, v20.4S
+        fsub            v22.4S, v22.4S, v17.4S
+        fadd            v23.4S, v3.4S, v21.4S
+        fsub            v23.4S, v23.4S, v18.4S
+        rev64           v23.4S, v23.4S
+        fmul            v23.4S, v23.4S, v0.4S
+        fadd            v22.4S, v22.4S, v23.4S
+        st1             {v22.4S}, [x1], #16
+        fadd            v23.2S, v1.2S, v19.2S
+        fsub            v23.2S, v23.2S, v16.2S
+        faddp           v23.2S, v23.2S, v23.2S
+        st1             {v23.S}[0], [x1]
+        add             x1, x1, #8
+        rev64           v3.2S, v3.2S
+        fmul            v3.2S, v3.2S, v0.2S
+        fadd            v2.2S, v2.2S, v3.2S
+        st1             {v2.2S}, [x1]
+        add             x1, x1, #16
+        faddp           v1.2S, v1.2S, v1.2S
+        st1             {v1.S}[0], [x1]
+        ret
+endfunc
+
+.macro apply_noise_common
+        sxtw            x3, w3
+        sxtw            x5, w5
+        movrel          x7, X(ff_sbr_noise_table)
+        add             x3, x3, #1
+1:      and             x3, x3, #0x1ff
+        add             x8, x7, x3, lsl #3
+        add             x3, x3, #2
+        ld1             {v2.4S}, [x0]
+        ld1             {v3.2S}, [x1], #8
+        ld1             {v4.2S}, [x2], #8
+        ld1             {v5.4S}, [x8]
+        mov             v6.16B, v2.16B
+        zip1            v3.4S, v3.4S, v3.4S
+        zip1            v4.4S, v4.4S, v4.4S
+        fmla            v6.4S, v1.4S, v3.4S
+        fmla            v2.4S, v5.4S, v4.4S
+        fcmeq           v7.4S, v3.4S, #0
+        bif             v2.16B, v6.16B, v7.16B
+        st1             {v2.4S}, [x0], #16
+        subs            x5, x5, #2
+        b.gt            1b
+.endm
+
+function ff_sbr_hf_apply_noise_0_neon, export=1
+        movrel          x9, phi_noise_0
+        ld1             {v1.4S}, [x9]
+        apply_noise_common
+        ret
+endfunc
+
+function ff_sbr_hf_apply_noise_1_neon, export=1
+        movrel          x9, phi_noise_1
+        and             x4, x4, #1
+        add             x9, x9, x4, lsl #4
+        ld1             {v1.4S}, [x9]
+        apply_noise_common
+        ret
+endfunc
+
+function ff_sbr_hf_apply_noise_2_neon, export=1
+        movrel          x9, phi_noise_2
+        ld1             {v1.4S}, [x9]
+        apply_noise_common
+        ret
+endfunc
+
+function ff_sbr_hf_apply_noise_3_neon, export=1
+        movrel          x9, phi_noise_3
+        and             x4, x4, #1
+        add             x9, x9, x4, lsl #4
+        ld1             {v1.4S}, [x9]
+        apply_noise_common
+        ret
+endfunc
--- a/externals/ffmpeg/libavcodec/aarch64/simple_idct_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/simple_idct_neon.S
@@ -0,0 +1,362 @@
+/*
+ * ARM NEON IDCT
+ *
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
+ *
+ * Based on Simple IDCT
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define Z1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z4c ((1<<(COL_SHIFT-1))/Z4)
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define z1 v0.H[0]
+#define z2 v0.H[1]
+#define z3 v0.H[2]
+#define z4 v0.H[3]
+#define z5 v0.H[4]
+#define z6 v0.H[5]
+#define z7 v0.H[6]
+#define z4c v0.H[7]
+
+const   idct_coeff_neon, align=4
+        .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
+endconst
+
+.macro idct_start data
+        prfm            pldl1keep, [\data]
+        mov             x10, x30
+        movrel          x3, idct_coeff_neon
+        ld1             {v0.2D}, [x3]
+.endm
+
+.macro idct_end
+        br              x10
+.endm
+
+.macro smull1 a, b, c
+        smull           \a, \b, \c
+.endm
+
+.macro smlal1 a, b, c
+        smlal           \a, \b, \c
+.endm
+
+.macro smlsl1 a, b, c
+        smlsl           \a, \b, \c
+.endm
+
+.macro idct_col4_top y1, y2, y3, y4, i, l
+        smull\i         v7.4S,  \y3\l, z2
+        smull\i         v16.4S, \y3\l, z6
+        smull\i         v17.4S, \y2\l, z1
+        add             v19.4S, v23.4S, v7.4S
+        smull\i         v18.4S, \y2\l, z3
+        add             v20.4S, v23.4S, v16.4S
+        smull\i         v5.4S,  \y2\l, z5
+        sub             v21.4S, v23.4S, v16.4S
+        smull\i         v6.4S,  \y2\l, z7
+        sub             v22.4S, v23.4S, v7.4S
+
+        smlal\i         v17.4S, \y4\l, z3
+        smlsl\i         v18.4S, \y4\l, z7
+        smlsl\i         v5.4S,  \y4\l, z1
+        smlsl\i         v6.4S,  \y4\l, z5
+.endm
+
+.macro idct_row4_neon y1, y2, y3, y4, pass
+        ld1             {\y1\().2D,\y2\().2D}, [x2], #32
+        movi            v23.4S, #1<<2, lsl #8
+        orr             v5.16B, \y1\().16B, \y2\().16B
+        ld1             {\y3\().2D,\y4\().2D}, [x2], #32
+        orr             v6.16B, \y3\().16B, \y4\().16B
+        orr             v5.16B, v5.16B, v6.16B
+        mov             x3, v5.D[1]
+        smlal           v23.4S, \y1\().4H, z4
+
+        idct_col4_top   \y1, \y2, \y3, \y4, 1, .4H
+
+        cmp             x3, #0
+        b.eq            \pass\()f
+
+        smull2          v7.4S, \y1\().8H, z4
+        smlal2          v17.4S, \y2\().8H, z5
+        smlsl2          v18.4S, \y2\().8H, z1
+        smull2          v16.4S, \y3\().8H, z2
+        smlal2          v5.4S, \y2\().8H, z7
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v7.4S
+        sub             v21.4S, v21.4S, v7.4S
+        add             v22.4S, v22.4S, v7.4S
+        smlal2          v6.4S, \y2\().8H, z3
+        smull2          v7.4S, \y3\().8H, z6
+        smlal2          v17.4S, \y4\().8H, z7
+        smlsl2          v18.4S, \y4\().8H, z5
+        smlal2          v5.4S, \y4\().8H, z3
+        smlsl2          v6.4S, \y4\().8H, z1
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v16.4S
+        add             v21.4S, v21.4S, v16.4S
+        sub             v22.4S, v22.4S, v7.4S
+
+\pass:  add             \y3\().4S, v19.4S, v17.4S
+        add             \y4\().4S, v20.4S, v18.4S
+        shrn            \y1\().4H, \y3\().4S, #ROW_SHIFT
+        shrn            \y2\().4H, \y4\().4S, #ROW_SHIFT
+        add             v7.4S, v21.4S, v5.4S
+        add             v16.4S, v22.4S, v6.4S
+        shrn            \y3\().4H, v7.4S, #ROW_SHIFT
+        shrn            \y4\().4H, v16.4S, #ROW_SHIFT
+        sub             v22.4S, v22.4S, v6.4S
+        sub             v19.4S, v19.4S, v17.4S
+        sub             v21.4S, v21.4S, v5.4S
+        shrn2           \y1\().8H, v22.4S, #ROW_SHIFT
+        sub             v20.4S, v20.4S, v18.4S
+        shrn2           \y2\().8H, v21.4S, #ROW_SHIFT
+        shrn2           \y3\().8H, v20.4S, #ROW_SHIFT
+        shrn2           \y4\().8H, v19.4S, #ROW_SHIFT
+
+        trn1            v16.8H, \y1\().8H, \y2\().8H
+        trn2            v17.8H, \y1\().8H, \y2\().8H
+        trn1            v18.8H, \y3\().8H, \y4\().8H
+        trn2            v19.8H, \y3\().8H, \y4\().8H
+        trn1            \y1\().4S, v16.4S, v18.4S
+        trn1            \y2\().4S, v17.4S, v19.4S
+        trn2            \y3\().4S, v16.4S, v18.4S
+        trn2            \y4\().4S, v17.4S, v19.4S
+.endm
+
+.macro declare_idct_col4_neon i, l
+function idct_col4_neon\i
+        dup             v23.4H, z4c
+.if \i == 1
+        add             v23.4H, v23.4H, v24.4H
+.else
+        mov             v5.D[0], v24.D[1]
+        add             v23.4H, v23.4H, v5.4H
+.endif
+        smull           v23.4S, v23.4H, z4
+
+        idct_col4_top   v24, v25, v26, v27, \i, \l
+
+        mov             x4, v28.D[\i - 1]
+        mov             x5, v29.D[\i - 1]
+        cmp             x4, #0
+        b.eq            1f
+
+        smull\i         v7.4S,  v28\l,  z4
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v7.4S
+        sub             v21.4S, v21.4S, v7.4S
+        add             v22.4S, v22.4S, v7.4S
+
+1:      mov             x4, v30.D[\i - 1]
+        cmp             x5, #0
+        b.eq            2f
+
+        smlal\i         v17.4S, v29\l, z5
+        smlsl\i         v18.4S, v29\l, z1
+        smlal\i         v5.4S,  v29\l, z7
+        smlal\i         v6.4S,  v29\l, z3
+
+2:      mov             x5, v31.D[\i - 1]
+        cmp             x4, #0
+        b.eq            3f
+
+        smull\i         v7.4S,  v30\l, z6
+        smull\i         v16.4S, v30\l, z2
+        add             v19.4S, v19.4S, v7.4S
+        sub             v22.4S, v22.4S, v7.4S
+        sub             v20.4S, v20.4S, v16.4S
+        add             v21.4S, v21.4S, v16.4S
+
+3:      cmp             x5, #0
+        b.eq            4f
+
+        smlal\i         v17.4S, v31\l, z7
+        smlsl\i         v18.4S, v31\l, z5
+        smlal\i         v5.4S,  v31\l, z3
+        smlsl\i         v6.4S,  v31\l, z1
+
+4:      addhn           v7.4H, v19.4S, v17.4S
+        addhn2          v7.8H, v20.4S, v18.4S
+        subhn           v18.4H, v20.4S, v18.4S
+        subhn2          v18.8H, v19.4S, v17.4S
+
+        addhn           v16.4H, v21.4S, v5.4S
+        addhn2          v16.8H, v22.4S, v6.4S
+        subhn           v17.4H, v22.4S, v6.4S
+        subhn2          v17.8H, v21.4S, v5.4S
+
+        ret
+endfunc
+.endm
+
+declare_idct_col4_neon 1, .4H
+declare_idct_col4_neon 2, .8H
+
+function ff_simple_idct_put_neon, export=1
+        idct_start      x2
+
+        idct_row4_neon  v24, v25, v26, v27, 1
+        idct_row4_neon  v28, v29, v30, v31, 2
+        bl              idct_col4_neon1
+
+        sqshrun         v1.8B,  v7.8H, #COL_SHIFT-16
+        sqshrun2        v1.16B, v16.8H, #COL_SHIFT-16
+        sqshrun         v3.8B,  v17.8H, #COL_SHIFT-16
+        sqshrun2        v3.16B, v18.8H, #COL_SHIFT-16
+
+        bl              idct_col4_neon2
+
+        sqshrun         v2.8B,  v7.8H, #COL_SHIFT-16
+        sqshrun2        v2.16B, v16.8H, #COL_SHIFT-16
+        sqshrun         v4.8B,  v17.8H, #COL_SHIFT-16
+        sqshrun2        v4.16B, v18.8H, #COL_SHIFT-16
+
+        zip1            v16.4S, v1.4S, v2.4S
+        zip2            v17.4S, v1.4S, v2.4S
+
+        st1             {v16.D}[0], [x0], x1
+        st1             {v16.D}[1], [x0], x1
+
+        zip1            v18.4S, v3.4S, v4.4S
+        zip2            v19.4S, v3.4S, v4.4S
+
+        st1             {v17.D}[0], [x0], x1
+        st1             {v17.D}[1], [x0], x1
+        st1             {v18.D}[0], [x0], x1
+        st1             {v18.D}[1], [x0], x1
+        st1             {v19.D}[0], [x0], x1
+        st1             {v19.D}[1], [x0], x1
+
+        idct_end
+endfunc
+
+function ff_simple_idct_add_neon, export=1
+        idct_start      x2
+
+        idct_row4_neon  v24, v25, v26, v27, 1
+        idct_row4_neon  v28, v29, v30, v31, 2
+        bl              idct_col4_neon1
+
+        sshr            v1.8H, v7.8H, #COL_SHIFT-16
+        sshr            v2.8H, v16.8H, #COL_SHIFT-16
+        sshr            v3.8H, v17.8H, #COL_SHIFT-16
+        sshr            v4.8H, v18.8H, #COL_SHIFT-16
+
+        bl              idct_col4_neon2
+
+        sshr            v7.8H, v7.8H, #COL_SHIFT-16
+        sshr            v16.8H, v16.8H, #COL_SHIFT-16
+        sshr            v17.8H, v17.8H, #COL_SHIFT-16
+        sshr            v18.8H, v18.8H, #COL_SHIFT-16
+
+        mov             x9,  x0
+        ld1             {v19.D}[0], [x0], x1
+        zip1            v23.2D, v1.2D, v7.2D
+        zip2            v24.2D, v1.2D, v7.2D
+        ld1             {v19.D}[1], [x0], x1
+        zip1            v25.2D, v2.2D, v16.2D
+        zip2            v26.2D, v2.2D, v16.2D
+        ld1             {v20.D}[0], [x0], x1
+        zip1            v27.2D, v3.2D, v17.2D
+        zip2            v28.2D, v3.2D, v17.2D
+        ld1             {v20.D}[1], [x0], x1
+        zip1            v29.2D, v4.2D, v18.2D
+        zip2            v30.2D, v4.2D, v18.2D
+        ld1             {v21.D}[0], [x0], x1
+        uaddw           v23.8H, v23.8H, v19.8B
+        uaddw2          v24.8H, v24.8H, v19.16B
+        ld1             {v21.D}[1], [x0], x1
+        sqxtun          v23.8B, v23.8H
+        sqxtun2         v23.16B, v24.8H
+        ld1             {v22.D}[0], [x0], x1
+        uaddw           v24.8H, v25.8H, v20.8B
+        uaddw2          v25.8H, v26.8H, v20.16B
+        ld1             {v22.D}[1], [x0], x1
+        sqxtun          v24.8B, v24.8H
+        sqxtun2         v24.16B, v25.8H
+        st1             {v23.D}[0], [x9], x1
+        uaddw           v25.8H, v27.8H, v21.8B
+        uaddw2          v26.8H, v28.8H, v21.16B
+        st1             {v23.D}[1], [x9], x1
+        sqxtun          v25.8B, v25.8H
+        sqxtun2         v25.16B, v26.8H
+        st1             {v24.D}[0], [x9], x1
+        uaddw           v26.8H, v29.8H, v22.8B
+        uaddw2          v27.8H, v30.8H, v22.16B
+        st1             {v24.D}[1], [x9], x1
+        sqxtun          v26.8B, v26.8H
+        sqxtun2         v26.16B, v27.8H
+        st1             {v25.D}[0], [x9], x1
+        st1             {v25.D}[1], [x9], x1
+        st1             {v26.D}[0], [x9], x1
+        st1             {v26.D}[1], [x9], x1
+
+        idct_end
+endfunc
+
+function ff_simple_idct_neon, export=1
+        idct_start      x0
+
+        mov             x2,  x0
+        idct_row4_neon  v24, v25, v26, v27, 1
+        idct_row4_neon  v28, v29, v30, v31, 2
+        sub             x2, x2, #128
+        bl              idct_col4_neon1
+
+        sshr            v1.8H, v7.8H, #COL_SHIFT-16
+        sshr            v2.8H, v16.8H, #COL_SHIFT-16
+        sshr            v3.8H, v17.8H, #COL_SHIFT-16
+        sshr            v4.8H, v18.8H, #COL_SHIFT-16
+
+        bl              idct_col4_neon2
+
+        sshr            v7.8H, v7.8H, #COL_SHIFT-16
+        sshr            v16.8H, v16.8H, #COL_SHIFT-16
+        sshr            v17.8H, v17.8H, #COL_SHIFT-16
+        sshr            v18.8H, v18.8H, #COL_SHIFT-16
+
+        zip1            v23.2D, v1.2D, v7.2D
+        zip2            v24.2D, v1.2D, v7.2D
+        st1             {v23.2D,v24.2D}, [x2], #32
+        zip1            v25.2D, v2.2D, v16.2D
+        zip2            v26.2D, v2.2D, v16.2D
+        st1             {v25.2D,v26.2D}, [x2], #32
+        zip1            v27.2D, v3.2D, v17.2D
+        zip2            v28.2D, v3.2D, v17.2D
+        st1             {v27.2D,v28.2D}, [x2], #32
+        zip1            v29.2D, v4.2D, v18.2D
+        zip2            v30.2D, v4.2D, v18.2D
+        st1             {v29.2D,v30.2D}, [x2], #32
+
+        idct_end
+endfunc
--- a/externals/ffmpeg/libavcodec/aarch64/synth_filter_init.c
+++ b/externals/ffmpeg/libavcodec/aarch64/synth_filter_init.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
+
+#include "asm-offsets.h"
+
+#if HAVE_NEON || HAVE_VFP
+AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
+#endif
+
+void ff_synth_filter_float_neon(FFTContext *imdct,
+                                float *synth_buf_ptr, int *synth_buf_offset,
+                                float synth_buf2[32], const float window[512],
+                                float out[32], const float in[32],
+                                float scale);
+
+av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags))
+        s->synth_filter_float = ff_synth_filter_float_neon;
+}
--- a/externals/ffmpeg/libavcodec/aarch64/synth_filter_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/synth_filter_neon.S
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm-offsets.h"
+
+#include "libavutil/aarch64/asm.S"
+
+.macro inner_loop
+        ld1             {v29.4s},  [x9],  x15
+        ld1             {v28.4s},  [x8],  x15
+        ld1             {v30.4s},  [x10], x15
+        ld1             {v31.4s},  [x11], x15
+        rev64           v28.4s, v28.4s
+        ld1             {v24.4s},  [x4],  x15
+        ld1             {v25.4s},  [x5],  x15
+        rev64           v31.4s, v31.4s
+        ld1             {v26.4s},  [x6],  x15
+        fmla            v5.4s,  v25.4s, v29.4s
+        ld1             {v27.4s},  [x7],  x15
+        ext             v28.16b, v28.16b, v28.16b, #8
+        ext             v31.16b, v31.16b, v31.16b, #8
+        fmla            v6.4s,  v26.4s, v30.4s
+        fmls            v4.4s,  v24.4s, v28.4s
+        fmla            v7.4s,  v27.4s, v31.4s
+.endm
+
+function ff_synth_filter_float_neon, export=1
+        ldr             w7,  [x2]               // *synth_buf_offset
+        ldr             x9,  [x0, #IMDCT_HALF]  // imdct_half function pointer
+        sxtw            x7,  w7
+        stp             x3,  x4,  [sp, #-64]!
+        add             x1,  x1,  x7,  lsl #2   // synth_buf
+        sub             w8,  w7,  #32
+        stp             x5,  x1,  [sp, #16]
+        and             x7,  x7,  #~63
+        and             w8,  w8,  #511
+        stp             x7,  x30, [sp, #32]
+        str             w8,  [x2]
+        str             s0,  [sp, #48]
+
+        mov             x2,  x6                 // in
+
+        blr             x9
+
+        ldp             x2,  x4,  [sp]          // synct_buf_2, window
+        ldp             x13, x9,  [sp, #16]     // out, synth_buf
+        ldp             x0,  x30, [sp, #32]     // *synth_buf_offset
+        ldr             s0,  [sp, #48]
+
+        add             x3,  x2,  #16*4         // synct_buf_2 + 16
+        add             x14, x13, #16*4         // out + 16
+        add             x8,  x9,  #12*4
+        mov             x15, #64*4
+        mov             x1,  #4
+1:
+        add             x10, x9,  #16*4         // synth_buf
+        add             x11, x8,  #16*4
+        add             x5,  x4,  #16*4         // window
+        add             x6,  x4,  #32*4
+        add             x7,  x4,  #48*4
+
+        ld1             {v4.4s},   [x2]         // a
+        ld1             {v5.4s},   [x3]         // b
+        movi            v6.4s,  #0              // c
+        movi            v7.4s,  #0              // d
+
+        mov             x12, #512
+2:
+        sub             x12, x12, #64
+        cmp             x12, x0
+        inner_loop
+        b.gt            2b
+
+        sub             x8,  x8,  #512*4
+        sub             x9,  x9,  #512*4
+        cbz             x12, 4f
+        sub             x10, x10, #512*4
+        sub             x11, x11, #512*4
+3:
+        subs            x12, x12, #64
+        inner_loop
+        b.gt            3b
+4:
+        subs            x1,  x1,  #1
+        fmul            v4.4s,  v4.4s,  v0.s[0]
+        fmul            v5.4s,  v5.4s,  v0.s[0]
+        st1             {v6.4s},   [x2],  #16
+        st1             {v7.4s},   [x3],  #16
+        st1             {v4.4s},   [x13], #16
+        st1             {v5.4s},   [x14], #16
+        b.le            10f
+
+        sub             x4,  x4,  #508*4        // window
+        add             x9,  x9,  #4*4          // synth_buf
+        sub             x8,  x8,  #4*4          // synth_buf
+        b               1b
+
+10:
+        add             sp,  sp,  #64
+        ret
+endfunc
--- a/externals/ffmpeg/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -0,0 +1,47 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/vc1dsp.h"
+
+#include "config.h"
+
+void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
+void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
+void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
+
+av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
+        dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+        dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+        dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/videodsp.S
+++ b/externals/ffmpeg/libavcodec/aarch64/videodsp.S
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_prefetch_aarch64, export=1
+        subs            w2,  w2,  #2
+        prfm            pldl1strm, [x0]
+        prfm            pldl1strm, [x0,  x1]
+        add             x0,  x0,  x1,  lsl #1
+        b.gt            X(ff_prefetch_aarch64)
+        ret
+endfunc
--- a/externals/ffmpeg/libavcodec/aarch64/videodsp_init.c
+++ b/externals/ffmpeg/libavcodec/aarch64/videodsp_init.c
@@ -0,0 +1,32 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/videodsp.h"
+
+void ff_prefetch_aarch64(uint8_t *mem, ptrdiff_t stride, int h);
+
+av_cold void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_armv8(cpu_flags))
+        ctx->prefetch = ff_prefetch_aarch64;
+}
--- a/externals/ffmpeg/libavcodec/aarch64/vorbisdsp_init.c
+++ b/externals/ffmpeg/libavcodec/aarch64/vorbisdsp_init.c
@@ -0,0 +1,34 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/vorbisdsp.h"
+
+void ff_vorbis_inverse_coupling_neon(float *mag, float *ang,
+                                     intptr_t blocksize);
+
+av_cold void ff_vorbisdsp_init_aarch64(VorbisDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
+    }
+}
--- a/externals/ffmpeg/libavcodec/aarch64/vorbisdsp_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/vorbisdsp_neon.S
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_vorbis_inverse_coupling_neon, export=1
+        movi            v20.4s,  #1<<7, lsl #24
+        subs            x2,  x2,  #4
+        mov             x3,  x0
+        mov             x4,  x1
+        b.eq            3f
+
+        ld1             {v7.4s},  [x1], #16
+        ld1             {v6.4s},  [x0], #16
+        cmle            v4.4s,  v7.4s,  #0
+        and             v5.16b, v6.16b, v20.16b
+        eor             v7.16b, v7.16b, v5.16b
+        and             v2.16b, v7.16b, v4.16b
+        bic             v3.16b, v7.16b, v4.16b
+        fadd            v7.4s,  v6.4s,  v2.4s
+        fsub            v6.4s,  v6.4s,  v3.4s
+1:      ld1             {v1.4s},  [x1], #16
+        ld1             {v0.4s},  [x0], #16
+        cmle            v4.4s,  v1.4s,  #0
+        and             v5.16b, v0.16b, v20.16b
+        eor             v1.16b, v1.16b, v5.16b
+        st1             {v7.4s},  [x3], #16
+        st1             {v6.4s},  [x4], #16
+        and             v2.16b, v1.16b, v4.16b
+        bic             v3.16b, v1.16b, v4.16b
+        fadd            v1.4s,  v0.4s,  v2.4s
+        fsub            v0.4s,  v0.4s,  v3.4s
+        subs            x2,  x2,  #8
+        b.le            2f
+        ld1             {v7.4s},  [x1], #16
+        ld1             {v6.4s},  [x0], #16
+        cmle            v4.4s,  v7.4s,  #0
+        and             v5.16b, v6.16b, v20.16b
+        eor             v7.16b, v7.16b, v5.16b
+        st1             {v1.4s},  [x3], #16
+        st1             {v0.4s},  [x4], #16
+        and             v2.16b, v7.16b, v4.16b
+        bic             v3.16b, v7.16b, v4.16b
+        fadd            v7.4s,  v6.4s,  v2.4s
+        fsub            v6.4s,  v6.4s,  v3.4s
+        b               1b
+
+2:      st1             {v1.4s},  [x3], #16
+        st1             {v0.4s},  [x4], #16
+        b.lt            ret
+
+3:      ld1             {v1.4s},  [x1]
+        ld1             {v0.4s},  [x0]
+        cmle            v4.4s,  v1.4s,  #0
+        and             v5.16b, v0.16b, v20.16b
+        eor             v1.16b, v1.16b, v5.16b
+        and             v2.16b, v1.16b, v4.16b
+        bic             v3.16b, v1.16b, v4.16b
+        fadd            v1.4s,  v0.4s,  v2.4s
+        fsub            v0.4s,  v0.4s,  v3.4s
+        st1             {v1.4s},  [x0], #16
+        st1             {v0.4s},  [x1], #16
+ret:
+        ret
+endfunc
--- a/externals/ffmpeg/libavcodec/aarch64/vp8dsp.h
+++ b/externals/ffmpeg/libavcodec/aarch64/vp8dsp.h
@@ -0,0 +1,75 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_VP8DSP_H
+#define AVCODEC_AARCH64_VP8DSP_H
+
+#include "libavcodec/vp8dsp.h"
+
+#define VP8_LF_Y(hv, inner, opt)                                             \
+    void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst,            \
+                                                    ptrdiff_t stride,        \
+                                                    int flim_E, int flim_I,  \
+                                                    int hev_thresh)
+
+#define VP8_LF_UV(hv, inner, opt)                                            \
+    void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU,          \
+                                                     uint8_t *dstV,          \
+                                                     ptrdiff_t stride,       \
+                                                     int flim_E, int flim_I, \
+                                                     int hev_thresh)
+
+#define VP8_LF_SIMPLE(hv, opt)                                          \
+    void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst,         \
+                                                  ptrdiff_t stride,     \
+                                                  int flim)
+
+#define VP8_LF_HV(inner, opt)                   \
+    VP8_LF_Y(h,  inner, opt);                   \
+    VP8_LF_Y(v,  inner, opt);                   \
+    VP8_LF_UV(h, inner, opt);                   \
+    VP8_LF_UV(v, inner, opt)
+
+#define VP8_LF(opt)                             \
+    VP8_LF_HV(,       opt);                     \
+    VP8_LF_HV(_inner, opt);                     \
+    VP8_LF_SIMPLE(h, opt);                      \
+    VP8_LF_SIMPLE(v, opt)
+
+#define VP8_MC(n, opt)                                                  \
+    void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride,      \
+                                uint8_t *src, ptrdiff_t srcstride,      \
+                                int h, int x, int y)
+
+#define VP8_EPEL(w, opt)                        \
+    VP8_MC(pixels ## w, opt);                   \
+    VP8_MC(epel ## w ## _h4, opt);              \
+    VP8_MC(epel ## w ## _h6, opt);              \
+    VP8_MC(epel ## w ## _v4, opt);              \
+    VP8_MC(epel ## w ## _h4v4, opt);            \
+    VP8_MC(epel ## w ## _h6v4, opt);            \
+    VP8_MC(epel ## w ## _v6, opt);              \
+    VP8_MC(epel ## w ## _h4v6, opt);            \
+    VP8_MC(epel ## w ## _h6v6, opt)
+
+#define VP8_BILIN(w, opt)                       \
+    VP8_MC(bilin ## w ## _h, opt);              \
+    VP8_MC(bilin ## w ## _v, opt);              \
+    VP8_MC(bilin ## w ## _hv, opt)
+
+#endif /* AVCODEC_AARCH64_VP8DSP_H */
--- a/externals/ffmpeg/libavcodec/aarch64/vp8dsp_init_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/vp8dsp_init_aarch64.c
@@ -0,0 +1,124 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/vp8dsp.h"
+#include "vp8dsp.h"
+
+void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]);
+
+void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
+
+VP8_LF(neon);
+
+VP8_EPEL(16, neon);
+VP8_EPEL(8,  neon);
+VP8_EPEL(4,  neon);
+
+VP8_BILIN(16, neon);
+VP8_BILIN(8,  neon);
+VP8_BILIN(4,  neon);
+
+av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp)
+{
+    if (!have_neon(av_get_cpu_flags()))
+        return;
+    dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
+    dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon;
+    dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon;
+    dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon;
+
+    dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
+    dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_neon;
+    dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_neon;
+    dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_neon;
+    dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon;
+    dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon;
+    dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon;
+    dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon;
+    dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon;
+
+    dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon;
+    dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon;
+    dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon;
+    dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon;
+    dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon;
+    dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon;
+    dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon;
+    dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon;
+
+    dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
+    dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon;
+    dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon;
+    dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon;
+    dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon;
+    dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon;
+    dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon;
+    dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon;
+    dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon;
+
+    dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
+    dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon;
+    dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon;
+    dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon;
+    dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon;
+    dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon;
+    dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon;
+    dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon;
+    dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon;
+
+    dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon;
+    dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon;
+    dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon;
+    dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon;
+    dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon;
+    dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon;
+    dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon;
+    dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon;
+}
+
+av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp)
+{
+    if (!have_neon(av_get_cpu_flags()))
+        return;
+    dsp->vp8_luma_dc_wht    = ff_vp8_luma_dc_wht_neon;
+
+    dsp->vp8_idct_add       = ff_vp8_idct_add_neon;
+    dsp->vp8_idct_dc_add    = ff_vp8_idct_dc_add_neon;
+    dsp->vp8_idct_dc_add4y  = ff_vp8_idct_dc_add4y_neon;
+    dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon;
+
+    dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
+    dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;
+    dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon;
+    dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon;
+
+    dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon;
+    dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon;
+    dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon;
+    dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon;
+
+    dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon;
+    dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon;
+}
--- a/externals/ffmpeg/libavcodec/aarch64/vp8dsp_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/vp8dsp_neon.S
--- a/externals/ffmpeg/libavcodec/aarch64/vp9dsp_init.h
+++ b/externals/ffmpeg/libavcodec/aarch64/vp9dsp_init.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_VP9DSP_INIT_H
+#define AVCODEC_AARCH64_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+void ff_vp9dsp_init_10bpp_aarch64(VP9DSPContext *dsp);
+void ff_vp9dsp_init_12bpp_aarch64(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_AARCH64_VP9DSP_INIT_H */
--- a/externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_aarch64
+#include "vp9dsp_init_16bpp_aarch64_template.c"
--- a/externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_aarch64
+#include "vp9dsp_init_16bpp_aarch64_template.c"
--- a/externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
+++ b/externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/aarch64/cpu.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz, suffix)                                          \
+void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                      const uint8_t *src, ptrdiff_t src_stride, \
+                                      int h, int mx, int my)
+
+#define decl_mc_func(op, filter, dir, sz, bpp)                                                   \
+void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                                       const uint8_t *src, ptrdiff_t src_stride, \
+                                                       int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz, bpp)                                      \
+static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                const uint8_t *src,                 \
+                                                ptrdiff_t src_stride,               \
+                                                int h, int mx, int my)              \
+{                                                                                   \
+    LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]);         \
+    /* We only need h + 7 lines, but the horizontal filter assumes an               \
+     * even number of rows, so filter h + 8 lines here. */                          \
+    ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz,                          \
+                                             src - 3 * src_stride, src_stride,      \
+                                             h + 8, mx, 0);                         \
+    ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride,                    \
+                                                temp + 3 * 2 * sz, 2 * sz,          \
+                                                h, 0, my);                          \
+}
+
+#define decl_filter_funcs(op, dir, sz, bpp)  \
+    decl_mc_func(op, regular, dir, sz, bpp); \
+    decl_mc_func(op, sharp,   dir, sz, bpp); \
+    decl_mc_func(op, smooth,  dir, sz, bpp)
+
+#define decl_mc_funcs(sz, bpp)           \
+    decl_filter_funcs(put, h,  sz, bpp); \
+    decl_filter_funcs(avg, h,  sz, bpp); \
+    decl_filter_funcs(put, v,  sz, bpp); \
+    decl_filter_funcs(avg, v,  sz, bpp); \
+    decl_filter_funcs(put, hv, sz, bpp); \
+    decl_filter_funcs(avg, hv, sz, bpp)
+
+#define ff_vp9_copy32_neon  ff_vp9_copy32_aarch64
+#define ff_vp9_copy64_neon  ff_vp9_copy64_aarch64
+#define ff_vp9_copy128_neon ff_vp9_copy128_aarch64
+
+declare_fpel(copy, 128, );
+declare_fpel(copy, 64,  );
+declare_fpel(copy, 32,  );
+declare_fpel(copy, 16,  );
+declare_fpel(copy, 8,   );
+declare_fpel(avg, 64, _16);
+declare_fpel(avg, 32, _16);
+declare_fpel(avg, 16, _16);
+declare_fpel(avg, 8,  _16);
+declare_fpel(avg, 4,  _16);
+
+decl_mc_funcs(64, BPP);
+decl_mc_funcs(32, BPP);
+decl_mc_funcs(16, BPP);
+decl_mc_funcs(8, BPP);
+decl_mc_funcs(4, BPP);
+
+#define define_8tap_2d_funcs(sz, bpp)        \
+    define_8tap_2d_fn(put, regular, sz, bpp) \
+    define_8tap_2d_fn(put, sharp,   sz, bpp) \
+    define_8tap_2d_fn(put, smooth,  sz, bpp) \
+    define_8tap_2d_fn(avg, regular, sz, bpp) \
+    define_8tap_2d_fn(avg, sharp,   sz, bpp) \
+    define_8tap_2d_fn(avg, smooth,  sz, bpp)
+
+define_8tap_2d_funcs(64, BPP)
+define_8tap_2d_funcs(32, BPP)
+define_8tap_2d_funcs(16, BPP)
+define_8tap_2d_funcs(8,  BPP)
+define_8tap_2d_funcs(4,  BPP)
+
+static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#define init_fpel(idx1, idx2, sz, type, suffix)      \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##suffix
+
+#define init_copy(idx, sz, suffix) \
+    init_fpel(idx, 0, sz, copy, suffix)
+
+#define init_avg(idx, sz, suffix) \
+    init_fpel(idx, 1, sz, avg,  suffix)
+
+#define init_copy_avg(idx, sz1, sz2) \
+    init_copy(idx, sz2, _neon);      \
+    init_avg (idx, sz1, _16_neon)
+
+    if (have_armv8(cpu_flags)) {
+        init_copy(0, 128, _aarch64);
+        init_copy(1, 64,  _aarch64);
+        init_copy(2, 32,  _aarch64);
+    }
+
+    if (have_neon(cpu_flags)) {
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
+    dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp)                                   \
+    init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx, bpp)
+
+#define init_mc_funcs_dirs(idx, sz, bpp)            \
+    init_mc_funcs(idx, v,  0, 1, sz, ff_vp9_, bpp); \
+    init_mc_funcs(idx, h,  1, 0, sz, ff_vp9_, bpp); \
+    init_mc_funcs(idx, hv, 1, 1, sz,        , bpp)
+
+
+        init_avg(0, 64, _16_neon);
+        init_avg(1, 32, _16_neon);
+        init_avg(2, 16, _16_neon);
+        init_copy_avg(3, 8, 16);
+        init_copy_avg(4, 4, 8);
+
+        init_mc_funcs_dirs(0, 64, BPP);
+        init_mc_funcs_dirs(1, 32, BPP);
+        init_mc_funcs_dirs(2, 16, BPP);
+        init_mc_funcs_dirs(3, 8,  BPP);
+        init_mc_funcs_dirs(4, 4,  BPP);
+    }
+}
+
+#define define_itxfm2(type_a, type_b, sz, bpp)                                     \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst,    \
+                                                                 ptrdiff_t stride, \
+                                                                 int16_t *_block, int eob)
+#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
+
+#define define_itxfm_funcs(sz, bpp)      \
+    define_itxfm(idct,  idct,  sz, bpp); \
+    define_itxfm(iadst, idct,  sz, bpp); \
+    define_itxfm(idct,  iadst, sz, bpp); \
+    define_itxfm(iadst, iadst, sz, bpp)
+
+define_itxfm_funcs(4,  BPP);
+define_itxfm_funcs(8,  BPP);
+define_itxfm_funcs(16, BPP);
+define_itxfm(idct, idct, 32, BPP);
+define_itxfm(iwht, iwht, 4,  BPP);
+
+
+static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_itxfm2(tx, sz, bpp)                                               \
+    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_##bpp##_neon;  \
+    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
+    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
+#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
+
+#define init_idct2(tx, nm, bpp)     \
+    dsp->itxfm_add[tx][DCT_DCT]   = \
+    dsp->itxfm_add[tx][ADST_DCT]  = \
+    dsp->itxfm_add[tx][DCT_ADST]  = \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
+#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
+
+        init_itxfm(TX_4X4,   4x4,   BPP);
+        init_itxfm(TX_8X8,   8x8,   BPP);
+        init_itxfm(TX_16X16, 16x16, BPP);
+        init_idct(TX_32X32, idct_idct_32x32, BPP);
+        init_idct(4,        iwht_iwht_4x4,   BPP);
+    }
+}
+
+#define define_loop_filter(dir, wd, size, bpp) \
+void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
+
+#define define_loop_filters(wd, size, bpp) \
+    define_loop_filter(h, wd, size, bpp);  \
+    define_loop_filter(v, wd, size, bpp)
+
+define_loop_filters(4,  8,  BPP);
+define_loop_filters(8,  8,  BPP);
+define_loop_filters(16, 8,  BPP);
+
+define_loop_filters(16, 16, BPP);
+
+define_loop_filters(44, 16, BPP);
+define_loop_filters(48, 16, BPP);
+define_loop_filters(84, 16, BPP);
+define_loop_filters(88, 16, BPP);
+
+static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
+    dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
+
+#define init_lpf_func_16(idx, dir, bpp) \
+    dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
+
+#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
+    dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
+
+#define init_lpf_funcs_8_wd(idx, wd, bpp) \
+    init_lpf_func_8(idx, 0, h, wd, bpp);  \
+    init_lpf_func_8(idx, 1, v, wd, bpp)
+
+#define init_lpf_funcs_16(bpp)   \
+    init_lpf_func_16(0, h, bpp); \
+    init_lpf_func_16(1, v, bpp)
+
+#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
+    init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp);  \
+    init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
+
+#define init_lpf_funcs_8(bpp)        \
+    init_lpf_funcs_8_wd(0, 4,  bpp); \
+    init_lpf_funcs_8_wd(1, 8,  bpp); \
+    init_lpf_funcs_8_wd(2, 16, bpp)
+
+#define init_lpf_funcs_mix2(bpp)           \
+    init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
+    init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
+    init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
+    init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
+
+        init_lpf_funcs_8(BPP);
+        init_lpf_funcs_16(BPP);
+        init_lpf_funcs_mix2(BPP);
+    }
+}
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp)
+{
+    vp9dsp_mc_init_aarch64(dsp);
+    vp9dsp_loopfilter_init_aarch64(dsp);
+    vp9dsp_itxfm_init_aarch64(dsp);
+}
--- a/externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_aarch64.c
+++ b/externals/ffmpeg/libavcodec/aarch64/vp9dsp_init_aarch64.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz)                                          \
+void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                              const uint8_t *src, ptrdiff_t src_stride, \
+                              int h, int mx, int my)
+
+#define declare_copy_avg(sz) \
+    declare_fpel(copy, sz);  \
+    declare_fpel(avg , sz)
+
+#define decl_mc_func(op, filter, dir, sz)                                                \
+void ff_vp9_##op##_##filter##sz##_##dir##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                               const uint8_t *src, ptrdiff_t src_stride, \
+                                               int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz)                                         \
+static void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                        const uint8_t *src, ptrdiff_t src_stride, \
+                                        int h, int mx, int my)                    \
+{                                                                                 \
+    LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz]);           \
+    /* We only need h + 7 lines, but the horizontal filter assumes an             \
+     * even number of rows, so filter h + 8 lines here. */                        \
+    ff_vp9_put_##filter##sz##_h_neon(temp, sz,                                    \
+                                     src - 3 * src_stride, src_stride,            \
+                                     h + 8, mx, 0);                               \
+    ff_vp9_##op##_##filter##sz##_v_neon(dst, dst_stride,                          \
+                                        temp + 3 * sz, sz,                        \
+                                        h, 0, my);                                \
+}
+
+#define decl_filter_funcs(op, dir, sz)  \
+    decl_mc_func(op, regular, dir, sz); \
+    decl_mc_func(op, sharp,   dir, sz); \
+    decl_mc_func(op, smooth,  dir, sz)
+
+#define decl_mc_funcs(sz)           \
+    decl_filter_funcs(put, h,  sz); \
+    decl_filter_funcs(avg, h,  sz); \
+    decl_filter_funcs(put, v,  sz); \
+    decl_filter_funcs(avg, v,  sz); \
+    decl_filter_funcs(put, hv, sz); \
+    decl_filter_funcs(avg, hv, sz)
+
+#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
+#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
+
+declare_copy_avg(64);
+declare_copy_avg(32);
+declare_copy_avg(16);
+declare_copy_avg(8);
+declare_copy_avg(4);
+
+decl_mc_funcs(64);
+decl_mc_funcs(32);
+decl_mc_funcs(16);
+decl_mc_funcs(8);
+decl_mc_funcs(4);
+
+#define define_8tap_2d_funcs(sz)        \
+    define_8tap_2d_fn(put, regular, sz) \
+    define_8tap_2d_fn(put, sharp,   sz) \
+    define_8tap_2d_fn(put, smooth,  sz) \
+    define_8tap_2d_fn(avg, regular, sz) \
+    define_8tap_2d_fn(avg, sharp,   sz) \
+    define_8tap_2d_fn(avg, smooth,  sz)
+
+define_8tap_2d_funcs(64)
+define_8tap_2d_funcs(32)
+define_8tap_2d_funcs(16)
+define_8tap_2d_funcs(8)
+define_8tap_2d_funcs(4)
+
+static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#define init_fpel(idx1, idx2, sz, type, suffix)      \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##suffix
+
+#define init_copy(idx, sz, suffix) \
+    init_fpel(idx, 0, sz, copy, suffix)
+
+#define init_avg(idx, sz, suffix) \
+    init_fpel(idx, 1, sz, avg,  suffix)
+
+#define init_copy_avg(idx, sz) \
+    init_copy(idx, sz, _neon); \
+    init_avg (idx, sz, _neon)
+
+    if (have_armv8(cpu_flags)) {
+        init_copy(0, 64, _aarch64);
+        init_copy(1, 32, _aarch64);
+    }
+
+    if (have_neon(cpu_flags)) {
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \
+    dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx)                                   \
+    init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx)
+
+#define init_mc_funcs_dirs(idx, sz)            \
+    init_mc_funcs(idx, h,  1, 0, sz, ff_vp9_); \
+    init_mc_funcs(idx, v,  0, 1, sz, ff_vp9_); \
+    init_mc_funcs(idx, hv, 1, 1, sz,)
+
+        init_avg(0, 64, _neon);
+        init_avg(1, 32, _neon);
+        init_copy_avg(2, 16);
+        init_copy_avg(3, 8);
+        init_copy_avg(4, 4);
+
+        init_mc_funcs_dirs(0, 64);
+        init_mc_funcs_dirs(1, 32);
+        init_mc_funcs_dirs(2, 16);
+        init_mc_funcs_dirs(3, 8);
+        init_mc_funcs_dirs(4, 4);
+    }
+}
+
+#define define_itxfm(type_a, type_b, sz)                                   \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst,    \
+                                                         ptrdiff_t stride, \
+                                                         int16_t *_block, int eob)
+
+#define define_itxfm_funcs(sz)      \
+    define_itxfm(idct,  idct,  sz); \
+    define_itxfm(iadst, idct,  sz); \
+    define_itxfm(idct,  iadst, sz); \
+    define_itxfm(iadst, iadst, sz)
+
+define_itxfm_funcs(4);
+define_itxfm_funcs(8);
+define_itxfm_funcs(16);
+define_itxfm(idct, idct, 32);
+define_itxfm(iwht, iwht, 4);
+
+
+static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_itxfm(tx, sz)                                             \
+    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_neon;  \
+    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_neon; \
+    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_neon; \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
+
+#define init_idct(tx, nm)           \
+    dsp->itxfm_add[tx][DCT_DCT]   = \
+    dsp->itxfm_add[tx][ADST_DCT]  = \
+    dsp->itxfm_add[tx][DCT_ADST]  = \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
+
+        init_itxfm(TX_4X4, 4x4);
+        init_itxfm(TX_8X8, 8x8);
+        init_itxfm(TX_16X16, 16x16);
+        init_idct(TX_32X32, idct_idct_32x32);
+        init_idct(4, iwht_iwht_4x4);
+    }
+}
+
+#define define_loop_filter(dir, wd, len) \
+void ff_vp9_loop_filter_##dir##_##wd##_##len##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
+
+#define define_loop_filters(wd, len) \
+    define_loop_filter(h, wd, len);  \
+    define_loop_filter(v, wd, len)
+
+define_loop_filters(4, 8);
+define_loop_filters(8, 8);
+define_loop_filters(16, 8);
+
+define_loop_filters(16, 16);
+
+define_loop_filters(44, 16);
+define_loop_filters(48, 16);
+define_loop_filters(84, 16);
+define_loop_filters(88, 16);
+
+static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_neon;
+        dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_neon;
+        dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_neon;
+        dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_neon;
+        dsp->loop_filter_8[2][1] = ff_vp9_loop_filter_v_16_8_neon;
+        dsp->loop_filter_8[2][0] = ff_vp9_loop_filter_h_16_8_neon;
+
+        dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
+        dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
+
+        dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
+        dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
+        dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_neon;
+        dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_neon;
+        dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_neon;
+        dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_neon;
+        dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_neon;
+        dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_neon;
+    }
+}
+
+av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 10) {
+        ff_vp9dsp_init_10bpp_aarch64(dsp);
+        return;
+    } else if (bpp == 12) {
+        ff_vp9dsp_init_12bpp_aarch64(dsp);
+        return;
+    } else if (bpp != 8)
+        return;
+
+    vp9dsp_mc_init_aarch64(dsp);
+    vp9dsp_loopfilter_init_aarch64(dsp);
+    vp9dsp_itxfm_init_aarch64(dsp);
+}
--- a/externals/ffmpeg/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
--- a/externals/ffmpeg/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/vp9itxfm_neon.S
--- a/externals/ffmpeg/libavcodec/aarch64/vp9lpf_16bpp_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/vp9lpf_16bpp_neon.S
@@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+
+.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
+
+        trn1            \r0\().4s,  \t4\().4s,  \t6\().4s
+        trn2            \r2\().4s,  \t4\().4s,  \t6\().4s
+        trn1            \r1\().4s,  \t5\().4s,  \t7\().4s
+        trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
+.endm
+
+// The input to and output from this macro is in the registers v16-v31,
+// and v0-v7 are used as scratch registers.
+// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
+// Depending on the width of the loop filter, we either use v16-v19
+// and v28-v31 as temp registers, or v8-v15.
+.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
+        dup             v0.8h,  w2                   // E
+        dup             v2.8h,  w3                   // I
+        dup             v3.8h,  w4                   // H
+
+        uabd            v4.8h,  v20.8h, v21.8h       // abs(p3 - p2)
+        uabd            v5.8h,  v21.8h, v22.8h       // abs(p2 - p1)
+        uabd            v6.8h,  v22.8h, v23.8h       // abs(p1 - p0)
+        uabd            v7.8h,  v24.8h, v25.8h       // abs(q0 - q1)
+        uabd            \tmp1\().8h,  v25.8h, v26.8h // abs(q1 - q2)
+        uabd            \tmp2\().8h,  v26.8h, v27.8h // abs(q2 - q3)
+        umax            v4.8h,  v4.8h,  v5.8h
+        umax            v5.8h,  v6.8h,  v7.8h
+        umax            \tmp1\().8h,  \tmp1\().8h, \tmp2\().8h
+        uabd            v6.8h,  v23.8h, v24.8h       // abs(p0 - q0)
+        umax            v4.8h,  v4.8h,  v5.8h
+        add             v6.8h,  v6.8h,  v6.8h        // abs(p0 - q0) * 2
+        uabd            v5.8h,  v22.8h, v25.8h       // abs(p1 - q1)
+        umax            v4.8h,  v4.8h,  \tmp1\().8h  // max(abs(p3 - p2), ..., abs(q2 - q3))
+        ushr            v5.8h,  v5.8h,  #1
+        cmhs            v4.8h,  v2.8h,  v4.8h        // max(abs()) <= I
+        add             v6.8h,  v6.8h,  v5.8h        // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        cmhs            v6.8h,  v0.8h,  v6.8h
+        and             v4.16b, v4.16b, v6.16b       // fm
+
+        // If no pixels need filtering, just exit as soon as possible
+        mov             x11, v4.d[0]
+        mov             x12, v4.d[1]
+        adds            x11, x11, x12
+        b.ne            1f
+        br              x10
+1:
+
+.if \wd >= 8
+        dup             v0.8h,  w5
+
+        uabd            v6.8h,  v20.8h, v23.8h       // abs(p3 - p0)
+        uabd            v2.8h,  v21.8h, v23.8h       // abs(p2 - p0)
+        uabd            v1.8h,  v22.8h, v23.8h       // abs(p1 - p0)
+        uabd            \tmp1\().8h,  v25.8h, v24.8h // abs(q1 - q0)
+        uabd            \tmp2\().8h,  v26.8h, v24.8h // abs(q2 - q0)
+        uabd            \tmp3\().8h,  v27.8h, v24.8h // abs(q3 - q0)
+        umax            v6.8h,  v6.8h,  v2.8h
+        umax            v1.8h,  v1.8h,  \tmp1\().8h
+        umax            \tmp2\().8h,  \tmp2\().8h,  \tmp3\().8h
+.if \wd == 16
+        uabd            v7.8h,  v16.8h, v23.8h       // abs(p7 - p0)
+        umax            v6.8h,  v6.8h,  v1.8h
+        uabd            v2.8h,  v17.8h, v23.8h       // abs(p6 - p0)
+        umax            v6.8h,  v6.8h,  \tmp2\().8h
+        uabd            v1.8h,  v18.8h, v23.8h       // abs(p5 - p0)
+        cmhs            v6.8h,  v0.8h,  v6.8h        // flat8in
+        uabd            v8.8h,  v19.8h, v23.8h       // abs(p4 - p0)
+        and             v6.16b, v6.16b, v4.16b       // flat8in && fm
+        uabd            v9.8h,  v28.8h, v24.8h       // abs(q4 - q0)
+        bic             v4.16b, v4.16b, v6.16b       // fm && !flat8in
+        uabd            v10.8h, v29.8h, v24.8h       // abs(q5 - q0)
+        uabd            v11.8h, v30.8h, v24.8h       // abs(q6 - q0)
+        uabd            v12.8h, v31.8h, v24.8h       // abs(q7 - q0)
+
+        umax            v7.8h,  v7.8h,  v2.8h
+        umax            v1.8h,  v1.8h,  v8.8h
+        umax            v9.8h,  v9.8h,  v10.8h
+        umax            v11.8h, v11.8h, v12.8h
+        // The rest of the calculation of flat8out is interleaved below
+.else
+        // The rest of the calculation of flat8in is interleaved below
+.endif
+.endif
+
+        // Calculate the normal inner loop filter for 2 or 4 pixels
+        uabd            v5.8h,  v22.8h, v23.8h                  // abs(p1 - p0)
+.if \wd == 16
+        umax            v7.8h,  v7.8h,  v1.8h
+        umax            v9.8h,  v9.8h,  v11.8h
+.elseif \wd == 8
+        umax            v6.8h,  v6.8h,  v1.8h
+.endif
+        uabd            v1.8h,  v25.8h, v24.8h                  // abs(q1 - q0)
+.if \wd == 16
+        umax            v7.8h,  v7.8h,  v9.8h
+.elseif \wd == 8
+        umax            v6.8h,  v6.8h,  \tmp2\().8h
+.endif
+        dup             \tmp2\().8h,  w6                        // left shift for saturation
+        sub             \tmp1\().8h,  v22.8h,  v25.8h           // p1 - q1
+        neg             \tmp6\().8h,  \tmp2\().8h               // negative left shift after saturation
+        umax            v5.8h,  v5.8h,  v1.8h                   // max(abs(p1 - p0), abs(q1 - q0))
+        sub             \tmp3\().8h,  v24.8h,  v23.8h           // q0 - p0
+        movi            \tmp5\().8h,  #3
+.if \wd == 8
+        cmhs            v6.8h,  v0.8h,  v6.8h                   // flat8in
+.endif
+        cmhs            v5.8h,  v3.8h,  v5.8h                   // !hev
+.if \wd == 8
+        and             v6.16b, v6.16b, v4.16b                  // flat8in && fm
+.endif
+        sqshl           \tmp1\().8h,  \tmp1\().8h,  \tmp2\().8h
+.if \wd == 16
+        cmhs            v7.8h,  v0.8h,  v7.8h                   // flat8out
+.elseif \wd == 8
+        bic             v4.16b, v4.16b, v6.16b                  // fm && !flat8in
+.endif
+        and             v5.16b,  v5.16b,  v4.16b                // !hev && fm && !flat8in
+.if \wd == 16
+        and             v7.16b, v7.16b, v6.16b                  // flat8out && flat8in && fm
+.endif
+        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
+
+        mul             \tmp3\().8h,  \tmp3\().8h,  \tmp5\().8h // 3 * (q0 - p0)
+        bic             \tmp1\().16b, \tmp1\().16b, v5.16b      // if (!hev) av_clip_int8 = 0
+        movi            v2.8h,  #4
+        add             \tmp3\().8h,  \tmp3\().8h,  \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+        movi            v3.8h,  #3
+        sqshl           \tmp1\().8h,  \tmp3\().8h,  \tmp2\().8h
+        movi            \tmp5\().8h,  #0
+        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
+        dup             \tmp6\().8h,  w7                        // max pixel value
+.if \wd == 16
+        bic             v6.16b, v6.16b, v7.16b                  // fm && flat8in && !flat8out
+.endif
+
+        ushr            \tmp2\().8h,  \tmp6\().8h,  #1          // (1 << (BIT_DEPTH - 1)) - 1
+
+        add             \tmp3\().8h,  \tmp1\().8h,  v2.8h       // f + 4
+        add             \tmp4\().8h,  \tmp1\().8h,  v3.8h       // f + 3
+        smin            \tmp3\().8h,  \tmp3\().8h,  \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
+        smin            \tmp4\().8h,  \tmp4\().8h,  \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
+        sshr            \tmp3\().8h,  \tmp3\().8h,  #3          // f1
+        sshr            \tmp4\().8h,  \tmp4\().8h,  #3          // f2
+
+        add             v0.8h,   v23.8h,  \tmp4\().8h           // p0 + f2
+        sub             v2.8h,   v24.8h,  \tmp3\().8h           // q0 - f1
+        smin            v0.8h,   v0.8h,   \tmp6\().8h
+        smin            v2.8h,   v2.8h,   \tmp6\().8h
+        srshr           \tmp3\().8h, \tmp3\().8h, #1            // f = (f1 + 1) >> 1
+        smax            v0.8h,   v0.8h,   \tmp5\().8h           // out p0
+        smax            v2.8h,   v2.8h,   \tmp5\().8h           // out q0
+        bit             v23.16b, v0.16b,  v4.16b                // if (fm && !flat8in)
+        bit             v24.16b, v2.16b,  v4.16b
+
+        add             v0.8h,  v22.8h,  \tmp3\().8h            // p1 + f
+        sub             v2.8h,  v25.8h,  \tmp3\().8h            // q1 - f
+.if \wd >= 8
+        mov             x11, v6.d[0]
+.endif
+        smin            v0.8h,  v0.8h,  \tmp6\().8h
+        smin            v2.8h,  v2.8h,  \tmp6\().8h
+.if \wd >= 8
+        mov             x12, v6.d[1]
+.endif
+        smax            v0.8h,  v0.8h,  \tmp5\().8h             // out p1
+        smax            v2.8h,  v2.8h,  \tmp5\().8h             // out q1
+.if \wd >= 8
+        adds            x11, x11, x12
+.endif
+        bit             v22.16b, v0.16b,  v5.16b                // if (!hev && fm && !flat8in)
+        bit             v25.16b, v2.16b,  v5.16b
+
+        // If no pixels need flat8in, jump to flat8out
+        // (or to a writeout of the inner 4 pixels, for wd=8)
+.if \wd >= 8
+.if \wd == 16
+        b.eq            6f
+.else
+        b.ne            1f
+        br              x13
+1:
+.endif
+
+        // flat8in
+        add             \tmp1\().8h, v20.8h, v21.8h
+        add             \tmp3\().8h, v22.8h, v25.8h
+        add             \tmp5\().8h, v20.8h, v22.8h
+        add             \tmp7\().8h, v23.8h, v26.8h
+        add             v0.8h,  \tmp1\().8h, \tmp1\().8h
+        add             v0.8h,  v0.8h,  v23.8h
+        add             v0.8h,  v0.8h,  v24.8h
+        add             v0.8h,  v0.8h,  \tmp5\().8h
+        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
+        urshr           v2.8h,  v0.8h,  #3                      // out p2
+
+        add             v0.8h,  v0.8h,  \tmp3\().8h
+        add             \tmp1\().8h, v20.8h,  v23.8h
+        add             \tmp3\().8h, v24.8h,  v27.8h
+        urshr           v3.8h,  v0.8h,  #3                      // out p1
+
+        add             v0.8h,  v0.8h,  \tmp7\().8h
+        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+        add             \tmp5\().8h, v21.8h,  v24.8h
+        add             \tmp7\().8h, v25.8h,  v27.8h
+        urshr           v4.8h,  v0.8h,  #3                      // out p0
+
+        add             v0.8h,  v0.8h,  \tmp3\().8h
+        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
+        add             \tmp1\().8h, v22.8h,  v25.8h
+        add             \tmp3\().8h, v26.8h,  v27.8h
+        urshr           v5.8h,  v0.8h,  #3                      // out q0
+
+        add             v0.8h,  v0.8h,  \tmp7\().8h
+        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+        urshr           \tmp5\().8h, v0.8h,  #3                 // out q1
+
+        add             v0.8h,  v0.8h,  \tmp3\().8h
+        // The output here is written back into the input registers. This doesn't
+        // matter for the flat8part below, since we only update those pixels
+        // which won't be touched below.
+        bit             v21.16b, v2.16b,  v6.16b
+        bit             v22.16b, v3.16b,  v6.16b
+        bit             v23.16b, v4.16b,  v6.16b
+        urshr           \tmp6\().8h,  v0.8h,  #3                // out q2
+        bit             v24.16b, v5.16b,  v6.16b
+        bit             v25.16b, \tmp5\().16b,  v6.16b
+        bit             v26.16b, \tmp6\().16b,  v6.16b
+.endif
+.if \wd == 16
+6:
+        orr             v2.16b,  v6.16b,  v7.16b
+        mov             x11, v2.d[0]
+        mov             x12, v2.d[1]
+        adds            x11, x11, x12
+        b.ne            1f
+        // If no pixels needed flat8in nor flat8out, jump to a
+        // writeout of the inner 4 pixels
+        br              x14
+1:
+
+        mov             x11, v7.d[0]
+        mov             x12, v7.d[1]
+        adds            x11, x11, x12
+        b.ne            1f
+        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
+        br              x15
+
+1:
+        // flat8out
+        // This writes all outputs into v2-v17 (skipping v6 and v16).
+        // If this part is skipped, the output is read from v21-v26 (which is the input
+        // to this section).
+        shl             v0.8h,   v16.8h,  #3     // 8 * v16
+        sub             v0.8h,   v0.8h,   v16.8h // 7 * v16
+        add             v0.8h,   v0.8h,   v17.8h
+        add             v8.8h,   v17.8h,  v18.8h
+        add             v10.8h,  v19.8h,  v20.8h
+        add             v0.8h,   v0.8h,   v8.8h
+        add             v8.8h,   v16.8h,  v17.8h
+        add             v12.8h,  v21.8h,  v22.8h
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v10.8h,  v18.8h,  v25.8h
+        add             v14.8h,  v23.8h,  v24.8h
+        sub             v10.8h,  v10.8h,  v8.8h
+        add             v0.8h,   v0.8h,   v12.8h
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v12.8h,  v16.8h,  v18.8h
+        add             v14.8h,  v19.8h,  v26.8h
+        urshr           v2.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v8.8h,   v16.8h,  v19.8h
+        add             v10.8h,  v20.8h,  v27.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        bif             v2.16b,  v17.16b, v7.16b
+        urshr           v3.8h ,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v12.8h,  v16.8h,  v20.8h
+        add             v14.8h,  v21.8h,  v28.8h
+        sub             v10.8h,  v10.8h,  v8.8h
+        bif             v3.16b,  v18.16b, v7.16b
+        urshr           v4.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v8.8h,   v16.8h,  v21.8h
+        add             v10.8h,  v22.8h,  v29.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        bif             v4.16b,  v19.16b, v7.16b
+        urshr           v5.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v12.8h,  v16.8h,  v22.8h
+        add             v14.8h,  v23.8h,  v30.8h
+        sub             v10.8h,  v10.8h,  v8.8h
+        bif             v5.16b,  v20.16b, v7.16b
+        urshr           v6.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v10.8h,  v16.8h,  v23.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        add             v12.8h,  v24.8h,  v31.8h
+        bif             v6.16b,  v21.16b, v7.16b
+        urshr           v8.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        sub             v10.8h,  v12.8h,  v10.8h
+        add             v12.8h,  v17.8h,  v24.8h
+        add             v14.8h,  v25.8h,  v31.8h
+        bif             v8.16b,  v22.16b, v7.16b
+        urshr           v9.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        add             v12.8h,  v26.8h,  v31.8h
+        bif             v9.16b,  v23.16b, v7.16b
+        urshr           v10.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v14.8h,  v18.8h,  v25.8h
+        add             v18.8h,  v19.8h,  v26.8h
+        sub             v12.8h,  v12.8h,  v14.8h
+        add             v14.8h,  v27.8h,  v31.8h
+        bif             v10.16b, v24.16b, v7.16b
+        urshr           v11.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v12.8h
+        add             v12.8h,  v20.8h,  v27.8h
+        sub             v14.8h,  v14.8h,  v18.8h
+        add             v18.8h,  v28.8h,  v31.8h
+        bif             v11.16b, v25.16b, v7.16b
+        sub             v18.8h,  v18.8h,  v12.8h
+        urshr           v12.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v14.8h,  v21.8h,  v28.8h
+        add             v20.8h,  v29.8h,  v31.8h
+        bif             v12.16b, v26.16b, v7.16b
+        urshr           v13.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v18.8h
+        sub             v20.8h,  v20.8h,  v14.8h
+        add             v18.8h,  v22.8h,  v29.8h
+        add             v22.8h,  v30.8h,  v31.8h
+        bif             v13.16b, v27.16b, v7.16b
+        urshr           v14.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v20.8h
+        sub             v22.8h,  v22.8h,  v18.8h
+        bif             v14.16b, v28.16b, v7.16b
+        urshr           v15.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v22.8h
+        bif             v15.16b, v29.16b, v7.16b
+        urshr           v17.8h,  v0.8h,   #4
+        bif             v17.16b, v30.16b, v7.16b
+.endif
+.endm
+
+// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
+// while we need those for inputs/outputs in wd=16 and use v8-v15
+// for temp registers there instead.
+function vp9_loop_filter_4
+        loop_filter     4,  v16, v17, v18, v19, v28, v29, v30, v31
+        ret
+endfunc
+
+function vp9_loop_filter_8
+        loop_filter     8,  v16, v17, v18, v19, v28, v29, v30, v31
+        ret
+endfunc
+
+function vp9_loop_filter_16
+        loop_filter     16, v8,  v9,  v10, v11, v12, v13, v14, v15
+        ret
+endfunc
+
+.macro loop_filter_4
+        bl              vp9_loop_filter_4
+.endm
+
+.macro loop_filter_8
+        // calculate alternative 'return' targets
+        adr             x13, 6f
+        bl              vp9_loop_filter_8
+.endm
+
+.macro loop_filter_16
+        // calculate alternative 'return' targets
+        adr             x14, 7f
+        adr             x15, 8f
+        bl              vp9_loop_filter_16
+.endm
+
+
+// The public functions in this file have got the following signature:
+// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
+
+.macro bpp_frontend func, bpp, push
+function ff_\func\()_\bpp\()_neon, export=1
+.if \push
+        mov             x16, x30
+        stp             d14, d15, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d8,  d9,  [sp, #-0x10]!
+.endif
+        lsl             w2,  w2,  #\bpp - 8
+        lsl             w3,  w3,  #\bpp - 8
+        lsl             w4,  w4,  #\bpp - 8
+        mov             x5,  #1 << (\bpp - 8)
+        mov             x6,  #16 - \bpp
+        mov             x7,  #((1 << \bpp) - 1)
+.if \push
+        bl              \func\()_16_neon
+        ldp             d8,  d9,  [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+        br              x16
+.else
+        b               \func\()_16_neon
+.endif
+endfunc
+.endm
+
+.macro bpp_frontends func, push=0
+        bpp_frontend    \func, 10, \push
+        bpp_frontend    \func, 12, \push
+.endm
+
+.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
+function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
+        mov             x16, x30
+.if \push
+        stp             d14, d15, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d8,  d9,  [sp, #-0x10]!
+.endif
+        lsl             w2,  w2,  #\bpp - 8
+        lsl             w3,  w3,  #\bpp - 8
+        lsl             w4,  w4,  #\bpp - 8
+        mov             x5,  #1 << (\bpp - 8)
+        mov             x6,  #16 - \bpp
+        mov             x7,  #((1 << \bpp) - 1)
+        bl              \func\()_\int_suffix\()_16_neon
+.ifc \dir,h
+        add             x0,  x0,  x1, lsl #3
+.else
+        add             x0,  x0,  #16
+.endif
+        bl              \func\()_\int_suffix\()_16_neon
+.if \push
+        ldp             d8,  d9,  [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+.endif
+        br              x16
+endfunc
+.endm
+
+.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
+        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
+        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
+.endm
+
+.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
+function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
+        mov             x16, x30
+        lsr             w8,  w2,  #8
+        lsr             w14, w3,  #8
+        lsr             w15, w4,  #8
+        and             w2,  w2,  #0xff
+        and             w3,  w3,  #0xff
+        and             w4,  w4,  #0xff
+        lsl             w2,  w2,  #\bpp - 8
+        lsl             w3,  w3,  #\bpp - 8
+        lsl             w4,  w4,  #\bpp - 8
+        mov             x5,  #1 << (\bpp - 8)
+        mov             x6,  #16 - \bpp
+        mov             x7,  #((1 << \bpp) - 1)
+        bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
+.ifc \dir,h
+        add             x0,  x0,  x1, lsl #3
+.else
+        add             x0,  x0,  #16
+.endif
+        lsl             w2,  w8,  #\bpp - 8
+        lsl             w3,  w14, #\bpp - 8
+        lsl             w4,  w15, #\bpp - 8
+        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
+        br              x16
+endfunc
+.endm
+
+.macro bpp_frontends_mix2 wd1, wd2
+        bpp_frontend_mix2 \wd1, \wd2, v, 10
+        bpp_frontend_mix2 \wd1, \wd2, v, 12
+        bpp_frontend_mix2 \wd1, \wd2, h, 10
+        bpp_frontend_mix2 \wd1, \wd2, h, 12
+.endm
+
+function vp9_loop_filter_v_4_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1 // p3
+        ld1             {v24.8h}, [x0], x1 // q0
+        ld1             {v21.8h}, [x9], x1 // p2
+        ld1             {v25.8h}, [x0], x1 // q1
+        ld1             {v22.8h}, [x9], x1 // p1
+        ld1             {v26.8h}, [x0], x1 // q2
+        ld1             {v23.8h}, [x9], x1 // p0
+        ld1             {v27.8h}, [x0], x1 // q3
+        sub             x0,  x0,  x1, lsl #2
+        sub             x9,  x9,  x1, lsl #1
+
+        loop_filter_4
+
+        st1             {v22.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_4_8
+
+function vp9_loop_filter_h_4_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  #8
+        add             x0,  x9,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1
+        ld1             {v24.8h}, [x0], x1
+        ld1             {v21.8h}, [x9], x1
+        ld1             {v25.8h}, [x0], x1
+        ld1             {v22.8h}, [x9], x1
+        ld1             {v26.8h}, [x0], x1
+        ld1             {v23.8h}, [x9], x1
+        ld1             {v27.8h}, [x0], x1
+
+        sub             x9,  x9,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        loop_filter_4
+
+        // Move x9 forward by 2 pixels; we don't need to rewrite the
+        // outermost 2 pixels since they aren't changed.
+        add             x9,  x9,  #4
+        add             x0,  x9,  x1, lsl #2
+
+        // We only will write the mid 4 pixels back; after the loop filter,
+        // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
+        // We need to transpose them to columns, done with a 4x8 transpose
+        // (which in practice is two 4x4 transposes of the two 4x4 halves
+        // of the 8x4 pixels; into 4x8 pixels).
+        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
+        st1             {v22.d}[0], [x9], x1
+        st1             {v22.d}[1], [x0], x1
+        st1             {v23.d}[0], [x9], x1
+        st1             {v23.d}[1], [x0], x1
+        st1             {v24.d}[0], [x9], x1
+        st1             {v24.d}[1], [x0], x1
+        st1             {v25.d}[0], [x9], x1
+        st1             {v25.d}[1], [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #4
+
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_4_8
+
+function vp9_loop_filter_v_8_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1 // p3
+        ld1             {v24.8h}, [x0], x1 // q0
+        ld1             {v21.8h}, [x9], x1 // p2
+        ld1             {v25.8h}, [x0], x1 // q1
+        ld1             {v22.8h}, [x9], x1 // p1
+        ld1             {v26.8h}, [x0], x1 // q2
+        ld1             {v23.8h}, [x9], x1 // p0
+        ld1             {v27.8h}, [x0], x1 // q3
+        sub             x9,  x9,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #2
+        add             x9,  x9,  x1
+
+        loop_filter_8
+
+        st1             {v21.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+
+        br              x10
+6:
+        sub             x9,  x0,  x1, lsl #1
+        st1             {v22.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_8_8
+
+function vp9_loop_filter_h_8_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  #8
+        add             x0,  x9,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1
+        ld1             {v24.8h}, [x0], x1
+        ld1             {v21.8h}, [x9], x1
+        ld1             {v25.8h}, [x0], x1
+        ld1             {v22.8h}, [x9], x1
+        ld1             {v26.8h}, [x0], x1
+        ld1             {v23.8h}, [x9], x1
+        ld1             {v27.8h}, [x0], x1
+
+        sub             x9,  x9,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        loop_filter_8
+
+        add             x0,  x9,  x1, lsl #2
+
+        // Even though only 6 pixels per row have been changed, we write the
+        // full 8 pixel registers.
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        st1             {v20.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v21.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v27.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+
+        br              x10
+6:
+        // If we didn't need to do the flat8in part, we use the same writeback
+        // as in loop_filter_h_4_8.
+        add             x9,  x9,  #4
+        add             x0,  x9,  x1, lsl #2
+        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
+        st1             {v22.d}[0], [x9], x1
+        st1             {v22.d}[1], [x0], x1
+        st1             {v23.d}[0], [x9], x1
+        st1             {v23.d}[1], [x0], x1
+        st1             {v24.d}[0], [x9], x1
+        st1             {v24.d}[1], [x0], x1
+        st1             {v25.d}[0], [x9], x1
+        st1             {v25.d}[1], [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #4
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_8_8
+
+bpp_frontends_mix2 4, 4
+bpp_frontends_mix2 4, 8
+bpp_frontends_mix2 8, 4
+bpp_frontends_mix2 8, 8
+
+function vp9_loop_filter_v_16_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  x1, lsl #3
+        ld1             {v16.8h}, [x9], x1 // p7
+        ld1             {v24.8h}, [x0], x1 // q0
+        ld1             {v17.8h}, [x9], x1 // p6
+        ld1             {v25.8h}, [x0], x1 // q1
+        ld1             {v18.8h}, [x9], x1 // p5
+        ld1             {v26.8h}, [x0], x1 // q2
+        ld1             {v19.8h}, [x9], x1 // p4
+        ld1             {v27.8h}, [x0], x1 // q3
+        ld1             {v20.8h}, [x9], x1 // p3
+        ld1             {v28.8h}, [x0], x1 // q4
+        ld1             {v21.8h}, [x9], x1 // p2
+        ld1             {v29.8h}, [x0], x1 // q5
+        ld1             {v22.8h}, [x9], x1 // p1
+        ld1             {v30.8h}, [x0], x1 // q6
+        ld1             {v23.8h}, [x9], x1 // p0
+        ld1             {v31.8h}, [x0], x1 // q7
+        sub             x9,  x9,  x1, lsl #3
+        sub             x0,  x0,  x1, lsl #3
+        add             x9,  x9,  x1
+
+        loop_filter_16
+
+        // If we did the flat8out part, we get the output in
+        // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
+        // store v2-v9 there, and v10-v17 into x0.
+        st1             {v2.8h},  [x9], x1
+        st1             {v10.8h}, [x0], x1
+        st1             {v3.8h},  [x9], x1
+        st1             {v11.8h}, [x0], x1
+        st1             {v4.8h},  [x9], x1
+        st1             {v12.8h}, [x0], x1
+        st1             {v5.8h},  [x9], x1
+        st1             {v13.8h}, [x0], x1
+        st1             {v6.8h},  [x9], x1
+        st1             {v14.8h}, [x0], x1
+        st1             {v8.8h},  [x9], x1
+        st1             {v15.8h}, [x0], x1
+        st1             {v9.8h},  [x9], x1
+        st1             {v17.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  x1
+
+        br              x10
+8:
+        add             x9,  x9,  x1, lsl #2
+        // If we didn't do the flat8out part, the output is left in the
+        // input registers.
+        st1             {v21.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+        br              x10
+7:
+        sub             x9,  x0,  x1, lsl #1
+        st1             {v22.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_16_8, push=1
+bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
+
+function vp9_loop_filter_h_16_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  #16
+        ld1             {v16.8h}, [x9], x1
+        ld1             {v24.8h}, [x0], x1
+        ld1             {v17.8h}, [x9], x1
+        ld1             {v25.8h}, [x0], x1
+        ld1             {v18.8h}, [x9], x1
+        ld1             {v26.8h}, [x0], x1
+        ld1             {v19.8h}, [x9], x1
+        ld1             {v27.8h}, [x0], x1
+        ld1             {v20.8h}, [x9], x1
+        ld1             {v28.8h}, [x0], x1
+        ld1             {v21.8h}, [x9], x1
+        ld1             {v29.8h}, [x0], x1
+        ld1             {v22.8h}, [x9], x1
+        ld1             {v30.8h}, [x0], x1
+        ld1             {v23.8h}, [x9], x1
+        ld1             {v31.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        sub             x9,  x9,  x1, lsl #3
+
+        // The 16x8 pixels read above is in two 8x8 blocks; the left
+        // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
+        // of this, to get one column per register.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
+
+        loop_filter_16
+
+        transpose_8x8H  v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
+        transpose_8x8H  v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
+
+        st1             {v16.8h}, [x9], x1
+        st1             {v10.8h}, [x0], x1
+        st1             {v2.8h},  [x9], x1
+        st1             {v11.8h}, [x0], x1
+        st1             {v3.8h},  [x9], x1
+        st1             {v12.8h}, [x0], x1
+        st1             {v4.8h},  [x9], x1
+        st1             {v13.8h}, [x0], x1
+        st1             {v5.8h},  [x9], x1
+        st1             {v14.8h}, [x0], x1
+        st1             {v6.8h},  [x9], x1
+        st1             {v15.8h}, [x0], x1
+        st1             {v8.8h},  [x9], x1
+        st1             {v17.8h}, [x0], x1
+        st1             {v9.8h},  [x9], x1
+        st1             {v31.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+
+        br              x10
+8:
+        // The same writeback as in loop_filter_h_8_8
+        sub             x9,  x0,  #8
+        add             x0,  x9,  x1, lsl #2
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        st1             {v20.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v21.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v27.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+        br              x10
+7:
+        // The same writeback as in loop_filter_h_4_8
+        sub             x9,  x0,  #4
+        add             x0,  x9,  x1, lsl #2
+        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
+        st1             {v22.d}[0], [x9], x1
+        st1             {v22.d}[1], [x0], x1
+        st1             {v23.d}[0], [x9], x1
+        st1             {v23.d}[1], [x0], x1
+        st1             {v24.d}[0], [x9], x1
+        st1             {v24.d}[1], [x0], x1
+        st1             {v25.d}[0], [x9], x1
+        st1             {v25.d}[1], [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #4
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_16_8, push=1
+bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1
--- a/externals/ffmpeg/libavcodec/aarch64/vp9lpf_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/vp9lpf_neon.S
--- a/externals/ffmpeg/libavcodec/aarch64/vp9mc_16bpp_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/vp9mc_16bpp_neon.S
@@ -0,0 +1,606 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// All public functions in this file have the following signature:
+// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+//                            const uint8_t *ref, ptrdiff_t ref_stride,
+//                            int h, int mx, int my);
+
+function ff_vp9_avg64_16_neon, export=1
+        mov             x5,  x0
+        sub             x1,  x1,  #64
+        sub             x3,  x3,  #64
+1:
+        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], #64
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
+        urhadd          v0.8h,  v0.8h,  v4.8h
+        urhadd          v1.8h,  v1.8h,  v5.8h
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
+        urhadd          v2.8h,  v2.8h,  v6.8h
+        urhadd          v3.8h,  v3.8h,  v7.8h
+        subs            w4,  w4,  #1
+        urhadd          v16.8h, v16.8h, v20.8h
+        urhadd          v17.8h, v17.8h, v21.8h
+        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x5], #64
+        urhadd          v18.8h, v18.8h, v22.8h
+        urhadd          v19.8h, v19.8h, v23.8h
+        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg32_16_neon, export=1
+        mov             x5,  x0
+1:
+        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], x3
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], x1
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
+        urhadd          v0.8h,  v0.8h,  v4.8h
+        urhadd          v1.8h,  v1.8h,  v5.8h
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
+        urhadd          v2.8h,  v2.8h,  v6.8h
+        urhadd          v3.8h,  v3.8h,  v7.8h
+        subs            w4,  w4,  #2
+        urhadd          v16.8h, v16.8h, v20.8h
+        urhadd          v17.8h, v17.8h, v21.8h
+        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x5], x1
+        urhadd          v18.8h, v18.8h, v22.8h
+        urhadd          v19.8h, v19.8h, v23.8h
+        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg16_16_neon, export=1
+1:
+        ld1             {v2.8h, v3.8h},  [x2], x3
+        ld1             {v0.8h, v1.8h},  [x0]
+        urhadd          v0.8h,  v0.8h,  v2.8h
+        urhadd          v1.8h,  v1.8h,  v3.8h
+        subs            w4,  w4,  #1
+        st1             {v0.8h, v1.8h},  [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg8_16_neon, export=1
+        mov             x5,  x0
+1:
+        ld1             {v2.8h},  [x2], x3
+        ld1             {v0.8h},  [x0], x1
+        ld1             {v3.8h},  [x2], x3
+        urhadd          v0.8h,  v0.8h,  v2.8h
+        ld1             {v1.8h},  [x0], x1
+        urhadd          v1.8h,  v1.8h,  v3.8h
+        subs            w4,  w4,  #2
+        st1             {v0.8h},  [x5], x1
+        st1             {v1.8h},  [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg4_16_neon, export=1
+        mov             x5,  x0
+1:
+        ld1             {v2.4h},  [x2], x3
+        ld1             {v0.4h},  [x0], x1
+        ld1             {v3.4h},  [x2], x3
+        urhadd          v0.4h,  v0.4h,  v2.4h
+        ld1             {v1.4h},  [x0], x1
+        urhadd          v1.4h,  v1.4h,  v3.4h
+        subs            w4,  w4,  #2
+        st1             {v0.4h},  [x5], x1
+        st1             {v1.8b},  [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+
+// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
+// for size >= 16), and multiply-accumulate into dst1 and dst5 (or
+// dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8
+// for size >= 16)
+.macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size
+        ext             v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
+        ext             v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
+        smlal           \dst1\().4s, v20.4h, v0.h[\offset]
+        smlal           \dst5\().4s, v22.4h, v0.h[\offset]
+.if \size >= 16
+        ext             v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
+        ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
+.endif
+.if \size >= 8
+        smlal2          \dst2\().4s, v20.8h, v0.h[\offset]
+        smlal2          \dst6\().4s, v22.8h, v0.h[\offset]
+.endif
+.if \size >= 16
+        smlal           \dst3\().4s, v21.4h, v0.h[\offset]
+        smlal           \dst7\().4s, v23.4h, v0.h[\offset]
+        smlal2          \dst4\().4s, v21.8h, v0.h[\offset]
+        smlal2          \dst8\().4s, v23.8h, v0.h[\offset]
+.endif
+.endm
+
+
+// Instantiate a horizontal filter function for the given size.
+// This can work on 4, 8 or 16 pixels in parallel; for larger
+// widths it will do 16 pixels at a time and loop horizontally.
+// The actual width (in bytes) is passed in x5, the height in w4 and
+// the filter coefficients in x9.
+.macro do_8tap_h type, size
+function \type\()_8tap_\size\()h
+        sub             x2,  x2,  #6
+        add             x6,  x0,  x1
+        add             x7,  x2,  x3
+        add             x1,  x1,  x1
+        add             x3,  x3,  x3
+        // Only size >= 16 loops horizontally and needs
+        // reduced dst stride
+.if \size >= 16
+        sub             x1,  x1,  x5
+.endif
+        // size >= 16 loads two qwords and increments r2,
+        // for size 4/8 it's enough with one qword and no
+        // postincrement
+.if \size >= 16
+        sub             x3,  x3,  x5
+        sub             x3,  x3,  #16
+.endif
+        // Load the filter vector
+        ld1             {v0.8h},  [x9]
+1:
+.if \size >= 16
+        mov             x9,  x5
+.endif
+        // Load src
+.if \size >= 16
+        ld1             {v5.8h,  v6.8h,  v7.8h},  [x2], #48
+        ld1             {v16.8h, v17.8h, v18.8h}, [x7], #48
+.else
+        ld1             {v5.8h,  v6.8h},  [x2]
+        ld1             {v16.8h, v17.8h}, [x7]
+.endif
+2:
+
+        smull           v1.4s,  v5.4h,  v0.h[0]
+        smull           v24.4s, v16.4h, v0.h[0]
+.if \size >= 8
+        smull2          v2.4s,  v5.8h,  v0.h[0]
+        smull2          v25.4s, v16.8h, v0.h[0]
+.endif
+.if \size >= 16
+        smull           v3.4s,  v6.4h,  v0.h[0]
+        smull           v26.4s, v17.4h, v0.h[0]
+        smull2          v4.4s,  v6.8h,  v0.h[0]
+        smull2          v27.4s, v17.8h, v0.h[0]
+.endif
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 1, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 2, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 3, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 4, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 5, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 6, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 7, \size
+
+        // Round, shift and saturate
+        // The sqrshrun takes care of clamping negative values to zero, but
+        // we manually need to do umin with the max pixel value.
+        sqrshrun        v1.4h,  v1.4s,  #7
+        sqrshrun        v24.4h, v24.4s, #7
+.if \size >= 8
+        sqrshrun2       v1.8h,  v2.4s,  #7
+        sqrshrun2       v24.8h, v25.4s, #7
+        umin            v1.8h,  v1.8h,  v31.8h
+        umin            v24.8h, v24.8h, v31.8h
+.if \size >= 16
+        sqrshrun        v2.4h,  v3.4s,  #7
+        sqrshrun        v25.4h, v26.4s, #7
+        sqrshrun2       v2.8h,  v4.4s,  #7
+        sqrshrun2       v25.8h, v27.4s, #7
+        umin            v2.8h,  v2.8h,  v31.8h
+        umin            v25.8h, v25.8h, v31.8h
+.endif
+.else
+        umin            v1.4h,  v1.4h,  v31.4h
+        umin            v24.4h, v24.4h, v31.4h
+.endif
+        // Average
+.ifc \type,avg
+.if \size >= 16
+        ld1             {v3.8h,  v4.8h},  [x0]
+        ld1             {v29.8h, v30.8h}, [x6]
+        urhadd          v1.8h,  v1.8h,  v3.8h
+        urhadd          v2.8h,  v2.8h,  v4.8h
+        urhadd          v24.8h, v24.8h, v29.8h
+        urhadd          v25.8h, v25.8h, v30.8h
+.elseif \size >= 8
+        ld1             {v3.8h},  [x0]
+        ld1             {v4.8h},  [x6]
+        urhadd          v1.8h,  v1.8h,  v3.8h
+        urhadd          v24.8h, v24.8h, v4.8h
+.else
+        ld1             {v3.4h},  [x0]
+        ld1             {v4.4h},  [x6]
+        urhadd          v1.4h,  v1.4h,  v3.4h
+        urhadd          v24.4h, v24.4h, v4.4h
+.endif
+.endif
+        // Store and loop horizontally (for size >= 16)
+.if \size >= 16
+        subs            x9,  x9,  #32
+        st1             {v1.8h,  v2.8h},  [x0], #32
+        st1             {v24.8h, v25.8h}, [x6], #32
+        b.eq            3f
+        mov             v5.16b,  v7.16b
+        mov             v16.16b, v18.16b
+        ld1             {v6.8h,  v7.8h},  [x2], #32
+        ld1             {v17.8h, v18.8h}, [x7], #32
+        b               2b
+.elseif \size == 8
+        st1             {v1.8h},  [x0]
+        st1             {v24.8h}, [x6]
+.else // \size == 4
+        st1             {v1.4h},  [x0]
+        st1             {v24.4h}, [x6]
+.endif
+3:
+        // Loop vertically
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        add             x2,  x2,  x3
+        add             x7,  x7,  x3
+        subs            w4,  w4,  #2
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+.macro do_8tap_h_size size
+do_8tap_h put, \size
+do_8tap_h avg, \size
+.endm
+
+do_8tap_h_size 4
+do_8tap_h_size 8
+do_8tap_h_size 16
+
+.macro do_8tap_h_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
+        mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+        movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
+        cmp             w5,  #8
+        add             x9,  x6,  w5, uxtw #4
+        mov             x5,  #2*\size
+.if \size >= 16
+        b               \type\()_8tap_16h
+.else
+        b               \type\()_8tap_\size\()h
+.endif
+endfunc
+.endm
+
+.macro do_8tap_h_filters size, bpp
+do_8tap_h_func put, regular, 1, \size, \bpp
+do_8tap_h_func avg, regular, 1, \size, \bpp
+do_8tap_h_func put, sharp,   2, \size, \bpp
+do_8tap_h_func avg, sharp,   2, \size, \bpp
+do_8tap_h_func put, smooth,  0, \size, \bpp
+do_8tap_h_func avg, smooth,  0, \size, \bpp
+.endm
+
+.macro do_8tap_h_filters_bpp bpp
+do_8tap_h_filters 64, \bpp
+do_8tap_h_filters 32, \bpp
+do_8tap_h_filters 16, \bpp
+do_8tap_h_filters 8,  \bpp
+do_8tap_h_filters 4,  \bpp
+.endm
+
+do_8tap_h_filters_bpp 10
+do_8tap_h_filters_bpp 12
+
+
+// Vertical filters
+
+// Round, shift and saturate and store reg1-reg4
+.macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type
+        sqrshrun        \reg1\().4h,  \reg1\().4s, #7
+        sqrshrun        \reg2\().4h,  \reg2\().4s, #7
+        sqrshrun        \reg3\().4h,  \reg3\().4s, #7
+        sqrshrun        \reg4\().4h,  \reg4\().4s, #7
+.ifc \type,avg
+        ld1             {\tmp1\().4h},  [x7], x1
+        ld1             {\tmp2\().4h},  [x7], x1
+        ld1             {\tmp3\().4h},  [x7], x1
+        ld1             {\tmp4\().4h},  [x7], x1
+.endif
+        umin            \reg1\().4h,  \reg1\().4h,  \minreg\().4h
+        umin            \reg2\().4h,  \reg2\().4h,  \minreg\().4h
+        umin            \reg3\().4h,  \reg3\().4h,  \minreg\().4h
+        umin            \reg4\().4h,  \reg4\().4h,  \minreg\().4h
+.ifc \type,avg
+        urhadd          \reg1\().4h,  \reg1\().4h,  \tmp1\().4h
+        urhadd          \reg2\().4h,  \reg2\().4h,  \tmp2\().4h
+        urhadd          \reg3\().4h,  \reg3\().4h,  \tmp3\().4h
+        urhadd          \reg4\().4h,  \reg4\().4h,  \tmp4\().4h
+.endif
+        st1             {\reg1\().4h},  [x0], x1
+        st1             {\reg2\().4h},  [x0], x1
+        st1             {\reg3\().4h},  [x0], x1
+        st1             {\reg4\().4h},  [x0], x1
+.endm
+
+// Round, shift and saturate and store reg1-8, where
+// reg1-2, reg3-4 etc pairwise correspond to 4 rows.
+.macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type
+        sqrshrun        \reg1\().4h,  \reg1\().4s, #7
+        sqrshrun2       \reg1\().8h,  \reg2\().4s, #7
+        sqrshrun        \reg2\().4h,  \reg3\().4s, #7
+        sqrshrun2       \reg2\().8h,  \reg4\().4s, #7
+        sqrshrun        \reg3\().4h,  \reg5\().4s, #7
+        sqrshrun2       \reg3\().8h,  \reg6\().4s, #7
+        sqrshrun        \reg4\().4h,  \reg7\().4s, #7
+        sqrshrun2       \reg4\().8h,  \reg8\().4s, #7
+.ifc \type,avg
+        ld1             {\reg5\().8h},  [x7], x1
+        ld1             {\reg6\().8h},  [x7], x1
+        ld1             {\reg7\().8h},  [x7], x1
+        ld1             {\reg8\().8h},  [x7], x1
+.endif
+        umin            \reg1\().8h,  \reg1\().8h,  \minreg\().8h
+        umin            \reg2\().8h,  \reg2\().8h,  \minreg\().8h
+        umin            \reg3\().8h,  \reg3\().8h,  \minreg\().8h
+        umin            \reg4\().8h,  \reg4\().8h,  \minreg\().8h
+.ifc \type,avg
+        urhadd          \reg1\().8h,  \reg1\().8h,  \reg5\().8h
+        urhadd          \reg2\().8h,  \reg2\().8h,  \reg6\().8h
+        urhadd          \reg3\().8h,  \reg3\().8h,  \reg7\().8h
+        urhadd          \reg4\().8h,  \reg4\().8h,  \reg8\().8h
+.endif
+        st1             {\reg1\().8h},  [x0], x1
+        st1             {\reg2\().8h},  [x0], x1
+        st1             {\reg3\().8h},  [x0], x1
+        st1             {\reg4\().8h},  [x0], x1
+.endm
+
+// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
+// (src1-src8 into dst1, src2-src9 into dst2).
+.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
+        smull           \dst1\().4s, \src1\().4h, v0.h[0]
+        smull           \dst2\().4s, \src2\().4h, v0.h[0]
+        smull           \tmp1\().4s, \src2\().4h, v0.h[1]
+        smull           \tmp2\().4s, \src3\().4h, v0.h[1]
+        smlal           \dst1\().4s, \src3\().4h, v0.h[2]
+        smlal           \dst2\().4s, \src4\().4h, v0.h[2]
+        smlal           \tmp1\().4s, \src4\().4h, v0.h[3]
+        smlal           \tmp2\().4s, \src5\().4h, v0.h[3]
+        smlal           \dst1\().4s, \src5\().4h, v0.h[4]
+        smlal           \dst2\().4s, \src6\().4h, v0.h[4]
+        smlal           \tmp1\().4s, \src6\().4h, v0.h[5]
+        smlal           \tmp2\().4s, \src7\().4h, v0.h[5]
+        smlal           \dst1\().4s, \src7\().4h, v0.h[6]
+        smlal           \dst2\().4s, \src8\().4h, v0.h[6]
+        smlal           \tmp1\().4s, \src8\().4h, v0.h[7]
+        smlal           \tmp2\().4s, \src9\().4h, v0.h[7]
+        add             \dst1\().4s, \dst1\().4s, \tmp1\().4s
+        add             \dst2\().4s, \dst2\().4s, \tmp2\().4s
+.endm
+
+// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4
+// (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4).
+.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9
+        smull           \dst1\().4s, \src1\().4h, v0.h[0]
+        smull2          \dst2\().4s, \src1\().8h, v0.h[0]
+        smull           \dst3\().4s, \src2\().4h, v0.h[0]
+        smull2          \dst4\().4s, \src2\().8h, v0.h[0]
+        smlal           \dst1\().4s, \src2\().4h, v0.h[1]
+        smlal2          \dst2\().4s, \src2\().8h, v0.h[1]
+        smlal           \dst3\().4s, \src3\().4h, v0.h[1]
+        smlal2          \dst4\().4s, \src3\().8h, v0.h[1]
+        smlal           \dst1\().4s, \src3\().4h, v0.h[2]
+        smlal2          \dst2\().4s, \src3\().8h, v0.h[2]
+        smlal           \dst3\().4s, \src4\().4h, v0.h[2]
+        smlal2          \dst4\().4s, \src4\().8h, v0.h[2]
+        smlal           \dst1\().4s, \src4\().4h, v0.h[3]
+        smlal2          \dst2\().4s, \src4\().8h, v0.h[3]
+        smlal           \dst3\().4s, \src5\().4h, v0.h[3]
+        smlal2          \dst4\().4s, \src5\().8h, v0.h[3]
+        smlal           \dst1\().4s, \src5\().4h, v0.h[4]
+        smlal2          \dst2\().4s, \src5\().8h, v0.h[4]
+        smlal           \dst3\().4s, \src6\().4h, v0.h[4]
+        smlal2          \dst4\().4s, \src6\().8h, v0.h[4]
+        smlal           \dst1\().4s, \src6\().4h, v0.h[5]
+        smlal2          \dst2\().4s, \src6\().8h, v0.h[5]
+        smlal           \dst3\().4s, \src7\().4h, v0.h[5]
+        smlal2          \dst4\().4s, \src7\().8h, v0.h[5]
+        smlal           \dst1\().4s, \src7\().4h, v0.h[6]
+        smlal2          \dst2\().4s, \src7\().8h, v0.h[6]
+        smlal           \dst3\().4s, \src8\().4h, v0.h[6]
+        smlal2          \dst4\().4s, \src8\().8h, v0.h[6]
+        smlal           \dst1\().4s, \src8\().4h, v0.h[7]
+        smlal2          \dst2\().4s, \src8\().8h, v0.h[7]
+        smlal           \dst3\().4s, \src9\().4h, v0.h[7]
+        smlal2          \dst4\().4s, \src9\().8h, v0.h[7]
+.endm
+
+// Instantiate a vertical filter function for filtering 8 pixels at a time.
+// The height is passed in x4, the width in x5 and the filter coefficients
+// in x6.
+.macro do_8tap_8v type
+function \type\()_8tap_8v
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        ld1             {v0.8h},  [x6]
+1:
+.ifc \type,avg
+        mov             x7,  x0
+.endif
+        mov             x6,  x4
+
+        ld1             {v17.8h}, [x2], x3
+        ld1             {v18.8h}, [x2], x3
+        ld1             {v19.8h}, [x2], x3
+        ld1             {v20.8h}, [x2], x3
+        ld1             {v21.8h}, [x2], x3
+        ld1             {v22.8h}, [x2], x3
+        ld1             {v23.8h}, [x2], x3
+2:
+        ld1             {v24.8h}, [x2], x3
+        ld1             {v25.8h}, [x2], x3
+        ld1             {v26.8h}, [x2], x3
+        ld1             {v27.8h}, [x2], x3
+
+        convolve8       v2,  v3,  v4,  v5,  v17, v18, v19, v20, v21, v22, v23, v24, v25
+        convolve8       v6,  v7,  v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27
+        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v30, v31, v1,  \type
+
+        subs            x6,  x6,  #4
+        b.eq            8f
+
+        ld1             {v16.8h}, [x2], x3
+        ld1             {v17.8h}, [x2], x3
+        ld1             {v18.8h}, [x2], x3
+        ld1             {v19.8h}, [x2], x3
+        convolve8       v2,  v3,  v4,  v5,  v21, v22, v23, v24, v25, v26, v27, v16, v17
+        convolve8       v6,  v7,  v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19
+        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v20, v21, v1,  \type
+
+        subs            x6,  x6,  #4
+        b.eq            8f
+
+        ld1             {v20.8h}, [x2], x3
+        ld1             {v21.8h}, [x2], x3
+        ld1             {v22.8h}, [x2], x3
+        ld1             {v23.8h}, [x2], x3
+        convolve8       v2,  v3,  v4,  v5,  v25, v26, v27, v16, v17, v18, v19, v20, v21
+        convolve8       v6,  v7,  v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23
+        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v24, v25, v1,  \type
+
+        subs            x6,  x6,  #4
+        b.ne            2b
+
+8:
+        subs            x5,  x5,  #8
+        b.eq            9f
+        // x0 -= h * dst_stride
+        msub            x0,  x1,  x4, x0
+        // x2 -= h * src_stride
+        msub            x2,  x3,  x4, x2
+        // x2 -= 8 * src_stride
+        sub             x2,  x2,  x3, lsl #3
+        // x2 += 1 * src_stride
+        add             x2,  x2,  x3
+        add             x2,  x2,  #16
+        add             x0,  x0,  #16
+        b               1b
+9:
+        ret
+endfunc
+.endm
+
+do_8tap_8v put
+do_8tap_8v avg
+
+
+// Instantiate a vertical filter function for filtering a 4 pixels wide
+// slice. This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type
+function \type\()_8tap_4v
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        ld1             {v0.8h},  [x6]
+.ifc \type,avg
+        mov             x7,  x0
+.endif
+
+        ld1             {v16.4h}, [x2], x3
+        ld1             {v17.4h}, [x2], x3
+        ld1             {v18.4h}, [x2], x3
+        ld1             {v19.4h}, [x2], x3
+        ld1             {v20.4h}, [x2], x3
+        ld1             {v21.4h}, [x2], x3
+        ld1             {v22.4h}, [x2], x3
+        ld1             {v23.4h}, [x2], x3
+        ld1             {v24.4h}, [x2], x3
+        ld1             {v25.4h}, [x2], x3
+        ld1             {v26.4h}, [x2], x3
+
+        convolve4       v2,  v3,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31
+        convolve4       v4,  v5,  v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31
+        do_store4       v2,  v3,  v4,  v5,  v28, v29, v30, v31, v1,  \type
+
+        subs            x4,  x4,  #4
+        b.eq            9f
+
+        ld1             {v27.4h}, [x2], x3
+        ld1             {v28.4h}, [x2], x3
+        ld1             {v29.4h}, [x2], x3
+        ld1             {v30.4h}, [x2], x3
+
+        convolve4       v2,  v3,  v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17
+        convolve4       v4,  v5,  v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
+        do_store4       v2,  v3,  v4,  v5,  v16, v17, v18, v19, v1,  \type
+
+9:
+        ret
+endfunc
+.endm
+
+do_8tap_4v put
+do_8tap_4v avg
+
+
+.macro do_8tap_v_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
+        uxtw            x4,  w4
+        mvni            v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+        movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
+        add             x6,  x5,  w6, uxtw #4
+        mov             x5,  #\size
+.if \size >= 8
+        b               \type\()_8tap_8v
+.else
+        b               \type\()_8tap_4v
+.endif
+endfunc
+.endm
+
+.macro do_8tap_v_filters size, bpp
+do_8tap_v_func put, regular, 1, \size, \bpp
+do_8tap_v_func avg, regular, 1, \size, \bpp
+do_8tap_v_func put, sharp,   2, \size, \bpp
+do_8tap_v_func avg, sharp,   2, \size, \bpp
+do_8tap_v_func put, smooth,  0, \size, \bpp
+do_8tap_v_func avg, smooth,  0, \size, \bpp
+.endm
+
+.macro do_8tap_v_filters_bpp bpp
+do_8tap_v_filters 64, \bpp
+do_8tap_v_filters 32, \bpp
+do_8tap_v_filters 16, \bpp
+do_8tap_v_filters 8,  \bpp
+do_8tap_v_filters 4,  \bpp
+.endm
+
+do_8tap_v_filters_bpp 10
+do_8tap_v_filters_bpp 12
--- a/externals/ffmpeg/libavcodec/aarch64/vp9mc_aarch64.S
+++ b/externals/ffmpeg/libavcodec/aarch64/vp9mc_aarch64.S
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// All public functions in this file have the following signature:
+// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+//                            const uint8_t *ref, ptrdiff_t ref_stride,
+//                            int h, int mx, int my);
+
+function ff_vp9_copy128_aarch64, export=1
+1:
+        ldp             x5,  x6,  [x2]
+        ldp             x7,  x8,  [x2, #16]
+        stp             x5,  x6,  [x0]
+        ldp             x9,  x10, [x2, #32]
+        stp             x7,  x8,  [x0, #16]
+        subs            w4,  w4,  #1
+        ldp             x11, x12, [x2, #48]
+        stp             x9,  x10, [x0, #32]
+        stp             x11, x12, [x0, #48]
+        ldp             x5,  x6,  [x2, #64]
+        ldp             x7,  x8,  [x2, #80]
+        stp             x5,  x6,  [x0, #64]
+        ldp             x9,  x10, [x2, #96]
+        stp             x7,  x8,  [x0, #80]
+        ldp             x11, x12, [x2, #112]
+        stp             x9,  x10, [x0, #96]
+        stp             x11, x12, [x0, #112]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_copy64_aarch64, export=1
+1:
+        ldp             x5,  x6,  [x2]
+        ldp             x7,  x8,  [x2, #16]
+        stp             x5,  x6,  [x0]
+        ldp             x9,  x10, [x2, #32]
+        stp             x7,  x8,  [x0, #16]
+        subs            w4,  w4,  #1
+        ldp             x11, x12, [x2, #48]
+        stp             x9,  x10, [x0, #32]
+        stp             x11, x12, [x0, #48]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_copy32_aarch64, export=1
+1:
+        ldp             x5,  x6,  [x2]
+        ldp             x7,  x8,  [x2, #16]
+        stp             x5,  x6,  [x0]
+        subs            w4,  w4,  #1
+        stp             x7,  x8,  [x0, #16]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.ne            1b
+        ret
+endfunc
--- a/externals/ffmpeg/libavcodec/aarch64/vp9mc_neon.S
+++ b/externals/ffmpeg/libavcodec/aarch64/vp9mc_neon.S
@@ -0,0 +1,657 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// All public functions in this file have the following signature:
+// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+//                            const uint8_t *ref, ptrdiff_t ref_stride,
+//                            int h, int mx, int my);
+
+function ff_vp9_avg64_neon, export=1
+        mov             x5,  x0
+1:
+        ld1             {v4.16b,  v5.16b,  v6.16b,  v7.16b},  [x2], x3
+        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x0], x1
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        urhadd          v0.16b,  v0.16b,  v4.16b
+        urhadd          v1.16b,  v1.16b,  v5.16b
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+        urhadd          v2.16b,  v2.16b,  v6.16b
+        urhadd          v3.16b,  v3.16b,  v7.16b
+        subs            w4,  w4,  #2
+        urhadd          v16.16b, v16.16b, v20.16b
+        urhadd          v17.16b, v17.16b, v21.16b
+        st1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5], x1
+        urhadd          v18.16b, v18.16b, v22.16b
+        urhadd          v19.16b, v19.16b, v23.16b
+        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg32_neon, export=1
+1:
+        ld1             {v2.16b, v3.16b},  [x2], x3
+        ld1             {v0.16b, v1.16b},  [x0]
+        urhadd          v0.16b,  v0.16b,  v2.16b
+        urhadd          v1.16b,  v1.16b,  v3.16b
+        subs            w4,  w4,  #1
+        st1             {v0.16b, v1.16b},  [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_copy16_neon, export=1
+        add             x5,  x0,  x1
+        lsl             x1,  x1,  #1
+        add             x6,  x2,  x3
+        lsl             x3,  x3,  #1
+1:
+        ld1             {v0.16b},  [x2], x3
+        ld1             {v1.16b},  [x6], x3
+        ld1             {v2.16b},  [x2], x3
+        ld1             {v3.16b},  [x6], x3
+        subs            w4,  w4,  #4
+        st1             {v0.16b},  [x0], x1
+        st1             {v1.16b},  [x5], x1
+        st1             {v2.16b},  [x0], x1
+        st1             {v3.16b},  [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg16_neon, export=1
+        mov             x5,  x0
+1:
+        ld1             {v2.16b},  [x2], x3
+        ld1             {v0.16b},  [x0], x1
+        ld1             {v3.16b},  [x2], x3
+        urhadd          v0.16b,  v0.16b,  v2.16b
+        ld1             {v1.16b},  [x0], x1
+        urhadd          v1.16b,  v1.16b,  v3.16b
+        subs            w4,  w4,  #2
+        st1             {v0.16b},  [x5], x1
+        st1             {v1.16b},  [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_copy8_neon, export=1
+1:
+        ld1             {v0.8b},  [x2], x3
+        ld1             {v1.8b},  [x2], x3
+        subs            w4,  w4,  #2
+        st1             {v0.8b},  [x0], x1
+        st1             {v1.8b},  [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg8_neon, export=1
+        mov             x5,  x0
+1:
+        ld1             {v2.8b},  [x2], x3
+        ld1             {v0.8b},  [x0], x1
+        ld1             {v3.8b},  [x2], x3
+        urhadd          v0.8b,  v0.8b,  v2.8b
+        ld1             {v1.8b},  [x0], x1
+        urhadd          v1.8b,  v1.8b,  v3.8b
+        subs            w4,  w4,  #2
+        st1             {v0.8b},  [x5], x1
+        st1             {v1.8b},  [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_copy4_neon, export=1
+1:
+        ld1             {v0.s}[0], [x2], x3
+        ld1             {v1.s}[0], [x2], x3
+        st1             {v0.s}[0], [x0], x1
+        ld1             {v2.s}[0], [x2], x3
+        st1             {v1.s}[0], [x0], x1
+        ld1             {v3.s}[0], [x2], x3
+        subs            w4,  w4,  #4
+        st1             {v2.s}[0], [x0], x1
+        st1             {v3.s}[0], [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg4_neon, export=1
+        mov             x5,  x0
+1:
+        ld1             {v2.s}[0], [x2], x3
+        ld1             {v0.s}[0], [x0], x1
+        ld1             {v2.s}[1], [x2], x3
+        ld1             {v0.s}[1], [x0], x1
+        ld1             {v3.s}[0], [x2], x3
+        ld1             {v1.s}[0], [x0], x1
+        ld1             {v3.s}[1], [x2], x3
+        ld1             {v1.s}[1], [x0], x1
+        subs            w4,  w4,  #4
+        urhadd          v0.8b,  v0.8b,  v2.8b
+        urhadd          v1.8b,  v1.8b,  v3.8b
+        st1             {v0.s}[0], [x5], x1
+        st1             {v0.s}[1], [x5], x1
+        st1             {v1.s}[0], [x5], x1
+        st1             {v1.s}[1], [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+
+// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
+// for size >= 16), and multiply-accumulate into dst1 and dst3 (or
+// dst1-dst2 and dst3-dst4 for size >= 16)
+.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
+        ext             v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
+        ext             v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
+.if \size >= 16
+        mla             \dst1\().8h, v20.8h, v0.h[\offset]
+        ext             v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
+        mla             \dst3\().8h, v22.8h, v0.h[\offset]
+        ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
+        mla             \dst2\().8h, v21.8h, v0.h[\offset]
+        mla             \dst4\().8h, v23.8h, v0.h[\offset]
+.elseif \size == 8
+        mla             \dst1\().8h, v20.8h, v0.h[\offset]
+        mla             \dst3\().8h, v22.8h, v0.h[\offset]
+.else
+        mla             \dst1\().4h, v20.4h, v0.h[\offset]
+        mla             \dst3\().4h, v22.4h, v0.h[\offset]
+.endif
+.endm
+// The same as above, but don't accumulate straight into the
+// destination, but use a temp register and accumulate with saturation.
+.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
+        ext             v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
+        ext             v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
+.if \size >= 16
+        mul             v20.8h, v20.8h, v0.h[\offset]
+        ext             v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
+        mul             v22.8h, v22.8h, v0.h[\offset]
+        ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
+        mul             v21.8h, v21.8h, v0.h[\offset]
+        mul             v23.8h, v23.8h, v0.h[\offset]
+.elseif \size == 8
+        mul             v20.8h, v20.8h, v0.h[\offset]
+        mul             v22.8h, v22.8h, v0.h[\offset]
+.else
+        mul             v20.4h, v20.4h, v0.h[\offset]
+        mul             v22.4h, v22.4h, v0.h[\offset]
+.endif
+.if \size == 4
+        sqadd           \dst1\().4h, \dst1\().4h, v20.4h
+        sqadd           \dst3\().4h, \dst3\().4h, v22.4h
+.else
+        sqadd           \dst1\().8h, \dst1\().8h, v20.8h
+        sqadd           \dst3\().8h, \dst3\().8h, v22.8h
+.if \size >= 16
+        sqadd           \dst2\().8h, \dst2\().8h, v21.8h
+        sqadd           \dst4\().8h, \dst4\().8h, v23.8h
+.endif
+.endif
+.endm
+
+
+// Instantiate a horizontal filter function for the given size.
+// This can work on 4, 8 or 16 pixels in parallel; for larger
+// widths it will do 16 pixels at a time and loop horizontally.
+// The actual width is passed in x5, the height in w4 and the
+// filter coefficients in x9. idx2 is the index of the largest
+// filter coefficient (3 or 4) and idx1 is the other one of them.
+.macro do_8tap_h type, size, idx1, idx2
+function \type\()_8tap_\size\()h_\idx1\idx2
+        sub             x2,  x2,  #3
+        add             x6,  x0,  x1
+        add             x7,  x2,  x3
+        add             x1,  x1,  x1
+        add             x3,  x3,  x3
+        // Only size >= 16 loops horizontally and needs
+        // reduced dst stride
+.if \size >= 16
+        sub             x1,  x1,  x5
+.endif
+        // size >= 16 loads two qwords and increments x2,
+        // for size 4/8 it's enough with one qword and no
+        // postincrement
+.if \size >= 16
+        sub             x3,  x3,  x5
+        sub             x3,  x3,  #8
+.endif
+        // Load the filter vector
+        ld1             {v0.8h},  [x9]
+1:
+.if \size >= 16
+        mov             x9,  x5
+.endif
+        // Load src
+.if \size >= 16
+        ld1             {v4.8b,  v5.8b,  v6.8b},  [x2], #24
+        ld1             {v16.8b, v17.8b, v18.8b}, [x7], #24
+.else
+        ld1             {v4.8b,  v5.8b},  [x2]
+        ld1             {v16.8b, v17.8b}, [x7]
+.endif
+        uxtl            v4.8h,  v4.8b
+        uxtl            v5.8h,  v5.8b
+        uxtl            v16.8h, v16.8b
+        uxtl            v17.8h, v17.8b
+.if \size >= 16
+        uxtl            v6.8h,  v6.8b
+        uxtl            v18.8h, v18.8b
+.endif
+2:
+
+        // Accumulate, adding idx2 last with a separate
+        // saturating add. The positive filter coefficients
+        // for all indices except idx2 must add up to less
+        // than 127 for this not to overflow.
+        mul             v1.8h,  v4.8h,  v0.h[0]
+        mul             v24.8h, v16.8h, v0.h[0]
+.if \size >= 16
+        mul             v2.8h,  v5.8h,  v0.h[0]
+        mul             v25.8h, v17.8h, v0.h[0]
+.endif
+        extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 1,     \size
+        extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 2,     \size
+        extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, \idx1, \size
+        extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 5,     \size
+        extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 6,     \size
+        extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 7,     \size
+        extmulqadd      v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, \idx2, \size
+
+        // Round, shift and saturate
+        sqrshrun        v1.8b,   v1.8h,  #7
+        sqrshrun        v24.8b,  v24.8h, #7
+.if \size >= 16
+        sqrshrun2       v1.16b,  v2.8h,  #7
+        sqrshrun2       v24.16b, v25.8h, #7
+.endif
+        // Average
+.ifc \type,avg
+.if \size >= 16
+        ld1             {v2.16b}, [x0]
+        ld1             {v3.16b}, [x6]
+        urhadd          v1.16b,  v1.16b,  v2.16b
+        urhadd          v24.16b, v24.16b, v3.16b
+.elseif \size == 8
+        ld1             {v2.8b},  [x0]
+        ld1             {v3.8b},  [x6]
+        urhadd          v1.8b,  v1.8b,  v2.8b
+        urhadd          v24.8b, v24.8b, v3.8b
+.else
+        ld1             {v2.s}[0], [x0]
+        ld1             {v3.s}[0], [x6]
+        urhadd          v1.8b,  v1.8b,  v2.8b
+        urhadd          v24.8b, v24.8b, v3.8b
+.endif
+.endif
+        // Store and loop horizontally (for size >= 16)
+.if \size >= 16
+        subs            x9,  x9,  #16
+        st1             {v1.16b},  [x0], #16
+        st1             {v24.16b}, [x6], #16
+        b.eq            3f
+        mov             v4.16b,  v6.16b
+        mov             v16.16b, v18.16b
+        ld1             {v6.16b},  [x2], #16
+        ld1             {v18.16b}, [x7], #16
+        uxtl            v5.8h,  v6.8b
+        uxtl2           v6.8h,  v6.16b
+        uxtl            v17.8h, v18.8b
+        uxtl2           v18.8h, v18.16b
+        b               2b
+.elseif \size == 8
+        st1             {v1.8b},    [x0]
+        st1             {v24.8b},   [x6]
+.else // \size == 4
+        st1             {v1.s}[0],  [x0]
+        st1             {v24.s}[0], [x6]
+.endif
+3:
+        // Loop vertically
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        add             x2,  x2,  x3
+        add             x7,  x7,  x3
+        subs            w4,  w4,  #2
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+.macro do_8tap_h_size size
+do_8tap_h put, \size, 3, 4
+do_8tap_h avg, \size, 3, 4
+do_8tap_h put, \size, 4, 3
+do_8tap_h avg, \size, 4, 3
+.endm
+
+do_8tap_h_size 4
+do_8tap_h_size 8
+do_8tap_h_size 16
+
+.macro do_8tap_h_func type, filter, offset, size
+function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
+        movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
+        cmp             w5,  #8
+        add             x9,  x6,  w5, uxtw #4
+        mov             x5,  #\size
+.if \size >= 16
+        b.ge            \type\()_8tap_16h_34
+        b               \type\()_8tap_16h_43
+.else
+        b.ge            \type\()_8tap_\size\()h_34
+        b               \type\()_8tap_\size\()h_43
+.endif
+endfunc
+.endm
+
+.macro do_8tap_h_filters size
+do_8tap_h_func put, regular, 1, \size
+do_8tap_h_func avg, regular, 1, \size
+do_8tap_h_func put, sharp,   2, \size
+do_8tap_h_func avg, sharp,   2, \size
+do_8tap_h_func put, smooth,  0, \size
+do_8tap_h_func avg, smooth,  0, \size
+.endm
+
+do_8tap_h_filters 64
+do_8tap_h_filters 32
+do_8tap_h_filters 16
+do_8tap_h_filters 8
+do_8tap_h_filters 4
+
+
+// Vertical filters
+
+// Round, shift and saturate and store reg1-reg2 over 4 lines
+.macro do_store4 reg1, reg2, tmp1, tmp2, type
+        sqrshrun        \reg1\().8b,  \reg1\().8h, #7
+        sqrshrun        \reg2\().8b,  \reg2\().8h, #7
+.ifc \type,avg
+        ld1             {\tmp1\().s}[0],  [x7], x1
+        ld1             {\tmp2\().s}[0],  [x7], x1
+        ld1             {\tmp1\().s}[1],  [x7], x1
+        ld1             {\tmp2\().s}[1],  [x7], x1
+        urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
+        urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
+.endif
+        st1             {\reg1\().s}[0],  [x0], x1
+        st1             {\reg2\().s}[0],  [x0], x1
+        st1             {\reg1\().s}[1],  [x0], x1
+        st1             {\reg2\().s}[1],  [x0], x1
+.endm
+
+// Round, shift and saturate and store reg1-4
+.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
+        sqrshrun        \reg1\().8b,  \reg1\().8h, #7
+        sqrshrun        \reg2\().8b,  \reg2\().8h, #7
+        sqrshrun        \reg3\().8b,  \reg3\().8h, #7
+        sqrshrun        \reg4\().8b,  \reg4\().8h, #7
+.ifc \type,avg
+        ld1             {\tmp1\().8b},  [x7], x1
+        ld1             {\tmp2\().8b},  [x7], x1
+        ld1             {\tmp3\().8b},  [x7], x1
+        ld1             {\tmp4\().8b},  [x7], x1
+        urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
+        urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
+        urhadd          \reg3\().8b,  \reg3\().8b,  \tmp3\().8b
+        urhadd          \reg4\().8b,  \reg4\().8b,  \tmp4\().8b
+.endif
+        st1             {\reg1\().8b},  [x0], x1
+        st1             {\reg2\().8b},  [x0], x1
+        st1             {\reg3\().8b},  [x0], x1
+        st1             {\reg4\().8b},  [x0], x1
+.endm
+
+// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
+// (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
+// at the end with saturation. Indices 0 and 7 always have negative or zero
+// coefficients, so they can be accumulated into tmp1-tmp2 together with the
+// largest coefficient.
+.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
+        mul             \dst1\().8h, \src2\().8h, v0.h[1]
+        mul             \dst2\().8h, \src3\().8h, v0.h[1]
+        mul             \tmp1\().8h, \src1\().8h, v0.h[0]
+        mul             \tmp2\().8h, \src2\().8h, v0.h[0]
+        mla             \dst1\().8h, \src3\().8h, v0.h[2]
+        mla             \dst2\().8h, \src4\().8h, v0.h[2]
+.if \idx1 == 3
+        mla             \dst1\().8h, \src4\().8h, v0.h[3]
+        mla             \dst2\().8h, \src5\().8h, v0.h[3]
+.else
+        mla             \dst1\().8h, \src5\().8h, v0.h[4]
+        mla             \dst2\().8h, \src6\().8h, v0.h[4]
+.endif
+        mla             \dst1\().8h, \src6\().8h, v0.h[5]
+        mla             \dst2\().8h, \src7\().8h, v0.h[5]
+        mla             \tmp1\().8h, \src8\().8h, v0.h[7]
+        mla             \tmp2\().8h, \src9\().8h, v0.h[7]
+        mla             \dst1\().8h, \src7\().8h, v0.h[6]
+        mla             \dst2\().8h, \src8\().8h, v0.h[6]
+.if \idx2 == 3
+        mla             \tmp1\().8h, \src4\().8h, v0.h[3]
+        mla             \tmp2\().8h, \src5\().8h, v0.h[3]
+.else
+        mla             \tmp1\().8h, \src5\().8h, v0.h[4]
+        mla             \tmp2\().8h, \src6\().8h, v0.h[4]
+.endif
+        sqadd           \dst1\().8h, \dst1\().8h, \tmp1\().8h
+        sqadd           \dst2\().8h, \dst2\().8h, \tmp2\().8h
+.endm
+
+// Load pixels and extend them to 16 bit
+.macro loadl dst1, dst2, dst3, dst4
+        ld1             {v1.8b}, [x2], x3
+        ld1             {v2.8b}, [x2], x3
+        ld1             {v3.8b}, [x2], x3
+.ifnb \dst4
+        ld1             {v4.8b}, [x2], x3
+.endif
+        uxtl            \dst1\().8h, v1.8b
+        uxtl            \dst2\().8h, v2.8b
+        uxtl            \dst3\().8h, v3.8b
+.ifnb \dst4
+        uxtl            \dst4\().8h, v4.8b
+.endif
+.endm
+
+// Instantiate a vertical filter function for filtering 8 pixels at a time.
+// The height is passed in x4, the width in x5 and the filter coefficients
+// in x6. idx2 is the index of the largest filter coefficient (3 or 4)
+// and idx1 is the other one of them.
+.macro do_8tap_8v type, idx1, idx2
+function \type\()_8tap_8v_\idx1\idx2
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        ld1             {v0.8h},  [x6]
+1:
+.ifc \type,avg
+        mov             x7,  x0
+.endif
+        mov             x6,  x4
+
+        loadl           v17, v18, v19
+
+        loadl           v20, v21, v22, v23
+2:
+        loadl           v24, v25, v26, v27
+        convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5,  v6
+        convolve        v3,  v4,  v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5,  v6
+        do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
+
+        subs            x6,  x6,  #4
+        b.eq            8f
+
+        loadl           v16, v17, v18, v19
+        convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5,  v6
+        convolve        v3,  v4,  v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5,  v6
+        do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
+
+        subs            x6,  x6,  #4
+        b.eq            8f
+
+        loadl           v20, v21, v22, v23
+        convolve        v1,  v2,  v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5,  v6
+        convolve        v3,  v4,  v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5,  v6
+        do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
+
+        subs            x6,  x6,  #4
+        b.ne            2b
+
+8:
+        subs            x5,  x5,  #8
+        b.eq            9f
+        // x0 -= h * dst_stride
+        msub            x0,  x1,  x4, x0
+        // x2 -= h * src_stride
+        msub            x2,  x3,  x4, x2
+        // x2 -= 8 * src_stride
+        sub             x2,  x2,  x3, lsl #3
+        // x2 += 1 * src_stride
+        add             x2,  x2,  x3
+        add             x2,  x2,  #8
+        add             x0,  x0,  #8
+        b               1b
+9:
+        ret
+endfunc
+.endm
+
+do_8tap_8v put, 3, 4
+do_8tap_8v put, 4, 3
+do_8tap_8v avg, 3, 4
+do_8tap_8v avg, 4, 3
+
+
+// Instantiate a vertical filter function for filtering a 4 pixels wide
+// slice. The first half of the registers contain one row, while the second
+// half of a register contains the second-next row (also stored in the first
+// half of the register two steps ahead). The convolution does two outputs
+// at a time; the output of v17-v24 into one, and v18-v25 into another one.
+// The first half of first output is the first output row, the first half
+// of the other output is the second output row. The second halves of the
+// registers are rows 3 and 4.
+// This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type, idx1, idx2
+function \type\()_8tap_4v_\idx1\idx2
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        ld1             {v0.8h},  [x6]
+.ifc \type,avg
+        mov             x7,  x0
+.endif
+
+        ld1             {v1.s}[0],  [x2], x3
+        ld1             {v2.s}[0],  [x2], x3
+        ld1             {v3.s}[0],  [x2], x3
+        ld1             {v4.s}[0],  [x2], x3
+        ld1             {v5.s}[0],  [x2], x3
+        ld1             {v6.s}[0],  [x2], x3
+        trn1            v1.2s,  v1.2s,  v3.2s
+        ld1             {v7.s}[0],  [x2], x3
+        trn1            v2.2s,  v2.2s,  v4.2s
+        ld1             {v26.s}[0], [x2], x3
+        uxtl            v17.8h, v1.8b
+        trn1            v3.2s,  v3.2s,  v5.2s
+        ld1             {v27.s}[0], [x2], x3
+        uxtl            v18.8h, v2.8b
+        trn1            v4.2s,  v4.2s,  v6.2s
+        ld1             {v28.s}[0], [x2], x3
+        uxtl            v19.8h, v3.8b
+        trn1            v5.2s,  v5.2s,  v7.2s
+        ld1             {v29.s}[0], [x2], x3
+        uxtl            v20.8h, v4.8b
+        trn1            v6.2s,  v6.2s,  v26.2s
+        uxtl            v21.8h, v5.8b
+        trn1            v7.2s,  v7.2s,  v27.2s
+        uxtl            v22.8h, v6.8b
+        trn1            v26.2s, v26.2s, v28.2s
+        uxtl            v23.8h, v7.8b
+        trn1            v27.2s, v27.2s, v29.2s
+        uxtl            v24.8h, v26.8b
+        uxtl            v25.8h, v27.8b
+
+        convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3,  v4
+        do_store4       v1,  v2,  v5,  v6,  \type
+
+        subs            x4,  x4,  #4
+        b.eq            9f
+
+        ld1             {v1.s}[0],  [x2], x3
+        ld1             {v2.s}[0],  [x2], x3
+        trn1            v28.2s, v28.2s, v1.2s
+        trn1            v29.2s, v29.2s, v2.2s
+        ld1             {v1.s}[1],  [x2], x3
+        uxtl            v26.8h, v28.8b
+        ld1             {v2.s}[1],  [x2], x3
+        uxtl            v27.8h, v29.8b
+        uxtl            v28.8h, v1.8b
+        uxtl            v29.8h, v2.8b
+
+        convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3,  v4
+        do_store4       v1,  v2,  v5,  v6,  \type
+
+9:
+        ret
+endfunc
+.endm
+
+do_8tap_4v put, 3, 4
+do_8tap_4v put, 4, 3
+do_8tap_4v avg, 3, 4
+do_8tap_4v avg, 4, 3
+
+
+.macro do_8tap_v_func type, filter, offset, size
+function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
+        uxtw            x4,  w4
+        movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
+        cmp             w6,  #8
+        add             x6,  x5,  w6, uxtw #4
+        mov             x5,  #\size
+.if \size >= 8
+        b.ge            \type\()_8tap_8v_34
+        b               \type\()_8tap_8v_43
+.else
+        b.ge            \type\()_8tap_4v_34
+        b               \type\()_8tap_4v_43
+.endif
+endfunc
+.endm
+
+.macro do_8tap_v_filters size
+do_8tap_v_func put, regular, 1, \size
+do_8tap_v_func avg, regular, 1, \size
+do_8tap_v_func put, sharp,   2, \size
+do_8tap_v_func avg, sharp,   2, \size
+do_8tap_v_func put, smooth,  0, \size
+do_8tap_v_func avg, smooth,  0, \size
+.endm
+
+do_8tap_v_filters 64
+do_8tap_v_filters 32
+do_8tap_v_filters 16
+do_8tap_v_filters 8
+do_8tap_v_filters 4