early-access version 1432

2021-02-09 04:25:58 +01:00
parent de64eab4b4
commit 3d5a9d908a
7336 changed files with 1773492 additions and 111 deletions
--- a/externals/ffmpeg/libavcodec/x86/Makefile
+++ b/externals/ffmpeg/libavcodec/x86/Makefile
@@ -0,0 +1,199 @@
+OBJS                                   += x86/constants.o               \
+
+# subsystems
+OBJS-$(CONFIG_AC3DSP)                  += x86/ac3dsp_init.o
+OBJS-$(CONFIG_AUDIODSP)                += x86/audiodsp_init.o
+OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp_init.o
+OBJS-$(CONFIG_BSWAPDSP)                += x86/bswapdsp_init.o
+OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
+OBJS-$(CONFIG_DIRAC_DECODER)           += x86/diracdsp_init.o           \
+                                          x86/dirac_dwt_init.o
+OBJS-$(CONFIG_FDCTDSP)                 += x86/fdctdsp_init.o
+OBJS-$(CONFIG_FFT)                     += x86/fft_init.o
+OBJS-$(CONFIG_FLACDSP)                 += x86/flacdsp_init.o
+OBJS-$(CONFIG_FMTCONVERT)              += x86/fmtconvert_init.o
+OBJS-$(CONFIG_H263DSP)                 += x86/h263dsp_init.o
+OBJS-$(CONFIG_H264CHROMA)              += x86/h264chroma_init.o
+OBJS-$(CONFIG_H264DSP)                 += x86/h264dsp_init.o
+OBJS-$(CONFIG_H264PRED)                += x86/h264_intrapred_init.o
+OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
+OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
+OBJS-$(CONFIG_LLAUDDSP)                += x86/lossless_audiodsp_init.o
+OBJS-$(CONFIG_LLVIDDSP)                += x86/lossless_videodsp_init.o
+OBJS-$(CONFIG_LLVIDENCDSP)             += x86/lossless_videoencdsp_init.o
+OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
+OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_init.o
+OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
+OBJS-$(CONFIG_LPC)                     += x86/lpc.o
+OBJS-$(CONFIG_MDCT15)                  += x86/mdct15_init.o
+OBJS-$(CONFIG_ME_CMP)                  += x86/me_cmp_init.o
+OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodsp.o
+OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o              \
+                                          x86/mpegvideodsp.o
+OBJS-$(CONFIG_MPEGVIDEOENC)            += x86/mpegvideoenc.o           \
+                                          x86/mpegvideoencdsp_init.o
+OBJS-$(CONFIG_PIXBLOCKDSP)             += x86/pixblockdsp_init.o
+OBJS-$(CONFIG_QPELDSP)                 += x86/qpeldsp_init.o
+OBJS-$(CONFIG_RV34DSP)                 += x86/rv34dsp_init.o
+OBJS-$(CONFIG_VC1DSP)                  += x86/vc1dsp_init.o
+OBJS-$(CONFIG_VIDEODSP)                += x86/videodsp_init.o
+OBJS-$(CONFIG_VP3DSP)                  += x86/vp3dsp_init.o
+OBJS-$(CONFIG_VP8DSP)                  += x86/vp8dsp_init.o
+OBJS-$(CONFIG_XMM_CLOBBER_TEST)        += x86/w64xmmtest.o
+
+# decoders/encoders
+OBJS-$(CONFIG_AAC_DECODER)             += x86/aacpsdsp_init.o          \
+                                          x86/sbrdsp_init.o
+OBJS-$(CONFIG_AAC_ENCODER)             += x86/aacencdsp_init.o
+OBJS-$(CONFIG_ADPCM_G722_DECODER)      += x86/g722dsp_init.o
+OBJS-$(CONFIG_ADPCM_G722_ENCODER)      += x86/g722dsp_init.o
+OBJS-$(CONFIG_ALAC_DECODER)            += x86/alacdsp_init.o
+OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
+OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
+OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
+OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
+OBJS-$(CONFIG_EXR_DECODER)             += x86/exrdsp_init.o
+OBJS-$(CONFIG_OPUS_DECODER)            += x86/opusdsp_init.o
+OBJS-$(CONFIG_OPUS_ENCODER)            += x86/celt_pvq_init.o
+OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
+OBJS-$(CONFIG_JPEG2000_DECODER)        += x86/jpeg2000dsp_init.o
+OBJS-$(CONFIG_LSCR_DECODER)            += x86/pngdsp_init.o
+OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp_init.o
+OBJS-$(CONFIG_MPEG4_DECODER)           += x86/xvididct_init.o
+OBJS-$(CONFIG_PNG_DECODER)             += x86/pngdsp_init.o
+OBJS-$(CONFIG_PRORES_DECODER)          += x86/proresdsp_init.o
+OBJS-$(CONFIG_PRORES_LGPL_DECODER)     += x86/proresdsp_init.o
+OBJS-$(CONFIG_RV40_DECODER)            += x86/rv40dsp_init.o
+OBJS-$(CONFIG_SBC_ENCODER)             += x86/sbcdsp_init.o
+OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc_init.o
+OBJS-$(CONFIG_TAK_DECODER)             += x86/takdsp_init.o
+OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp_init.o
+OBJS-$(CONFIG_TTA_DECODER)             += x86/ttadsp_init.o
+OBJS-$(CONFIG_TTA_ENCODER)             += x86/ttaencdsp_init.o
+OBJS-$(CONFIG_UTVIDEO_DECODER)         += x86/utvideodsp_init.o
+OBJS-$(CONFIG_V210_DECODER)            += x86/v210-init.o
+OBJS-$(CONFIG_V210_ENCODER)            += x86/v210enc_init.o
+OBJS-$(CONFIG_VORBIS_DECODER)          += x86/vorbisdsp_init.o
+OBJS-$(CONFIG_VP3_DECODER)             += x86/hpeldsp_vp3_init.o
+OBJS-$(CONFIG_VP6_DECODER)             += x86/vp6dsp_init.o
+OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o            \
+                                          x86/vp9dsp_init_10bpp.o      \
+                                          x86/vp9dsp_init_12bpp.o      \
+                                          x86/vp9dsp_init_16bpp.o
+OBJS-$(CONFIG_WEBP_DECODER)            += x86/vp8dsp_init.o
+
+
+# GCC inline assembly optimizations
+# subsystems
+MMX-OBJS-$(CONFIG_FDCTDSP)             += x86/fdct.o
+MMX-OBJS-$(CONFIG_VC1DSP)              += x86/vc1dsp_mmx.o
+
+# decoders/encoders
+MMX-OBJS-$(CONFIG_SNOW_DECODER)        += x86/snowdsp.o
+MMX-OBJS-$(CONFIG_SNOW_ENCODER)        += x86/snowdsp.o
+
+# subsystems
+X86ASM-OBJS-$(CONFIG_AC3DSP)           += x86/ac3dsp.o                  \
+                                          x86/ac3dsp_downmix.o
+X86ASM-OBJS-$(CONFIG_AUDIODSP)         += x86/audiodsp.o
+X86ASM-OBJS-$(CONFIG_BLOCKDSP)         += x86/blockdsp.o
+X86ASM-OBJS-$(CONFIG_BSWAPDSP)         += x86/bswapdsp.o
+X86ASM-OBJS-$(CONFIG_DCT)              += x86/dct32.o
+X86ASM-OBJS-$(CONFIG_FFT)              += x86/fft.o
+X86ASM-OBJS-$(CONFIG_FMTCONVERT)       += x86/fmtconvert.o
+X86ASM-OBJS-$(CONFIG_H263DSP)          += x86/h263_loopfilter.o
+X86ASM-OBJS-$(CONFIG_H264CHROMA)       += x86/h264_chromamc.o           \
+                                          x86/h264_chromamc_10bit.o
+X86ASM-OBJS-$(CONFIG_H264DSP)          += x86/h264_deblock.o            \
+                                          x86/h264_deblock_10bit.o      \
+                                          x86/h264_idct.o               \
+                                          x86/h264_idct_10bit.o         \
+                                          x86/h264_weight.o             \
+                                          x86/h264_weight_10bit.o
+X86ASM-OBJS-$(CONFIG_H264PRED)         += x86/h264_intrapred.o          \
+                                          x86/h264_intrapred_10bit.o
+X86ASM-OBJS-$(CONFIG_H264QPEL)         += x86/h264_qpel_8bit.o          \
+                                          x86/h264_qpel_10bit.o         \
+                                          x86/fpel.o                    \
+                                          x86/qpel.o
+X86ASM-OBJS-$(CONFIG_HPELDSP)          += x86/fpel.o                    \
+                                          x86/hpeldsp.o
+X86ASM-OBJS-$(CONFIG_HUFFYUVDSP)       += x86/huffyuvdsp.o
+X86ASM-OBJS-$(CONFIG_HUFFYUVENCDSP)    += x86/huffyuvencdsp.o
+X86ASM-OBJS-$(CONFIG_IDCTDSP)          += x86/idctdsp.o
+X86ASM-OBJS-$(CONFIG_LLAUDDSP)         += x86/lossless_audiodsp.o
+X86ASM-OBJS-$(CONFIG_LLVIDDSP)         += x86/lossless_videodsp.o
+X86ASM-OBJS-$(CONFIG_LLVIDENCDSP)      += x86/lossless_videoencdsp.o
+X86ASM-OBJS-$(CONFIG_MDCT15)           += x86/mdct15.o
+X86ASM-OBJS-$(CONFIG_ME_CMP)           += x86/me_cmp.o
+X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP)     += x86/imdct36.o
+X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC)     += x86/mpegvideoencdsp.o
+X86ASM-OBJS-$(CONFIG_OPUS_DECODER)     += x86/opusdsp.o
+X86ASM-OBJS-$(CONFIG_OPUS_ENCODER)     += x86/celt_pvq_search.o
+X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP)      += x86/pixblockdsp.o
+X86ASM-OBJS-$(CONFIG_QPELDSP)          += x86/qpeldsp.o                 \
+                                          x86/fpel.o                    \
+                                          x86/qpel.o
+X86ASM-OBJS-$(CONFIG_RV34DSP)          += x86/rv34dsp.o
+X86ASM-OBJS-$(CONFIG_VC1DSP)           += x86/vc1dsp_loopfilter.o       \
+                                          x86/vc1dsp_mc.o
+X86ASM-OBJS-$(CONFIG_IDCTDSP)          += x86/simple_idct10.o           \
+                                          x86/simple_idct.o
+X86ASM-OBJS-$(CONFIG_VIDEODSP)         += x86/videodsp.o
+X86ASM-OBJS-$(CONFIG_VP3DSP)           += x86/vp3dsp.o
+X86ASM-OBJS-$(CONFIG_VP8DSP)           += x86/vp8dsp.o                  \
+                                          x86/vp8dsp_loopfilter.o
+
+# decoders/encoders
+X86ASM-OBJS-$(CONFIG_AAC_DECODER)      += x86/aacpsdsp.o                \
+                                          x86/sbrdsp.o
+X86ASM-OBJS-$(CONFIG_AAC_ENCODER)      += x86/aacencdsp.o
+X86ASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
+X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
+X86ASM-OBJS-$(CONFIG_ALAC_DECODER)     += x86/alacdsp.o
+X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
+X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
+X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
+X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
+                                          x86/dirac_dwt.o
+X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER)    += x86/dnxhdenc.o
+X86ASM-OBJS-$(CONFIG_EXR_DECODER)      += x86/exrdsp.o
+X86ASM-OBJS-$(CONFIG_FLAC_DECODER)     += x86/flacdsp.o
+ifdef CONFIG_GPL
+X86ASM-OBJS-$(CONFIG_FLAC_ENCODER)     += x86/flac_dsp_gpl.o
+endif
+X86ASM-OBJS-$(CONFIG_HEVC_DECODER)     += x86/hevc_add_res.o            \
+                                          x86/hevc_deblock.o            \
+                                          x86/hevc_idct.o               \
+                                          x86/hevc_mc.o                 \
+                                          x86/hevc_sao.o                \
+                                          x86/hevc_sao_10bit.o
+X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
+X86ASM-OBJS-$(CONFIG_LSCR_DECODER)     += x86/pngdsp.o
+X86ASM-OBJS-$(CONFIG_MLP_DECODER)      += x86/mlpdsp.o
+X86ASM-OBJS-$(CONFIG_MPEG4_DECODER)    += x86/xvididct.o
+X86ASM-OBJS-$(CONFIG_PNG_DECODER)      += x86/pngdsp.o
+X86ASM-OBJS-$(CONFIG_PRORES_DECODER)   += x86/proresdsp.o
+X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
+X86ASM-OBJS-$(CONFIG_RV40_DECODER)     += x86/rv40dsp.o
+X86ASM-OBJS-$(CONFIG_SBC_ENCODER)      += x86/sbcdsp.o
+X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER)     += x86/svq1enc.o
+X86ASM-OBJS-$(CONFIG_TAK_DECODER)      += x86/takdsp.o
+X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER)   += x86/mlpdsp.o
+X86ASM-OBJS-$(CONFIG_TTA_DECODER)      += x86/ttadsp.o
+X86ASM-OBJS-$(CONFIG_TTA_ENCODER)      += x86/ttaencdsp.o
+X86ASM-OBJS-$(CONFIG_UTVIDEO_DECODER)  += x86/utvideodsp.o
+X86ASM-OBJS-$(CONFIG_V210_ENCODER)     += x86/v210enc.o
+X86ASM-OBJS-$(CONFIG_V210_DECODER)     += x86/v210.o
+X86ASM-OBJS-$(CONFIG_VORBIS_DECODER)   += x86/vorbisdsp.o
+X86ASM-OBJS-$(CONFIG_VP3_DECODER)      += x86/hpeldsp_vp3.o
+X86ASM-OBJS-$(CONFIG_VP6_DECODER)      += x86/vp6dsp.o
+X86ASM-OBJS-$(CONFIG_VP9_DECODER)      += x86/vp9intrapred.o            \
+                                          x86/vp9intrapred_16bpp.o      \
+                                          x86/vp9itxfm.o                \
+                                          x86/vp9itxfm_16bpp.o          \
+                                          x86/vp9lpf.o                  \
+                                          x86/vp9lpf_16bpp.o            \
+                                          x86/vp9mc.o                   \
+                                          x86/vp9mc_16bpp.o
+X86ASM-OBJS-$(CONFIG_WEBP_DECODER)     += x86/vp8dsp.o
--- a/externals/ffmpeg/libavcodec/x86/aacencdsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/aacencdsp.asm
@@ -0,0 +1,86 @@
+;******************************************************************************
+;* SIMD optimized AAC encoder DSP functions
+;*
+;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+float_abs_mask: times 4 dd 0x7fffffff
+
+SECTION .text
+
+;*******************************************************************
+;void ff_abs_pow34(float *out, const float *in, const int size);
+;*******************************************************************
+INIT_XMM sse
+cglobal abs_pow34, 3, 3, 3, out, in, size
+    mova   m2, [float_abs_mask]
+    shl    sizeq, 2
+    add    inq, sizeq
+    add    outq, sizeq
+    neg    sizeq
+.loop:
+    andps  m0, m2, [inq+sizeq]
+    sqrtps m1, m0
+    mulps  m0, m1
+    sqrtps m0, m0
+    mova   [outq+sizeq], m0
+    add    sizeq, mmsize
+    jl    .loop
+    RET
+
+;*******************************************************************
+;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled,
+;                           int size, int is_signed, int maxval, const float Q34,
+;                           const float rounding)
+;*******************************************************************
+INIT_XMM sse2
+cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
+%if UNIX64 == 0
+    movss     m0, Q34m
+    movss     m1, roundingm
+    cvtsi2ss  m3, dword maxvalm
+%else
+    cvtsi2ss  m3, maxvald
+%endif
+    shufps    m0, m0, 0
+    shufps    m1, m1, 0
+    shufps    m3, m3, 0
+    shl       is_signedd, 31
+    movd      m4, is_signedd
+    shufps    m4, m4, 0
+    shl       sized,   2
+    add       inq, sizeq
+    add       outq, sizeq
+    add       scaledq, sizeq
+    neg       sizeq
+.loop:
+    mulps     m2, m0, [scaledq+sizeq]
+    addps     m2, m1
+    minps     m2, m3
+    andps     m5, m4, [inq+sizeq]
+    orps      m2, m5
+    cvttps2dq m2, m2
+    mova      [outq+sizeq], m2
+    add       sizeq, mmsize
+    jl       .loop
+    RET
--- a/externals/ffmpeg/libavcodec/x86/aacencdsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/aacencdsp_init.c
@@ -0,0 +1,43 @@
+/*
+ * AAC encoder assembly optimizations
+ * Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/float_dsp.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/aacenc.h"
+
+void ff_abs_pow34_sse(float *out, const float *in, const int size);
+
+void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled,
+                                int size, int is_signed, int maxval, const float Q34,
+                                const float rounding);
+
+av_cold void ff_aac_dsp_init_x86(AACEncContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE(cpu_flags))
+        s->abs_pow34   = ff_abs_pow34_sse;
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->quant_bands = ff_aac_quantize_bands_sse2;
+}
--- a/externals/ffmpeg/libavcodec/x86/aacpsdsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/aacpsdsp.asm
@@ -0,0 +1,487 @@
+;******************************************************************************
+;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
+;*
+;* Copyright (C) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
+
+SECTION .text
+
+;*************************************************************************
+;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
+;*************************************************************************
+%macro PS_ADD_SQUARES 1
+cglobal ps_add_squares, 3, 3, %1, dst, src, n
+    shl    nd, 3
+    add  srcq, nq
+    neg    nq
+
+align 16
+.loop:
+    movaps m0, [srcq+nq]
+    movaps m1, [srcq+nq+mmsize]
+    mulps  m0, m0
+    mulps  m1, m1
+    HADDPS m0, m1, m2
+    addps  m0, [dstq]
+    movaps [dstq], m0
+    add  dstq, mmsize
+    add    nq, mmsize*2
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+PS_ADD_SQUARES 2
+INIT_XMM sse3
+PS_ADD_SQUARES 3
+
+;*******************************************************************
+;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
+;                                   float *src1, int n);
+;*******************************************************************
+INIT_XMM sse
+cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
+    shl      nd, 3
+    add   src1q, nq
+    add    dstq, nq
+    neg      nq
+
+align 16
+.loop:
+    movu     m0, [src1q+nq]
+    movu     m1, [src1q+nq+mmsize]
+    mova     m2, [src2q]
+    mova     m3, m2
+    unpcklps m2, m2
+    unpckhps m3, m3
+    mulps    m0, m2
+    mulps    m1, m3
+    mova [dstq+nq], m0
+    mova [dstq+nq+mmsize], m1
+    add   src2q, mmsize
+    add      nq, mmsize*2
+    jl .loop
+    REP_RET
+
+;***********************************************************************
+;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
+;                                   float h[2][4], float h_step[2][4],
+;                                   int len);
+;***********************************************************************
+INIT_XMM sse3
+cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
+    movaps   m0, [hq]
+    movaps   m1, [h_stepq]
+    unpcklps m4, m0, m0
+    unpckhps m0, m0
+    unpcklps m5, m1, m1
+    unpckhps m1, m1
+    shl      nd, 3
+    add      lq, nq
+    add      rq, nq
+    neg      nq
+
+align 16
+.loop:
+    addps    m4, m5
+    addps    m0, m1
+    movddup  m2, [lq+nq]
+    movddup  m3, [rq+nq]
+    mulps    m2, m4
+    mulps    m3, m0
+    addps    m2, m3
+    movsd  [lq+nq], m2
+    movhps [rq+nq], m2
+    add      nq, 8
+    jl .loop
+    REP_RET
+
+;***************************************************************************
+;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
+;                                       float h[2][4], float h_step[2][4],
+;                                       int len);
+;***************************************************************************
+INIT_XMM sse3
+cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
+    movaps   m0, [hq]
+    movaps   m1, [hq+mmsize]
+%if ARCH_X86_64
+    movaps   m8, [h_stepq]
+    movaps   m9, [h_stepq+mmsize]
+    %define  H_STEP0 m8
+    %define  H_STEP1 m9
+%else
+    %define  H_STEP0 [h_stepq]
+    %define  H_STEP1 [h_stepq+mmsize]
+%endif
+    shl      nd, 3
+    add      lq, nq
+    add      rq, nq
+    neg      nq
+
+align 16
+.loop:
+    addps    m0, H_STEP0
+    addps    m1, H_STEP1
+    movddup  m2, [lq+nq]
+    movddup  m3, [rq+nq]
+    shufps   m4, m2, m2, q2301
+    shufps   m5, m3, m3, q2301
+    unpcklps m6, m0, m0
+    unpckhps m7, m0, m0
+    mulps    m2, m6
+    mulps    m3, m7
+    unpcklps m6, m1, m1
+    unpckhps m7, m1, m1
+    mulps    m4, m6
+    mulps    m5, m7
+    addps    m2, m3
+    addsubps m2, m4
+    addsubps m2, m5
+    movsd  [lq+nq], m2
+    movhps [rq+nq], m2
+    add      nq, 8
+    jl .loop
+    REP_RET
+
+;**********************************************************
+;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
+;                                   float (*in)[32][2],
+;                                   int i, int len)
+;**********************************************************
+INIT_XMM sse
+cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
+    movsxdifnidn        iq, id
+    mov               lend, 32 << 3
+    lea                inq, [inq+iq*4]
+    mov               tmpd, id
+    shl               tmpd, 8
+    add               outq, tmpq
+    mov               tmpd, 64
+    sub               tmpd, id
+    mov                 id, tmpd
+
+    test                id, 1
+    jne .loop4
+    test                id, 2
+    jne .loop8
+
+align 16
+.loop16:
+    mov               in0q, inq
+    mov               in1q, 38*64*4
+    add               in1q, in0q
+    mov               tmpd, lend
+
+.inner_loop16:
+    movaps              m0, [in0q]
+    movaps              m1, [in1q]
+    movaps              m2, [in0q+lenq]
+    movaps              m3, [in1q+lenq]
+    TRANSPOSE4x4PS 0, 1, 2, 3, 4
+    movaps          [outq], m0
+    movaps     [outq+lenq], m1
+    movaps   [outq+lenq*2], m2
+    movaps [outq+3*32*2*4], m3
+    lea               in0q, [in0q+lenq*2]
+    lea               in1q, [in1q+lenq*2]
+    add               outq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop16
+    add                inq, 16
+    add               outq, 3*32*2*4
+    sub                 id, 4
+    jg .loop16
+    RET
+
+align 16
+.loop8:
+    mov               in0q, inq
+    mov               in1q, 38*64*4
+    add               in1q, in0q
+    mov               tmpd, lend
+
+.inner_loop8:
+    movlps              m0, [in0q]
+    movlps              m1, [in1q]
+    movhps              m0, [in0q+lenq]
+    movhps              m1, [in1q+lenq]
+    SBUTTERFLYPS 0, 1, 2
+    SBUTTERFLYPD 0, 1, 2
+    movaps          [outq], m0
+    movaps     [outq+lenq], m1
+    lea               in0q, [in0q+lenq*2]
+    lea               in1q, [in1q+lenq*2]
+    add               outq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop8
+    add                inq, 8
+    add               outq, lenq
+    sub                 id, 2
+    jg .loop16
+    RET
+
+align 16
+.loop4:
+    mov               in0q, inq
+    mov               in1q, 38*64*4
+    add               in1q, in0q
+    mov               tmpd, lend
+
+.inner_loop4:
+    movss               m0, [in0q]
+    movss               m1, [in1q]
+    movss               m2, [in0q+lenq]
+    movss               m3, [in1q+lenq]
+    movlhps             m0, m1
+    movlhps             m2, m3
+    shufps              m0, m2, q2020
+    movaps          [outq], m0
+    lea               in0q, [in0q+lenq*2]
+    lea               in1q, [in1q+lenq*2]
+    add               outq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop4
+    add                inq, 4
+    sub                 id, 1
+    test                id, 2
+    jne .loop8
+    cmp                 id, 4
+    jge .loop16
+    RET
+
+;***********************************************************
+;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
+;                                    float (*in)[32][2],
+;                                    int i, int len)
+;***********************************************************
+%macro HYBRID_SYNTHESIS_DEINT 0
+cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
+%if cpuflag(sse4)
+%define MOVH movsd
+%else
+%define MOVH movlps
+%endif
+    movsxdifnidn        iq, id
+    mov               lend, 32 << 3
+    lea               outq, [outq+iq*4]
+    mov               tmpd, id
+    shl               tmpd, 8
+    add                inq, tmpq
+    mov               tmpd, 64
+    sub               tmpd, id
+    mov                 id, tmpd
+
+    test                id, 1
+    jne .loop4
+    test                id, 2
+    jne .loop8
+
+align 16
+.loop16:
+    mov              out0q, outq
+    mov              out1q, 38*64*4
+    add              out1q, out0q
+    mov               tmpd, lend
+
+.inner_loop16:
+    movaps              m0, [inq]
+    movaps              m1, [inq+lenq]
+    movaps              m2, [inq+lenq*2]
+    movaps              m3, [inq+3*32*2*4]
+    TRANSPOSE4x4PS 0, 1, 2, 3, 4
+    movaps         [out0q], m0
+    movaps         [out1q], m1
+    movaps    [out0q+lenq], m2
+    movaps    [out1q+lenq], m3
+    lea              out0q, [out0q+lenq*2]
+    lea              out1q, [out1q+lenq*2]
+    add                inq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop16
+    add               outq, 16
+    add                inq, 3*32*2*4
+    sub                 id, 4
+    jg .loop16
+    RET
+
+align 16
+.loop8:
+    mov              out0q, outq
+    mov              out1q, 38*64*4
+    add              out1q, out0q
+    mov               tmpd, lend
+
+.inner_loop8:
+    movaps              m0, [inq]
+    movaps              m1, [inq+lenq]
+    SBUTTERFLYPS 0, 1, 2
+    SBUTTERFLYPD 0, 1, 2
+    MOVH           [out0q], m0
+    MOVH           [out1q], m1
+    movhps    [out0q+lenq], m0
+    movhps    [out1q+lenq], m1
+    lea              out0q, [out0q+lenq*2]
+    lea              out1q, [out1q+lenq*2]
+    add                inq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop8
+    add               outq, 8
+    add                inq, lenq
+    sub                 id, 2
+    jg .loop16
+    RET
+
+align 16
+.loop4:
+    mov              out0q, outq
+    mov              out1q, 38*64*4
+    add              out1q, out0q
+    mov               tmpd, lend
+
+.inner_loop4:
+    movaps              m0, [inq]
+    movss          [out0q], m0
+%if cpuflag(sse4)
+    extractps      [out1q], m0, 1
+    extractps [out0q+lenq], m0, 2
+    extractps [out1q+lenq], m0, 3
+%else
+    movhlps             m1, m0
+    movss     [out0q+lenq], m1
+    shufps              m0, m0, 0xb1
+    movss          [out1q], m0
+    movhlps             m1, m0
+    movss     [out1q+lenq], m1
+%endif
+    lea              out0q, [out0q+lenq*2]
+    lea              out1q, [out1q+lenq*2]
+    add                inq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop4
+    add               outq, 4
+    sub                 id, 1
+    test                id, 2
+    jne .loop8
+    cmp                 id, 4
+    jge .loop16
+    RET
+%endmacro
+
+INIT_XMM sse
+HYBRID_SYNTHESIS_DEINT
+INIT_XMM sse4
+HYBRID_SYNTHESIS_DEINT
+
+;*******************************************************************
+;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
+;                                 const float (*filter)[8][2],
+;                                 ptrdiff_t stride, int n);
+;*******************************************************************
+%macro PS_HYBRID_ANALYSIS_LOOP 3
+    movu     %1, [inq+mmsize*%3]
+    movu     m1, [inq+mmsize*(5-%3)+8]
+%if cpuflag(sse3)
+    pshufd   %2, %1, q2301
+    pshufd   m4, m1, q0123
+    pshufd   m1, m1, q1032
+    pshufd   m2, [filterq+nq+mmsize*%3], q2301
+    addsubps %2, m4
+    addsubps %1, m1
+%else
+    mova     m2, [filterq+nq+mmsize*%3]
+    mova     %2, %1
+    mova     m4, m1
+    shufps   %2, %2, q2301
+    shufps   m4, m4, q0123
+    shufps   m1, m1, q1032
+    shufps   m2, m2, q2301
+    xorps    m4, m7
+    xorps    m1, m7
+    subps    %2, m4
+    subps    %1, m1
+%endif
+    mulps    %2, m2
+    mulps    %1, m2
+%if %3
+    addps    m3, %2
+    addps    m0, %1
+%endif
+%endmacro
+
+%macro PS_HYBRID_ANALYSIS 0
+cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
+%if cpuflag(sse3)
+%define MOVH movsd
+%else
+%define MOVH movlps
+%endif
+    shl strideq, 3
+    shl nd, 6
+    add filterq, nq
+    neg nq
+    mova m7, [ps_p1m1p1m1]
+
+align 16
+.loop:
+    PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
+    PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
+    PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
+
+%if cpuflag(sse3)
+    pshufd   m3, m3, q2301
+    xorps    m0, m7
+    hsubps   m3, m0
+    pshufd   m1, m3, q0020
+    pshufd   m3, m3, q0031
+    addps    m1, m3
+    movsd    m2, [inq+6*8]
+%else
+    mova     m1, m3
+    mova     m2, m0
+    shufps   m1, m1, q2301
+    shufps   m2, m2, q2301
+    subps    m1, m3
+    addps    m2, m0
+    unpcklps m3, m1, m2
+    unpckhps m1, m2
+    addps    m1, m3
+    movu     m2, [inq+6*8] ; faster than movlps and no risk of overread
+%endif
+    movss    m3, [filterq+nq+8*6]
+    SPLATD   m3
+    mulps    m2, m3
+    addps    m1, m2
+    MOVH [outq], m1
+    add    outq, strideq
+    add      nq, 64
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+PS_HYBRID_ANALYSIS
+INIT_XMM sse3
+PS_HYBRID_ANALYSIS
--- a/externals/ffmpeg/libavcodec/x86/aacpsdsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/aacpsdsp_init.c
@@ -0,0 +1,72 @@
+/*
+ * SIMD optimized MPEG-4 Parametric Stereo decoding functions
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/aacpsdsp.h"
+
+void ff_ps_add_squares_sse  (float *dst, const float (*src)[2], int n);
+void ff_ps_add_squares_sse3 (float *dst, const float (*src)[2], int n);
+void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
+                                float *src1, int n);
+void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                ptrdiff_t stride, int n);
+void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                ptrdiff_t stride, int n);
+void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
+                                   float h[2][4], float h_step[2][4],
+                                   int len);
+void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
+                                          float h[2][4], float h_step[2][4],
+                                          int len);
+void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2],
+                                      int i, int len);
+void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2],
+                                       int i, int len);
+void ff_ps_hybrid_analysis_ileave_sse(float (*out)[32][2], float L[2][38][64],
+                                      int i, int len);
+
+av_cold void ff_psdsp_init_x86(PSDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->add_squares            = ff_ps_add_squares_sse;
+        s->mul_pair_single        = ff_ps_mul_pair_single_sse;
+        s->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_sse;
+        s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse;
+        s->hybrid_analysis        = ff_ps_hybrid_analysis_sse;
+    }
+    if (EXTERNAL_SSE3(cpu_flags)) {
+        s->add_squares            = ff_ps_add_squares_sse3;
+        s->stereo_interpolate[0]  = ff_ps_stereo_interpolate_sse3;
+        s->stereo_interpolate[1]  = ff_ps_stereo_interpolate_ipdopd_sse3;
+        s->hybrid_analysis        = ff_ps_hybrid_analysis_sse3;
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4;
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/ac3dsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/ac3dsp.asm
@@ -0,0 +1,552 @@
+;*****************************************************************************
+;* x86-optimized AC-3 DSP functions
+;* Copyright (c) 2011 Justin Ruggles
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+; 16777216.0f - used in ff_float_to_fixed24()
+pf_1_24: times 4 dd 0x4B800000
+
+; used in ff_ac3_compute_mantissa_size()
+cextern ac3_bap_bits
+pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
+pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
+
+; used in ff_ac3_extract_exponents()
+cextern pd_1
+pd_151: times 4 dd 151
+
+; used in ff_apply_window_int16()
+pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
+pd_16384: times 4 dd 16384
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
+;-----------------------------------------------------------------------------
+
+%macro AC3_EXPONENT_MIN 0
+cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
+    shl  reuse_blksq, 8
+    jz .end
+    LOOP_ALIGN
+.nextexp:
+    mov      offsetq, reuse_blksq
+    mova          m0, [expq+offsetq]
+    sub      offsetq, 256
+    LOOP_ALIGN
+.nextblk:
+    PMINUB        m0, [expq+offsetq], m1
+    sub      offsetq, 256
+    jae .nextblk
+    mova      [expq], m0
+    add         expq, mmsize
+    sub        expnq, mmsize
+    jg .nextexp
+.end:
+    REP_RET
+%endmacro
+
+%define LOOP_ALIGN
+INIT_MMX mmx
+AC3_EXPONENT_MIN
+%if HAVE_MMXEXT_EXTERNAL
+%define LOOP_ALIGN ALIGN 16
+INIT_MMX mmxext
+AC3_EXPONENT_MIN
+%endif
+%if HAVE_SSE2_EXTERNAL
+INIT_XMM sse2
+AC3_EXPONENT_MIN
+%endif
+%undef LOOP_ALIGN
+
+;-----------------------------------------------------------------------------
+; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
+;
+; This function uses 2 different methods to calculate a valid result.
+; 1) logical 'or' of abs of each element
+;        This is used for ssse3 because of the pabsw instruction.
+;        It is also used for mmx because of the lack of min/max instructions.
+; 2) calculate min/max for the array, then or(abs(min),abs(max))
+;        This is used for mmxext and sse2 because they have pminsw/pmaxsw.
+;-----------------------------------------------------------------------------
+
+; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
+%macro OR_WORDS_HORIZ 2 ; src, tmp
+%if cpuflag(sse2)
+    movhlps     %2, %1
+    por         %1, %2
+    pshuflw     %2, %1, q0032
+    por         %1, %2
+    pshuflw     %2, %1, q0001
+    por         %1, %2
+%elif cpuflag(mmxext)
+    pshufw      %2, %1, q0032
+    por         %1, %2
+    pshufw      %2, %1, q0001
+    por         %1, %2
+%else ; mmx
+    movq        %2, %1
+    psrlq       %2, 32
+    por         %1, %2
+    movq        %2, %1
+    psrlq       %2, 16
+    por         %1, %2
+%endif
+%endmacro
+
+%macro AC3_MAX_MSB_ABS_INT16 1
+cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
+    pxor        m2, m2
+    pxor        m3, m3
+.loop:
+%ifidn %1, min_max
+    mova        m0, [srcq]
+    mova        m1, [srcq+mmsize]
+    pminsw      m2, m0
+    pminsw      m2, m1
+    pmaxsw      m3, m0
+    pmaxsw      m3, m1
+%else ; or_abs
+%if notcpuflag(ssse3)
+    mova        m0, [srcq]
+    mova        m1, [srcq+mmsize]
+    ABS2        m0, m1, m3, m4
+%else ; ssse3
+    ; using memory args is faster for ssse3
+    pabsw       m0, [srcq]
+    pabsw       m1, [srcq+mmsize]
+%endif
+    por         m2, m0
+    por         m2, m1
+%endif
+    add       srcq, mmsize*2
+    sub       lend, mmsize
+    ja .loop
+%ifidn %1, min_max
+    ABS2        m2, m3, m0, m1
+    por         m2, m3
+%endif
+    OR_WORDS_HORIZ m2, m0
+    movd       eax, m2
+    and        eax, 0xFFFF
+    RET
+%endmacro
+
+INIT_MMX mmx
+AC3_MAX_MSB_ABS_INT16 or_abs
+INIT_MMX mmxext
+AC3_MAX_MSB_ABS_INT16 min_max
+INIT_XMM sse2
+AC3_MAX_MSB_ABS_INT16 min_max
+INIT_XMM ssse3
+AC3_MAX_MSB_ABS_INT16 or_abs
+
+;-----------------------------------------------------------------------------
+; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
+;-----------------------------------------------------------------------------
+
+%macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
+cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
+    movd      m0, shiftd
+.loop:
+    mova      m1, [srcq         ]
+    mova      m2, [srcq+mmsize  ]
+    mova      m3, [srcq+mmsize*2]
+    mova      m4, [srcq+mmsize*3]
+    %3        m1, m0
+    %3        m2, m0
+    %3        m3, m0
+    %3        m4, m0
+    mova  [srcq         ], m1
+    mova  [srcq+mmsize  ], m2
+    mova  [srcq+mmsize*2], m3
+    mova  [srcq+mmsize*3], m4
+    add     srcq, mmsize*4
+    sub     lend, mmsize*32/%2
+    ja .loop
+.end:
+    REP_RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmx
+AC3_SHIFT l, 16, psllw
+INIT_XMM sse2
+AC3_SHIFT l, 16, psllw
+
+;-----------------------------------------------------------------------------
+; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmx
+AC3_SHIFT r, 32, psrad
+INIT_XMM sse2
+AC3_SHIFT r, 32, psrad
+
+;-----------------------------------------------------------------------------
+; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
+;-----------------------------------------------------------------------------
+
+; The 3DNow! version is not bit-identical because pf2id uses truncation rather
+; than round-to-nearest.
+INIT_MMX 3dnow
+cglobal float_to_fixed24, 3, 3, 0, dst, src, len
+    movq   m0, [pf_1_24]
+.loop:
+    movq   m1, [srcq   ]
+    movq   m2, [srcq+8 ]
+    movq   m3, [srcq+16]
+    movq   m4, [srcq+24]
+    pfmul  m1, m0
+    pfmul  m2, m0
+    pfmul  m3, m0
+    pfmul  m4, m0
+    pf2id  m1, m1
+    pf2id  m2, m2
+    pf2id  m3, m3
+    pf2id  m4, m4
+    movq  [dstq   ], m1
+    movq  [dstq+8 ], m2
+    movq  [dstq+16], m3
+    movq  [dstq+24], m4
+    add  srcq, 32
+    add  dstq, 32
+    sub  lend, 8
+    ja .loop
+    femms
+    RET
+
+INIT_XMM sse
+cglobal float_to_fixed24, 3, 3, 3, dst, src, len
+    movaps     m0, [pf_1_24]
+.loop:
+    movaps     m1, [srcq   ]
+    movaps     m2, [srcq+16]
+    mulps      m1, m0
+    mulps      m2, m0
+    cvtps2pi  mm0, m1
+    movhlps    m1, m1
+    cvtps2pi  mm1, m1
+    cvtps2pi  mm2, m2
+    movhlps    m2, m2
+    cvtps2pi  mm3, m2
+    movq  [dstq   ], mm0
+    movq  [dstq+ 8], mm1
+    movq  [dstq+16], mm2
+    movq  [dstq+24], mm3
+    add      srcq, 32
+    add      dstq, 32
+    sub      lend, 8
+    ja .loop
+    emms
+    RET
+
+INIT_XMM sse2
+cglobal float_to_fixed24, 3, 3, 9, dst, src, len
+    movaps     m0, [pf_1_24]
+.loop:
+    movaps     m1, [srcq    ]
+    movaps     m2, [srcq+16 ]
+    movaps     m3, [srcq+32 ]
+    movaps     m4, [srcq+48 ]
+%ifdef m8
+    movaps     m5, [srcq+64 ]
+    movaps     m6, [srcq+80 ]
+    movaps     m7, [srcq+96 ]
+    movaps     m8, [srcq+112]
+%endif
+    mulps      m1, m0
+    mulps      m2, m0
+    mulps      m3, m0
+    mulps      m4, m0
+%ifdef m8
+    mulps      m5, m0
+    mulps      m6, m0
+    mulps      m7, m0
+    mulps      m8, m0
+%endif
+    cvtps2dq   m1, m1
+    cvtps2dq   m2, m2
+    cvtps2dq   m3, m3
+    cvtps2dq   m4, m4
+%ifdef m8
+    cvtps2dq   m5, m5
+    cvtps2dq   m6, m6
+    cvtps2dq   m7, m7
+    cvtps2dq   m8, m8
+%endif
+    movdqa  [dstq    ], m1
+    movdqa  [dstq+16 ], m2
+    movdqa  [dstq+32 ], m3
+    movdqa  [dstq+48 ], m4
+%ifdef m8
+    movdqa  [dstq+64 ], m5
+    movdqa  [dstq+80 ], m6
+    movdqa  [dstq+96 ], m7
+    movdqa  [dstq+112], m8
+    add      srcq, 128
+    add      dstq, 128
+    sub      lenq, 32
+%else
+    add      srcq, 64
+    add      dstq, 64
+    sub      lenq, 16
+%endif
+    ja .loop
+    REP_RET
+
+;------------------------------------------------------------------------------
+; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
+;------------------------------------------------------------------------------
+
+%macro PHADDD4 2 ; xmm src, xmm tmp
+    movhlps  %2, %1
+    paddd    %1, %2
+    pshufd   %2, %1, 0x1
+    paddd    %1, %2
+%endmacro
+
+INIT_XMM sse2
+cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
+    movdqa      m0, [mant_cntq      ]
+    movdqa      m1, [mant_cntq+ 1*16]
+    paddw       m0, [mant_cntq+ 2*16]
+    paddw       m1, [mant_cntq+ 3*16]
+    paddw       m0, [mant_cntq+ 4*16]
+    paddw       m1, [mant_cntq+ 5*16]
+    paddw       m0, [mant_cntq+ 6*16]
+    paddw       m1, [mant_cntq+ 7*16]
+    paddw       m0, [mant_cntq+ 8*16]
+    paddw       m1, [mant_cntq+ 9*16]
+    paddw       m0, [mant_cntq+10*16]
+    paddw       m1, [mant_cntq+11*16]
+    pmaddwd     m0, [ac3_bap_bits   ]
+    pmaddwd     m1, [ac3_bap_bits+16]
+    paddd       m0, m1
+    PHADDD4     m0, m1
+    movd      sumd, m0
+    movdqa      m3, [pw_bap_mul1]
+    movhpd      m0, [mant_cntq     +2]
+    movlpd      m0, [mant_cntq+1*32+2]
+    movhpd      m1, [mant_cntq+2*32+2]
+    movlpd      m1, [mant_cntq+3*32+2]
+    movhpd      m2, [mant_cntq+4*32+2]
+    movlpd      m2, [mant_cntq+5*32+2]
+    pmulhuw     m0, m3
+    pmulhuw     m1, m3
+    pmulhuw     m2, m3
+    paddusw     m0, m1
+    paddusw     m0, m2
+    pmaddwd     m0, [pw_bap_mul2]
+    PHADDD4     m0, m1
+    movd       eax, m0
+    add        eax, sumd
+    RET
+
+;------------------------------------------------------------------------------
+; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
+;------------------------------------------------------------------------------
+
+%macro PABSD 1-2 ; src/dst, unused
+%if cpuflag(ssse3)
+    pabsd    %1, %1
+%else ; src/dst, tmp
+    pxor     %2, %2
+    pcmpgtd  %2, %1
+    pxor     %1, %2
+    psubd    %1, %2
+%endif
+%endmacro
+
+%macro AC3_EXTRACT_EXPONENTS 0
+cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
+    add     expq, lenq
+    lea    coefq, [coefq+4*lenq]
+    neg     lenq
+    mova      m2, [pd_1]
+    mova      m3, [pd_151]
+.loop:
+    ; move 4 32-bit coefs to xmm0
+    mova      m0, [coefq+4*lenq]
+    ; absolute value
+    PABSD     m0, m1
+    ; convert to float and extract exponents
+    pslld     m0, 1
+    por       m0, m2
+    cvtdq2ps  m1, m0
+    psrld     m1, 23
+    mova      m0, m3
+    psubd     m0, m1
+    ; move the lowest byte in each of 4 dwords to the low dword
+    ; NOTE: We cannot just extract the low bytes with pshufb because the dword
+    ;       result for 16777215 is -1 due to float inaccuracy. Using packuswb
+    ;       clips this to 0, which is the correct exponent.
+    packssdw  m0, m0
+    packuswb  m0, m0
+    movd  [expq+lenq], m0
+
+    add     lenq, 4
+    jl .loop
+    REP_RET
+%endmacro
+
+%if HAVE_SSE2_EXTERNAL
+INIT_XMM sse2
+AC3_EXTRACT_EXPONENTS
+%endif
+%if HAVE_SSSE3_EXTERNAL
+INIT_XMM ssse3
+AC3_EXTRACT_EXPONENTS
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_apply_window_int16(int16_t *output, const int16_t *input,
+;                            const int16_t *window, unsigned int len)
+;-----------------------------------------------------------------------------
+
+%macro REVERSE_WORDS 1-2
+%if cpuflag(ssse3) && notcpuflag(atom)
+    pshufb  %1, %2
+%elif cpuflag(sse2)
+    pshuflw  %1, %1, 0x1B
+    pshufhw  %1, %1, 0x1B
+    pshufd   %1, %1, 0x4E
+%elif cpuflag(mmxext)
+    pshufw   %1, %1, 0x1B
+%endif
+%endmacro
+
+%macro MUL16FIXED 3
+%if cpuflag(ssse3) ; dst, src, unused
+; dst = ((dst * src) + (1<<14)) >> 15
+    pmulhrsw   %1, %2
+%elif cpuflag(mmxext) ; dst, src, temp
+; dst = (dst * src) >> 15
+; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
+; in from the pmullw result.
+    mova    %3, %1
+    pmulhw  %1, %2
+    pmullw  %3, %2
+    psrlw   %3, 15
+    psllw   %1, 1
+    por     %1, %3
+%endif
+%endmacro
+
+%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
+%if %1
+cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
+%else
+cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
+%endif
+    lea     offset2q, [offsetq-mmsize]
+%if cpuflag(ssse3) && notcpuflag(atom)
+    mova          m5, [pb_revwords]
+    ALIGN 16
+%elif %1
+    mova          m5, [pd_16384]
+%endif
+.loop:
+%if cpuflag(ssse3)
+    ; This version does the 16x16->16 multiplication in-place without expanding
+    ; to 32-bit. The ssse3 version is bit-identical.
+    mova          m0, [windowq+offset2q]
+    mova          m1, [ inputq+offset2q]
+    pmulhrsw      m1, m0
+    REVERSE_WORDS m0, m5
+    pmulhrsw      m0, [ inputq+offsetq ]
+    mova  [outputq+offset2q], m1
+    mova  [outputq+offsetq ], m0
+%elif %1
+    ; This version expands 16-bit to 32-bit, multiplies by the window,
+    ; adds 16384 for rounding, right shifts 15, then repacks back to words to
+    ; save to the output. The window is reversed for the second half.
+    mova          m3, [windowq+offset2q]
+    mova          m4, [ inputq+offset2q]
+    pxor          m0, m0
+    punpcklwd     m0, m3
+    punpcklwd     m1, m4
+    pmaddwd       m0, m1
+    paddd         m0, m5
+    psrad         m0, 15
+    pxor          m2, m2
+    punpckhwd     m2, m3
+    punpckhwd     m1, m4
+    pmaddwd       m2, m1
+    paddd         m2, m5
+    psrad         m2, 15
+    packssdw      m0, m2
+    mova  [outputq+offset2q], m0
+    REVERSE_WORDS m3
+    mova          m4, [ inputq+offsetq]
+    pxor          m0, m0
+    punpcklwd     m0, m3
+    punpcklwd     m1, m4
+    pmaddwd       m0, m1
+    paddd         m0, m5
+    psrad         m0, 15
+    pxor          m2, m2
+    punpckhwd     m2, m3
+    punpckhwd     m1, m4
+    pmaddwd       m2, m1
+    paddd         m2, m5
+    psrad         m2, 15
+    packssdw      m0, m2
+    mova  [outputq+offsetq], m0
+%else
+    ; This version does the 16x16->16 multiplication in-place without expanding
+    ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
+    ; therefore are not bit-identical to the C version.
+    mova          m0, [windowq+offset2q]
+    mova          m1, [ inputq+offset2q]
+    mova          m2, [ inputq+offsetq ]
+    MUL16FIXED    m1, m0, m3
+    REVERSE_WORDS m0
+    MUL16FIXED    m2, m0, m3
+    mova  [outputq+offset2q], m1
+    mova  [outputq+offsetq ], m2
+%endif
+    add      offsetd, mmsize
+    sub     offset2d, mmsize
+    jae .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+APPLY_WINDOW_INT16 0
+INIT_XMM sse2
+APPLY_WINDOW_INT16 0
+
+INIT_MMX mmxext
+APPLY_WINDOW_INT16 1
+INIT_XMM sse2
+APPLY_WINDOW_INT16 1
+INIT_XMM ssse3
+APPLY_WINDOW_INT16 1
+INIT_XMM ssse3, atom
+APPLY_WINDOW_INT16 1
--- a/externals/ffmpeg/libavcodec/x86/ac3dsp_downmix.asm
+++ b/externals/ffmpeg/libavcodec/x86/ac3dsp_downmix.asm
@@ -0,0 +1,187 @@
+;*****************************************************************************
+;* x86-optimized AC-3 downmixing
+;* Copyright (c) 2012 Justin Ruggles
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+;******************************************************************************
+;* This is based on the channel mixing asm in libavresample, but it is
+;* simplified for only float coefficients and only 3 to 6 channels.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; functions to downmix from 3 to 6 channels to mono or stereo
+; void ff_ac3_downmix_*(float **samples, float **matrix, int len);
+;-----------------------------------------------------------------------------
+
+%macro AC3_DOWNMIX 2 ; %1 = in channels, %2 = out channels
+; define some names to make the code clearer
+%assign  in_channels %1
+%assign out_channels %2
+%assign stereo out_channels - 1
+
+; determine how many matrix elements must go on the stack vs. mmregs
+%assign matrix_elements in_channels * out_channels
+%if stereo
+    %assign needed_mmregs 4
+%else
+    %assign needed_mmregs 3
+%endif
+%assign matrix_elements_mm num_mmregs - needed_mmregs
+%if matrix_elements < matrix_elements_mm
+    %assign matrix_elements_mm matrix_elements
+%endif
+%assign total_mmregs needed_mmregs+matrix_elements_mm
+%if matrix_elements_mm < matrix_elements
+    %assign matrix_elements_stack matrix_elements - matrix_elements_mm
+%else
+    %assign matrix_elements_stack 0
+%endif
+
+cglobal ac3_downmix_%1_to_%2, 3,in_channels+1,total_mmregs,0-matrix_elements_stack*mmsize, src0, src1, len, src2, src3, src4, src5
+
+; load matrix pointers
+%define matrix0q r1q
+%define matrix1q r3q
+%if stereo
+    mov      matrix1q, [matrix0q+gprsize]
+%endif
+    mov      matrix0q, [matrix0q]
+
+; define matrix coeff names
+%assign %%i 0
+%assign %%j needed_mmregs
+%rep in_channels
+    %if %%i >= matrix_elements_mm
+        CAT_XDEFINE mx_stack_0_, %%i, 1
+        CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
+    %else
+        CAT_XDEFINE mx_stack_0_, %%i, 0
+        CAT_XDEFINE mx_0_, %%i, m %+ %%j
+        %assign %%j %%j+1
+    %endif
+    %assign %%i %%i+1
+%endrep
+%if stereo
+%assign %%i 0
+%rep in_channels
+    %if in_channels + %%i >= matrix_elements_mm
+        CAT_XDEFINE mx_stack_1_, %%i, 1
+        CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
+    %else
+        CAT_XDEFINE mx_stack_1_, %%i, 0
+        CAT_XDEFINE mx_1_, %%i, m %+ %%j
+        %assign %%j %%j+1
+    %endif
+    %assign %%i %%i+1
+%endrep
+%endif
+
+; load/splat matrix coeffs
+%assign %%i 0
+%rep in_channels
+    %if mx_stack_0_ %+ %%i
+        VBROADCASTSS m0, [matrix0q+4*%%i]
+        mova  mx_0_ %+ %%i, m0
+    %else
+        VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
+    %endif
+    %if stereo
+    %if mx_stack_1_ %+ %%i
+        VBROADCASTSS m0, [matrix1q+4*%%i]
+        mova  mx_1_ %+ %%i, m0
+    %else
+        VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
+    %endif
+    %endif
+    %assign %%i %%i+1
+%endrep
+
+    lea          lenq, [4*r2d]
+    ; load channel pointers to registers
+%assign %%i 1
+%rep (in_channels - 1)
+    mov         src %+ %%i %+ q, [src0q+%%i*gprsize]
+    add         src %+ %%i %+ q, lenq
+    %assign %%i %%i+1
+%endrep
+    mov         src0q, [src0q]
+    add         src0q, lenq
+    neg          lenq
+.loop:
+    %if stereo || mx_stack_0_0
+    mova           m0, [src0q+lenq]
+    %endif
+    %if stereo
+    mulps          m1, m0, mx_1_0
+    %endif
+    %if stereo || mx_stack_0_0
+    mulps          m0, m0, mx_0_0
+    %else
+    mulps          m0, mx_0_0, [src0q+lenq]
+    %endif
+%assign %%i 1
+%rep (in_channels - 1)
+    %define src_ptr src %+ %%i %+ q
+    ; avoid extra load for mono if matrix is in a mm register
+    %if stereo || mx_stack_0_ %+ %%i
+    mova           m2, [src_ptr+lenq]
+    %endif
+    %if stereo
+    FMULADD_PS     m1, m2, mx_1_ %+ %%i, m1, m3
+    %endif
+    %if stereo || mx_stack_0_ %+ %%i
+    FMULADD_PS     m0, m2, mx_0_ %+ %%i, m0, m2
+    %else
+    FMULADD_PS     m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
+    %endif
+    %assign %%i %%i+1
+%endrep
+    mova [src0q+lenq], m0
+    %if stereo
+    mova [src1q+lenq], m1
+    %endif
+
+    add          lenq, mmsize
+    jl .loop
+    RET
+%endmacro
+
+%macro AC3_DOWNMIX_FUNCS 0
+%assign %%i 3
+%rep 4
+    INIT_XMM sse
+    AC3_DOWNMIX %%i, 1
+    AC3_DOWNMIX %%i, 2
+    INIT_YMM avx
+    AC3_DOWNMIX %%i, 1
+    AC3_DOWNMIX %%i, 2
+    %if HAVE_FMA3_EXTERNAL
+    INIT_YMM fma3
+    AC3_DOWNMIX %%i, 1
+    AC3_DOWNMIX %%i, 2
+    %endif
+    %assign %%i %%i+1
+%endrep
+%endmacro
+
+AC3_DOWNMIX_FUNCS
--- a/externals/ffmpeg/libavcodec/x86/ac3dsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/ac3dsp_init.c
@@ -0,0 +1,164 @@
+/*
+ * x86-optimized AC-3 DSP functions
+ * Copyright (c) 2011 Justin Ruggles
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/ac3.h"
+#include "libavcodec/ac3dsp.h"
+
+void ff_ac3_exponent_min_mmx   (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
+void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
+void ff_ac3_exponent_min_sse2  (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
+
+int ff_ac3_max_msb_abs_int16_mmx  (const int16_t *src, int len);
+int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
+int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
+int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);
+
+void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
+void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);
+
+void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
+void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
+
+void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
+void ff_float_to_fixed24_sse  (int32_t *dst, const float *src, unsigned int len);
+void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
+
+int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
+
+void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
+void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
+
+void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
+                                        const int16_t *window, unsigned int len);
+void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
+                                      const int16_t *window, unsigned int len);
+void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
+                                  const int16_t *window, unsigned int len);
+void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
+                                const int16_t *window, unsigned int len);
+void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
+                                 const int16_t *window, unsigned int len);
+void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
+                                      const int16_t *window, unsigned int len);
+
+av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
+        c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
+        c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
+    }
+    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
+        if (!bit_exact) {
+            c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
+        }
+    }
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
+        if (bit_exact) {
+            c->apply_window_int16 = ff_apply_window_int16_mmxext;
+        } else {
+            c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
+        }
+    }
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->float_to_fixed24 = ff_float_to_fixed24_sse;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
+        c->float_to_fixed24 = ff_float_to_fixed24_sse2;
+        c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
+        c->extract_exponents = ff_ac3_extract_exponents_sse2;
+        if (bit_exact) {
+            c->apply_window_int16 = ff_apply_window_int16_sse2;
+        }
+    }
+
+    if (EXTERNAL_SSE2_FAST(cpu_flags)) {
+        c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
+        c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
+        if (!bit_exact) {
+            c->apply_window_int16 = ff_apply_window_int16_round_sse2;
+        }
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
+        if (cpu_flags & AV_CPU_FLAG_ATOM) {
+            c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
+        } else {
+            c->extract_exponents = ff_ac3_extract_exponents_ssse3;
+            c->apply_window_int16 = ff_apply_window_int16_ssse3;
+        }
+    }
+}
+
+#define DOWNMIX_FUNC_OPT(ch, opt)                                       \
+void ff_ac3_downmix_ ## ch ## _to_1_ ## opt(float **samples,            \
+                                            float **matrix, int len);   \
+void ff_ac3_downmix_ ## ch ## _to_2_ ## opt(float **samples,            \
+                                            float **matrix, int len);
+
+#define DOWNMIX_FUNCS(opt)   \
+    DOWNMIX_FUNC_OPT(3, opt) \
+    DOWNMIX_FUNC_OPT(4, opt) \
+    DOWNMIX_FUNC_OPT(5, opt) \
+    DOWNMIX_FUNC_OPT(6, opt)
+
+DOWNMIX_FUNCS(sse)
+DOWNMIX_FUNCS(avx)
+DOWNMIX_FUNCS(fma3)
+
+void ff_ac3dsp_set_downmix_x86(AC3DSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#define SET_DOWNMIX(ch, suf, SUF)                                       \
+    if (ch == c->in_channels) {                                         \
+        if (EXTERNAL_ ## SUF (cpu_flags)) {                             \
+            if (c->out_channels == 1)                                   \
+                c->downmix = ff_ac3_downmix_ ## ch ## _to_1_ ## suf;    \
+            else                                                        \
+                c->downmix = ff_ac3_downmix_ ## ch ## _to_2_ ## suf;    \
+        }                                                               \
+    }
+
+#define SET_DOWNMIX_ALL(suf, SUF)                   \
+    SET_DOWNMIX(3, suf, SUF)                        \
+    SET_DOWNMIX(4, suf, SUF)                        \
+    SET_DOWNMIX(5, suf, SUF)                        \
+    SET_DOWNMIX(6, suf, SUF)
+
+    SET_DOWNMIX_ALL(sse,  SSE)
+    if (!(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
+        SET_DOWNMIX_ALL(avx,  AVX)
+        SET_DOWNMIX_ALL(fma3, FMA3)
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/alacdsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/alacdsp.asm
@@ -0,0 +1,133 @@
+;******************************************************************************
+;* ALAC DSP SIMD optimizations
+;*
+;* Copyright (C) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_XMM sse4
+%if ARCH_X86_64
+cglobal alac_decorrelate_stereo, 2, 5, 8, buf0, len, shift, weight, buf1
+%else
+cglobal alac_decorrelate_stereo, 2, 3, 8, buf0, len, shift, weight
+%define  buf1q  r2q
+%endif
+    movd    m6, shiftm
+    movd    m7, weightm
+    SPLATD  m7
+    shl   lend, 2
+    mov  buf1q, [buf0q + gprsize]
+    mov  buf0q, [buf0q]
+    add  buf1q, lenq
+    add  buf0q, lenq
+    neg  lenq
+
+align 16
+.loop:
+    mova    m0, [buf0q + lenq]
+    mova    m1, [buf0q + lenq + mmsize]
+    mova    m2, [buf1q + lenq]
+    mova    m3, [buf1q + lenq + mmsize]
+    pmulld  m4, m2, m7
+    pmulld  m5, m3, m7
+    psrad   m4, m6
+    psrad   m5, m6
+    psubd   m0, m4
+    psubd   m1, m5
+    paddd   m2, m0
+    paddd   m3, m1
+    mova [buf1q + lenq], m0
+    mova [buf1q + lenq + mmsize], m1
+    mova [buf0q + lenq], m2
+    mova [buf0q + lenq + mmsize], m3
+
+    add   lenq, mmsize*2
+    jl .loop
+    RET
+
+INIT_XMM sse2
+cglobal alac_append_extra_bits_stereo, 2, 5, 5, buf0, exbuf0, buf1, exbuf1, len
+    movifnidn lend, lenm
+    movd      m4, r2m ; exbits
+    shl     lend, 2
+    mov    buf1q, [buf0q + gprsize]
+    mov    buf0q, [buf0q]
+    mov  exbuf1q, [exbuf0q + gprsize]
+    mov  exbuf0q, [exbuf0q]
+    add    buf1q, lenq
+    add    buf0q, lenq
+    add  exbuf1q, lenq
+    add  exbuf0q, lenq
+    neg lenq
+
+align 16
+.loop:
+    mova      m0, [buf0q + lenq]
+    mova      m1, [buf0q + lenq + mmsize]
+    pslld     m0, m4
+    pslld     m1, m4
+    mova      m2, [buf1q + lenq]
+    mova      m3, [buf1q + lenq + mmsize]
+    pslld     m2, m4
+    pslld     m3, m4
+    por       m0, [exbuf0q + lenq]
+    por       m1, [exbuf0q + lenq + mmsize]
+    por       m2, [exbuf1q + lenq]
+    por       m3, [exbuf1q + lenq + mmsize]
+    mova [buf0q + lenq         ], m0
+    mova [buf0q + lenq + mmsize], m1
+    mova [buf1q + lenq         ], m2
+    mova [buf1q + lenq + mmsize], m3
+
+    add     lenq, mmsize*2
+    jl .loop
+    REP_RET
+
+%if ARCH_X86_64
+cglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len
+%else
+cglobal alac_append_extra_bits_mono, 2, 3, 3, buf, exbuf, len
+%define exbitsm r2m
+%endif
+    movifnidn lend, r4m
+    movd     m2, exbitsm
+    shl    lend, 2
+    mov    bufq, [bufq]
+    mov  exbufq, [exbufq]
+    add    bufq, lenq
+    add  exbufq, lenq
+    neg lenq
+
+align 16
+.loop:
+    mova      m0, [bufq + lenq]
+    mova      m1, [bufq + lenq + mmsize]
+    pslld     m0, m2
+    pslld     m1, m2
+    por       m0, [exbufq + lenq]
+    por       m1, [exbufq + lenq + mmsize]
+    mova [bufq + lenq], m0
+    mova [bufq + lenq + mmsize], m1
+
+    add     lenq, mmsize*2
+    jl .loop
+    REP_RET
--- a/externals/ffmpeg/libavcodec/x86/alacdsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/alacdsp_init.c
@@ -0,0 +1,44 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/alacdsp.h"
+#include "config.h"
+
+void ff_alac_decorrelate_stereo_sse4(int32_t *buffer[2], int nb_samples,
+                                     int decorr_shift, int decorr_left_weight);
+void ff_alac_append_extra_bits_stereo_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                                           int extra_bits, int channels, int nb_samples);
+void ff_alac_append_extra_bits_mono_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                                         int extra_bits, int channels, int nb_samples);
+
+av_cold void ff_alacdsp_init_x86(ALACDSPContext *c)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->append_extra_bits[0] = ff_alac_append_extra_bits_mono_sse2;
+        c->append_extra_bits[1] = ff_alac_append_extra_bits_stereo_sse2;
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->decorrelate_stereo   = ff_alac_decorrelate_stereo_sse4;
+    }
+#endif /* HAVE_X86ASM */
+}
--- a/externals/ffmpeg/libavcodec/x86/audiodsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/audiodsp.asm
@@ -0,0 +1,174 @@
+;******************************************************************************
+;* optimized audio functions
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SCALARPRODUCT 0
+; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
+cglobal scalarproduct_int16, 3,3,3, v1, v2, order
+    add orderd, orderd
+    add v1q, orderq
+    add v2q, orderq
+    neg orderq
+    pxor    m2, m2
+.loop:
+    movu    m0, [v1q + orderq]
+    movu    m1, [v1q + orderq + mmsize]
+    pmaddwd m0, [v2q + orderq]
+    pmaddwd m1, [v2q + orderq + mmsize]
+    paddd   m2, m0
+    paddd   m2, m1
+    add     orderq, mmsize*2
+    jl .loop
+    HADDD   m2, m0
+    movd   eax, m2
+%if mmsize == 8
+    emms
+%endif
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SCALARPRODUCT
+INIT_XMM sse2
+SCALARPRODUCT
+
+
+;-----------------------------------------------------------------------------
+; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
+;                           int32_t max, unsigned int len)
+;-----------------------------------------------------------------------------
+
+; %1 = number of xmm registers used
+; %2 = number of inline load/process/store loops per asm loop
+; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
+; %4 = CLIPD function takes min/max as float instead of int (SSE2 version)
+; %5 = suffix
+%macro VECTOR_CLIP_INT32 4-5
+cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
+%if %4
+    cvtsi2ss  m4, minm
+    cvtsi2ss  m5, maxm
+%else
+    movd      m4, minm
+    movd      m5, maxm
+%endif
+    SPLATD    m4
+    SPLATD    m5
+.loop:
+%assign %%i 0
+%rep %2
+    mova      m0,  [srcq + mmsize * (0 + %%i)]
+    mova      m1,  [srcq + mmsize * (1 + %%i)]
+    mova      m2,  [srcq + mmsize * (2 + %%i)]
+    mova      m3,  [srcq + mmsize * (3 + %%i)]
+%if %3
+    mova      m7,  [srcq + mmsize * (4 + %%i)]
+    mova      m8,  [srcq + mmsize * (5 + %%i)]
+    mova      m9,  [srcq + mmsize * (6 + %%i)]
+    mova      m10, [srcq + mmsize * (7 + %%i)]
+%endif
+    CLIPD  m0,  m4, m5, m6
+    CLIPD  m1,  m4, m5, m6
+    CLIPD  m2,  m4, m5, m6
+    CLIPD  m3,  m4, m5, m6
+%if %3
+    CLIPD  m7,  m4, m5, m6
+    CLIPD  m8,  m4, m5, m6
+    CLIPD  m9,  m4, m5, m6
+    CLIPD  m10, m4, m5, m6
+%endif
+    mova  [dstq + mmsize * (0 + %%i)], m0
+    mova  [dstq + mmsize * (1 + %%i)], m1
+    mova  [dstq + mmsize * (2 + %%i)], m2
+    mova  [dstq + mmsize * (3 + %%i)], m3
+%if %3
+    mova  [dstq + mmsize * (4 + %%i)], m7
+    mova  [dstq + mmsize * (5 + %%i)], m8
+    mova  [dstq + mmsize * (6 + %%i)], m9
+    mova  [dstq + mmsize * (7 + %%i)], m10
+%endif
+%assign %%i (%%i + 4 * (1 + %3))
+%endrep
+    add     srcq, mmsize*4*(%2+%3)
+    add     dstq, mmsize*4*(%2+%3)
+    sub     lend, mmsize*(%2+%3)
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmx
+VECTOR_CLIP_INT32 0, 1, 0, 0
+INIT_XMM sse2
+VECTOR_CLIP_INT32 6, 1, 0, 0, _int
+VECTOR_CLIP_INT32 6, 2, 0, 1
+INIT_XMM sse4
+%ifdef m8
+VECTOR_CLIP_INT32 11, 1, 1, 0
+%else
+VECTOR_CLIP_INT32 6, 1, 0, 0
+%endif
+
+; void ff_vector_clipf_sse(float *dst, const float *src,
+;                          int len, float min, float max)
+INIT_XMM sse
+cglobal vector_clipf, 3, 3, 6, dst, src, len, min, max
+%if ARCH_X86_32
+    VBROADCASTSS m0, minm
+    VBROADCASTSS m1, maxm
+%elif WIN64
+    SWAP 0, 3
+    VBROADCASTSS m0, m0
+    VBROADCASTSS m1, maxm
+%else ; 64bit sysv
+    VBROADCASTSS m0, m0
+    VBROADCASTSS m1, m1
+%endif
+
+    movsxdifnidn lenq, lend
+
+.loop:
+    mova m2, [srcq + 4 * lenq - 4 * mmsize]
+    mova m3, [srcq + 4 * lenq - 3 * mmsize]
+    mova m4, [srcq + 4 * lenq - 2 * mmsize]
+    mova m5, [srcq + 4 * lenq - 1 * mmsize]
+
+    maxps m2, m0
+    maxps m3, m0
+    maxps m4, m0
+    maxps m5, m0
+
+    minps m2, m1
+    minps m3, m1
+    minps m4, m1
+    minps m5, m1
+
+    mova [dstq + 4 * lenq - 4 * mmsize], m2
+    mova [dstq + 4 * lenq - 3 * mmsize], m3
+    mova [dstq + 4 * lenq - 2 * mmsize], m4
+    mova [dstq + 4 * lenq - 1 * mmsize], m5
+
+    sub lenq, mmsize
+    jg .loop
+
+    RET
--- a/externals/ffmpeg/libavcodec/x86/audiodsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/audiodsp_init.c
@@ -0,0 +1,66 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/audiodsp.h"
+
+int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
+                                      int order);
+int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
+                                    int order);
+
+void ff_vector_clip_int32_mmx(int32_t *dst, const int32_t *src,
+                              int32_t min, int32_t max, unsigned int len);
+void ff_vector_clip_int32_sse2(int32_t *dst, const int32_t *src,
+                               int32_t min, int32_t max, unsigned int len);
+void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
+                                   int32_t min, int32_t max, unsigned int len);
+void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
+                               int32_t min, int32_t max, unsigned int len);
+void ff_vector_clipf_sse(float *dst, const float *src,
+                         int len, float min, float max);
+
+av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags))
+        c->vector_clip_int32 = ff_vector_clip_int32_mmx;
+
+    if (EXTERNAL_MMXEXT(cpu_flags))
+        c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
+
+    if (EXTERNAL_SSE(cpu_flags))
+        c->vector_clipf = ff_vector_clipf_sse;
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
+        if (cpu_flags & AV_CPU_FLAG_ATOM)
+            c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
+        else
+            c->vector_clip_int32 = ff_vector_clip_int32_sse2;
+    }
+
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->vector_clip_int32 = ff_vector_clip_int32_sse4;
+}
--- a/externals/ffmpeg/libavcodec/x86/blockdsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/blockdsp.asm
@@ -0,0 +1,88 @@
+;******************************************************************************
+;* SIMD-optimized clear block functions
+;* Copyright (c) 2002 Michael Niedermayer
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2009 Fiona Glaser
+;*
+;* AVX version by Jokyo Images
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;----------------------------------------
+; void ff_clear_block(int16_t *blocks);
+;----------------------------------------
+; %1 = number of xmm registers used
+; %2 = number of inline store loops
+%macro CLEAR_BLOCK 2
+cglobal clear_block, 1, 1, %1, blocks
+    ZERO  m0, m0, m0
+%assign %%i 0
+%rep %2
+    mova  [blocksq+mmsize*(0+%%i)], m0
+    mova  [blocksq+mmsize*(1+%%i)], m0
+    mova  [blocksq+mmsize*(2+%%i)], m0
+    mova  [blocksq+mmsize*(3+%%i)], m0
+%assign %%i %%i+4
+%endrep
+    RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCK 0, 4
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCK 1, 2
+INIT_YMM avx
+CLEAR_BLOCK 1, 1
+
+;-----------------------------------------
+; void ff_clear_blocks(int16_t *blocks);
+;-----------------------------------------
+; %1 = number of xmm registers used
+%macro CLEAR_BLOCKS 1
+cglobal clear_blocks, 1, 2, %1, blocks, len
+    add   blocksq, 768
+    mov      lenq, -768
+    ZERO       m0, m0, m0
+.loop:
+    mova  [blocksq+lenq+mmsize*0], m0
+    mova  [blocksq+lenq+mmsize*1], m0
+    mova  [blocksq+lenq+mmsize*2], m0
+    mova  [blocksq+lenq+mmsize*3], m0
+    mova  [blocksq+lenq+mmsize*4], m0
+    mova  [blocksq+lenq+mmsize*5], m0
+    mova  [blocksq+lenq+mmsize*6], m0
+    mova  [blocksq+lenq+mmsize*7], m0
+    add   lenq, mmsize*8
+    js .loop
+    RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCKS 0
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCKS 1
+INIT_YMM avx
+CLEAR_BLOCKS 1
--- a/externals/ffmpeg/libavcodec/x86/blockdsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/blockdsp_init.c
@@ -0,0 +1,60 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/blockdsp.h"
+#include "libavcodec/version.h"
+
+void ff_clear_block_mmx(int16_t *block);
+void ff_clear_block_sse(int16_t *block);
+void ff_clear_block_avx(int16_t *block);
+void ff_clear_blocks_mmx(int16_t *blocks);
+void ff_clear_blocks_sse(int16_t *blocks);
+void ff_clear_blocks_avx(int16_t *blocks);
+
+av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
+                                  AVCodecContext *avctx)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->clear_block  = ff_clear_block_mmx;
+        c->clear_blocks = ff_clear_blocks_mmx;
+    }
+
+    /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
+    if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
+        return;
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->clear_block  = ff_clear_block_sse;
+        c->clear_blocks = ff_clear_blocks_sse;
+    }
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        c->clear_block  = ff_clear_block_avx;
+        c->clear_blocks = ff_clear_blocks_avx;
+    }
+#endif /* HAVE_X86ASM */
+}
--- a/externals/ffmpeg/libavcodec/x86/bswapdsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/bswapdsp.asm
@@ -0,0 +1,159 @@
+;******************************************************************************
+;* optimized bswap buffer functions
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+cextern pb_80
+
+SECTION .text
+
+; %1 = aligned/unaligned
+%macro BSWAP_LOOPS  1
+    mov      r3d, r2d
+    sar      r2d, 3
+    jz       .left4_%1
+%if cpuflag(avx2)
+    sar      r2d, 1
+    jz       .left8_%1
+%endif
+.loop8_%1:
+    mov%1    m0, [r1 +  0]
+    mov%1    m1, [r1 + mmsize]
+%if cpuflag(ssse3)||cpuflag(avx2)
+    pshufb   m0, m2
+    pshufb   m1, m2
+    mov%1    [r0 +  0], m0
+    mov%1    [r0 + mmsize], m1
+%else
+    pshuflw  m0, m0, 10110001b
+    pshuflw  m1, m1, 10110001b
+    pshufhw  m0, m0, 10110001b
+    pshufhw  m1, m1, 10110001b
+    mova     m2, m0
+    mova     m3, m1
+    psllw    m0, 8
+    psllw    m1, 8
+    psrlw    m2, 8
+    psrlw    m3, 8
+    por      m2, m0
+    por      m3, m1
+    mov%1    [r0 +  0], m2
+    mov%1    [r0 + 16], m3
+%endif
+    add      r0, mmsize*2
+    add      r1, mmsize*2
+    dec      r2d
+    jnz      .loop8_%1
+%if cpuflag(avx2)
+.left8_%1:
+    mov      r2d, r3d
+    test     r3d, 8
+    jz       .left4_%1
+    mov%1    m0, [r1]
+    pshufb   m0, m2
+    mov%1    [r0 +  0], m0
+    add r1, mmsize
+    add r0, mmsize
+%endif
+.left4_%1:
+    mov      r2d, r3d
+    test     r3d, 4
+    jz       .left
+    mov%1    xm0, [r1]
+%if cpuflag(ssse3)
+    pshufb   xm0, xm2
+    mov%1    [r0], xm0
+%else
+    pshuflw  m0, m0, 10110001b
+    pshufhw  m0, m0, 10110001b
+    mova     m2, m0
+    psllw    m0, 8
+    psrlw    m2, 8
+    por      m2, m0
+    mov%1    [r0], m2
+%endif
+    add      r1, 16
+    add      r0, 16
+%endmacro
+
+; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
+%macro BSWAP32_BUF 0
+%if cpuflag(ssse3)||cpuflag(avx2)
+cglobal bswap32_buf, 3,4,3
+    mov      r3, r1
+    VBROADCASTI128  m2, [pb_bswap32]
+%else
+cglobal bswap32_buf, 3,4,5
+    mov      r3, r1
+%endif
+    or       r3, r0
+    test     r3, mmsize - 1
+    jz       .start_align
+    BSWAP_LOOPS  u
+    jmp      .left
+.start_align:
+    BSWAP_LOOPS  a
+.left:
+%if cpuflag(ssse3)
+    test     r2d, 2
+    jz       .left1
+    movq     xm0, [r1]
+    pshufb   xm0, xm2
+    movq     [r0], xm0
+    add      r1, 8
+    add      r0, 8
+.left1:
+    test     r2d, 1
+    jz       .end
+    mov      r2d, [r1]
+    bswap    r2d
+    mov      [r0], r2d
+%else
+    and      r2d, 3
+    jz       .end
+.loop2:
+    mov      r3d, [r1]
+    bswap    r3d
+    mov      [r0], r3d
+    add      r1, 4
+    add      r0, 4
+    dec      r2d
+    jnz      .loop2
+%endif
+.end:
+    RET
+%endmacro
+
+INIT_XMM sse2
+BSWAP32_BUF
+
+INIT_XMM ssse3
+BSWAP32_BUF
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+BSWAP32_BUF
+%endif
--- a/externals/ffmpeg/libavcodec/x86/bswapdsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/bswapdsp_init.c
@@ -0,0 +1,40 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/bswapdsp.h"
+
+void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
+void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
+void ff_bswap32_buf_avx2(uint32_t *dst, const uint32_t *src, int w);
+
+av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        c->bswap_buf = ff_bswap32_buf_sse2;
+    if (EXTERNAL_SSSE3(cpu_flags))
+        c->bswap_buf = ff_bswap32_buf_ssse3;
+    if (EXTERNAL_AVX2_FAST(cpu_flags))
+        c->bswap_buf = ff_bswap32_buf_avx2;
+}
--- a/externals/ffmpeg/libavcodec/x86/cabac.h
+++ b/externals/ffmpeg/libavcodec/x86/cabac.h
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_CABAC_H
+#define AVCODEC_X86_CABAC_H
+
+#include "libavcodec/cabac.h"
+#include "libavutil/attributes.h"
+#include "libavutil/macros.h"
+#include "libavutil/x86/asm.h"
+#include "config.h"
+
+#if   (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
+   || (                  !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)\
+   || (defined(__INTEL_COMPILER) && defined(_MSC_VER))
+#       define BROKEN_COMPILER 1
+#else
+#       define BROKEN_COMPILER 0
+#endif
+
+#if HAVE_INLINE_ASM
+
+#ifndef UNCHECKED_BITSTREAM_READER
+#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
+#endif
+
+#if UNCHECKED_BITSTREAM_READER
+#define END_CHECK(end) ""
+#else
+#define END_CHECK(end) \
+        "cmp    "end"       , %%"FF_REG_c"                              \n\t"\
+        "jge    1f                                                      \n\t"
+#endif
+
+#ifdef BROKEN_RELOCATIONS
+#define TABLES_ARG , "r"(tables)
+
+#if HAVE_FAST_CMOV
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
+        "cmp    "low"       , "tmp"                        \n\t"\
+        "cmova  %%ecx       , "range"                      \n\t"\
+        "sbb    %%rcx       , %%rcx                        \n\t"\
+        "and    %%ecx       , "tmp"                        \n\t"\
+        "xor    %%rcx       , "retq"                       \n\t"\
+        "sub    "tmp"       , "low"                        \n\t"
+#else /* HAVE_FAST_CMOV */
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
+/* P4 Prescott has crappy cmov,sbb,64-bit shift so avoid them */ \
+        "sub    "low"       , "tmp"                        \n\t"\
+        "sar    $31         , "tmp"                        \n\t"\
+        "sub    %%ecx       , "range"                      \n\t"\
+        "and    "tmp"       , "range"                      \n\t"\
+        "add    %%ecx       , "range"                      \n\t"\
+        "shl    $17         , %%ecx                        \n\t"\
+        "and    "tmp"       , %%ecx                        \n\t"\
+        "sub    %%ecx       , "low"                        \n\t"\
+        "xor    "tmp"       , "ret"                        \n\t"\
+        "movslq "ret"       , "retq"                       \n\t"
+#endif /* HAVE_FAST_CMOV */
+
+#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
+        "movzbl "statep"    , "ret"                                     \n\t"\
+        "mov    "range"     , "tmp"                                     \n\t"\
+        "and    $0xC0       , "range"                                   \n\t"\
+        "lea    ("ret", "range", 2), %%ecx                              \n\t"\
+        "movzbl "lps_off"("tables", %%rcx), "range"                     \n\t"\
+        "sub    "range"     , "tmp"                                     \n\t"\
+        "mov    "tmp"       , %%ecx                                     \n\t"\
+        "shl    $17         , "tmp"                                     \n\t"\
+        BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp)              \
+        "movzbl "norm_off"("tables", "rangeq"), %%ecx                   \n\t"\
+        "shl    %%cl        , "range"                                   \n\t"\
+        "movzbl "mlps_off"+128("tables", "retq"), "tmp"                 \n\t"\
+        "shl    %%cl        , "low"                                     \n\t"\
+        "mov    "tmpbyte"   , "statep"                                  \n\t"\
+        "test   "lowword"   , "lowword"                                 \n\t"\
+        "jnz    2f                                                      \n\t"\
+        "mov    "byte"      , %%"FF_REG_c"                              \n\t"\
+        END_CHECK(end)\
+        "add"FF_OPSIZE" $2  , "byte"                                    \n\t"\
+        "1:                                                             \n\t"\
+        "movzwl (%%"FF_REG_c") , "tmp"                                  \n\t"\
+        "lea    -1("low")   , %%ecx                                     \n\t"\
+        "xor    "low"       , %%ecx                                     \n\t"\
+        "shr    $15         , %%ecx                                     \n\t"\
+        "bswap  "tmp"                                                   \n\t"\
+        "shr    $15         , "tmp"                                     \n\t"\
+        "movzbl "norm_off"("tables", %%rcx), %%ecx                      \n\t"\
+        "sub    $0xFFFF     , "tmp"                                     \n\t"\
+        "neg    %%ecx                                                   \n\t"\
+        "add    $7          , %%ecx                                     \n\t"\
+        "shl    %%cl        , "tmp"                                     \n\t"\
+        "add    "tmp"       , "low"                                     \n\t"\
+        "2:                                                             \n\t"
+
+#else /* BROKEN_RELOCATIONS */
+#define TABLES_ARG NAMED_CONSTRAINTS_ARRAY_ADD(ff_h264_cabac_tables)
+#define RIP_ARG
+
+#if HAVE_FAST_CMOV
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
+        "mov    "tmp"       , %%ecx     \n\t"\
+        "shl    $17         , "tmp"     \n\t"\
+        "cmp    "low"       , "tmp"     \n\t"\
+        "cmova  %%ecx       , "range"   \n\t"\
+        "sbb    %%ecx       , %%ecx     \n\t"\
+        "and    %%ecx       , "tmp"     \n\t"\
+        "xor    %%ecx       , "ret"     \n\t"\
+        "sub    "tmp"       , "low"     \n\t"
+#else /* HAVE_FAST_CMOV */
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
+        "mov    "tmp"       , %%ecx     \n\t"\
+        "shl    $17         , "tmp"     \n\t"\
+        "sub    "low"       , "tmp"     \n\t"\
+        "sar    $31         , "tmp"     \n\t" /*lps_mask*/\
+        "sub    %%ecx       , "range"   \n\t" /*RangeLPS - range*/\
+        "and    "tmp"       , "range"   \n\t" /*(RangeLPS - range)&lps_mask*/\
+        "add    %%ecx       , "range"   \n\t" /*new range*/\
+        "shl    $17         , %%ecx     \n\t"\
+        "and    "tmp"       , %%ecx     \n\t"\
+        "sub    %%ecx       , "low"     \n\t"\
+        "xor    "tmp"       , "ret"     \n\t"
+#endif /* HAVE_FAST_CMOV */
+
+#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
+        "movzbl "statep"    , "ret"                                     \n\t"\
+        "mov    "range"     , "tmp"                                     \n\t"\
+        "and    $0xC0       , "range"                                   \n\t"\
+        "movzbl "MANGLE(ff_h264_cabac_tables)"+"lps_off"("ret", "range", 2), "range" \n\t"\
+        "sub    "range"     , "tmp"                                     \n\t"\
+        BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)                    \
+        "movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"("range"), %%ecx    \n\t"\
+        "shl    %%cl        , "range"                                   \n\t"\
+        "movzbl "MANGLE(ff_h264_cabac_tables)"+"mlps_off"+128("ret"), "tmp"  \n\t"\
+        "shl    %%cl        , "low"                                     \n\t"\
+        "mov    "tmpbyte"   , "statep"                                  \n\t"\
+        "test   "lowword"   , "lowword"                                 \n\t"\
+        " jnz   2f                                                      \n\t"\
+        "mov    "byte"      , %%"FF_REG_c"                              \n\t"\
+        END_CHECK(end)\
+        "add"FF_OPSIZE" $2  , "byte"                                    \n\t"\
+        "1:                                                             \n\t"\
+        "movzwl (%%"FF_REG_c") , "tmp"                                  \n\t"\
+        "lea    -1("low")   , %%ecx                                     \n\t"\
+        "xor    "low"       , %%ecx                                     \n\t"\
+        "shr    $15         , %%ecx                                     \n\t"\
+        "bswap  "tmp"                                                   \n\t"\
+        "shr    $15         , "tmp"                                     \n\t"\
+        "movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"(%%ecx), %%ecx \n\t"\
+        "sub    $0xFFFF     , "tmp"                                     \n\t"\
+        "neg    %%ecx                                                   \n\t"\
+        "add    $7          , %%ecx                                     \n\t"\
+        "shl    %%cl        , "tmp"                                     \n\t"\
+        "add    "tmp"       , "low"                                     \n\t"\
+        "2:                                                             \n\t"
+
+#endif /* BROKEN_RELOCATIONS */
+
+#if HAVE_7REGS && !BROKEN_COMPILER
+#define get_cabac_inline get_cabac_inline_x86
+static av_always_inline int get_cabac_inline_x86(CABACContext *c,
+                                                 uint8_t *const state)
+{
+    int bit, tmp;
+#ifdef BROKEN_RELOCATIONS
+    void *tables;
+
+    __asm__ volatile(
+        "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
+        : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
+    );
+#endif
+
+    __asm__ volatile(
+        BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1",
+                             "%2", "%q2", "%3", "%b3",
+                             "%c6(%5)", "%c7(%5)",
+                             AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+                             AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+                             AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+                             "%8")
+        : "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
+        : "r"(state), "r"(c),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, bytestream_end))
+          TABLES_ARG
+          ,"1"(c->low), "2"(c->range)
+        : "%"FF_REG_c, "memory"
+    );
+    return bit & 1;
+}
+#endif /* HAVE_7REGS && !BROKEN_COMPILER */
+
+#if !BROKEN_COMPILER
+#define get_cabac_bypass_sign get_cabac_bypass_sign_x86
+static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
+{
+    x86_reg tmp;
+    __asm__ volatile(
+        "movl        %c6(%2), %k1       \n\t"
+        "movl        %c3(%2), %%eax     \n\t"
+        "shl             $17, %k1       \n\t"
+        "add           %%eax, %%eax     \n\t"
+        "sub             %k1, %%eax     \n\t"
+        "cdq                            \n\t"
+        "and           %%edx, %k1       \n\t"
+        "add             %k1, %%eax     \n\t"
+        "xor           %%edx, %%ecx     \n\t"
+        "sub           %%edx, %%ecx     \n\t"
+        "test           %%ax, %%ax      \n\t"
+        "jnz              1f            \n\t"
+        "mov         %c4(%2), %1        \n\t"
+        "subl        $0xFFFF, %%eax     \n\t"
+        "movzwl         (%1), %%edx     \n\t"
+        "bswap         %%edx            \n\t"
+        "shrl            $15, %%edx     \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "add              $2, %1        \n\t"
+        "addl          %%edx, %%eax     \n\t"
+        "mov              %1, %c4(%2)   \n\t"
+#else
+        "addl          %%edx, %%eax     \n\t"
+        "cmp         %c5(%2), %1        \n\t"
+        "jge              1f            \n\t"
+        "add"FF_OPSIZE"   $2, %c4(%2)   \n\t"
+#endif
+        "1:                             \n\t"
+        "movl          %%eax, %c3(%2)   \n\t"
+
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, bytestream_end)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
+    );
+    return val;
+}
+
+#define get_cabac_bypass get_cabac_bypass_x86
+static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
+{
+    x86_reg tmp;
+    int res;
+    __asm__ volatile(
+        "movl        %c6(%2), %k1       \n\t"
+        "movl        %c3(%2), %%eax     \n\t"
+        "shl             $17, %k1       \n\t"
+        "add           %%eax, %%eax     \n\t"
+        "sub             %k1, %%eax     \n\t"
+        "cdq                            \n\t"
+        "and           %%edx, %k1       \n\t"
+        "add             %k1, %%eax     \n\t"
+        "inc           %%edx            \n\t"
+        "test           %%ax, %%ax      \n\t"
+        "jnz              1f            \n\t"
+        "mov         %c4(%2), %1        \n\t"
+        "subl        $0xFFFF, %%eax     \n\t"
+        "movzwl         (%1), %%ecx     \n\t"
+        "bswap         %%ecx            \n\t"
+        "shrl            $15, %%ecx     \n\t"
+        "addl          %%ecx, %%eax     \n\t"
+        "cmp         %c5(%2), %1        \n\t"
+        "jge              1f            \n\t"
+        "add"FF_OPSIZE"   $2, %c4(%2)   \n\t"
+        "1:                             \n\t"
+        "movl          %%eax, %c3(%2)   \n\t"
+
+        : "=&d"(res), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, bytestream_end)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%ecx", "memory"
+    );
+    return res;
+}
+#endif /* !BROKEN_COMPILER */
+
+#endif /* HAVE_INLINE_ASM */
+#endif /* AVCODEC_X86_CABAC_H */
--- a/externals/ffmpeg/libavcodec/x86/cavsdsp.c
+++ b/externals/ffmpeg/libavcodec/x86/cavsdsp.c
@@ -0,0 +1,463 @@
+/*
+ * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
+ * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
+ *
+ * MMX-optimized DSP functions, based on H.264 optimizations by
+ * Michael Niedermayer and Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/cavsdsp.h"
+#include "libavcodec/idctdsp.h"
+#include "constants.h"
+#include "fpel.h"
+#include "idctdsp.h"
+#include "config.h"
+
+
+#if HAVE_MMX_EXTERNAL
+
+void ff_cavs_idct8_mmx(int16_t *out, const int16_t *in);
+
+static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
+{
+    LOCAL_ALIGNED(16, int16_t, b2, [64]);
+    ff_cavs_idct8_mmx(b2, block);
+    ff_add_pixels_clamped_mmx(b2, dst, stride);
+}
+
+void ff_cavs_idct8_sse2(int16_t *out, const int16_t *in);
+
+static void cavs_idct8_add_sse2(uint8_t *dst, int16_t *block, ptrdiff_t stride)
+{
+    LOCAL_ALIGNED(16, int16_t, b2, [64]);
+    ff_cavs_idct8_sse2(b2, block);
+    ff_add_pixels_clamped_sse2(b2, dst, stride);
+}
+
+#endif /* HAVE_MMX_EXTERNAL */
+
+#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
+
+/*****************************************************************************
+ *
+ * motion compensation
+ *
+ ****************************************************************************/
+
+/* vertical filter [-1 -2 96 42 -7  0]  */
+#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
+        "movd (%0), "#F"            \n\t"\
+        "movq "#C", %%mm6           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
+        "movq "#D", %%mm7           \n\t"\
+        "pmullw "MANGLE(MUL2)", %%mm7\n\t"\
+        "psllw $3, "#E"             \n\t"\
+        "psubw "#E", %%mm6          \n\t"\
+        "psraw $3, "#E"             \n\t"\
+        "paddw %%mm7, %%mm6         \n\t"\
+        "paddw "#E", %%mm6          \n\t"\
+        "paddw "#B", "#B"           \n\t"\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%mm7, "#F"      \n\t"\
+        "psubw "#B", %%mm6          \n\t"\
+        "psraw $1, "#B"             \n\t"\
+        "psubw "#A", %%mm6          \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
+        "psraw $7, %%mm6            \n\t"\
+        "packuswb %%mm6, %%mm6      \n\t"\
+        OP(%%mm6, (%1), A, d)            \
+        "add %3, %1                 \n\t"
+
+/* vertical filter [ 0 -1  5  5 -1  0]  */
+#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
+        "movd (%0), "#F"            \n\t"\
+        "movq "#C", %%mm6           \n\t"\
+        "paddw "#D", %%mm6          \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%mm7, "#F"      \n\t"\
+        "psubw "#B", %%mm6          \n\t"\
+        "psubw "#E", %%mm6          \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
+        "psraw $3, %%mm6            \n\t"\
+        "packuswb %%mm6, %%mm6      \n\t"\
+        OP(%%mm6, (%1), A, d)            \
+        "add %3, %1                 \n\t"
+
+/* vertical filter [ 0 -7 42 96 -2 -1]  */
+#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
+        "movd (%0), "#F"            \n\t"\
+        "movq "#C", %%mm6           \n\t"\
+        "pmullw "MANGLE(MUL2)", %%mm6\n\t"\
+        "movq "#D", %%mm7           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm7\n\t"\
+        "psllw $3, "#B"             \n\t"\
+        "psubw "#B", %%mm6          \n\t"\
+        "psraw $3, "#B"             \n\t"\
+        "paddw %%mm7, %%mm6         \n\t"\
+        "paddw "#B", %%mm6          \n\t"\
+        "paddw "#E", "#E"           \n\t"\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%mm7, "#F"      \n\t"\
+        "psubw "#E", %%mm6          \n\t"\
+        "psraw $1, "#E"             \n\t"\
+        "psubw "#F", %%mm6          \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
+        "psraw $7, %%mm6            \n\t"\
+        "packuswb %%mm6, %%mm6      \n\t"\
+        OP(%%mm6, (%1), A, d)            \
+        "add %3, %1                 \n\t"
+
+
+#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
+    int w= 2;\
+    src -= 2*srcStride;\
+    \
+    while(w--){\
+      __asm__ volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movd (%0), %%mm0           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm1           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm2           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm3           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm4           \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpcklbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpcklbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm4     \n\t"\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+        \
+        : "+a"(src), "+c"(dst)\
+        : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+          NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
+        : "memory"\
+     );\
+     if(h==16){\
+        __asm__ volatile(\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+            \
+           : "+a"(src), "+c"(dst)\
+           : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+             NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
+           : "memory"\
+        );\
+     }\
+     src += 4-(h+5)*srcStride;\
+     dst += 4-h*dstStride;\
+   }
+
+#define QPEL_CAVS(OPNAME, OP, MMX)\
+static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{\
+    int h=8;\
+    __asm__ volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
+        "1:                         \n\t"\
+        "movq    (%0), %%mm0        \n\t"\
+        "movq   1(%0), %%mm2        \n\t"\
+        "movq %%mm0, %%mm1          \n\t"\
+        "movq %%mm2, %%mm3          \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpckhbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpckhbw %%mm7, %%mm3     \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "paddw %%mm3, %%mm1         \n\t"\
+        "pmullw %%mm6, %%mm0        \n\t"\
+        "pmullw %%mm6, %%mm1        \n\t"\
+        "movq   -1(%0), %%mm2       \n\t"\
+        "movq    2(%0), %%mm4       \n\t"\
+        "movq %%mm2, %%mm3          \n\t"\
+        "movq %%mm4, %%mm5          \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpckhbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm4     \n\t"\
+        "punpckhbw %%mm7, %%mm5     \n\t"\
+        "paddw %%mm4, %%mm2         \n\t"\
+        "paddw %%mm3, %%mm5         \n\t"\
+        "psubw %%mm2, %%mm0         \n\t"\
+        "psubw %%mm5, %%mm1         \n\t"\
+        "movq "MANGLE(ff_pw_4)", %%mm5\n\t"\
+        "paddw %%mm5, %%mm0         \n\t"\
+        "paddw %%mm5, %%mm1         \n\t"\
+        "psraw $3, %%mm0            \n\t"\
+        "psraw $3, %%mm1            \n\t"\
+        "packuswb %%mm1, %%mm0      \n\t"\
+        OP(%%mm0, (%1),%%mm5, q)         \
+        "add %3, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "decl %2                    \n\t"\
+        " jnz 1b                    \n\t"\
+        : "+a"(src), "+c"(dst), "+m"(h)\
+        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_4,ff_pw_5)\
+        : "memory"\
+    );\
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
+{                                                                       \
+  QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42)      \
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
+{                                                                       \
+  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_42)        \
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
+{                                                                       \
+  QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42)      \
+}\
+\
+static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{                                                                       \
+    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{                                                                       \
+    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{                                                                       \
+    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{                                                                       \
+    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{                                                                       \
+    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{                                                                       \
+    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{                                                                       \
+    OPNAME ## cavs_qpel8_h_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## cavs_qpel8_h_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+
+#define CAVS_MC(OPNAME, SIZE, MMX) \
+static void OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
+}\
+
+#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "    \n\t"
+#define AVG_3DNOW_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgusb " #temp ", " #a "        \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+#define AVG_MMXEXT_OP(a, b, temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgb " #temp ", " #a "          \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+
+#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
+
+#if HAVE_MMX_EXTERNAL
+static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
+                                    ptrdiff_t stride)
+{
+    ff_put_pixels8_mmx(dst, src, stride, 8);
+}
+
+static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
+                                    ptrdiff_t stride)
+{
+    ff_avg_pixels8_mmx(dst, src, stride, 8);
+}
+
+static void avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride)
+{
+    ff_avg_pixels8_mmxext(dst, src, stride, 8);
+}
+
+static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
+                                     ptrdiff_t stride)
+{
+    ff_put_pixels16_mmx(dst, src, stride, 16);
+}
+
+static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
+                                     ptrdiff_t stride)
+{
+    ff_avg_pixels16_mmx(dst, src, stride, 16);
+}
+
+static void avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                        ptrdiff_t stride)
+{
+    ff_avg_pixels16_mmxext(dst, src, stride, 16);
+}
+
+static void put_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride)
+{
+    ff_put_pixels16_sse2(dst, src, stride, 16);
+}
+
+static void avg_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride)
+{
+    ff_avg_pixels16_sse2(dst, src, stride, 16);
+}
+#endif
+
+static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
+                                     AVCodecContext *avctx)
+{
+#if HAVE_MMX_EXTERNAL
+    c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
+    c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
+    c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
+    c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
+
+    c->cavs_idct8_add = cavs_idct8_add_mmx;
+    c->idct_perm      = FF_IDCT_PERM_TRANSPOSE;
+#endif /* HAVE_MMX_EXTERNAL */
+}
+
+#define DSPFUNC(PFX, IDX, NUM, EXT)                                                       \
+    c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
+    c->PFX ## _cavs_qpel_pixels_tab[IDX][ 4] = PFX ## _cavs_qpel ## NUM ## _mc01_ ## EXT; \
+    c->PFX ## _cavs_qpel_pixels_tab[IDX][ 8] = PFX ## _cavs_qpel ## NUM ## _mc02_ ## EXT; \
+    c->PFX ## _cavs_qpel_pixels_tab[IDX][12] = PFX ## _cavs_qpel ## NUM ## _mc03_ ## EXT; \
+
+#if HAVE_MMXEXT_INLINE
+QPEL_CAVS(put_,        PUT_OP, mmxext)
+QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext)
+
+CAVS_MC(put_,  8, mmxext)
+CAVS_MC(put_, 16, mmxext)
+CAVS_MC(avg_,  8, mmxext)
+CAVS_MC(avg_, 16, mmxext)
+#endif /* HAVE_MMXEXT_INLINE */
+
+#if HAVE_AMD3DNOW_INLINE
+QPEL_CAVS(put_,       PUT_OP, 3dnow)
+QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
+
+CAVS_MC(put_, 8, 3dnow)
+CAVS_MC(put_, 16,3dnow)
+CAVS_MC(avg_, 8, 3dnow)
+CAVS_MC(avg_, 16,3dnow)
+
+static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
+                                       AVCodecContext *avctx)
+{
+    DSPFUNC(put, 0, 16, 3dnow);
+    DSPFUNC(put, 1,  8, 3dnow);
+    DSPFUNC(avg, 0, 16, 3dnow);
+    DSPFUNC(avg, 1,  8, 3dnow);
+}
+#endif /* HAVE_AMD3DNOW_INLINE */
+
+av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
+{
+    av_unused int cpu_flags = av_get_cpu_flags();
+
+    if (X86_MMX(cpu_flags))
+        cavsdsp_init_mmx(c, avctx);
+
+#if HAVE_AMD3DNOW_INLINE
+    if (INLINE_AMD3DNOW(cpu_flags))
+        cavsdsp_init_3dnow(c, avctx);
+#endif /* HAVE_AMD3DNOW_INLINE */
+#if HAVE_MMXEXT_INLINE
+    if (INLINE_MMXEXT(cpu_flags)) {
+        DSPFUNC(put, 0, 16, mmxext);
+        DSPFUNC(put, 1,  8, mmxext);
+        DSPFUNC(avg, 0, 16, mmxext);
+        DSPFUNC(avg, 1,  8, mmxext);
+    }
+#endif
+#if HAVE_MMX_EXTERNAL
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmxext;
+        c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext;
+    }
+#endif
+#if HAVE_SSE2_EXTERNAL
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
+        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
+
+        c->cavs_idct8_add = cavs_idct8_add_sse2;
+        c->idct_perm      = FF_IDCT_PERM_TRANSPOSE;
+    }
+#endif
+}
--- a/externals/ffmpeg/libavcodec/x86/cavsidct.asm
+++ b/externals/ffmpeg/libavcodec/x86/cavsidct.asm
@@ -0,0 +1,211 @@
+; Chinese AVS video (AVS1-P2, JiZhun profile) decoder
+; Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
+;
+; MMX-optimized DSP functions, based on H.264 optimizations by
+; Michael Niedermayer and Loren Merritt
+; Conversion from gcc syntax to x264asm syntax with modifications
+; by Ronald S. Bultje <rsbultje@gmail.com>
+;
+; This file is part of FFmpeg.
+;
+; FFmpeg is free software; you can redistribute it and/or
+; modify it under the terms of the GNU Lesser General Public
+; License as published by the Free Software Foundation; either
+; version 2.1 of the License, or (at your option) any later version.
+;
+; FFmpeg is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; Lesser General Public License for more details.
+;
+; You should have received a copy of the GNU Lesser General Public License
+; along with FFmpeg; if not, write to the Free Software Foundation,
+; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pw_4
+cextern pw_64
+
+SECTION .text
+
+%macro CAVS_IDCT8_1D 2-3 1 ; source, round, init_load
+%if %3 == 1
+    mova            m4, [%1+7*16]       ; m4 = src7
+    mova            m5, [%1+1*16]       ; m5 = src1
+    mova            m2, [%1+5*16]       ; m2 = src5
+    mova            m7, [%1+3*16]       ; m7 = src3
+%else
+    SWAP             1, 7
+    SWAP             4, 6
+%endif
+    mova            m0, m4
+    mova            m3, m5
+    mova            m6, m2
+    mova            m1, m7
+
+    paddw           m4, m4              ; m4 = 2*src7
+    paddw           m3, m3              ; m3 = 2*src1
+    paddw           m6, m6              ; m6 = 2*src5
+    paddw           m1, m1              ; m1 = 2*src3
+    paddw           m0, m4              ; m0 = 3*src7
+    paddw           m5, m3              ; m5 = 3*src1
+    paddw           m2, m6              ; m2 = 3*src5
+    paddw           m7, m1              ; m7 = 3*src3
+    psubw           m5, m4              ; m5 = 3*src1 - 2*src7 = a0
+    paddw           m7, m6              ; m7 = 3*src3 - 2*src5 = a1
+    psubw           m1, m2              ; m1 = 2*src3 - 3*src5 = a2
+    paddw           m3, m0              ; m3 = 2*src1 - 3*src7 = a3
+
+    mova            m4, m5
+    mova            m6, m7
+    mova            m0, m3
+    mova            m2, m1
+    SUMSUB_BA     w, 7, 5               ; m7 = a0 + a1, m5 = a0 - a1
+    paddw           m7, m3              ; m7 = a0 + a1 + a3
+    paddw           m5, m1              ; m5 = a0 - a1 + a2
+    paddw           m7, m7
+    paddw           m5, m5
+    paddw           m7, m6              ; m7 = b4
+    paddw           m5, m4              ; m5 = b5
+
+    SUMSUB_BA     w, 1, 3               ; m1 = a3 + a2, m3 = a3 - a2
+    psubw           m4, m1              ; m4 = a0 - a2 - a3
+    mova            m1, m4              ; m1 = a0 - a2 - a3
+    psubw           m3, m6              ; m3 = a3 - a2 - a1
+    paddw           m1, m1
+    paddw           m3, m3
+    psubw           m1, m2              ; m1 = b7
+    paddw           m3, m0              ; m3 = b6
+
+    mova            m2, [%1+2*16]       ; m2 = src2
+    mova            m6, [%1+6*16]       ; m6 = src6
+    mova            m4, m2
+    mova            m0, m6
+    psllw           m4, 2               ; m4 = 4*src2
+    psllw           m6, 2               ; m6 = 4*src6
+    paddw           m2, m4              ; m2 = 5*src2
+    paddw           m0, m6              ; m0 = 5*src6
+    paddw           m2, m2
+    paddw           m0, m0
+    psubw           m4, m0              ; m4 = 4*src2 - 10*src6 = a7
+    paddw           m6, m2              ; m6 = 4*src6 + 10*src2 = a6
+
+    mova            m2, [%1+0*16]       ; m2 = src0
+    mova            m0, [%1+4*16]       ; m0 = src4
+    SUMSUB_BA     w, 0, 2               ; m0 = src0 + src4, m2 = src0 - src4
+    psllw           m0, 3
+    psllw           m2, 3
+    paddw           m0, %2              ; add rounding bias
+    paddw           m2, %2              ; add rounding bias
+
+    SUMSUB_BA     w, 6, 0               ; m6 = a4 + a6, m0 = a4 - a6
+    SUMSUB_BA     w, 4, 2               ; m4 = a5 + a7, m2 = a5 - a7
+    SUMSUB_BA     w, 7, 6               ; m7 = dst0, m6 = dst7
+    SUMSUB_BA     w, 5, 4               ; m5 = dst1, m4 = dst6
+    SUMSUB_BA     w, 3, 2               ; m3 = dst2, m2 = dst5
+    SUMSUB_BA     w, 1, 0               ; m1 = dst3, m0 = dst4
+%endmacro
+
+INIT_MMX mmx
+cglobal cavs_idct8, 2, 4, 8, 8 * 16, out, in, cnt, tmp
+    mov           cntd, 2
+    mov           tmpq, rsp
+
+.loop_1:
+    CAVS_IDCT8_1D  inq, [pw_4]
+    psraw           m7, 3
+    psraw           m6, 3
+    psraw           m5, 3
+    psraw           m4, 3
+    psraw           m3, 3
+    psraw           m2, 3
+    psraw           m1, 3
+    psraw           m0, 3
+    mova        [tmpq], m7
+    TRANSPOSE4x4W    0, 2, 4, 6, 7
+    mova    [tmpq+1*8], m0
+    mova    [tmpq+3*8], m2
+    mova    [tmpq+5*8], m4
+    mova    [tmpq+7*8], m6
+    mova            m7, [tmpq]
+    TRANSPOSE4x4W    7, 5, 3, 1, 0
+    mova    [tmpq+0*8], m7
+    mova    [tmpq+2*8], m5
+    mova    [tmpq+4*8], m3
+    mova    [tmpq+6*8], m1
+
+    add            inq, mmsize
+    add           tmpq, 64
+    dec           cntd
+    jg .loop_1
+
+    mov           cntd, 2
+    mov           tmpq, rsp
+.loop_2:
+    CAVS_IDCT8_1D tmpq, [pw_64]
+    psraw           m7, 7
+    psraw           m6, 7
+    psraw           m5, 7
+    psraw           m4, 7
+    psraw           m3, 7
+    psraw           m2, 7
+    psraw           m1, 7
+    psraw           m0, 7
+
+    mova   [outq+0*16], m7
+    mova   [outq+1*16], m5
+    mova   [outq+2*16], m3
+    mova   [outq+3*16], m1
+    mova   [outq+4*16], m0
+    mova   [outq+5*16], m2
+    mova   [outq+6*16], m4
+    mova   [outq+7*16], m6
+
+    add           outq, mmsize
+    add           tmpq, mmsize
+    dec           cntd
+    jg .loop_2
+
+    RET
+
+INIT_XMM sse2
+cglobal cavs_idct8, 2, 2, 8 + ARCH_X86_64, 0 - 8 * 16, out, in
+    CAVS_IDCT8_1D  inq, [pw_4]
+    psraw           m7, 3
+    psraw           m6, 3
+    psraw           m5, 3
+    psraw           m4, 3
+    psraw           m3, 3
+    psraw           m2, 3
+    psraw           m1, 3
+    psraw           m0, 3
+%if ARCH_X86_64
+    TRANSPOSE8x8W    7, 5, 3, 1, 0, 2, 4, 6, 8
+    mova    [rsp+4*16], m0
+%else
+    mova    [rsp+0*16], m4
+    TRANSPOSE8x8W    7, 5, 3, 1, 0, 2, 4, 6, [rsp+0*16], [rsp+4*16], 1
+%endif
+    mova    [rsp+0*16], m7
+    mova    [rsp+2*16], m3
+    mova    [rsp+6*16], m4
+    CAVS_IDCT8_1D  rsp, [pw_64], 0
+    psraw           m7, 7
+    psraw           m6, 7
+    psraw           m5, 7
+    psraw           m4, 7
+    psraw           m3, 7
+    psraw           m2, 7
+    psraw           m1, 7
+    psraw           m0, 7
+
+    mova   [outq+0*16], m7
+    mova   [outq+1*16], m5
+    mova   [outq+2*16], m3
+    mova   [outq+3*16], m1
+    mova   [outq+4*16], m0
+    mova   [outq+5*16], m2
+    mova   [outq+6*16], m4
+    mova   [outq+7*16], m6
+    RET
--- a/externals/ffmpeg/libavcodec/x86/celt_pvq_init.c
+++ b/externals/ffmpeg/libavcodec/x86/celt_pvq_init.c
@@ -0,0 +1,43 @@
+/*
+ * Opus encoder assembly optimizations
+ * Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/opus_pvq.h"
+
+extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N);
+extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N);
+extern float ff_pvq_search_exact_avx  (float *X, int *y, int K, int N);
+
+av_cold void ff_celt_pvq_init_x86(CeltPVQ *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->pvq_search = ff_pvq_search_approx_sse2;
+
+    if (EXTERNAL_SSE4(cpu_flags))
+        s->pvq_search = ff_pvq_search_approx_sse4;
+
+    if (EXTERNAL_AVX_FAST(cpu_flags))
+        s->pvq_search = ff_pvq_search_exact_avx;
+}
--- a/externals/ffmpeg/libavcodec/x86/celt_pvq_search.asm
+++ b/externals/ffmpeg/libavcodec/x86/celt_pvq_search.asm
@@ -0,0 +1,385 @@
+;******************************************************************************
+;* SIMD optimized Opus encoder DSP function
+;*
+;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "config.asm"
+%include "libavutil/x86/x86util.asm"
+
+%ifdef __NASM_VER__
+%use "smartalign"
+ALIGNMODE p6
+%endif
+
+SECTION_RODATA 64
+
+const_float_abs_mask:   times 8 dd 0x7fffffff
+const_align_abs_edge:   times 8 dd 0
+
+const_float_0_5:        times 8 dd 0.5
+const_float_1:          times 8 dd 1.0
+const_float_sign_mask:  times 8 dd 0x80000000
+
+const_int32_offsets:
+                        %rep 8
+                                dd $-const_int32_offsets
+                        %endrep
+SECTION .text
+
+;
+;   Setup High Register to be used
+;   for holding memory constants
+;
+; %1 - the register to be used, assmues it is >= mm8
+; %2 - name of the constant.
+;
+; Subsequent opcodes are going to use the constant in the form
+; "addps m0, mm_const_name" and it would be turned into:
+; "addps m0, [const_name]" on 32 bit arch or
+; "addps m0, m8" on 64 bit arch
+%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
+%if num_mmregs > 8
+    %define  mm_%3   %2
+    %{1}        %2, [%3]    ; movaps m8, [const_name]
+%else
+    %define  mm_%3  [%3]
+%endif
+%endmacro
+
+;
+;   Set Position Independent Code
+;       Base address of a constant
+; %1 - the register to be used, if PIC is set
+; %2 - name of the constant.
+;
+; Subsequent opcode are going to use the base address in the form
+; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
+; "movaps m0, [r5 + r4]" if PIC is enabled
+; "movaps m0, [constant_name + r4]" if texrel are used
+%macro SET_PIC_BASE 3; reg, const_label
+%ifdef PIC
+    %{1}     %2, [%3]      ; lea r5, [rip+const]
+    %define  pic_base_%3 %2
+%else
+    %define  pic_base_%3 %3
+%endif
+%endmacro
+
+%macro PULSES_SEARCH 1
+; m6 Syy_norm
+; m7 Sxy_norm
+    addps          m6, mm_const_float_0_5   ; Syy_norm += 1.0/2
+    pxor           m1, m1                   ; max_idx
+    xorps          m3, m3                   ; p_max
+    xor           r4d, r4d
+align 16
+%%distortion_search:
+    movd          xm2, dword r4d    ; movd zero extends
+%ifidn %1,add
+    movaps         m4, [tmpY + r4]  ; y[i]
+    movaps         m5, [tmpX + r4]  ; X[i]
+
+  %if USE_APPROXIMATION == 1
+    xorps          m0, m0
+    cmpps          m0, m0, m5, 4    ; m0 = (X[i] != 0.0)
+  %endif
+
+    addps          m4, m6           ; m4 = Syy_new = y[i] + Syy_norm
+    addps          m5, m7           ; m5 = Sxy_new = X[i] + Sxy_norm
+
+  %if USE_APPROXIMATION == 1
+    andps          m5, m0           ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
+  %endif
+
+%else
+    movaps         m5, [tmpY + r4]      ; m5 = y[i]
+
+    xorps          m0, m0               ; m0 = 0;
+    cmpps          m0, m0, m5, 1        ; m0 = (0<y)
+
+    subps          m4, m6, m5           ; m4 = Syy_new = Syy_norm - y[i]
+    subps          m5, m7, [tmpX + r4]  ; m5 = Sxy_new = Sxy_norm - X[i]
+    andps          m5, m0               ; (0<y)?m5:0
+%endif
+
+%if USE_APPROXIMATION == 1
+    rsqrtps        m4, m4
+    mulps          m5, m4           ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
+%else
+    mulps          m5, m5
+    divps          m5, m4           ; m5 = p = Sxy_new*Sxy_new/Syy
+%endif
+    VPBROADCASTD   m2, xm2          ; m2=i (all lanes get same values, we add the offset-per-lane, later)
+
+    cmpps          m0, m3, m5, 1    ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
+    maxps          m3, m5           ; m3=max(p_max,p)
+                                    ; maxps here is faster than blendvps, despite blend having lower latency.
+
+    pand           m2, m0           ; This version seems faster than sse41 pblendvb
+    pmaxsw         m1, m2           ; SSE2 signed word, so it would work for N < 32768/4
+
+    add           r4d, mmsize
+    cmp           r4d, Nd
+    jb   %%distortion_search
+
+    por            m1, mm_const_int32_offsets  ; max_idx offsets per individual lane (skipped in the inner loop)
+    movdqa         m4, m1                      ; needed for the aligned y[max_idx]+=1; processing
+
+%if mmsize >= 32
+; Merge parallel maximums round 8 (4 vs 4)
+
+    vextractf128  xm5, ym3, 1       ; xmm5 = ymm3[1x128] = ymm3[255..128b]
+    cmpps         xm0, xm3, xm5, 1  ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
+
+    vextracti128  xm2, ym1, 1       ; xmm2 = ymm1[1x128] = ymm1[255..128b]
+    BLENDVPS      xm3, xm5, xm0     ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
+    PBLENDVB      xm1, xm2, xm0     ; p       = m0 ? p[1x128]       : p[0x128]
+%endif
+
+; Merge parallel maximums round 4 (2 vs 2)
+                                    ; m3=p[3210]
+    movhlps       xm5, xm3          ; m5=p[xx32]
+    cmpps         xm0, xm3, xm5, 1  ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
+
+    pshufd        xm2, xm1, q3232
+    BLENDVPS      xm3, xm5, xm0     ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
+    PBLENDVB      xm1, xm2, xm0     ; p       = m0 ? p[3,2]       : p[1,0]
+
+; Merge parallel maximums final round (1 vs 1)
+    shufps        xm0, xm3, xm3, q1111  ; m0 = m3[1] = p[1]
+    cmpss         xm0, xm3, 5           ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
+
+    pshufd        xm2, xm1, q1111
+    PBLENDVB      xm1, xm2, xm0
+
+    movd    dword r4d, xm1          ; zero extends to the rest of r4q
+
+    VBROADCASTSS   m3, [tmpX + r4]
+    %{1}ps         m7, m3           ; Sxy += X[max_idx]
+
+    VBROADCASTSS   m5, [tmpY + r4]
+    %{1}ps         m6, m5           ; Syy += Y[max_idx]
+
+    ; We have to update a single element in Y[i]
+    ; However writing 4 bytes and then doing 16 byte load in the inner loop
+    ; could cause a stall due to breaking write forwarding.
+    VPBROADCASTD   m1, xm1
+    pcmpeqd        m1, m1, m4           ; exactly 1 element matches max_idx and this finds it
+
+    and           r4d, ~(mmsize-1)      ; align address down, so the value pointed by max_idx is inside a mmsize load
+    movaps         m5, [tmpY + r4]      ; m5 = Y[y3...ym...y0]
+    andps          m1, mm_const_float_1 ; m1 =  [ 0...1.0...0]
+    %{1}ps         m5, m1               ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
+    movaps [tmpY + r4], m5              ; Y[max_idx] +-= 1.0;
+%endmacro
+
+;
+; We need one more register for
+; PIC relative addressing. Use this
+; to count it in cglobal
+;
+%ifdef PIC
+  %define num_pic_regs 1
+%else
+  %define num_pic_regs 0
+%endif
+
+;
+; Pyramid Vector Quantization Search implementation
+;
+; float * inX   - Unaligned (SIMD) access, it will be overread,
+;                 but extra data is masked away.
+; int32 * outY  - Should be aligned and padded buffer.
+;                 It is used as temp buffer.
+; uint32 K      - Number of pulses to have after quantizations.
+; uint32 N      - Number of vector elements. Must be 0 < N < 256
+;
+%macro PVQ_FAST_SEARCH 1
+cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
+%define tmpX rsp
+%define tmpY outYq
+
+    movaps     m0, [const_float_abs_mask]
+    shl        Nd, 2    ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
+    mov       r4d, Nd
+
+    neg       r4d
+    and       r4d, mmsize-1
+
+    SET_PIC_BASE lea, r5, const_align_abs_edge  ; rip+const
+    movups     m2, [pic_base_const_align_abs_edge + r4 - mmsize]
+
+    add        Nd, r4d              ; N = align(N, mmsize)
+
+    lea       r4d, [Nd - mmsize]    ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
+    movups     m1, [inXq + r4]
+    andps      m1, m2
+    movaps  [tmpX + r4], m1         ; Sx = abs( X[N-1] )
+
+align 16
+%%loop_abs_sum:
+    sub       r4d, mmsize
+    jc   %%end_loop_abs_sum
+
+    movups     m2, [inXq + r4]
+    andps      m2, m0
+
+    movaps  [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
+    addps      m1, m2       ; Sx += abs(X[i])
+    jmp  %%loop_abs_sum
+
+align 16
+%%end_loop_abs_sum:
+
+    HSUMPS     m1, m2       ; m1  = Sx
+
+    xorps      m0, m0
+    comiss    xm0, xm1      ;
+    jz   %%zero_input       ; if (Sx==0) goto zero_input
+
+    cvtsi2ss  xm0, dword Kd ; m0 = K
+%if USE_APPROXIMATION == 1
+    rcpss     xm1, xm1      ; m1 = approx(1/Sx)
+    mulss     xm0, xm1      ; m0 = K*(1/Sx)
+%else
+    divss     xm0, xm1      ; b = K/Sx
+                            ; b = K/max_x
+%endif
+
+    VBROADCASTSS  m0, xm0
+
+    lea       r4d, [Nd - mmsize]
+    pxor       m5, m5             ; Sy    ( Sum of abs( y[i]) )
+    xorps      m6, m6             ; Syy   ( Sum of y[i]*y[i]  )
+    xorps      m7, m7             ; Sxy   ( Sum of X[i]*y[i]  )
+align 16
+%%loop_guess:
+    movaps     m1, [tmpX + r4]    ; m1   = X[i]
+    mulps      m2, m0, m1         ; m2   = res*X[i]
+    cvtps2dq   m2, m2             ; yt   = (int)lrintf( res*X[i] )
+    paddd      m5, m2             ; Sy  += yt
+    cvtdq2ps   m2, m2             ; yt   = (float)yt
+    mulps      m1, m2             ; m1   = X[i]*yt
+    movaps  [tmpY + r4], m2       ; y[i] = m2
+    addps      m7, m1             ; Sxy += m1;
+    mulps      m2, m2             ; m2   = yt*yt
+    addps      m6, m2             ; Syy += m2
+
+    sub       r4d, mmsize
+    jnc  %%loop_guess
+
+    HSUMPS     m6, m1       ; Syy_norm
+    HADDD      m5, m4       ; pulses
+
+    movd  dword r4d, xm5    ; zero extends to the rest of r4q
+
+    sub        Kd, r4d      ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
+    jz   %%finish           ; K - pulses == 0
+
+    SET_HI_REG_MM_CONSTANT movaps,  m8, const_float_0_5
+    SET_HI_REG_MM_CONSTANT movaps,  m9, const_float_1
+    SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
+    ; Use Syy/2 in distortion parameter calculations.
+    ; Saves pre and post-caclulation to correct Y[] values.
+    ; Same precision, since float mantisa is normalized.
+    ; The SQRT approximation does differ.
+    HSUMPS     m7, m0         ; Sxy_norm
+    mulps      m6, mm_const_float_0_5
+
+    jc   %%remove_pulses_loop   ; K - pulses < 0
+
+align 16                        ; K - pulses > 0
+%%add_pulses_loop:
+
+    PULSES_SEARCH add   ; m6 Syy_norm ; m7 Sxy_norm
+
+    sub        Kd, 1
+    jnz  %%add_pulses_loop
+
+    addps      m6, m6 ; Syy*=2
+
+    jmp  %%finish
+
+align 16
+%%remove_pulses_loop:
+
+    PULSES_SEARCH sub   ; m6 Syy_norm ; m7 Sxy_norm
+
+    add        Kd, 1
+    jnz  %%remove_pulses_loop
+
+    addps      m6, m6 ; Syy*=2
+
+align 16
+%%finish:
+    lea       r4d, [Nd - mmsize]
+    movaps     m2, [const_float_sign_mask]
+
+align 16
+%%restore_sign_loop:
+    movaps     m0, [tmpY + r4]    ; m0 = Y[i]
+    movups     m1, [inXq + r4]    ; m1 = X[i]
+    andps      m1, m2             ; m1 = sign(X[i])
+    orps       m0, m1             ; m0 = Y[i]*sign
+    cvtps2dq   m3, m0             ; m3 = (int)m0
+    movaps  [outYq + r4], m3
+
+    sub       r4d, mmsize
+    jnc  %%restore_sign_loop
+%%return:
+
+%if ARCH_X86_64 == 0    ; sbrdsp
+    movss     r0m, xm6  ; return (float)Syy_norm
+    fld dword r0m
+%else
+    movaps     m0, m6   ; return (float)Syy_norm
+%endif
+
+    RET
+
+align 16
+%%zero_input:
+    lea       r4d, [Nd - mmsize]
+    xorps      m0, m0
+%%zero_loop:
+    movaps  [outYq + r4], m0
+    sub       r4d, mmsize
+    jnc  %%zero_loop
+
+    movaps     m6, [const_float_1]
+    jmp  %%return
+%endmacro
+
+; if 1, use a float op that give half precision but execute for around 3 cycles.
+; On Skylake & Ryzen the division is much faster (around 11c/3),
+; that makes the full precision code about 2% slower.
+; Opus also does use rsqrt approximation in their intrinsics code.
+%define USE_APPROXIMATION   1
+
+INIT_XMM sse2
+PVQ_FAST_SEARCH _approx
+
+INIT_XMM sse4
+PVQ_FAST_SEARCH _approx
+
+%define USE_APPROXIMATION   0
+
+INIT_XMM avx
+PVQ_FAST_SEARCH _exact
--- a/externals/ffmpeg/libavcodec/x86/constants.c
+++ b/externals/ffmpeg/libavcodec/x86/constants.c
@@ -0,0 +1,94 @@
+/*
+ * MMX/SSE/AVX constants used across x86 dsp optimizations.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h" // for xmm_reg
+#include "constants.h"
+
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL,
+                                                    0x0001000100010001ULL, 0x0001000100010001ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL,
+                                                    0x0002000200020002ULL, 0x0002000200020002ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
+DECLARE_ASM_ALIGNED(32, const ymm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL,
+                                                    0x0004000400040004ULL, 0x0004000400040004ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8)    = { 0x0008000800080008ULL, 0x0008000800080008ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_9)    = { 0x0009000900090009ULL, 0x0009000900090009ULL };
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15)   =   0x000F000F000F000FULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16)   = { 0x0010001000100010ULL, 0x0010001000100010ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17)   = { 0x0011001100110011ULL, 0x0011001100110011ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_20)   = { 0x0014001400140014ULL, 0x0014001400140014ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32)   = { 0x0020002000200020ULL, 0x0020002000200020ULL };
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_255)  = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+                                                    0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_256)  = { 0x0100010001000100ULL, 0x0100010001000100ULL,
+                                                    0x0100010001000100ULL, 0x0100010001000100ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL,
+                                                    0x0200020002000200ULL, 0x0200020002000200ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL,
+                                                    0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL};
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL,
+                                                    0x0400040004000400ULL, 0x0400040004000400ULL};
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
+                                                    0x0800080008000800ULL, 0x0800080008000800ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
+                                                    0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
+                                                    0x1000100010001000ULL, 0x1000100010001000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
+                                                    0x2000200020002000ULL, 0x2000200020002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_m1)   = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
+                                                    0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
+
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL,
+                                                    0x0000000000000000ULL, 0x0000000000000000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL,
+                                                    0x0101010101010101ULL, 0x0101010101010101ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_2)    = { 0x0202020202020202ULL, 0x0202020202020202ULL,
+                                                    0x0202020202020202ULL, 0x0202020202020202ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL,
+                                                    0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(32, const xmm_reg,  ff_pb_15)   = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL,
+                                                    0x8080808080808080ULL, 0x8080808080808080ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
+                                                    0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
+
+DECLARE_ALIGNED(16, const xmm_reg,  ff_ps_neg)  = { 0x8000000080000000ULL, 0x8000000080000000ULL };
+
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_1)    = { 0x0000000100000001ULL, 0x0000000100000001ULL,
+                                                    0x0000000100000001ULL, 0x0000000100000001ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_16)   = { 0x0000001000000010ULL, 0x0000001000000010ULL,
+                                                    0x0000001000000010ULL, 0x0000001000000010ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_32)   = { 0x0000002000000020ULL, 0x0000002000000020ULL,
+                                                    0x0000002000000020ULL, 0x0000002000000020ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL,
+                                                    0x0000200000002000ULL, 0x0000200000002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+                                                    0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };
--- a/externals/ffmpeg/libavcodec/x86/constants.h
+++ b/externals/ffmpeg/libavcodec/x86/constants.h
@@ -0,0 +1,72 @@
+/*
+ * MMX/SSE constants used across x86 dsp optimizations.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_CONSTANTS_H
+#define AVCODEC_X86_CONSTANTS_H
+
+#include <stdint.h>
+
+#include "libavutil/x86/asm.h"
+
+extern const ymm_reg  ff_pw_1;
+extern const ymm_reg  ff_pw_2;
+extern const xmm_reg  ff_pw_3;
+extern const ymm_reg  ff_pw_4;
+extern const xmm_reg  ff_pw_5;
+extern const xmm_reg  ff_pw_8;
+extern const xmm_reg  ff_pw_9;
+extern const uint64_t ff_pw_15;
+extern const xmm_reg  ff_pw_16;
+extern const xmm_reg  ff_pw_18;
+extern const xmm_reg  ff_pw_20;
+extern const xmm_reg  ff_pw_32;
+extern const uint64_t ff_pw_42;
+extern const uint64_t ff_pw_53;
+extern const xmm_reg  ff_pw_64;
+extern const uint64_t ff_pw_96;
+extern const uint64_t ff_pw_128;
+extern const ymm_reg  ff_pw_255;
+extern const ymm_reg  ff_pw_256;
+extern const ymm_reg  ff_pw_512;
+extern const ymm_reg  ff_pw_1023;
+extern const ymm_reg  ff_pw_1024;
+extern const ymm_reg  ff_pw_2048;
+extern const ymm_reg  ff_pw_4095;
+extern const ymm_reg  ff_pw_4096;
+extern const ymm_reg  ff_pw_8192;
+extern const ymm_reg  ff_pw_m1;
+
+extern const ymm_reg  ff_pb_0;
+extern const ymm_reg  ff_pb_1;
+extern const ymm_reg  ff_pb_2;
+extern const ymm_reg  ff_pb_3;
+extern const ymm_reg  ff_pb_80;
+extern const ymm_reg  ff_pb_FE;
+extern const uint64_t ff_pb_FC;
+
+extern const xmm_reg  ff_ps_neg;
+
+extern const ymm_reg  ff_pd_1;
+extern const ymm_reg  ff_pd_16;
+extern const ymm_reg  ff_pd_32;
+extern const ymm_reg  ff_pd_8192;
+extern const ymm_reg  ff_pd_65535;
+
+#endif /* AVCODEC_X86_CONSTANTS_H */
--- a/externals/ffmpeg/libavcodec/x86/dcadsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/dcadsp.asm
@@ -0,0 +1,301 @@
+;******************************************************************************
+;* SIMD-optimized functions for the DCA decoder
+;* Copyright (C) 2016 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%define sizeof_float 4
+%define FMA3_OFFSET (8 * cpuflag(fma3))
+
+%macro LFE_FIR0_FLOAT 0
+cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
+    shr nblocksd, 1
+    sub     lfeq, 7*sizeof_float
+    mov    cnt1d, 32*sizeof_float
+    mov    cnt2d, 32*sizeof_float-8-FMA3_OFFSET
+    lea   coeffq, [coeffq+cnt1q*8]
+    add samplesq, cnt1q
+    neg    cnt1q
+
+.loop:
+%if cpuflag(avx)
+    cvtdq2ps  m4, [lfeq+16]
+    cvtdq2ps  m5, [lfeq   ]
+    shufps    m7, m4, m4, q0123
+    shufps    m6, m5, m5, q0123
+%elif cpuflag(sse2)
+    movu      m4, [lfeq+16]
+    movu      m5, [lfeq   ]
+    cvtdq2ps  m4, m4
+    cvtdq2ps  m5, m5
+    pshufd    m7, m4, q0123
+    pshufd    m6, m5, q0123
+%else
+    cvtpi2ps  m4, [lfeq+16]
+    cvtpi2ps  m0, [lfeq+24]
+    cvtpi2ps  m5, [lfeq   ]
+    cvtpi2ps  m1, [lfeq+8 ]
+    shufps    m4, m0, q1010
+    shufps    m5, m1, q1010
+    shufps    m7, m4, m4, q0123
+    shufps    m6, m5, m5, q0123
+%endif
+
+.inner_loop:
+%if ARCH_X86_64
+    movaps    m8, [coeffq+cnt1q*8   ]
+    movaps    m9, [coeffq+cnt1q*8+16]
+    movaps   m10, [coeffq+cnt1q*8+32]
+    movaps   m11, [coeffq+cnt1q*8+48]
+%if cpuflag(fma3)
+    movaps   m12, [coeffq+cnt1q*8+64]
+    movaps   m13, [coeffq+cnt1q*8+80]
+    movaps   m14, [coeffq+cnt1q*8+96]
+    movaps   m15, [coeffq+cnt1q*8+112]
+    mulps     m0, m7, m8
+    mulps     m1, m7, m10
+    mulps     m2, m7, m12
+    mulps     m3, m7, m14
+    fmaddps   m0, m6, m9, m0
+    fmaddps   m1, m6, m11, m1
+    fmaddps   m2, m6, m13, m2
+    fmaddps   m3, m6, m15, m3
+
+    haddps    m0, m1
+    haddps    m2, m3
+    haddps    m0, m2
+    movaps [samplesq+cnt1q], m0
+%else
+    mulps     m0, m7, m8
+    mulps     m1, m6, m9
+    mulps     m2, m7, m10
+    mulps     m3, m6, m11
+    addps     m0, m1
+    addps     m2, m3
+
+    unpckhps  m3, m0, m2
+    unpcklps  m0, m2
+    addps     m3, m0
+    movhlps   m2, m3
+    addps     m2, m3
+    movlps [samplesq+cnt1q], m2
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+    mulps     m0, m7, [coeffq+cnt1q*8    ]
+    mulps     m1, m7, [coeffq+cnt1q*8+32 ]
+    mulps     m2, m7, [coeffq+cnt1q*8+64 ]
+    mulps     m3, m7, [coeffq+cnt1q*8+96 ]
+    fmaddps   m0, m6, [coeffq+cnt1q*8+16 ], m0
+    fmaddps   m1, m6, [coeffq+cnt1q*8+48 ], m1
+    fmaddps   m2, m6, [coeffq+cnt1q*8+80 ], m2
+    fmaddps   m3, m6, [coeffq+cnt1q*8+112], m3
+
+    haddps    m0, m1
+    haddps    m2, m3
+    haddps    m0, m2
+    movaps [samplesq+cnt1q], m0
+%else
+    mulps     m0, m7, [coeffq+cnt1q*8   ]
+    mulps     m1, m6, [coeffq+cnt1q*8+16]
+    mulps     m2, m7, [coeffq+cnt1q*8+32]
+    mulps     m3, m6, [coeffq+cnt1q*8+48]
+    addps     m0, m1
+    addps     m2, m3
+
+    unpckhps  m3, m0, m2
+    unpcklps  m0, m2
+    addps     m3, m0
+    movhlps   m2, m3
+    addps     m2, m3
+    movlps [samplesq+cnt1q], m2
+%endif
+%endif; ARCH
+
+%if ARCH_X86_64
+%if cpuflag(fma3)
+    mulps     m8, m5
+    mulps    m10, m5
+    mulps    m12, m5
+    mulps    m14, m5
+    fmaddps   m8, m4, m9, m8
+    fmaddps  m10, m4, m11, m10
+    fmaddps  m12, m4, m13, m12
+    fmaddps  m14, m4, m15, m14
+
+    haddps   m10, m8
+    haddps   m14, m12
+    haddps   m14, m10
+    movaps [samplesq+cnt2q], m14
+%else
+    mulps     m8, m5
+    mulps     m9, m4
+    mulps    m10, m5
+    mulps    m11, m4
+    addps     m8, m9
+    addps    m10, m11
+
+    unpckhps m11, m10, m8
+    unpcklps m10, m8
+    addps    m11, m10
+    movhlps   m8, m11
+    addps     m8, m11
+    movlps [samplesq+cnt2q], m8
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+    mulps     m0, m5, [coeffq+cnt1q*8    ]
+    mulps     m1, m5, [coeffq+cnt1q*8+32 ]
+    mulps     m2, m5, [coeffq+cnt1q*8+64 ]
+    mulps     m3, m5, [coeffq+cnt1q*8+96 ]
+    fmaddps   m0, m4, [coeffq+cnt1q*8+16 ], m0
+    fmaddps   m1, m4, [coeffq+cnt1q*8+48 ], m1
+    fmaddps   m2, m4, [coeffq+cnt1q*8+80 ], m2
+    fmaddps   m3, m4, [coeffq+cnt1q*8+112], m3
+
+    haddps    m1, m0
+    haddps    m3, m2
+    haddps    m3, m1
+    movaps [samplesq+cnt2q], m3
+%else
+    mulps     m0, m5, [coeffq+cnt1q*8   ]
+    mulps     m1, m4, [coeffq+cnt1q*8+16]
+    mulps     m2, m5, [coeffq+cnt1q*8+32]
+    mulps     m3, m4, [coeffq+cnt1q*8+48]
+    addps     m0, m1
+    addps     m2, m3
+
+    unpckhps  m3, m2, m0
+    unpcklps  m2, m0
+    addps     m3, m2
+    movhlps   m0, m3
+    addps     m0, m3
+    movlps [samplesq+cnt2q], m0
+%endif
+%endif; ARCH
+
+    sub    cnt2d, 8 + FMA3_OFFSET
+    add    cnt1q, 8 + FMA3_OFFSET
+    jl .inner_loop
+
+    add     lfeq, 4
+    add samplesq,  64*sizeof_float
+    mov    cnt1q, -32*sizeof_float
+    mov    cnt2d,  32*sizeof_float-8-FMA3_OFFSET
+    sub nblocksd, 1
+    jg .loop
+    RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_XMM sse
+LFE_FIR0_FLOAT
+%endif
+INIT_XMM sse2
+LFE_FIR0_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR0_FLOAT
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_XMM fma3
+LFE_FIR0_FLOAT
+%endif
+
+%macro LFE_FIR1_FLOAT 0
+cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
+    shr nblocksd, 2
+    sub     lfeq, 3*sizeof_float
+    mov    cnt1d, 64*sizeof_float
+    mov    cnt2d, 64*sizeof_float-16
+    lea   coeffq, [coeffq+cnt1q*4]
+    add samplesq, cnt1q
+    neg    cnt1q
+
+.loop:
+%if cpuflag(avx)
+    cvtdq2ps  m4, [lfeq]
+    shufps    m5, m4, m4, q0123
+%elif cpuflag(sse2)
+    movu      m4, [lfeq]
+    cvtdq2ps  m4, m4
+    pshufd    m5, m4, q0123
+%endif
+
+.inner_loop:
+    movaps    m6, [coeffq+cnt1q*4   ]
+    movaps    m7, [coeffq+cnt1q*4+16]
+    mulps     m0, m5, m6
+    mulps     m1, m5, m7
+%if ARCH_X86_64
+    movaps    m8, [coeffq+cnt1q*4+32]
+    movaps    m9, [coeffq+cnt1q*4+48]
+    mulps     m2, m5, m8
+    mulps     m3, m5, m9
+%else
+    mulps     m2, m5, [coeffq+cnt1q*4+32]
+    mulps     m3, m5, [coeffq+cnt1q*4+48]
+%endif
+
+    haddps    m0, m1
+    haddps    m2, m3
+    haddps    m0, m2
+    movaps [samplesq+cnt1q], m0
+
+    mulps     m6, m4
+    mulps     m7, m4
+%if ARCH_X86_64
+    mulps     m8, m4
+    mulps     m9, m4
+
+    haddps    m6, m7
+    haddps    m8, m9
+    haddps    m6, m8
+%else
+    mulps     m2, m4, [coeffq+cnt1q*4+32]
+    mulps     m3, m4, [coeffq+cnt1q*4+48]
+
+    haddps    m6, m7
+    haddps    m2, m3
+    haddps    m6, m2
+%endif
+    movaps [samplesq+cnt2q], m6
+
+    sub    cnt2d, 16
+    add    cnt1q, 16
+    jl .inner_loop
+
+    add     lfeq, sizeof_float
+    add samplesq, 128*sizeof_float
+    mov    cnt1q, -64*sizeof_float
+    mov    cnt2d,  64*sizeof_float-16
+    sub nblocksd, 1
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse3
+LFE_FIR1_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR1_FLOAT
+%endif
--- a/externals/ffmpeg/libavcodec/x86/dcadsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/dcadsp_init.c
@@ -0,0 +1,52 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dcadsp.h"
+
+#define LFE_FIR_FLOAT_FUNC(opt)                                               \
+void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples,         \
+                             const float *filter_coeff, ptrdiff_t npcmblocks); \
+void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples,         \
+                             const float *filter_coeff, ptrdiff_t npcmblocks);
+
+LFE_FIR_FLOAT_FUNC(sse)
+LFE_FIR_FLOAT_FUNC(sse2)
+LFE_FIR_FLOAT_FUNC(sse3)
+LFE_FIR_FLOAT_FUNC(avx)
+LFE_FIR_FLOAT_FUNC(fma3)
+
+av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
+    if (EXTERNAL_SSE3(cpu_flags))
+        s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3;
+    if (EXTERNAL_AVX(cpu_flags)) {
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
+        s->lfe_fir_float[1] = ff_lfe_fir1_float_avx;
+    }
+    if (EXTERNAL_FMA3(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
+}
--- a/externals/ffmpeg/libavcodec/x86/dct32.asm
+++ b/externals/ffmpeg/libavcodec/x86/dct32.asm
@@ -0,0 +1,491 @@
+;******************************************************************************
+;* 32 point SSE-optimized DCT transform
+;* Copyright (c) 2010 Vitor Sessak
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
+
+ps_cos_vec: dd   0.500603,  0.505471,  0.515447,  0.531043
+            dd   0.553104,  0.582935,  0.622504,  0.674808
+            dd -10.190008, -3.407609, -2.057781, -1.484165
+            dd  -1.169440, -0.972568, -0.839350, -0.744536
+            dd   0.502419,  0.522499,  0.566944,  0.646822
+            dd   0.788155,  1.060678,  1.722447,  5.101149
+            dd   0.509796,  0.601345,  0.899976,  2.562916
+            dd   0.509796,  0.601345,  0.899976,  2.562916
+            dd   1.000000,  1.000000,  1.306563,  0.541196
+            dd   1.000000,  1.000000,  1.306563,  0.541196
+            dd   1.000000,  0.707107,  1.000000, -0.707107
+            dd   1.000000,  0.707107,  1.000000, -0.707107
+            dd   0.707107,  0.707107,  0.707107,  0.707107
+
+%macro BUTTERFLY 4
+    subps  %4, %1, %2
+    addps  %2, %2, %1
+    mulps  %1, %4, %3
+%endmacro
+
+%macro BUTTERFLY0 5
+%if cpuflag(sse2) && notcpuflag(avx)
+    pshufd %4, %1, %5
+    xorps  %1, %2
+    addps  %1, %4
+    mulps  %1, %3
+%else
+    shufps %4, %1, %1, %5
+    xorps  %1, %1, %2
+    addps  %4, %4, %1
+    mulps  %1, %4, %3
+%endif
+%endmacro
+
+%macro BUTTERFLY2 4
+    BUTTERFLY0 %1, %2, %3, %4, 0x1b
+%endmacro
+
+%macro BUTTERFLY3 4
+    BUTTERFLY0 %1, %2, %3, %4, 0xb1
+%endmacro
+
+%macro BUTTERFLY3V 5
+    movaps m%5, m%1
+    addps  m%1, m%2
+    subps  m%5, m%2
+    SWAP %2, %5
+    mulps  m%2, [ps_cos_vec+192]
+    movaps m%5, m%3
+    addps  m%3, m%4
+    subps  m%4, m%5
+    mulps  m%4, [ps_cos_vec+192]
+%endmacro
+
+%macro PASS6_AND_PERMUTE 0
+    mov         tmpd, [outq+4]
+    movss         m7, [outq+72]
+    addss         m7, [outq+76]
+    movss         m3, [outq+56]
+    addss         m3, [outq+60]
+    addss         m4, m3
+    movss         m2, [outq+52]
+    addss         m2, m3
+    movss         m3, [outq+104]
+    addss         m3, [outq+108]
+    addss         m1, m3
+    addss         m5, m4
+    movss [outq+ 16], m1
+    movss         m1, [outq+100]
+    addss         m1, m3
+    movss         m3, [outq+40]
+    movss [outq+ 48], m1
+    addss         m3, [outq+44]
+    movss         m1, [outq+100]
+    addss         m4, m3
+    addss         m3, m2
+    addss         m1, [outq+108]
+    movss [outq+ 40], m3
+    addss         m2, [outq+36]
+    movss         m3, [outq+8]
+    movss [outq+ 56], m2
+    addss         m3, [outq+12]
+    movss [outq+ 32], m3
+    movss         m3, [outq+80]
+    movss [outq+  8], m5
+    movss [outq+ 80], m1
+    movss         m2, [outq+52]
+    movss         m5, [outq+120]
+    addss         m5, [outq+124]
+    movss         m1, [outq+64]
+    addss         m2, [outq+60]
+    addss         m0, m5
+    addss         m5, [outq+116]
+    mov    [outq+64], tmpd
+    addss         m6, m0
+    addss         m1, m6
+    mov         tmpd, [outq+12]
+    mov   [outq+ 96], tmpd
+    movss [outq+  4], m1
+    movss         m1, [outq+24]
+    movss [outq+ 24], m4
+    movss         m4, [outq+88]
+    addss         m4, [outq+92]
+    addss         m3, m4
+    addss         m4, [outq+84]
+    mov         tmpd, [outq+108]
+    addss         m1, [outq+28]
+    addss         m0, m1
+    addss         m1, m5
+    addss         m6, m3
+    addss         m3, m0
+    addss         m0, m7
+    addss         m5, [outq+20]
+    addss         m7, m1
+    movss [outq+ 12], m6
+    mov   [outq+112], tmpd
+    movss         m6, [outq+28]
+    movss [outq+ 28], m0
+    movss         m0, [outq+36]
+    movss [outq+ 36], m7
+    addss         m1, m4
+    movss         m7, [outq+116]
+    addss         m0, m2
+    addss         m7, [outq+124]
+    movss [outq+ 72], m0
+    movss         m0, [outq+44]
+    addss         m2, m0
+    movss [outq+ 44], m1
+    movss [outq+ 88], m2
+    addss         m0, [outq+60]
+    mov         tmpd, [outq+60]
+    mov   [outq+120], tmpd
+    movss [outq+104], m0
+    addss         m4, m5
+    addss         m5, [outq+68]
+    movss  [outq+52], m4
+    movss  [outq+60], m5
+    movss         m4, [outq+68]
+    movss         m5, [outq+20]
+    movss [outq+ 20], m3
+    addss         m5, m7
+    addss         m7, m6
+    addss         m4, m5
+    movss         m2, [outq+84]
+    addss         m2, [outq+92]
+    addss         m5, m2
+    movss [outq+ 68], m4
+    addss         m2, m7
+    movss         m4, [outq+76]
+    movss [outq+ 84], m2
+    movss [outq+ 76], m5
+    addss         m7, m4
+    addss         m6, [outq+124]
+    addss         m4, m6
+    addss         m6, [outq+92]
+    movss [outq+100], m4
+    movss [outq+108], m6
+    movss         m6, [outq+92]
+    movss  [outq+92], m7
+    addss         m6, [outq+124]
+    movss [outq+116], m6
+%endmacro
+
+INIT_YMM avx
+SECTION .text
+%if HAVE_AVX_EXTERNAL
+; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
+cglobal dct32_float, 2,3,8, out, in, tmp
+    ; pass 1
+    vmovaps     m4, [inq+0]
+    vinsertf128 m5, m5, [inq+96], 1
+    vinsertf128 m5, m5, [inq+112], 0
+    vshufps     m5, m5, m5, 0x1b
+    BUTTERFLY   m4, m5, [ps_cos_vec], m6
+
+    vmovaps     m2, [inq+64]
+    vinsertf128 m6, m6, [inq+32], 1
+    vinsertf128 m6, m6, [inq+48], 0
+    vshufps     m6, m6, m6, 0x1b
+    BUTTERFLY   m2, m6, [ps_cos_vec+32], m0
+
+    ; pass 2
+
+    BUTTERFLY  m5, m6, [ps_cos_vec+64], m0
+    BUTTERFLY  m4, m2, [ps_cos_vec+64], m7
+
+
+    ; pass 3
+    vperm2f128  m3, m6, m4, 0x31
+    vperm2f128  m1, m6, m4, 0x20
+    vshufps     m3, m3, m3, 0x1b
+
+    BUTTERFLY   m1, m3, [ps_cos_vec+96], m6
+
+
+    vperm2f128  m4, m5, m2, 0x20
+    vperm2f128  m5, m5, m2, 0x31
+    vshufps     m5, m5, m5, 0x1b
+
+    BUTTERFLY   m4, m5, [ps_cos_vec+96], m6
+
+    ; pass 4
+    vmovaps m6, [ps_p1p1m1m1+0]
+    vmovaps m2, [ps_cos_vec+128]
+
+    BUTTERFLY2  m5, m6, m2, m7
+    BUTTERFLY2  m4, m6, m2, m7
+    BUTTERFLY2  m1, m6, m2, m7
+    BUTTERFLY2  m3, m6, m2, m7
+
+
+    ; pass 5
+    vshufps m6, m6, m6, 0xcc
+    vmovaps m2, [ps_cos_vec+160]
+
+    BUTTERFLY3  m5, m6, m2, m7
+    BUTTERFLY3  m4, m6, m2, m7
+    BUTTERFLY3  m1, m6, m2, m7
+    BUTTERFLY3  m3, m6, m2, m7
+
+    vperm2f128  m6, m3, m3, 0x31
+    vmovaps [outq], m3
+
+    vextractf128  [outq+64], m5, 1
+    vextractf128  [outq+32], m5, 0
+
+    vextractf128  [outq+80], m4, 1
+    vextractf128  [outq+48], m4, 0
+
+    vperm2f128  m0, m1, m1, 0x31
+    vmovaps [outq+96], m1
+
+    vzeroupper
+
+    ;    pass 6, no SIMD...
+INIT_XMM
+    PASS6_AND_PERMUTE
+    RET
+%endif
+
+%if ARCH_X86_64
+%define SPILL SWAP
+%define UNSPILL SWAP
+
+%macro PASS5 0
+    nop ; FIXME code alignment
+    SWAP 5, 8
+    SWAP 4, 12
+    SWAP 6, 14
+    SWAP 7, 13
+    SWAP 0, 15
+    PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
+    TRANSPOSE4x4PS 8, 9, 10, 11, 0
+    BUTTERFLY3V    8, 9, 10, 11, 0
+    addps   m10, m11
+    TRANSPOSE4x4PS 12, 13, 14, 15, 0
+    BUTTERFLY3V    12, 13, 14, 15, 0
+    addps   m14, m15
+    addps   m12, m14
+    addps   m14, m13
+    addps   m13, m15
+%endmacro
+
+%macro PASS6 0
+    SWAP 9, 12
+    SWAP 11, 14
+    movss [outq+0x00], m8
+    pshuflw m0, m8, 0xe
+    movss [outq+0x10], m9
+    pshuflw m1, m9, 0xe
+    movss [outq+0x20], m10
+    pshuflw m2, m10, 0xe
+    movss [outq+0x30], m11
+    pshuflw m3, m11, 0xe
+    movss [outq+0x40], m12
+    pshuflw m4, m12, 0xe
+    movss [outq+0x50], m13
+    pshuflw m5, m13, 0xe
+    movss [outq+0x60], m14
+    pshuflw m6, m14, 0xe
+    movaps [outq+0x70], m15
+    pshuflw m7, m15, 0xe
+    addss   m0, m1
+    addss   m1, m2
+    movss [outq+0x08], m0
+    addss   m2, m3
+    movss [outq+0x18], m1
+    addss   m3, m4
+    movss [outq+0x28], m2
+    addss   m4, m5
+    movss [outq+0x38], m3
+    addss   m5, m6
+    movss [outq+0x48], m4
+    addss   m6, m7
+    movss [outq+0x58], m5
+    movss [outq+0x68], m6
+    movss [outq+0x78], m7
+
+    PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
+    movhlps m0, m1
+    pshufd  m1, m1, 3
+    SWAP 0, 2, 4, 6, 8, 10, 12, 14
+    SWAP 1, 3, 5, 7, 9, 11, 13, 15
+%rep 7
+    movhlps m0, m1
+    pshufd  m1, m1, 3
+    addss   m15, m1
+    SWAP 0, 2, 4, 6, 8, 10, 12, 14
+    SWAP 1, 3, 5, 7, 9, 11, 13, 15
+%endrep
+%assign i 4
+%rep 15
+    addss m0, m1
+    movss [outq+i], m0
+    SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    %assign i i+8
+%endrep
+%endmacro
+
+%else ; ARCH_X86_32
+%macro SPILL 2 ; xmm#, mempos
+    movaps [outq+(%2-8)*16], m%1
+%endmacro
+%macro UNSPILL 2
+    movaps m%1, [outq+(%2-8)*16]
+%endmacro
+
+%define PASS6 PASS6_AND_PERMUTE
+%macro PASS5 0
+    movaps      m2, [ps_cos_vec+160]
+    shufps      m3, m3, 0xcc
+
+    BUTTERFLY3  m5, m3, m2, m1
+    SPILL 5, 8
+
+    UNSPILL 1, 9
+    BUTTERFLY3  m1, m3, m2, m5
+    SPILL 1, 14
+
+    BUTTERFLY3  m4, m3, m2, m5
+    SPILL 4, 12
+
+    BUTTERFLY3  m7, m3, m2, m5
+    SPILL 7, 13
+
+    UNSPILL 5, 10
+    BUTTERFLY3  m5, m3, m2, m7
+    SPILL 5, 10
+
+    UNSPILL 4, 11
+    BUTTERFLY3  m4, m3, m2, m7
+    SPILL 4, 11
+
+    BUTTERFLY3  m6, m3, m2, m7
+    SPILL 6, 9
+
+    BUTTERFLY3  m0, m3, m2, m7
+    SPILL 0, 15
+%endmacro
+%endif
+
+
+; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
+%macro DCT32_FUNC 0
+cglobal dct32_float, 2, 3, 16, out, in, tmp
+    ; pass 1
+
+    movaps      m0, [inq+0]
+    LOAD_INV    m1, [inq+112]
+    BUTTERFLY   m0, m1, [ps_cos_vec], m3
+
+    movaps      m7, [inq+64]
+    LOAD_INV    m4, [inq+48]
+    BUTTERFLY   m7, m4, [ps_cos_vec+32], m3
+
+    ; pass 2
+    movaps      m2, [ps_cos_vec+64]
+    BUTTERFLY   m1, m4, m2, m3
+    SPILL 1, 11
+    SPILL 4, 8
+
+    ; pass 1
+    movaps      m1, [inq+16]
+    LOAD_INV    m6, [inq+96]
+    BUTTERFLY   m1, m6, [ps_cos_vec+16], m3
+
+    movaps      m4, [inq+80]
+    LOAD_INV    m5, [inq+32]
+    BUTTERFLY   m4, m5, [ps_cos_vec+48], m3
+
+    ; pass 2
+    BUTTERFLY   m0, m7, m2, m3
+
+    movaps      m2, [ps_cos_vec+80]
+    BUTTERFLY   m6, m5, m2, m3
+
+    BUTTERFLY   m1, m4, m2, m3
+
+    ; pass 3
+    movaps      m2, [ps_cos_vec+96]
+    shufps      m1, m1, 0x1b
+    BUTTERFLY   m0, m1, m2, m3
+    SPILL 0, 15
+    SPILL 1, 14
+
+    UNSPILL 0, 8
+    shufps      m5, m5, 0x1b
+    BUTTERFLY   m0, m5, m2, m3
+
+    UNSPILL 1, 11
+    shufps      m6, m6, 0x1b
+    BUTTERFLY   m1, m6, m2, m3
+    SPILL 1, 11
+
+    shufps      m4, m4, 0x1b
+    BUTTERFLY   m7, m4, m2, m3
+
+    ; pass 4
+    movaps      m3, [ps_p1p1m1m1+0]
+    movaps      m2, [ps_cos_vec+128]
+
+    BUTTERFLY2  m5, m3, m2, m1
+
+    BUTTERFLY2  m0, m3, m2, m1
+    SPILL 0, 9
+
+    BUTTERFLY2  m6, m3, m2, m1
+    SPILL 6, 10
+
+    UNSPILL 0, 11
+    BUTTERFLY2  m0, m3, m2, m1
+    SPILL 0, 11
+
+    BUTTERFLY2  m4, m3, m2, m1
+
+    BUTTERFLY2  m7, m3, m2, m1
+
+    UNSPILL 6, 14
+    BUTTERFLY2  m6, m3, m2, m1
+
+    UNSPILL 0, 15
+    BUTTERFLY2  m0, m3, m2, m1
+
+    PASS5
+    PASS6
+    RET
+%endmacro
+
+%macro LOAD_INV 2
+%if cpuflag(sse2)
+    pshufd      %1, %2, 0x1b
+%elif cpuflag(sse)
+    movaps      %1, %2
+    shufps      %1, %1, 0x1b
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_XMM sse
+DCT32_FUNC
+%endif
+
+INIT_XMM sse2
+DCT32_FUNC
--- a/externals/ffmpeg/libavcodec/x86/dct_init.c
+++ b/externals/ffmpeg/libavcodec/x86/dct_init.c
@@ -0,0 +1,41 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dct.h"
+
+void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
+void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
+void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
+
+av_cold void ff_dct_init_x86(DCTContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+    if (EXTERNAL_SSE(cpu_flags))
+        s->dct32 = ff_dct32_float_sse;
+#endif
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->dct32 = ff_dct32_float_sse2;
+    if (EXTERNAL_AVX_FAST(cpu_flags))
+        s->dct32 = ff_dct32_float_avx;
+}
--- a/externals/ffmpeg/libavcodec/x86/dirac_dwt.asm
+++ b/externals/ffmpeg/libavcodec/x86/dirac_dwt.asm
@@ -0,0 +1,307 @@
+;******************************************************************************
+;* x86 optimized discrete wavelet trasnform
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_1991: times 4 dw 9,-1
+
+cextern pw_1
+cextern pw_2
+cextern pw_8
+cextern pw_16
+
+SECTION .text
+
+; %1 -= (%2 + %3 + 2)>>2     %4 is pw_2
+%macro COMPOSE_53iL0 4
+    paddw   %2, %3
+    paddw   %2, %4
+    psraw   %2, 2
+    psubw   %1, %2
+%endm
+
+; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
+; if %4 is supplied, %1 is loaded unaligned from there
+; m2: clobbered  m3: pw_8  m4: pw_1991
+%macro COMPOSE_DD97iH0 3-4
+    paddw   m0, %3
+    paddw   m1, %2
+    psubw   m0, m3
+    mova    m2, m1
+    punpcklwd m1, m0
+    punpckhwd m2, m0
+    pmaddwd m1, m4
+    pmaddwd m2, m4
+%if %0 > 3
+    movu    %1, %4
+%endif
+    psrad   m1, 4
+    psrad   m2, 4
+    packssdw m1, m2
+    paddw   m1, %1
+%endm
+
+%macro COMPOSE_VERTICAL 1
+; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                  int width)
+cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
+    mova    m2, [pw_2]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m1, [b0q+2*widthq]
+    mova    m0, [b1q+2*widthq]
+    COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
+    mova    [b1q+2*widthq], m0
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                  int width)
+cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
+    mova    m1, [pw_1]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    paddw   m0, [b2q+2*widthq]
+    paddw   m0, m1
+    psraw   m0, 1
+    paddw   m0, [b1q+2*widthq]
+    mova    [b1q+2*widthq], m0
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                               IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
+    mova    m3, [pw_8]
+    mova    m4, [pw_1991]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    mova    m1, [b1q+2*widthq]
+    COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
+    mova    [b2q+2*widthq], m1
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
+    mova    m3, [pw_16]
+    mova    m4, [pw_1991]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    mova    m1, [b1q+2*widthq]
+    mova    m5, [b2q+2*widthq]
+    paddw   m0, [b4q+2*widthq]
+    paddw   m1, [b3q+2*widthq]
+    psubw   m0, m3
+    mova    m2, m1
+    punpcklwd m1, m0
+    punpckhwd m2, m0
+    pmaddwd m1, m4
+    pmaddwd m2, m4
+    psrad   m1, 5
+    psrad   m2, 5
+    packssdw m1, m2
+    psubw   m5, m1
+    mova    [b2q+2*widthq], m5
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
+cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
+    mova    m3, [pw_1]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m1, [b1q+2*widthq]
+    mova    m0, [b0q+2*widthq]
+    mova    m2, m1
+    paddw   m1, m3
+    psraw   m1, 1
+    psubw   m0, m1
+    mova    [b0q+2*widthq], m0
+    paddw   m2, m0
+    mova    [b1q+2*widthq], m2
+    jg      .loop
+    REP_RET
+%endmacro
+
+; extend the left and right edges of the tmp array by %1 and %2 respectively
+%macro EDGE_EXTENSION 3
+    mov     %3, [tmpq]
+%assign %%i 1
+%rep %1
+    mov     [tmpq-2*%%i], %3
+    %assign %%i %%i+1
+%endrep
+    mov     %3, [tmpq+2*w2q-2]
+%assign %%i 0
+%rep %2
+    mov     [tmpq+2*w2q+2*%%i], %3
+    %assign %%i %%i+1
+%endrep
+%endmacro
+
+
+%macro HAAR_HORIZONTAL 2
+; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
+    mov    w2d, wd
+    xor     xq, xq
+    shr    w2d, 1
+    lea  b_w2q, [bq+wq]
+    mova    m3, [pw_1]
+.lowpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [bq    + 2*xq]
+    paddw   m1, m3
+    psraw   m1, 1
+    psubw   m0, m1
+    mova    [tmpq + 2*xq], m0
+    add     xq, mmsize/2
+    cmp     xq, w2q
+    jl      .lowpass_loop
+
+    xor     xq, xq
+    and    w2q, ~(mmsize/2 - 1)
+    cmp    w2q, mmsize/2
+    jl      .end
+
+.highpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [tmpq  + 2*xq]
+    paddw   m1, m0
+
+    ; shift and interleave
+%if %2 == 1
+    paddw   m0, m3
+    paddw   m1, m3
+    psraw   m0, 1
+    psraw   m1, 1
+%endif
+    mova    m2, m0
+    punpcklwd m0, m1
+    punpckhwd m2, m1
+    mova    [bq+4*xq], m0
+    mova    [bq+4*xq+mmsize], m2
+
+    add     xq, mmsize/2
+    cmp     xq, w2q
+    jl      .highpass_loop
+.end:
+    REP_RET
+%endmacro
+
+
+INIT_XMM
+; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
+    mov    w2d, wd
+    xor     xd, xd
+    shr    w2d, 1
+    lea  b_w2q, [bq+wq]
+    movu    m4, [bq+wq]
+    mova    m7, [pw_2]
+    pslldq  m4, 14
+.lowpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [bq    + 2*xq]
+    mova    m2, m1
+    palignr m1, m4, 14
+    mova    m4, m2
+    COMPOSE_53iL0 m0, m1, m2, m7
+    mova    [tmpq + 2*xq], m0
+    add     xd, mmsize/2
+    cmp     xd, w2d
+    jl      .lowpass_loop
+
+    EDGE_EXTENSION 1, 2, xw
+    ; leave the last up to 7 (sse) or 3 (mmx) values for C
+    xor     xd, xd
+    and    w2d, ~(mmsize/2 - 1)
+    cmp    w2d, mmsize/2
+    jl      .end
+
+    mova    m7, [tmpq-mmsize]
+    mova    m0, [tmpq]
+    mova    m5, [pw_1]
+    mova    m3, [pw_8]
+    mova    m4, [pw_1991]
+.highpass_loop:
+    mova    m6, m0
+    palignr m0, m7, 14
+    mova    m7, [tmpq + 2*xq + 16]
+    mova    m1, m7
+    mova    m2, m7
+    palignr m1, m6, 2
+    palignr m2, m6, 4
+    COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
+    mova    m0, m7
+    mova    m7, m6
+
+    ; shift and interleave
+    paddw   m6, m5
+    paddw   m1, m5
+    psraw   m6, 1
+    psraw   m1, 1
+    mova    m2, m6
+    punpcklwd m6, m1
+    punpckhwd m2, m1
+    mova    [bq+4*xq], m6
+    mova    [bq+4*xq+mmsize], m2
+
+    add     xd, mmsize/2
+    cmp     xd, w2d
+    jl      .highpass_loop
+.end:
+    REP_RET
+
+
+%if ARCH_X86_64 == 0
+INIT_MMX
+COMPOSE_VERTICAL mmx
+HAAR_HORIZONTAL mmx, 0
+HAAR_HORIZONTAL mmx, 1
+%endif
+
+;;INIT_XMM
+INIT_XMM
+COMPOSE_VERTICAL sse2
+HAAR_HORIZONTAL sse2, 0
+HAAR_HORIZONTAL sse2, 1
--- a/externals/ffmpeg/libavcodec/x86/dirac_dwt_init.c
+++ b/externals/ffmpeg/libavcodec/x86/dirac_dwt_init.c
@@ -0,0 +1,229 @@
+/*
+ * x86 optimized discrete wavelet transform
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dirac_dwt.h"
+
+#define COMPOSE_VERTICAL(ext, align) \
+void ff_vertical_compose53iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
+void ff_vertical_compose_dirac53iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
+void ff_vertical_compose_dd137iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
+void ff_vertical_compose_dd97iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
+void ff_vertical_compose_haar##ext(int16_t *b0, int16_t *b1, int width); \
+void ff_horizontal_compose_haar0i##ext(int16_t *b, int16_t *tmp, int w);\
+void ff_horizontal_compose_haar1i##ext(int16_t *b, int16_t *tmp, int w);\
+\
+static void vertical_compose53iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dirac53iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dd137iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
+                                           uint8_t *_b3, uint8_t *_b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+    int16_t *b3 = (int16_t *)_b3; \
+    int16_t *b4 = (int16_t *)_b4; \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+\
+static void vertical_compose_dd97iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
+                                          uint8_t *_b3, uint8_t *_b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+    int16_t *b3 = (int16_t *)_b3; \
+    int16_t *b4 = (int16_t *)_b4; \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+static void vertical_compose_haar##ext(uint8_t *_b0, uint8_t *_b1, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+\
+    for(i=width_align; i<width; i++) { \
+        b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
+        b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
+    } \
+\
+    ff_vertical_compose_haar##ext(b0, b1, width_align); \
+} \
+static void horizontal_compose_haar0i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    int16_t *b = (int16_t *)_b; \
+    int16_t *tmp = (int16_t *)_tmp; \
+\
+    ff_horizontal_compose_haar0i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = tmp[x];\
+        b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
+    }\
+}\
+static void horizontal_compose_haar1i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    int16_t *b = (int16_t *)_b; \
+    int16_t *tmp = (int16_t *)_tmp; \
+\
+    ff_horizontal_compose_haar1i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = (tmp[x] + 1)>>1;\
+        b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
+    }\
+}\
+\
+
+#if HAVE_X86ASM
+#if !ARCH_X86_64
+COMPOSE_VERTICAL(_mmx, 4)
+#endif
+COMPOSE_VERTICAL(_sse2, 8)
+
+
+void ff_horizontal_compose_dd97i_ssse3(int16_t *_b, int16_t *_tmp, int w);
+
+static void horizontal_compose_dd97i_ssse3(uint8_t *_b, uint8_t *_tmp, int w)
+{
+    int w2= w>>1;
+    int x= w2 - (w2&7);
+    int16_t *b = (int16_t *)_b;
+    int16_t *tmp = (int16_t *)_tmp;
+
+    ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
+
+    for (; x < w2; x++) {
+        b[2*x  ] = (tmp[x] + 1)>>1;
+        b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+    }
+}
+#endif
+
+void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type)
+{
+#if HAVE_X86ASM
+  int mm_flags = av_get_cpu_flags();
+
+#if !ARCH_X86_64
+    if (!(mm_flags & AV_CPU_FLAG_MMX))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar0i_mmx;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar1i_mmx;
+        break;
+    }
+#endif
+
+    if (!(mm_flags & AV_CPU_FLAG_SSE2))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar0i_sse2;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar1i_sse2;
+        break;
+    }
+
+    if (!(mm_flags & AV_CPU_FLAG_SSSE3))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->horizontal_compose = horizontal_compose_dd97i_ssse3;
+        break;
+    }
+#endif // HAVE_X86ASM
+}
--- a/externals/ffmpeg/libavcodec/x86/diracdsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/diracdsp.asm
@@ -0,0 +1,348 @@
+;******************************************************************************
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_7: times 8 dw 7
+convert_to_unsigned_10bit: times 4 dd 0x200
+clip_10bit:                times 8 dw 0x3ff
+
+cextern pw_3
+cextern pw_16
+cextern pw_32
+cextern pb_80
+
+SECTION .text
+
+%macro UNPACK_ADD 6
+    mov%5   %1, %3
+    mov%6   m5, %4
+    mova    m4, %1
+    mova    %2, m5
+    punpcklbw %1, m7
+    punpcklbw m5, m7
+    punpckhbw m4, m7
+    punpckhbw %2, m7
+    paddw   %1, m5
+    paddw   %2, m4
+%endmacro
+
+%macro HPEL_FILTER 1
+; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
+cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
+    mov     src0q, srcq
+    lea     stridex3q, [3*strideq]
+    sub     src0q, stridex3q
+    pxor    m7, m7
+.loop:
+    ; 7*(src[0] + src[1])
+    UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
+    pmullw  m0, [pw_7]
+    pmullw  m1, [pw_7]
+
+    ; 3*( ... + src[-2] + src[3])
+    UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
+    paddw   m0, m2
+    paddw   m1, m3
+    pmullw  m0, [pw_3]
+    pmullw  m1, [pw_3]
+
+    ; ... - 7*(src[-1] + src[2])
+    UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
+    pmullw  m2, [pw_7]
+    pmullw  m3, [pw_7]
+    psubw   m0, m2
+    psubw   m1, m3
+
+    ; ... - (src[-3] + src[4])
+    UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
+    psubw   m0, m2
+    psubw   m1, m3
+
+    paddw   m0, [pw_16]
+    paddw   m1, [pw_16]
+    psraw   m0, 5
+    psraw   m1, 5
+    packuswb m0, m1
+    mova    [dstq], m0
+    add     dstq, mmsize
+    add     srcq, mmsize
+    add     src0q, mmsize
+    sub     widthd, mmsize
+    jg      .loop
+    RET
+
+; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
+cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
+    dec     widthd
+    pxor    m7, m7
+    and     widthd, ~(mmsize-1)
+.loop:
+    ; 7*(src[0] + src[1])
+    UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
+    pmullw  m0, [pw_7]
+    pmullw  m1, [pw_7]
+
+    ; 3*( ... + src[-2] + src[3])
+    UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
+    paddw   m0, m2
+    paddw   m1, m3
+    pmullw  m0, [pw_3]
+    pmullw  m1, [pw_3]
+
+    ; ... - 7*(src[-1] + src[2])
+    UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
+    pmullw  m2, [pw_7]
+    pmullw  m3, [pw_7]
+    psubw   m0, m2
+    psubw   m1, m3
+
+    ; ... - (src[-3] + src[4])
+    UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
+    psubw   m0, m2
+    psubw   m1, m3
+
+    paddw   m0, [pw_16]
+    paddw   m1, [pw_16]
+    psraw   m0, 5
+    psraw   m1, 5
+    packuswb m0, m1
+    mova    [dstq + widthq], m0
+    sub     widthd, mmsize
+    jge     .loop
+    RET
+%endmacro
+
+%macro PUT_RECT 1
+; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
+    mova    m0, [pb_80]
+    add     wd, (mmsize-1)
+    and     wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+    movsxd   dst_strideq, dst_strided
+    movsxd   src_strideq, src_strided
+    mov   r7d, r5m
+    mov   r8d, wd
+    %define wspill r8d
+    %define hd r7d
+%else
+    mov    r4m, wd
+    %define wspill r4m
+    %define hd r5mp
+%endif
+
+.loopy:
+    lea     src2q, [srcq+src_strideq]
+    lea     dst2q, [dstq+dst_strideq]
+.loopx:
+    sub      wd, mmsize
+    mova     m1, [srcq +2*wq]
+    mova     m2, [src2q+2*wq]
+    packsswb m1, [srcq +2*wq+mmsize]
+    packsswb m2, [src2q+2*wq+mmsize]
+    paddb    m1, m0
+    paddb    m2, m0
+    mova    [dstq +wq], m1
+    mova    [dst2q+wq], m2
+    jg      .loopx
+
+    lea   srcq, [srcq+src_strideq*2]
+    lea   dstq, [dstq+dst_strideq*2]
+    sub     hd, 2
+    mov     wd, wspill
+    jg      .loopy
+    RET
+%endm
+
+%macro ADD_RECT 1
+; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
+cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
+    mova    m0, [pw_32]
+    add     wd, (mmsize-1)
+    and     wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+    movsxd   strideq, strided
+    movsxd   idwt_strideq, idwt_strided
+    mov   r8d, wd
+    %define wspill r8d
+%else
+    mov    r5m, wd
+    %define wspill r5m
+%endif
+
+.loop:
+    sub     wd, mmsize
+    movu    m1, [srcq +2*wq] ; FIXME: ensure alignment
+    paddw   m1, m0
+    psraw   m1, 6
+    movu    m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
+    paddw   m2, m0
+    psraw   m2, 6
+    paddw   m1, [idwtq+2*wq]
+    paddw   m2, [idwtq+2*wq+mmsize]
+    packuswb m1, m2
+    mova    [dstq +wq], m1
+    jg      .loop
+
+    lea   srcq, [srcq + 2*strideq]
+    add   dstq, strideq
+    lea  idwtq, [idwtq+ 2*idwt_strideq]
+    sub     hd, 1
+    mov     wd, wspill
+    jg      .loop
+    RET
+%endm
+
+%macro ADD_OBMC 2
+; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
+cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
+    pxor        m4, m4
+.loop:
+%assign i 0
+%rep %1 / mmsize
+    mova        m0, [srcq+i]
+    mova        m1, m0
+    punpcklbw   m0, m4
+    punpckhbw   m1, m4
+    mova        m2, [obmcq+i]
+    mova        m3, m2
+   punpcklbw   m2, m4
+    punpckhbw   m3, m4
+    pmullw      m0, m2
+    pmullw      m1, m3
+    movu        m2, [dstq+2*i]
+    movu        m3, [dstq+2*i+mmsize]
+    paddw       m0, m2
+    paddw       m1, m3
+    movu        [dstq+2*i], m0
+    movu        [dstq+2*i+mmsize], m1
+%assign i i+mmsize
+%endrep
+    lea         srcq, [srcq+strideq]
+    lea         dstq, [dstq+2*strideq]
+    add         obmcq, 32
+    sub         yblend, 1
+    jg          .loop
+    RET
+%endm
+
+INIT_MMX
+%if ARCH_X86_64 == 0
+PUT_RECT mmx
+ADD_RECT mmx
+
+HPEL_FILTER mmx
+ADD_OBMC 32, mmx
+ADD_OBMC 16, mmx
+%endif
+ADD_OBMC 8, mmx
+
+INIT_XMM
+PUT_RECT sse2
+ADD_RECT sse2
+
+HPEL_FILTER sse2
+ADD_OBMC 32, sse2
+ADD_OBMC 16, sse2
+
+INIT_XMM sse4
+
+; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
+cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
+    movd   m2, qfd
+    movd   m3, qsd
+    SPLATD m2
+    SPLATD m3
+    mov    r4d, tot_hd
+    mov    r3, dstq
+
+    .loop_v:
+    mov    tot_hq, r4
+    mov    dstq,   r3
+
+    .loop_h:
+    movu   m0, [srcq]
+
+    pabsd  m1, m0
+    pmulld m1, m2
+    paddd  m1, m3
+    psrld  m1,  2
+    psignd m1, m0
+
+    movu   [dstq], m1
+
+    add    srcq, mmsize
+    add    dstq, mmsize
+    sub    tot_hq, 4
+    jg     .loop_h
+    lea    srcq, [srcq + 4*tot_hq]
+
+    add    r3, strideq
+    dec    tot_vd
+    jg     .loop_v
+
+    RET
+
+INIT_XMM sse4
+; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
+%if ARCH_X86_64
+cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2
+%else
+cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2
+    %define  hd  r5mp
+%endif
+    shl      wd, 2
+    add    srcq, wq
+    neg      wq
+    mov     t2q, dstq
+    mov     t1q, wq
+    pxor     m2, m2
+    mova     m3, [clip_10bit]
+    mova     m4, [convert_to_unsigned_10bit]
+
+    .loop_h:
+    mov    dstq, t2q
+    mov      wq, t1q
+
+    .loop_w:
+    movu     m0, [srcq+wq+0*mmsize]
+    movu     m1, [srcq+wq+1*mmsize]
+
+    paddd    m0, m4
+    paddd    m1, m4
+    packusdw m0, m0, m1
+    CLIPW    m0, m2, m3 ; packusdw saturates so it's fine
+
+    movu     [dstq], m0
+
+    add      dstq, 1*mmsize
+    add      wq,   2*mmsize
+    jl       .loop_w
+
+    add    srcq, src_strideq
+    add     t2q, dst_strideq
+    sub      hd, 1
+    jg       .loop_h
+
+    RET
--- a/externals/ffmpeg/libavcodec/x86/diracdsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/diracdsp_init.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/diracdsp.h"
+#include "fpel.h"
+
+DECL_DIRAC_PIXOP(put, mmx);
+DECL_DIRAC_PIXOP(avg, mmx);
+DECL_DIRAC_PIXOP(avg, mmxext);
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+
+void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+
+void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
+
+void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
+
+#if HAVE_X86ASM
+
+#define HPEL_FILTER(MMSIZE, EXT)                                                             \
+    void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int);               \
+    void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int);                    \
+                                                                                             \
+    static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,       \
+                                          const uint8_t *src, int stride, int width, int height)   \
+    {                                                                                        \
+        while( height-- )                                                                    \
+        {                                                                                    \
+            ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
+            ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width);                                \
+            ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width);                               \
+                                                                                             \
+            dsth += stride;                                                                  \
+            dstv += stride;                                                                  \
+            dstc += stride;                                                                  \
+            src  += stride;                                                                  \
+        }                                                                                    \
+    }
+
+#define PIXFUNC(PFX, IDX, EXT)                                                   \
+    /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/  \
+    c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
+    c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
+
+#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
+void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3) {\
+        ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
+    } else {\
+        OPNAME ## _pixels16_ ## EXT(dst   , src[0]   , stride, h);\
+        OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
+    }\
+}
+
+DIRAC_PIXOP(put, ff_put, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmxext)
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3)
+        ff_put_dirac_pixels16_c(dst, src, stride, h);
+    else
+        ff_put_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3)
+        ff_avg_dirac_pixels16_c(dst, src, stride, h);
+    else
+        ff_avg_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3) {
+        ff_put_dirac_pixels32_c(dst, src, stride, h);
+    } else {
+        ff_put_pixels16_sse2(dst   , src[0]   , stride, h);
+        ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
+}
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3) {
+        ff_avg_dirac_pixels32_c(dst, src, stride, h);
+    } else {
+        ff_avg_pixels16_sse2(dst   , src[0]   , stride, h);
+        ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
+}
+
+#else // HAVE_X86ASM
+
+#define HPEL_FILTER(MMSIZE, EXT)                                                     \
+    void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,              \
+                                   const uint8_t *src, int stride, int width, int height);
+
+#define PIXFUNC(PFX, IDX, EXT) do {} while (0)
+
+#endif // HAVE_X86ASM
+
+#if !ARCH_X86_64
+HPEL_FILTER(8, mmx)
+#endif
+HPEL_FILTER(16, sse2)
+
+void ff_diracdsp_init_x86(DiracDSPContext* c)
+{
+    int mm_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(mm_flags)) {
+        c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
+#if !ARCH_X86_64
+        c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
+        c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
+        c->dirac_hpel_filter = dirac_hpel_filter_mmx;
+        c->add_rect_clamped = ff_add_rect_clamped_mmx;
+        c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_mmx;
+#endif
+        PIXFUNC(put, 0, mmx);
+        PIXFUNC(avg, 0, mmx);
+    }
+
+    if (EXTERNAL_MMXEXT(mm_flags)) {
+        PIXFUNC(avg, 0, mmxext);
+    }
+
+    if (EXTERNAL_SSE2(mm_flags)) {
+        c->dirac_hpel_filter = dirac_hpel_filter_sse2;
+        c->add_rect_clamped = ff_add_rect_clamped_sse2;
+        c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
+
+        c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
+        c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
+
+        c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
+        c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
+        c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
+        c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
+    }
+
+    if (EXTERNAL_SSE4(mm_flags)) {
+        c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
+        c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/dnxhdenc.asm
+++ b/externals/ffmpeg/libavcodec/x86/dnxhdenc.asm
@@ -0,0 +1,49 @@
+;************************************************************************
+;* VC3/DNxHD SIMD functions
+;* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
+;* Copyright (c) 2014 Tiancheng "Timothy" Gu <timothygu99@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+; void get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
+;                              ptrdiff_t line_size)
+INIT_XMM sse2
+cglobal get_pixels_8x4_sym, 3,3,5, block, pixels, linesize
+    pxor      m4,       m4
+    movq      m0,       [pixelsq]
+    add       pixelsq,  linesizeq
+    movq      m1,       [pixelsq]
+    movq      m2,       [pixelsq+linesizeq]
+    movq      m3,       [pixelsq+linesizeq*2]
+    punpcklbw m0,       m4
+    punpcklbw m1,       m4
+    punpcklbw m2,       m4
+    punpcklbw m3,       m4
+    mova  [blockq    ], m0
+    mova  [blockq+16 ], m1
+    mova  [blockq+32 ], m2
+    mova  [blockq+48 ], m3
+    mova  [blockq+64 ], m3
+    mova  [blockq+80 ], m2
+    mova  [blockq+96 ], m1
+    mova  [blockq+112], m0
+    RET
--- a/externals/ffmpeg/libavcodec/x86/dnxhdenc_init.c
+++ b/externals/ffmpeg/libavcodec/x86/dnxhdenc_init.c
@@ -0,0 +1,37 @@
+/*
+ * VC3/DNxHD SIMD functions
+ * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
+ *
+ * VC-3 encoder funded by the British Broadcasting Corporation
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dnxhdenc.h"
+
+void ff_get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
+                                ptrdiff_t line_size);
+
+av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
+{
+    if (EXTERNAL_SSE2(av_get_cpu_flags())) {
+        if (ctx->cid_table->bit_depth == 8)
+            ctx->get_pixels_8x4_sym = ff_get_pixels_8x4_sym_sse2;
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/exrdsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/exrdsp.asm
@@ -0,0 +1,118 @@
+;******************************************************************************
+;* X86 Optimized functions for Open Exr Decoder
+;* Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
+;*
+;* reorder_pixels, predictor based on patch by John Loy
+;* port to ASM by Jokyo Images support by CNC - French National Center for Cinema
+;*
+;* predictor AVX/AVX2 by Henrik Gramner
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pb_15
+cextern pb_80
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void ff_reorder_pixels(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
+;------------------------------------------------------------------------------
+
+%macro REORDER_PIXELS 0
+cglobal reorder_pixels, 3,4,3, dst, src1, size, src2
+    lea                              src2q, [src1q+sizeq] ; src2 = src + 2 * half_size
+    add                               dstq, sizeq         ; dst offset by size
+    shr                              sizeq, 1             ; half_size
+    add                              src1q, sizeq         ; offset src by half_size
+    neg                              sizeq                ; size = offset for dst, src1, src2
+.loop:
+
+    mova                                m0, [src1q+sizeq]        ; load first part
+    movu                                m1, [src2q+sizeq]        ; load second part
+    SBUTTERFLY bw, 0, 1, 2                                       ; interleaved
+    mova                 [dstq+2*sizeq   ], xm0                  ; copy to dst
+    mova                 [dstq+2*sizeq+16], xm1
+%if cpuflag(avx2)
+    vperm2i128                          m0, m0, m1, q0301
+    mova                 [dstq+2*sizeq+32], m0
+%endif
+    add     sizeq, mmsize
+    jl .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+REORDER_PIXELS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+REORDER_PIXELS
+%endif
+
+
+;------------------------------------------------------------------------------
+; void ff_predictor(uint8_t *src, ptrdiff_t size);
+;------------------------------------------------------------------------------
+
+%macro PREDICTOR 0
+cglobal predictor, 2,2,5, src, size
+    mova             m0, [pb_80]
+    mova            xm1, [pb_15]
+    mova            xm2, xm0
+    add            srcq, sizeq
+    neg           sizeq
+.loop:
+    pxor             m3, m0, [srcq + sizeq]
+    pslldq           m4, m3, 1
+    paddb            m3, m4
+    pslldq           m4, m3, 2
+    paddb            m3, m4
+    pslldq           m4, m3, 4
+    paddb            m3, m4
+    pslldq           m4, m3, 8
+%if mmsize == 32
+    paddb            m3, m4
+    paddb           xm2, xm3
+    vextracti128    xm4, m3, 1
+    mova [srcq + sizeq], xm2
+    pshufb          xm2, xm1
+    paddb           xm2, xm4
+    mova [srcq + sizeq + 16], xm2
+%else
+    paddb            m2, m3
+    paddb            m2, m4
+    mova [srcq + sizeq], m2
+%endif
+    pshufb          xm2, xm1
+    add           sizeq, mmsize
+    jl .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+PREDICTOR
+
+INIT_XMM avx
+PREDICTOR
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+PREDICTOR
+%endif
--- a/externals/ffmpeg/libavcodec/x86/exrdsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/exrdsp_init.c
@@ -0,0 +1,52 @@
+/*
+ * OpenEXR (.exr) image decoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/exrdsp.h"
+
+void ff_reorder_pixels_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
+
+void ff_reorder_pixels_avx2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
+
+void ff_predictor_ssse3(uint8_t *src, ptrdiff_t size);
+
+void ff_predictor_avx(uint8_t *src, ptrdiff_t size);
+
+void ff_predictor_avx2(uint8_t *src, ptrdiff_t size);
+
+av_cold void ff_exrdsp_init_x86(ExrDSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        dsp->reorder_pixels = ff_reorder_pixels_sse2;
+    }
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        dsp->predictor = ff_predictor_ssse3;
+    }
+    if (EXTERNAL_AVX(cpu_flags)) {
+        dsp->predictor = ff_predictor_avx;
+    }
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        dsp->reorder_pixels = ff_reorder_pixels_avx2;
+        dsp->predictor      = ff_predictor_avx2;
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/fdct.c
+++ b/externals/ffmpeg/libavcodec/x86/fdct.c
@@ -0,0 +1,594 @@
+/*
+ * SIMD-optimized forward DCT
+ * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
+ * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
+ *
+ * from  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
+ *
+ *  Intel Application Note AP-922 - fast, precise implementation of DCT
+ *        http://developer.intel.com/vtune/cbts/appnotes.htm
+ *
+ * Also of inspiration:
+ * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
+ * Skal's fdct at http://skal.planet-d.net/coding/dct.html
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavutil/x86/asm.h"
+#include "fdct.h"
+
+#if HAVE_MMX_INLINE
+
+//////////////////////////////////////////////////////////////////////
+//
+// constants for the forward DCT
+// -----------------------------
+//
+// Be sure to check that your compiler is aligning all constants to QWORD
+// (8-byte) memory boundaries!  Otherwise the unaligned memory access will
+// severely stall MMX execution.
+//
+//////////////////////////////////////////////////////////////////////
+
+#define BITS_FRW_ACC   3 //; 2 or 3 for accuracy
+#define SHIFT_FRW_COL  BITS_FRW_ACC
+#define SHIFT_FRW_ROW  (BITS_FRW_ACC + 17 - 3)
+#define RND_FRW_ROW    (1 << (SHIFT_FRW_ROW-1))
+//#define RND_FRW_COL    (1 << (SHIFT_FRW_COL-1))
+
+#define X8(x) x,x,x,x,x,x,x,x
+
+//concatenated table, for forward DCT transformation
+DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
+    X8(13036),  // tg * (2<<16) + 0.5
+    X8(27146),  // tg * (2<<16) + 0.5
+    X8(-21746)  // tg * (2<<16) + 0.5
+};
+
+DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
+    X8(23170)   //cos * (2<<15) + 0.5
+};
+
+DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
+
+DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
+
+static const struct
+{
+ DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
+} fdct_r_row_sse2 =
+{{
+ RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
+}};
+//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
+
+DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = {  // forward_dct coeff table
+  16384,   16384,   22725,   19266,
+  16384,   16384,   12873,    4520,
+  21407,    8867,   19266,   -4520,
+  -8867,  -21407,  -22725,  -12873,
+  16384,  -16384,   12873,  -22725,
+ -16384,   16384,    4520,   19266,
+   8867,  -21407,    4520,  -12873,
+  21407,   -8867,   19266,  -22725,
+
+  22725,   22725,   31521,   26722,
+  22725,   22725,   17855,    6270,
+  29692,   12299,   26722,   -6270,
+ -12299,  -29692,  -31521,  -17855,
+  22725,  -22725,   17855,  -31521,
+ -22725,   22725,    6270,   26722,
+  12299,  -29692,    6270,  -17855,
+  29692,  -12299,   26722,  -31521,
+
+  21407,   21407,   29692,   25172,
+  21407,   21407,   16819,    5906,
+  27969,   11585,   25172,   -5906,
+ -11585,  -27969,  -29692,  -16819,
+  21407,  -21407,   16819,  -29692,
+ -21407,   21407,    5906,   25172,
+  11585,  -27969,    5906,  -16819,
+  27969,  -11585,   25172,  -29692,
+
+  19266,   19266,   26722,   22654,
+  19266,   19266,   15137,    5315,
+  25172,   10426,   22654,   -5315,
+ -10426,  -25172,  -26722,  -15137,
+  19266,  -19266,   15137,  -26722,
+ -19266,   19266,    5315,   22654,
+  10426,  -25172,    5315,  -15137,
+  25172,  -10426,   22654,  -26722,
+
+  16384,   16384,   22725,   19266,
+  16384,   16384,   12873,    4520,
+  21407,    8867,   19266,   -4520,
+  -8867,  -21407,  -22725,  -12873,
+  16384,  -16384,   12873,  -22725,
+ -16384,   16384,    4520,   19266,
+   8867,  -21407,    4520,  -12873,
+  21407,   -8867,   19266,  -22725,
+
+  19266,   19266,   26722,   22654,
+  19266,   19266,   15137,    5315,
+  25172,   10426,   22654,   -5315,
+ -10426,  -25172,  -26722,  -15137,
+  19266,  -19266,   15137,  -26722,
+ -19266,   19266,    5315,   22654,
+  10426,  -25172,    5315,  -15137,
+  25172,  -10426,   22654,  -26722,
+
+  21407,   21407,   29692,   25172,
+  21407,   21407,   16819,    5906,
+  27969,   11585,   25172,   -5906,
+ -11585,  -27969,  -29692,  -16819,
+  21407,  -21407,   16819,  -29692,
+ -21407,   21407,    5906,   25172,
+  11585,  -27969,    5906,  -16819,
+  27969,  -11585,   25172,  -29692,
+
+  22725,   22725,   31521,   26722,
+  22725,   22725,   17855,    6270,
+  29692,   12299,   26722,   -6270,
+ -12299,  -29692,  -31521,  -17855,
+  22725,  -22725,   17855,  -31521,
+ -22725,   22725,    6270,   26722,
+  12299,  -29692,    6270,  -17855,
+  29692,  -12299,   26722,  -31521,
+};
+
+static const struct
+{
+ DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
+} tab_frw_01234567_sse2 =
+{{
+//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = {  // forward_dct coeff table
+#define TABLE_SSE2 C4,  C4,  C1,  C3, -C6, -C2, -C1, -C5, \
+                   C4,  C4,  C5,  C7,  C2,  C6,  C3, -C7, \
+                  -C4,  C4,  C7,  C3,  C6, -C2,  C7, -C5, \
+                   C4, -C4,  C5, -C1,  C2, -C6,  C3, -C1,
+// c1..c7 * cos(pi/4) * 2^15
+#define C1 22725
+#define C2 21407
+#define C3 19266
+#define C4 16384
+#define C5 12873
+#define C6 8867
+#define C7 4520
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 31521
+#define C2 29692
+#define C3 26722
+#define C4 22725
+#define C5 17855
+#define C6 12299
+#define C7 6270
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 29692
+#define C2 27969
+#define C3 25172
+#define C4 21407
+#define C5 16819
+#define C6 11585
+#define C7 5906
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 26722
+#define C2 25172
+#define C3 22654
+#define C4 19266
+#define C5 15137
+#define C6 10426
+#define C7 5315
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 22725
+#define C2 21407
+#define C3 19266
+#define C4 16384
+#define C5 12873
+#define C6 8867
+#define C7 4520
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 26722
+#define C2 25172
+#define C3 22654
+#define C4 19266
+#define C5 15137
+#define C6 10426
+#define C7 5315
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 29692
+#define C2 27969
+#define C3 25172
+#define C4 21407
+#define C5 16819
+#define C6 11585
+#define C7 5906
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 31521
+#define C2 29692
+#define C3 26722
+#define C4 22725
+#define C5 17855
+#define C6 12299
+#define C7 6270
+TABLE_SSE2
+}};
+
+#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
+
+#define FDCT_COL(cpu, mm, mov)\
+static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
+{\
+    __asm__ volatile (\
+        #mov"      16(%0),  %%"#mm"0 \n\t" \
+        #mov"      96(%0),  %%"#mm"1 \n\t" \
+        #mov"    %%"#mm"0,  %%"#mm"2 \n\t" \
+        #mov"      32(%0),  %%"#mm"3 \n\t" \
+        "paddsw  %%"#mm"1,  %%"#mm"0 \n\t" \
+        #mov"      80(%0),  %%"#mm"4 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
+        #mov"        (%0),  %%"#mm"5 \n\t" \
+        "paddsw  %%"#mm"3,  %%"#mm"4 \n\t" \
+        "paddsw   112(%0),  %%"#mm"5 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
+        #mov"    %%"#mm"0,  %%"#mm"6 \n\t" \
+        "psubsw  %%"#mm"1,  %%"#mm"2 \n\t" \
+        #mov"      16(%1),  %%"#mm"1 \n\t" \
+        "psubsw  %%"#mm"4,  %%"#mm"0 \n\t" \
+        #mov"      48(%0),  %%"#mm"7 \n\t" \
+        "pmulhw  %%"#mm"0,  %%"#mm"1 \n\t" \
+        "paddsw    64(%0),  %%"#mm"7 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
+        "paddsw  %%"#mm"4,  %%"#mm"6 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
+        #mov"    %%"#mm"5,  %%"#mm"4 \n\t" \
+        "psubsw  %%"#mm"7,  %%"#mm"5 \n\t" \
+        "paddsw  %%"#mm"5,  %%"#mm"1 \n\t" \
+        "paddsw  %%"#mm"7,  %%"#mm"4 \n\t" \
+        "por         (%2),  %%"#mm"1 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
+        "pmulhw    16(%1),  %%"#mm"5 \n\t" \
+        #mov"    %%"#mm"4,  %%"#mm"7 \n\t" \
+        "psubsw    80(%0),  %%"#mm"3 \n\t" \
+        "psubsw  %%"#mm"6,  %%"#mm"4 \n\t" \
+        #mov"    %%"#mm"1,    32(%3) \n\t" \
+        "paddsw  %%"#mm"6,  %%"#mm"7 \n\t" \
+        #mov"      48(%0),  %%"#mm"1 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
+        "psubsw    64(%0),  %%"#mm"1 \n\t" \
+        #mov"    %%"#mm"2,  %%"#mm"6 \n\t" \
+        #mov"    %%"#mm"4,    64(%3) \n\t" \
+        "paddsw  %%"#mm"3,  %%"#mm"2 \n\t" \
+        "pmulhw      (%4),  %%"#mm"2 \n\t" \
+        "psubsw  %%"#mm"3,  %%"#mm"6 \n\t" \
+        "pmulhw      (%4),  %%"#mm"6 \n\t" \
+        "psubsw  %%"#mm"0,  %%"#mm"5 \n\t" \
+        "por         (%2),  %%"#mm"5 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
+        "por         (%2),  %%"#mm"2 \n\t" \
+        #mov"    %%"#mm"1,  %%"#mm"4 \n\t" \
+        #mov"        (%0),  %%"#mm"3 \n\t" \
+        "paddsw  %%"#mm"6,  %%"#mm"1 \n\t" \
+        "psubsw   112(%0),  %%"#mm"3 \n\t" \
+        "psubsw  %%"#mm"6,  %%"#mm"4 \n\t" \
+        #mov"        (%1),  %%"#mm"0 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
+        #mov"      32(%1),  %%"#mm"6 \n\t" \
+        "pmulhw  %%"#mm"1,  %%"#mm"0 \n\t" \
+        #mov"    %%"#mm"7,      (%3) \n\t" \
+        "pmulhw  %%"#mm"4,  %%"#mm"6 \n\t" \
+        #mov"    %%"#mm"5,    96(%3) \n\t" \
+        #mov"    %%"#mm"3,  %%"#mm"7 \n\t" \
+        #mov"      32(%1),  %%"#mm"5 \n\t" \
+        "psubsw  %%"#mm"2,  %%"#mm"7 \n\t" \
+        "paddsw  %%"#mm"2,  %%"#mm"3 \n\t" \
+        "pmulhw  %%"#mm"7,  %%"#mm"5 \n\t" \
+        "paddsw  %%"#mm"3,  %%"#mm"0 \n\t" \
+        "paddsw  %%"#mm"4,  %%"#mm"6 \n\t" \
+        "pmulhw      (%1),  %%"#mm"3 \n\t" \
+        "por         (%2),  %%"#mm"0 \n\t" \
+        "paddsw  %%"#mm"7,  %%"#mm"5 \n\t" \
+        "psubsw  %%"#mm"6,  %%"#mm"7 \n\t" \
+        #mov"    %%"#mm"0,    16(%3) \n\t" \
+        "paddsw  %%"#mm"4,  %%"#mm"5 \n\t" \
+        #mov"    %%"#mm"7,    48(%3) \n\t" \
+        "psubsw  %%"#mm"1,  %%"#mm"3 \n\t" \
+        #mov"    %%"#mm"5,    80(%3) \n\t" \
+        #mov"    %%"#mm"3,   112(%3) \n\t" \
+        : \
+        : "r" (in  + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
+          "r" (out + offset), "r" (ocos_4_16)); \
+}
+
+FDCT_COL(mmx, mm, movq)
+FDCT_COL(sse2, xmm, movdqa)
+
+static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
+{
+    __asm__ volatile(
+#define FDCT_ROW_SSE2_H1(i,t)                    \
+        "movq      " #i "(%0), %%xmm2      \n\t" \
+        "movq      " #i "+8(%0), %%xmm0    \n\t" \
+        "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
+        "movdqa    " #t "+48(%1), %%xmm7   \n\t" \
+        "movdqa    " #t "(%1), %%xmm4      \n\t" \
+        "movdqa    " #t "+16(%1), %%xmm5   \n\t"
+
+#define FDCT_ROW_SSE2_H2(i,t)                    \
+        "movq      " #i "(%0), %%xmm2      \n\t" \
+        "movq      " #i "+8(%0), %%xmm0    \n\t" \
+        "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
+        "movdqa    " #t "+48(%1), %%xmm7   \n\t"
+
+#define FDCT_ROW_SSE2(i)                      \
+        "movq      %%xmm2, %%xmm1       \n\t" \
+        "pshuflw   $27, %%xmm0, %%xmm0  \n\t" \
+        "paddsw    %%xmm0, %%xmm1       \n\t" \
+        "psubsw    %%xmm0, %%xmm2       \n\t" \
+        "punpckldq %%xmm2, %%xmm1       \n\t" \
+        "pshufd    $78, %%xmm1, %%xmm2  \n\t" \
+        "pmaddwd   %%xmm2, %%xmm3       \n\t" \
+        "pmaddwd   %%xmm1, %%xmm7       \n\t" \
+        "pmaddwd   %%xmm5, %%xmm2       \n\t" \
+        "pmaddwd   %%xmm4, %%xmm1       \n\t" \
+        "paddd     %%xmm7, %%xmm3       \n\t" \
+        "paddd     %%xmm2, %%xmm1       \n\t" \
+        "paddd     %%xmm6, %%xmm3       \n\t" \
+        "paddd     %%xmm6, %%xmm1       \n\t" \
+        "psrad     %3, %%xmm3           \n\t" \
+        "psrad     %3, %%xmm1           \n\t" \
+        "packssdw  %%xmm3, %%xmm1       \n\t" \
+        "movdqa    %%xmm1, " #i "(%4)   \n\t"
+
+        "movdqa    (%2), %%xmm6         \n\t"
+        FDCT_ROW_SSE2_H1(0,0)
+        FDCT_ROW_SSE2(0)
+        FDCT_ROW_SSE2_H2(64,0)
+        FDCT_ROW_SSE2(64)
+
+        FDCT_ROW_SSE2_H1(16,64)
+        FDCT_ROW_SSE2(16)
+        FDCT_ROW_SSE2_H2(112,64)
+        FDCT_ROW_SSE2(112)
+
+        FDCT_ROW_SSE2_H1(32,128)
+        FDCT_ROW_SSE2(32)
+        FDCT_ROW_SSE2_H2(96,128)
+        FDCT_ROW_SSE2(96)
+
+        FDCT_ROW_SSE2_H1(48,192)
+        FDCT_ROW_SSE2(48)
+        FDCT_ROW_SSE2_H2(80,192)
+        FDCT_ROW_SSE2(80)
+        :
+        : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
+          "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
+          XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
+                            "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+    );
+}
+
+static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out,
+                                             const int16_t *table)
+{
+    __asm__ volatile (
+        "pshufw    $0x1B, 8(%0), %%mm5 \n\t"
+        "movq       (%0), %%mm0 \n\t"
+        "movq      %%mm0, %%mm1 \n\t"
+        "paddsw    %%mm5, %%mm0 \n\t"
+        "psubsw    %%mm5, %%mm1 \n\t"
+        "movq      %%mm0, %%mm2 \n\t"
+        "punpckldq %%mm1, %%mm0 \n\t"
+        "punpckhdq %%mm1, %%mm2 \n\t"
+        "movq       (%1), %%mm1 \n\t"
+        "movq      8(%1), %%mm3 \n\t"
+        "movq     16(%1), %%mm4 \n\t"
+        "movq     24(%1), %%mm5 \n\t"
+        "movq     32(%1), %%mm6 \n\t"
+        "movq     40(%1), %%mm7 \n\t"
+        "pmaddwd   %%mm0, %%mm1 \n\t"
+        "pmaddwd   %%mm2, %%mm3 \n\t"
+        "pmaddwd   %%mm0, %%mm4 \n\t"
+        "pmaddwd   %%mm2, %%mm5 \n\t"
+        "pmaddwd   %%mm0, %%mm6 \n\t"
+        "pmaddwd   %%mm2, %%mm7 \n\t"
+        "pmaddwd  48(%1), %%mm0 \n\t"
+        "pmaddwd  56(%1), %%mm2 \n\t"
+        "paddd     %%mm1, %%mm3 \n\t"
+        "paddd     %%mm4, %%mm5 \n\t"
+        "paddd     %%mm6, %%mm7 \n\t"
+        "paddd     %%mm0, %%mm2 \n\t"
+        "movq       (%2), %%mm0 \n\t"
+        "paddd     %%mm0, %%mm3 \n\t"
+        "paddd     %%mm0, %%mm5 \n\t"
+        "paddd     %%mm0, %%mm7 \n\t"
+        "paddd     %%mm0, %%mm2 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
+        "packssdw  %%mm5, %%mm3 \n\t"
+        "packssdw  %%mm2, %%mm7 \n\t"
+        "movq      %%mm3,  (%3) \n\t"
+        "movq      %%mm7, 8(%3) \n\t"
+        :
+        : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
+}
+
+static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
+{
+    //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
+    __asm__ volatile(
+        "movd     12(%0), %%mm1 \n\t"
+        "punpcklwd 8(%0), %%mm1 \n\t"
+        "movq      %%mm1, %%mm2 \n\t"
+        "psrlq     $0x20, %%mm1 \n\t"
+        "movq      0(%0), %%mm0 \n\t"
+        "punpcklwd %%mm2, %%mm1 \n\t"
+        "movq      %%mm0, %%mm5 \n\t"
+        "paddsw    %%mm1, %%mm0 \n\t"
+        "psubsw    %%mm1, %%mm5 \n\t"
+        "movq      %%mm0, %%mm2 \n\t"
+        "punpckldq %%mm5, %%mm0 \n\t"
+        "punpckhdq %%mm5, %%mm2 \n\t"
+        "movq      0(%1), %%mm1 \n\t"
+        "movq      8(%1), %%mm3 \n\t"
+        "movq     16(%1), %%mm4 \n\t"
+        "movq     24(%1), %%mm5 \n\t"
+        "movq     32(%1), %%mm6 \n\t"
+        "movq     40(%1), %%mm7 \n\t"
+        "pmaddwd   %%mm0, %%mm1 \n\t"
+        "pmaddwd   %%mm2, %%mm3 \n\t"
+        "pmaddwd   %%mm0, %%mm4 \n\t"
+        "pmaddwd   %%mm2, %%mm5 \n\t"
+        "pmaddwd   %%mm0, %%mm6 \n\t"
+        "pmaddwd   %%mm2, %%mm7 \n\t"
+        "pmaddwd  48(%1), %%mm0 \n\t"
+        "pmaddwd  56(%1), %%mm2 \n\t"
+        "paddd     %%mm1, %%mm3 \n\t"
+        "paddd     %%mm4, %%mm5 \n\t"
+        "paddd     %%mm6, %%mm7 \n\t"
+        "paddd     %%mm0, %%mm2 \n\t"
+        "movq       (%2), %%mm0 \n\t"
+        "paddd     %%mm0, %%mm3 \n\t"
+        "paddd     %%mm0, %%mm5 \n\t"
+        "paddd     %%mm0, %%mm7 \n\t"
+        "paddd     %%mm0, %%mm2 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
+        "packssdw  %%mm5, %%mm3 \n\t"
+        "packssdw  %%mm2, %%mm7 \n\t"
+        "movq      %%mm3, 0(%3) \n\t"
+        "movq      %%mm7, 8(%3) \n\t"
+        :
+        : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
+}
+
+void ff_fdct_mmx(int16_t *block)
+{
+    DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+    int16_t * block1= (int16_t*)align_tmp;
+    const int16_t *table= tab_frw_01234567;
+    int i;
+
+    fdct_col_mmx(block, block1, 0);
+    fdct_col_mmx(block, block1, 4);
+
+    for(i=8;i>0;i--) {
+        fdct_row_mmx(block1, block, table);
+        block1 += 8;
+        table += 32;
+        block += 8;
+    }
+}
+
+#endif /* HAVE_MMX_INLINE */
+
+#if HAVE_MMXEXT_INLINE
+
+void ff_fdct_mmxext(int16_t *block)
+{
+    DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+    int16_t *block1= (int16_t*)align_tmp;
+    const int16_t *table= tab_frw_01234567;
+    int i;
+
+    fdct_col_mmx(block, block1, 0);
+    fdct_col_mmx(block, block1, 4);
+
+    for(i=8;i>0;i--) {
+        fdct_row_mmxext(block1, block, table);
+        block1 += 8;
+        table += 32;
+        block += 8;
+    }
+}
+
+#endif /* HAVE_MMXEXT_INLINE */
+
+#if HAVE_SSE2_INLINE
+
+void ff_fdct_sse2(int16_t *block)
+{
+    DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
+    int16_t * const block1= (int16_t*)align_tmp;
+
+    fdct_col_sse2(block, block1, 0);
+    fdct_row_sse2(block1, block);
+}
+
+#endif /* HAVE_SSE2_INLINE */
--- a/externals/ffmpeg/libavcodec/x86/fdct.h
+++ b/externals/ffmpeg/libavcodec/x86/fdct.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_FDCT_H
+#define AVCODEC_X86_FDCT_H
+
+#include <stdint.h>
+
+void ff_fdct_mmx(int16_t *block);
+void ff_fdct_mmxext(int16_t *block);
+void ff_fdct_sse2(int16_t *block);
+
+#endif /* AVCODEC_X86_FDCT_H */
--- a/externals/ffmpeg/libavcodec/x86/fdctdsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/fdctdsp_init.c
@@ -0,0 +1,44 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/fdctdsp.h"
+#include "fdct.h"
+
+av_cold void ff_fdctdsp_init_x86(FDCTDSPContext *c, AVCodecContext *avctx,
+                                 unsigned high_bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+    const int dct_algo = avctx->dct_algo;
+
+    if (!high_bit_depth) {
+        if ((dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) {
+            if (INLINE_MMX(cpu_flags))
+                c->fdct = ff_fdct_mmx;
+
+            if (INLINE_MMXEXT(cpu_flags))
+                c->fdct = ff_fdct_mmxext;
+
+            if (INLINE_SSE2(cpu_flags))
+                c->fdct = ff_fdct_sse2;
+        }
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/fft.asm
+++ b/externals/ffmpeg/libavcodec/x86/fft.asm
--- a/externals/ffmpeg/libavcodec/x86/fft.h
+++ b/externals/ffmpeg/libavcodec/x86/fft.h
@@ -0,0 +1,38 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_FFT_H
+#define AVCODEC_X86_FFT_H
+
+#include "libavcodec/fft.h"
+
+void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
+
+void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
+
+#endif /* AVCODEC_X86_FFT_H */
--- a/externals/ffmpeg/libavcodec/x86/fft_init.c
+++ b/externals/ffmpeg/libavcodec/x86/fft_init.c
@@ -0,0 +1,61 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+
+#include "fft.h"
+
+av_cold void ff_fft_init_x86(FFTContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (s->nbits > 16)
+        return;
+
+#if ARCH_X86_32
+    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
+        s->imdct_calc = ff_imdct_calc_3dnow;
+        s->imdct_half = ff_imdct_half_3dnow;
+        s->fft_calc   = ff_fft_calc_3dnow;
+    }
+
+    if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
+        s->imdct_calc = ff_imdct_calc_3dnowext;
+        s->imdct_half = ff_imdct_half_3dnowext;
+        s->fft_calc   = ff_fft_calc_3dnowext;
+    }
+#endif /* ARCH_X86_32 */
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->imdct_calc  = ff_imdct_calc_sse;
+        s->imdct_half  = ff_imdct_half_sse;
+        s->fft_permute = ff_fft_permute_sse;
+        s->fft_calc    = ff_fft_calc_sse;
+        s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
+    }
+
+    if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
+        s->imdct_half      = ff_imdct_half_avx;
+        s->fft_calc        = ff_fft_calc_avx;
+        s->fft_permutation = FF_FFT_PERM_AVX;
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/flac_dsp_gpl.asm
+++ b/externals/ffmpeg/libavcodec/x86/flac_dsp_gpl.asm
@@ -0,0 +1,101 @@
+;******************************************************************************
+;* FLAC DSP functions
+;*
+;* Copyright (c) 2014 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_XMM sse4
+%if ARCH_X86_64
+    cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
+    DECLARE_REG_TMP 5, 6
+    %define length r2d
+
+    movsxd orderq, orderd
+%else
+    cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs
+    DECLARE_REG_TMP 2, 5
+    %define length r2mp
+%endif
+
+; Here we assume that the maximum order value is 32.  This means that we only
+; need to copy a maximum of 32 samples.  Therefore we let the preprocessor
+; unroll this loop and copy all 32.
+%assign iter 0
+%rep 32/(mmsize/4)
+    movu  m0,         [smpq+iter]
+    movu [resq+iter],  m0
+    %assign iter iter+mmsize
+%endrep
+
+lea  resq,   [resq+orderq*4]
+lea  smpq,   [smpq+orderq*4]
+lea  coefsq, [coefsq+orderq*4]
+sub  length,  orderd
+movd m3,      r5m
+neg  orderq
+
+%define posj t0q
+%define negj t1q
+
+.looplen:
+    pxor m0,   m0
+    pxor m4,   m4
+    pxor m6,   m6
+    mov  posj, orderq
+    xor  negj, negj
+
+    .looporder:
+        movd   m2, [coefsq+posj*4] ; c = coefs[j]
+        SPLATD m2
+        movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+        movu   m5, [smpq+negj*4-4+mmsize]
+        movu   m7, [smpq+negj*4-4+mmsize*2]
+        pmulld m1,  m2
+        pmulld m5,  m2
+        pmulld m7,  m2
+        paddd  m0,  m1             ; p += c * s
+        paddd  m4,  m5
+        paddd  m6,  m7
+
+        dec    negj
+        inc    posj
+    jnz .looporder
+
+    psrad  m0,     m3              ; p >>= shift
+    psrad  m4,     m3
+    psrad  m6,     m3
+    movu   m1,    [smpq]
+    movu   m5,    [smpq+mmsize]
+    movu   m7,    [smpq+mmsize*2]
+    psubd  m1,     m0              ; smp[i] - p
+    psubd  m5,     m4
+    psubd  m7,     m6
+    movu  [resq],  m1              ; res[i] = smp[i] - (p >> shift)
+    movu  [resq+mmsize], m5
+    movu  [resq+mmsize*2], m7
+
+    add resq,    3*mmsize
+    add smpq,    3*mmsize
+    sub length, (3*mmsize)/4
+jg .looplen
+RET
--- a/externals/ffmpeg/libavcodec/x86/flacdsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/flacdsp.asm
@@ -0,0 +1,313 @@
+;******************************************************************************
+;* FLAC DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 Loren Merritt
+;* Copyright (C) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro PMACSDQL 5
+%if cpuflag(xop)
+    pmacsdql %1, %2, %3, %1
+%else
+    pmuldq   %2, %3
+    paddq    %1, %2
+%endif
+%endmacro
+
+%macro LPC_32 1
+INIT_XMM %1
+cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
+    sub    lend, pred_orderd
+    jle .ret
+    lea    decodedq, [decodedq+pred_orderq*4-8]
+    lea    coeffsq, [coeffsq+pred_orderq*4]
+    neg    pred_orderq
+    movd   m4, qlevelm
+ALIGN 16
+.loop_sample:
+    movd   m0, [decodedq+pred_orderq*4+8]
+    add    decodedq, 8
+    movd   m1, [coeffsq+pred_orderq*4]
+    pxor   m2, m2
+    pxor   m3, m3
+    lea    jq, [pred_orderq+1]
+    test   jq, jq
+    jz .end_order
+.loop_order:
+    PMACSDQL m2, m0, m1, m2, m0
+    movd   m0, [decodedq+jq*4]
+    PMACSDQL m3, m1, m0, m3, m1
+    movd   m1, [coeffsq+jq*4]
+    inc    jq
+    jl .loop_order
+.end_order:
+    PMACSDQL m2, m0, m1, m2, m0
+    psrlq  m2, m4
+    movd   m0, [decodedq]
+    paddd  m0, m2
+    movd   [decodedq], m0
+    sub  lend, 2
+    jl .ret
+    PMACSDQL m3, m1, m0, m3, m1
+    psrlq  m3, m4
+    movd   m1, [decodedq+4]
+    paddd  m1, m3
+    movd   [decodedq+4], m1
+    jg .loop_sample
+.ret:
+    REP_RET
+%endmacro
+
+%if HAVE_XOP_EXTERNAL
+LPC_32 xop
+%endif
+LPC_32 sse4
+
+;----------------------------------------------------------------------------------
+;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
+;                                                   int len, int shift);
+;----------------------------------------------------------------------------------
+%macro FLAC_DECORRELATE_16 3-4
+cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
+%if ARCH_X86_32
+    mov      lend, lenm
+%endif
+    movd       m3, r4m
+    shl      lend, 2
+    mov      in1q, [in0q + gprsize]
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+    add      in1q, lenq
+    add      in0q, lenq
+    add      outq, lenq
+    neg      lenq
+
+align 16
+.loop:
+    mova       m0, [in0q + lenq]
+    mova       m1, [in1q + lenq]
+%ifidn %1, ms
+    psrad      m2, m1, 1
+    psubd      m0, m2
+%endif
+%ifnidn %1, indep2
+    p%4d       m2, m0, m1
+%endif
+    packssdw  m%2, m%2
+    packssdw  m%3, m%3
+    punpcklwd m%2, m%3
+    psllw     m%2, m3
+    mova [outq + lenq], m%2
+    add      lenq, 16
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_16 ls, 0, 2, sub
+FLAC_DECORRELATE_16 rs, 2, 1, add
+FLAC_DECORRELATE_16 ms, 2, 0, add
+
+;----------------------------------------------------------------------------------
+;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
+;                                        int len, int shift);
+;----------------------------------------------------------------------------------
+%macro FLAC_DECORRELATE_32 5
+cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
+%if ARCH_X86_32
+    mov      lend, lenm
+%endif
+    movd       m3, r4m
+    mov      in1q, [in0q + gprsize]
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+    sub      in1q, in0q
+
+align 16
+.loop:
+    mova       m0, [in0q]
+    mova       m1, [in0q + in1q]
+%ifidn %1, ms
+    psrad      m2, m1, 1
+    psubd      m0, m2
+%endif
+    p%5d       m2, m0, m1
+    pslld     m%2, m3
+    pslld     m%3, m3
+
+    SBUTTERFLY dq, %2, %3, %4
+
+    mova  [outq         ], m%2
+    mova  [outq + mmsize], m%3
+
+    add      in0q, mmsize
+    add      outq, mmsize*2
+    sub      lend, mmsize/4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
+FLAC_DECORRELATE_32 rs, 2, 1, 0, add
+FLAC_DECORRELATE_32 ms, 2, 0, 1, add
+
+;-----------------------------------------------------------------------------------------
+;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
+;                                            int len, int shift);
+;-----------------------------------------------------------------------------------------
+;%1 = bps
+;%2 = channels
+;%3 = last xmm reg used
+;%4 = word/dword (shift instruction)
+%macro FLAC_DECORRELATE_INDEP 4
+%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
+cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
+%if ARCH_X86_32
+%if %2 == 6
+    DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
+    %define  lend  dword r3m
+%else
+    mov      lend, lenm
+%endif
+%endif
+    movd      m%3, r4m
+
+%assign %%i 1
+%rep %2-1
+    mov      in %+ %%i %+ q, [in0q+%%i*gprsize]
+%assign %%i %%i+1
+%endrep
+
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+
+%assign %%i 1
+%rep %2-1
+    sub      in %+ %%i %+ q, in0q
+%assign %%i %%i+1
+%endrep
+
+align 16
+.loop:
+    mova       m0, [in0q]
+
+%assign %%i 1
+%rep REPCOUNT-1
+    mova     m %+ %%i, [in0q + in %+ %%i %+ q]
+%assign %%i %%i+1
+%endrep
+
+%if %1 == 32
+
+%if %2 == 8
+    TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
+%elif %2 == 6
+    SBUTTERFLY dq, 0, 1, 6
+    SBUTTERFLY dq, 2, 3, 6
+    SBUTTERFLY dq, 4, 5, 6
+
+    punpcklqdq m6, m0, m2
+    punpckhqdq m2, m4
+    shufps     m4, m0, 0xe4
+    punpcklqdq m0, m1, m3
+    punpckhqdq m3, m5
+    shufps     m5, m1, 0xe4
+    SWAP 0,6,1,4,5,3
+%elif %2 == 4
+    TRANSPOSE4x4D 0, 1, 2, 3, 4
+%else ; %2 == 2
+    SBUTTERFLY dq, 0, 1, 2
+%endif
+
+%else ; %1 == 16
+
+%if %2 == 8
+    packssdw   m0, [in0q + in4q]
+    packssdw   m1, [in0q + in5q]
+    packssdw   m2, [in0q + in6q]
+    packssdw   m3, [in0q + in7q]
+    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+%elif %2 == 6
+    packssdw   m0, [in0q + in3q]
+    packssdw   m1, [in0q + in4q]
+    packssdw   m2, [in0q + in5q]
+    pshufd     m3, m0,     q1032
+    punpcklwd  m0, m1
+    punpckhwd  m1, m2
+    punpcklwd  m2, m3
+
+    shufps     m3, m0, m2, q2020
+    shufps     m0, m1,     q2031
+    shufps     m2, m1,     q3131
+    shufps     m1, m2, m3, q3120
+    shufps     m3, m0,     q0220
+    shufps     m0, m2,     q3113
+    SWAP 2, 0, 3
+%else ; %2 == 4
+    packssdw   m0, [in0q + in2q]
+    packssdw   m1, [in0q + in3q]
+    SBUTTERFLY wd, 0, 1, 2
+    SBUTTERFLY dq, 0, 1, 2
+%endif
+
+%endif
+
+%assign %%i 0
+%rep REPCOUNT
+    psll%4   m %+ %%i, m%3
+%assign %%i %%i+1
+%endrep
+
+%assign %%i 0
+%rep REPCOUNT
+    mova [outq + %%i*mmsize], m %+ %%i
+%assign %%i %%i+1
+%endrep
+
+    add      in0q, mmsize
+    add      outq, mmsize*REPCOUNT
+    sub      lend, mmsize/4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
+FLAC_DECORRELATE_INDEP 32, 2, 3, d
+FLAC_DECORRELATE_INDEP 16, 4, 3, w
+FLAC_DECORRELATE_INDEP 32, 4, 5, d
+FLAC_DECORRELATE_INDEP 16, 6, 4, w
+FLAC_DECORRELATE_INDEP 32, 6, 7, d
+%if ARCH_X86_64
+FLAC_DECORRELATE_INDEP 16, 8, 5, w
+FLAC_DECORRELATE_INDEP 32, 8, 9, d
+%endif
+
+INIT_XMM avx
+FLAC_DECORRELATE_INDEP 32, 4, 5, d
+FLAC_DECORRELATE_INDEP 32, 6, 7, d
+%if ARCH_X86_64
+FLAC_DECORRELATE_INDEP 16, 8, 5, w
+FLAC_DECORRELATE_INDEP 32, 8, 9, d
+%endif
--- a/externals/ffmpeg/libavcodec/x86/flacdsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/flacdsp_init.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/flacdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
+                         int qlevel, int len);
+void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
+                        int qlevel, int len);
+
+void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
+
+#define DECORRELATE_FUNCS(fmt, opt)                                                      \
+void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                             int len, int shift);                        \
+void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift);                       \
+void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift);                       \
+void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift)
+
+DECORRELATE_FUNCS(16, sse2);
+DECORRELATE_FUNCS(16,  avx);
+DECORRELATE_FUNCS(32, sse2);
+DECORRELATE_FUNCS(32,  avx);
+
+av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
+                                 int bps)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+#if CONFIG_FLAC_DECODER
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        if (fmt == AV_SAMPLE_FMT_S16) {
+            if (channels == 2)
+                c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2;
+            else if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2;
+            c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
+            c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
+            c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
+        } else if (fmt == AV_SAMPLE_FMT_S32) {
+            if (channels == 2)
+                c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2;
+            else if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2;
+            c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
+            c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
+            c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
+        }
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->lpc32 = ff_flac_lpc_32_sse4;
+    }
+    if (EXTERNAL_AVX(cpu_flags)) {
+        if (fmt == AV_SAMPLE_FMT_S16) {
+            if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx;
+        } else if (fmt == AV_SAMPLE_FMT_S32) {
+            if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx;
+        }
+    }
+    if (EXTERNAL_XOP(cpu_flags)) {
+        c->lpc32 = ff_flac_lpc_32_xop;
+    }
+#endif
+
+#if CONFIG_FLAC_ENCODER
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        if (CONFIG_GPL)
+            c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
+    }
+#endif
+#endif /* HAVE_X86ASM */
+}
--- a/externals/ffmpeg/libavcodec/x86/fmtconvert.asm
+++ b/externals/ffmpeg/libavcodec/x86/fmtconvert.asm
@@ -0,0 +1,124 @@
+;******************************************************************************
+;* x86 optimized Format Conversion Utils
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul,
+;                                    int len);
+;------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_SCALAR 1
+%if UNIX64
+cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
+%else
+cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
+%endif
+%if WIN64
+    SWAP 0, 2
+%elif ARCH_X86_32
+    movss   m0, mulm
+%endif
+    SPLATD  m0
+    shl     lend, 2
+    add     srcq, lenq
+    add     dstq, lenq
+    neg     lenq
+.loop:
+%if cpuflag(sse2)
+    cvtdq2ps  m1, [srcq+lenq   ]
+    cvtdq2ps  m2, [srcq+lenq+16]
+%else
+    cvtpi2ps  m1, [srcq+lenq   ]
+    cvtpi2ps  m3, [srcq+lenq+ 8]
+    cvtpi2ps  m2, [srcq+lenq+16]
+    cvtpi2ps  m4, [srcq+lenq+24]
+    movlhps   m1, m3
+    movlhps   m2, m4
+%endif
+    mulps     m1, m0
+    mulps     m2, m0
+    mova  [dstq+lenq   ], m1
+    mova  [dstq+lenq+16], m2
+    add     lenq, 32
+    jl .loop
+%if notcpuflag(sse2)
+    ;; cvtpi2ps switches to MMX even if the source is a memory location
+    ;; possible an error in documentation since every tested CPU disagrees with
+    ;; that. Use emms anyway since the vast majority of machines will use the
+    ;; SSE2 variant
+    emms
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse
+INT32_TO_FLOAT_FMUL_SCALAR 5
+INIT_XMM sse2
+INT32_TO_FLOAT_FMUL_SCALAR 3
+
+;------------------------------------------------------------------------------
+; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
+;                                    const float *mul, int len);
+;------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_ARRAY8 0
+cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
+    shl     lend, 2
+    add     srcq, lenq
+    add     dstq, lenq
+    neg     lenq
+.loop:
+    movss     m0, [mulq]
+    SPLATD    m0
+%if cpuflag(sse2)
+    cvtdq2ps  m1, [srcq+lenq   ]
+    cvtdq2ps  m2, [srcq+lenq+16]
+%else
+    cvtpi2ps  m1, [srcq+lenq   ]
+    cvtpi2ps  m3, [srcq+lenq+ 8]
+    cvtpi2ps  m2, [srcq+lenq+16]
+    cvtpi2ps  m4, [srcq+lenq+24]
+    movlhps   m1, m3
+    movlhps   m2, m4
+%endif
+    mulps     m1, m0
+    mulps     m2, m0
+    mova  [dstq+lenq   ], m1
+    mova  [dstq+lenq+16], m2
+    add     mulq, 4
+    add     lenq, 32
+    jl .loop
+%if notcpuflag(sse2)
+    ;; cvtpi2ps switches to MMX even if the source is a memory location
+    ;; possible an error in documentation since every tested CPU disagrees with
+    ;; that. Use emms anyway since the vast majority of machines will use the
+    ;; SSE2 variant
+    emms
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse
+INT32_TO_FLOAT_FMUL_ARRAY8
+INIT_XMM sse2
+INT32_TO_FLOAT_FMUL_ARRAY8
+
--- a/externals/ffmpeg/libavcodec/x86/fmtconvert_init.c
+++ b/externals/ffmpeg/libavcodec/x86/fmtconvert_init.c
@@ -0,0 +1,55 @@
+/*
+ * Format Conversion Utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/fmtconvert.h"
+
+#if HAVE_X86ASM
+
+void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
+void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
+void ff_int32_to_float_fmul_array8_sse (FmtConvertContext *c, float *dst, const int32_t *src,
+                                        const float *mul, int len);
+void ff_int32_to_float_fmul_array8_sse2(FmtConvertContext *c, float *dst, const int32_t *src,
+                                        const float *mul, int len);
+
+#endif /* HAVE_X86ASM */
+
+av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse2;
+    }
+#endif /* HAVE_X86ASM */
+}
--- a/externals/ffmpeg/libavcodec/x86/fpel.asm
+++ b/externals/ffmpeg/libavcodec/x86/fpel.asm
@@ -0,0 +1,106 @@
+;******************************************************************************
+;* SIMD-optimized fullpel functions
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro PAVGB_MMX 4
+    LOAD   %3, %1
+    por    %3, %2
+    pxor   %2, %1
+    pand   %2, %4
+    psrlq  %2, 1
+    psubb  %3, %2
+    SWAP   %2, %3
+%endmacro
+
+; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels,
+;                        ptrdiff_t line_size, int h)
+%macro OP_PIXELS 2
+%if %2 == mmsize/2
+%define LOAD movh
+%define SAVE movh
+%define LEN  mmsize
+%else
+%define LOAD movu
+%define SAVE mova
+%define LEN  %2
+%endif
+cglobal %1_pixels%2, 4,5,4
+    lea          r4, [r2*3]
+%ifidn %1, avg
+%if notcpuflag(mmxext)
+    pcmpeqd      m6, m6
+    paddb        m6, m6
+%endif
+%endif
+.loop:
+%assign %%i 0
+%rep LEN/mmsize
+    LOAD         m0, [r1 + %%i]
+    LOAD         m1, [r1+r2 + %%i]
+    LOAD         m2, [r1+r2*2 + %%i]
+    LOAD         m3, [r1+r4 + %%i]
+%ifidn %1, avg
+%if notcpuflag(mmxext)
+    PAVGB_MMX    [r0 + %%i], m0, m4, m6
+    PAVGB_MMX    [r0+r2 + %%i], m1, m5, m6
+    PAVGB_MMX    [r0+r2*2 + %%i], m2, m4, m6
+    PAVGB_MMX    [r0+r4 + %%i], m3, m5, m6
+%else
+    pavgb        m0, [r0 + %%i]
+    pavgb        m1, [r0+r2 + %%i]
+    pavgb        m2, [r0+r2*2 + %%i]
+    pavgb        m3, [r0+r4 + %%i]
+%endif
+%endif
+    SAVE       [r0 + %%i], m0
+    SAVE    [r0+r2 + %%i], m1
+    SAVE  [r0+r2*2 + %%i], m2
+    SAVE    [r0+r4 + %%i], m3
+%assign %%i %%i+mmsize
+%endrep
+    sub         r3d, 4
+    lea          r1, [r1+r2*4]
+    lea          r0, [r0+r2*4]
+    jne       .loop
+    RET
+%endmacro
+
+INIT_MMX mmx
+OP_PIXELS put, 4
+OP_PIXELS avg, 4
+OP_PIXELS put, 8
+OP_PIXELS avg, 8
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
+
+INIT_MMX mmxext
+OP_PIXELS avg, 4
+OP_PIXELS avg, 8
+OP_PIXELS avg, 16
+
+INIT_XMM sse2
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
--- a/externals/ffmpeg/libavcodec/x86/fpel.h
+++ b/externals/ffmpeg/libavcodec/x86/fpel.h
@@ -0,0 +1,49 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_FPEL_H
+#define AVCODEC_X86_FPEL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void ff_avg_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h);
+void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h);
+void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int h);
+void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h);
+void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
+                          ptrdiff_t line_size, int h);
+void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int h);
+void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
+                          ptrdiff_t line_size, int h);
+
+
+#endif /* AVCODEC_X86_FPEL_H */
--- a/externals/ffmpeg/libavcodec/x86/g722dsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/g722dsp.asm
@@ -0,0 +1,54 @@
+;******************************************************************************
+;* SIMD optimized DSP functions for G722 coding
+;*
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_qmf_coeffs:  dw   3, -210,  -11, -805,  -11,  951,  53, 3876
+pw_qmf_coeffs2: dw  12, 3876, -156,  951,   32, -805, 362, -210
+pw_qmf_coeffs3: dw 362,    0 ,  32,    0, -156,    0,  12,    0
+pw_qmf_coeffs4: dw  53,    0,  -11,    0,  -11,    0,   3,    0
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal g722_apply_qmf, 2, 2, 5, prev, out
+    movu m0, [prevq+mmsize*0]
+    movu m1, [prevq+mmsize*1]
+    movu m2, [prevq+mmsize*2]
+    punpcklwd m3, m0, m1
+    punpckhwd m0, m1
+    punpcklwd m4, m2, m2
+    punpckhwd m2, m2
+    pmaddwd   m3, [pw_qmf_coeffs ]
+    pmaddwd   m0, [pw_qmf_coeffs2]
+    pmaddwd   m4, [pw_qmf_coeffs3]
+    pmaddwd   m2, [pw_qmf_coeffs4]
+    paddd     m0, m3
+    paddd     m2, m4
+    paddd     m0, m2
+    pshufd    m2, m0, q0032
+    paddd     m0, m2
+    pshufd    m0, m0, q0001
+    movq  [outq], m0
+    RET
--- a/externals/ffmpeg/libavcodec/x86/g722dsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/g722dsp_init.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/g722dsp.h"
+
+void ff_g722_apply_qmf_sse2(const int16_t *prev_samples, int xout[2]);
+
+av_cold void ff_g722dsp_init_x86(G722DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        dsp->apply_qmf = ff_g722_apply_qmf_sse2;
+}
--- a/externals/ffmpeg/libavcodec/x86/h263_loopfilter.asm
+++ b/externals/ffmpeg/libavcodec/x86/h263_loopfilter.asm
@@ -0,0 +1,189 @@
+;******************************************************************************
+;* MMX-optimized H.263 loop filter
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+cextern pb_FC
+cextern h263_loop_filter_strength
+
+SECTION .text
+
+%macro H263_LOOP_FILTER 5
+    pxor         m7, m7
+    mova         m0, [%1]
+    mova         m1, [%1]
+    mova         m2, [%4]
+    mova         m3, [%4]
+    punpcklbw    m0, m7
+    punpckhbw    m1, m7
+    punpcklbw    m2, m7
+    punpckhbw    m3, m7
+    psubw        m0, m2
+    psubw        m1, m3
+    mova         m2, [%2]
+    mova         m3, [%2]
+    mova         m4, [%3]
+    mova         m5, [%3]
+    punpcklbw    m2, m7
+    punpckhbw    m3, m7
+    punpcklbw    m4, m7
+    punpckhbw    m5, m7
+    psubw        m4, m2
+    psubw        m5, m3
+    psllw        m4, 2
+    psllw        m5, 2
+    paddw        m4, m0
+    paddw        m5, m1
+    pxor         m6, m6
+    pcmpgtw      m6, m4
+    pcmpgtw      m7, m5
+    pxor         m4, m6
+    pxor         m5, m7
+    psubw        m4, m6
+    psubw        m5, m7
+    psrlw        m4, 3
+    psrlw        m5, 3
+    packuswb     m4, m5
+    packsswb     m6, m7
+    pxor         m7, m7
+    movd         m2, %5
+    punpcklbw    m2, m2
+    punpcklbw    m2, m2
+    punpcklbw    m2, m2
+    psubusb      m2, m4
+    mova         m3, m2
+    psubusb      m3, m4
+    psubb        m2, m3
+    mova         m3, [%2]
+    mova         m4, [%3]
+    pxor         m3, m6
+    pxor         m4, m6
+    paddusb      m3, m2
+    psubusb      m4, m2
+    pxor         m3, m6
+    pxor         m4, m6
+    paddusb      m2, m2
+    packsswb     m0, m1
+    pcmpgtb      m7, m0
+    pxor         m0, m7
+    psubb        m0, m7
+    mova         m1, m0
+    psubusb      m0, m2
+    psubb        m1, m0
+    pand         m1, [pb_FC]
+    psrlw        m1, 2
+    pxor         m1, m7
+    psubb        m1, m7
+    mova         m5, [%1]
+    mova         m6, [%4]
+    psubb        m5, m1
+    paddb        m6, m1
+%endmacro
+
+INIT_MMX mmx
+; void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
+cglobal h263_v_loop_filter, 3,5
+    movsxdifnidn r1, r1d
+    movsxdifnidn r2, r2d
+
+    lea          r4, [h263_loop_filter_strength]
+    movzx       r3d, BYTE [r4+r2]
+    movsx        r2, r3b
+    shl          r2, 1
+
+    mov          r3, r0
+    sub          r3, r1
+    mov          r4, r3
+    sub          r4, r1
+    H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
+
+    mova       [r3], m3
+    mova       [r0], m4
+    mova       [r4], m5
+    mova    [r0+r1], m6
+    RET
+
+%macro TRANSPOSE4X4 2
+    movd      m0, [%1]
+    movd      m1, [%1+r1]
+    movd      m2, [%1+r1*2]
+    movd      m3, [%1+r3]
+    punpcklbw m0, m1
+    punpcklbw m2, m3
+    mova      m1, m0
+    punpcklwd m0, m2
+    punpckhwd m1, m2
+    movd [%2+ 0], m0
+    punpckhdq m0, m0
+    movd [%2+ 8], m0
+    movd [%2+16], m1
+    punpckhdq m1, m1
+    movd [%2+24], m1
+%endmacro
+
+
+; void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
+INIT_MMX mmx
+cglobal h263_h_loop_filter, 3,5,0,32
+    movsxdifnidn r1, r1d
+    movsxdifnidn r2, r2d
+
+    lea          r4, [h263_loop_filter_strength]
+    movzx       r3d, BYTE [r4+r2]
+    movsx        r2, r3b
+    shl          r2, 1
+
+    sub          r0, 2
+    lea          r3, [r1*3]
+
+    TRANSPOSE4X4 r0, rsp
+    lea          r4, [r0+r1*4]
+    TRANSPOSE4X4 r4, rsp+4
+
+    H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
+
+    mova         m1, m5
+    mova         m0, m4
+    punpcklbw    m5, m3
+    punpcklbw    m4, m6
+    punpckhbw    m1, m3
+    punpckhbw    m0, m6
+    mova         m3, m5
+    mova         m6, m1
+    punpcklwd    m5, m4
+    punpcklwd    m1, m0
+    punpckhwd    m3, m4
+    punpckhwd    m6, m0
+    movd       [r0], m5
+    punpckhdq    m5, m5
+    movd  [r0+r1*1], m5
+    movd  [r0+r1*2], m3
+    punpckhdq    m3, m3
+    movd    [r0+r3], m3
+    movd       [r4], m1
+    punpckhdq    m1, m1
+    movd  [r4+r1*1], m1
+    movd  [r4+r1*2], m6
+    punpckhdq    m6, m6
+    movd    [r4+r3], m6
+    RET
--- a/externals/ffmpeg/libavcodec/x86/h263dsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/h263dsp_init.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2013 Diego Biurrun <diego@biurrun.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/h263dsp.h"
+
+void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
+void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
+
+av_cold void ff_h263dsp_init_x86(H263DSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
+        c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/h264_cabac.c
+++ b/externals/ffmpeg/libavcodec/x86/h264_cabac.c
@@ -0,0 +1,208 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG-4 part10 codec.
+ * non-SIMD x86-specific optimizations for H.264
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include <stddef.h>
+
+#include "libavcodec/cabac.h"
+#include "cabac.h"
+
+#if HAVE_INLINE_ASM
+
+#if ARCH_X86_64
+#define REG64 "r"
+#else
+#define REG64 "m"
+#endif
+
+//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
+//as that would make optimization work hard)
+#if HAVE_7REGS && !BROKEN_COMPILER
+#define decode_significance decode_significance_x86
+static int decode_significance_x86(CABACContext *c, int max_coeff,
+                                   uint8_t *significant_coeff_ctx_base,
+                                   int *index, x86_reg last_off){
+    void *end= significant_coeff_ctx_base + max_coeff - 1;
+    int minusstart= -(intptr_t)significant_coeff_ctx_base;
+    int minusindex= 4-(intptr_t)index;
+    int bit;
+    x86_reg coeff_count;
+
+#ifdef BROKEN_RELOCATIONS
+    void *tables;
+
+    __asm__ volatile(
+        "lea   "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
+        : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
+    );
+#endif
+
+    __asm__ volatile(
+        "3:                                     \n\t"
+
+        BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
+                             "%5", "%q5", "%k0", "%b0",
+                             "%c11(%6)", "%c12(%6)",
+                             AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+                             AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+                             AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+                             "%13")
+
+        "test $1, %4                            \n\t"
+        " jz 4f                                 \n\t"
+        "add  %10, %1                           \n\t"
+
+        BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
+                             "%5", "%q5", "%k0", "%b0",
+                             "%c11(%6)", "%c12(%6)",
+                             AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+                             AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+                             AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+                             "%13")
+
+        "sub  %10, %1                           \n\t"
+        "mov  %2, %0                            \n\t"
+        "movl %7, %%ecx                         \n\t"
+        "add  %1, %%"FF_REG_c"                  \n\t"
+        "movl %%ecx, (%0)                       \n\t"
+
+        "test $1, %4                            \n\t"
+        " jnz 5f                                \n\t"
+
+        "add"FF_OPSIZE"  $4, %2                 \n\t"
+
+        "4:                                     \n\t"
+        "add  $1, %1                            \n\t"
+        "cmp  %8, %1                            \n\t"
+        " jb 3b                                 \n\t"
+        "mov  %2, %0                            \n\t"
+        "movl %7, %%ecx                         \n\t"
+        "add  %1, %%"FF_REG_c"                  \n\t"
+        "movl %%ecx, (%0)                       \n\t"
+        "5:                                     \n\t"
+        "add  %9, %k0                           \n\t"
+        "shr $2, %k0                            \n\t"
+        : "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
+          "+&r"(c->low), "=&r"(bit), "+&r"(c->range)
+        : "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, bytestream_end))
+          TABLES_ARG
+        : "%"FF_REG_c, "memory"
+    );
+    return coeff_count;
+}
+
+#define decode_significance_8x8 decode_significance_8x8_x86
+static int decode_significance_8x8_x86(CABACContext *c,
+                                       uint8_t *significant_coeff_ctx_base,
+                                       int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){
+    int minusindex= 4-(intptr_t)index;
+    int bit;
+    x86_reg coeff_count;
+    x86_reg last=0;
+    x86_reg state;
+
+#ifdef BROKEN_RELOCATIONS
+    void *tables;
+
+    __asm__ volatile(
+        "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
+        : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
+    );
+#endif
+
+    __asm__ volatile(
+        "mov %1, %6                             \n\t"
+        "3:                                     \n\t"
+
+        "mov %10, %0                            \n\t"
+        "movzb (%0, %6), %6                     \n\t"
+        "add %9, %6                             \n\t"
+
+        BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
+                             "%5", "%q5", "%k0", "%b0",
+                             "%c12(%7)", "%c13(%7)",
+                             AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+                             AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+                             AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+                             "%15")
+
+        "mov %1, %6                             \n\t"
+        "test $1, %4                            \n\t"
+        " jz 4f                                 \n\t"
+
+#ifdef BROKEN_RELOCATIONS
+        "movzb %c14(%15, %q6), %6\n\t"
+#else
+        "movzb "MANGLE(ff_h264_cabac_tables)"+%c14(%6), %6\n\t"
+#endif
+        "add %11, %6                            \n\t"
+
+        BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
+                             "%5", "%q5", "%k0", "%b0",
+                             "%c12(%7)", "%c13(%7)",
+                             AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+                             AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+                             AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+                             "%15")
+
+        "mov %2, %0                             \n\t"
+        "mov %1, %6                             \n\t"
+        "mov %k6, (%0)                          \n\t"
+
+        "test $1, %4                            \n\t"
+        " jnz 5f                                \n\t"
+
+        "add"FF_OPSIZE"  $4, %2                 \n\t"
+
+        "4:                                     \n\t"
+        "add $1, %6                             \n\t"
+        "mov %6, %1                             \n\t"
+        "cmp $63, %6                            \n\t"
+        " jb 3b                                 \n\t"
+        "mov %2, %0                             \n\t"
+        "mov %k6, (%0)                          \n\t"
+        "5:                                     \n\t"
+        "addl %8, %k0                           \n\t"
+        "shr $2, %k0                            \n\t"
+        : "=&q"(coeff_count), "+"REG64(last), "+"REG64(index), "+&r"(c->low),
+          "=&r"(bit), "+&r"(c->range), "=&r"(state)
+        : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
+          REG64(sig_off), REG64(last_coeff_ctx_base),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, bytestream_end)),
+          "i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
+        : "%"FF_REG_c, "memory"
+    );
+    return coeff_count;
+}
+#endif /* HAVE_7REGS && BROKEN_COMPILER */
+
+#endif /* HAVE_INLINE_ASM */
--- a/externals/ffmpeg/libavcodec/x86/h264_chromamc.asm
+++ b/externals/ffmpeg/libavcodec/x86/h264_chromamc.asm
@@ -0,0 +1,663 @@
+;******************************************************************************
+;* MMX/SSSE3-optimized functions for H.264 chroma MC
+;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
+;*               2005-2008 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+rnd_rv40_2d_tbl: times 4 dw  0
+                 times 4 dw 16
+                 times 4 dw 32
+                 times 4 dw 16
+                 times 4 dw 32
+                 times 4 dw 28
+                 times 4 dw 32
+                 times 4 dw 28
+                 times 4 dw  0
+                 times 4 dw 32
+                 times 4 dw 16
+                 times 4 dw 32
+                 times 4 dw 32
+                 times 4 dw 28
+                 times 4 dw 32
+                 times 4 dw 28
+rnd_rv40_1d_tbl: times 4 dw  0
+                 times 4 dw  2
+                 times 4 dw  4
+                 times 4 dw  2
+                 times 4 dw  4
+                 times 4 dw  3
+                 times 4 dw  4
+                 times 4 dw  3
+                 times 4 dw  0
+                 times 4 dw  4
+                 times 4 dw  2
+                 times 4 dw  4
+                 times 4 dw  4
+                 times 4 dw  3
+                 times 4 dw  4
+                 times 4 dw  3
+
+cextern pw_3
+cextern pw_4
+cextern pw_8
+pw_28: times 8 dw 28
+cextern pw_32
+cextern pw_64
+
+SECTION .text
+
+%macro mv0_pixels_mc8 0
+    lea           r4, [r2*2 ]
+.next4rows:
+    movq         mm0, [r1   ]
+    movq         mm1, [r1+r2]
+    add           r1, r4
+    CHROMAMC_AVG mm0, [r0   ]
+    CHROMAMC_AVG mm1, [r0+r2]
+    movq     [r0   ], mm0
+    movq     [r0+r2], mm1
+    add           r0, r4
+    movq         mm0, [r1   ]
+    movq         mm1, [r1+r2]
+    add           r1, r4
+    CHROMAMC_AVG mm0, [r0   ]
+    CHROMAMC_AVG mm1, [r0+r2]
+    movq     [r0   ], mm0
+    movq     [r0+r2], mm1
+    add           r0, r4
+    sub          r3d, 4
+    jne .next4rows
+%endmacro
+
+%macro chroma_mc8_mmx_func 2-3
+%ifidn %2, rv40
+%ifdef PIC
+%define rnd_1d_rv40 r8
+%define rnd_2d_rv40 r8
+%define extra_regs 2
+%else ; no-PIC
+%define rnd_1d_rv40 rnd_rv40_1d_tbl
+%define rnd_2d_rv40 rnd_rv40_2d_tbl
+%define extra_regs 1
+%endif ; PIC
+%else
+%define extra_regs 0
+%endif ; rv40
+; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */,
+;                                   uint8_t *src /* align 1 */,
+;                                   ptrdiff_t stride, int h, int mx, int my)
+cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
+    mov          r6d, r5d
+    or           r6d, r4d
+    jne .at_least_one_non_zero
+    ; mx == 0 AND my == 0 - no filter needed
+    mv0_pixels_mc8
+    REP_RET
+
+.at_least_one_non_zero:
+%ifidn %2, rv40
+%if ARCH_X86_64
+    mov           r7, r5
+    and           r7, 6         ; &~1 for mx/my=[0,7]
+    lea           r7, [r7*4+r4]
+    sar          r7d, 1
+%define rnd_bias r7
+%define dest_reg r0
+%else ; x86-32
+    mov           r0, r5
+    and           r0, 6         ; &~1 for mx/my=[0,7]
+    lea           r0, [r0*4+r4]
+    sar          r0d, 1
+%define rnd_bias r0
+%define dest_reg r5
+%endif
+%else ; vc1, h264
+%define rnd_bias  0
+%define dest_reg r0
+%endif
+
+    test         r5d, r5d
+    mov           r6, 1
+    je .my_is_zero
+    test         r4d, r4d
+    mov           r6, r2        ; dxy = x ? 1 : stride
+    jne .both_non_zero
+.my_is_zero:
+    ; mx == 0 XOR my == 0 - 1 dimensional filter only
+    or           r4d, r5d       ; x + y
+
+%ifidn %2, rv40
+%ifdef PIC
+    lea           r8, [rnd_rv40_1d_tbl]
+%endif
+%if ARCH_X86_64 == 0
+    mov           r5, r0m
+%endif
+%endif
+
+    movd          m5, r4d
+    movq          m4, [pw_8]
+    movq          m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
+    punpcklwd     m5, m5
+    punpckldq     m5, m5        ; mm5 = B = x
+    pxor          m7, m7
+    psubw         m4, m5        ; mm4 = A = 8-x
+
+.next1drow:
+    movq          m0, [r1   ]   ; mm0 = src[0..7]
+    movq          m2, [r1+r6]   ; mm1 = src[1..8]
+
+    movq          m1, m0
+    movq          m3, m2
+    punpcklbw     m0, m7
+    punpckhbw     m1, m7
+    punpcklbw     m2, m7
+    punpckhbw     m3, m7
+    pmullw        m0, m4        ; [mm0,mm1] = A * src[0..7]
+    pmullw        m1, m4
+    pmullw        m2, m5        ; [mm2,mm3] = B * src[1..8]
+    pmullw        m3, m5
+
+    paddw         m0, m6
+    paddw         m1, m6
+    paddw         m0, m2
+    paddw         m1, m3
+    psrlw         m0, 3
+    psrlw         m1, 3
+    packuswb      m0, m1
+    CHROMAMC_AVG  m0, [dest_reg]
+    movq  [dest_reg], m0        ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
+
+    add     dest_reg, r2
+    add           r1, r2
+    dec           r3d
+    jne .next1drow
+    REP_RET
+
+.both_non_zero: ; general case, bilinear
+    movd          m4, r4d         ; x
+    movd          m6, r5d         ; y
+%ifidn %2, rv40
+%ifdef PIC
+    lea           r8, [rnd_rv40_2d_tbl]
+%endif
+%if ARCH_X86_64 == 0
+    mov           r5, r0m
+%endif
+%endif
+    mov           r6, rsp         ; backup stack pointer
+    and          rsp, ~(mmsize-1) ; align stack
+    sub          rsp, 16          ; AA and DD
+
+    punpcklwd     m4, m4
+    punpcklwd     m6, m6
+    punpckldq     m4, m4          ; mm4 = x words
+    punpckldq     m6, m6          ; mm6 = y words
+    movq          m5, m4
+    pmullw        m4, m6          ; mm4 = x * y
+    psllw         m5, 3
+    psllw         m6, 3
+    movq          m7, m5
+    paddw         m7, m6
+    movq     [rsp+8], m4          ; DD = x * y
+    psubw         m5, m4          ; mm5 = B = 8x - xy
+    psubw         m6, m4          ; mm6 = C = 8y - xy
+    paddw         m4, [pw_64]
+    psubw         m4, m7          ; mm4 = A = xy - (8x+8y) + 64
+    pxor          m7, m7
+    movq     [rsp  ], m4
+
+    movq          m0, [r1  ]      ; mm0 = src[0..7]
+    movq          m1, [r1+1]      ; mm1 = src[1..8]
+.next2drow:
+    add           r1, r2
+
+    movq          m2, m0
+    movq          m3, m1
+    punpckhbw     m0, m7
+    punpcklbw     m1, m7
+    punpcklbw     m2, m7
+    punpckhbw     m3, m7
+    pmullw        m0, [rsp]
+    pmullw        m2, [rsp]
+    pmullw        m1, m5
+    pmullw        m3, m5
+    paddw         m2, m1          ; mm2 = A * src[0..3] + B * src[1..4]
+    paddw         m3, m0          ; mm3 = A * src[4..7] + B * src[5..8]
+
+    movq          m0, [r1]
+    movq          m1, m0
+    punpcklbw     m0, m7
+    punpckhbw     m1, m7
+    pmullw        m0, m6
+    pmullw        m1, m6
+    paddw         m2, m0
+    paddw         m3, m1          ; [mm2,mm3] += C * src[0..7]
+
+    movq          m1, [r1+1]
+    movq          m0, m1
+    movq          m4, m1
+    punpcklbw     m0, m7
+    punpckhbw     m4, m7
+    pmullw        m0, [rsp+8]
+    pmullw        m4, [rsp+8]
+    paddw         m2, m0
+    paddw         m3, m4          ; [mm2,mm3] += D * src[1..8]
+    movq          m0, [r1]
+
+    paddw         m2, [rnd_2d_%2+rnd_bias*8]
+    paddw         m3, [rnd_2d_%2+rnd_bias*8]
+    psrlw         m2, 6
+    psrlw         m3, 6
+    packuswb      m2, m3
+    CHROMAMC_AVG  m2, [dest_reg]
+    movq  [dest_reg], m2          ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
+
+    add     dest_reg, r2
+    dec          r3d
+    jne .next2drow
+    mov          rsp, r6          ; restore stack pointer
+    RET
+%endmacro
+
+%macro chroma_mc4_mmx_func 2
+%define extra_regs 0
+%ifidn %2, rv40
+%ifdef PIC
+%define extra_regs 1
+%endif ; PIC
+%endif ; rv40
+cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
+    pxor          m7, m7
+    movd          m2, r4d         ; x
+    movd          m3, r5d         ; y
+    movq          m4, [pw_8]
+    movq          m5, [pw_8]
+    punpcklwd     m2, m2
+    punpcklwd     m3, m3
+    punpcklwd     m2, m2
+    punpcklwd     m3, m3
+    psubw         m4, m2
+    psubw         m5, m3
+
+%ifidn %2, rv40
+%ifdef PIC
+   lea            r6, [rnd_rv40_2d_tbl]
+%define rnd_2d_rv40 r6
+%else
+%define rnd_2d_rv40 rnd_rv40_2d_tbl
+%endif
+    and           r5, 6         ; &~1 for mx/my=[0,7]
+    lea           r5, [r5*4+r4]
+    sar          r5d, 1
+%define rnd_bias r5
+%else ; vc1, h264
+%define rnd_bias 0
+%endif
+
+    movd          m0, [r1  ]
+    movd          m6, [r1+1]
+    add           r1, r2
+    punpcklbw     m0, m7
+    punpcklbw     m6, m7
+    pmullw        m0, m4
+    pmullw        m6, m2
+    paddw         m6, m0
+
+.next2rows:
+    movd          m0, [r1  ]
+    movd          m1, [r1+1]
+    add           r1, r2
+    punpcklbw     m0, m7
+    punpcklbw     m1, m7
+    pmullw        m0, m4
+    pmullw        m1, m2
+    paddw         m1, m0
+    movq          m0, m1
+
+    pmullw        m6, m5
+    pmullw        m1, m3
+    paddw         m6, [rnd_2d_%2+rnd_bias*8]
+    paddw         m1, m6
+    psrlw         m1, 6
+    packuswb      m1, m1
+    CHROMAMC_AVG4 m1, m6, [r0]
+    movd        [r0], m1
+    add           r0, r2
+
+    movd          m6, [r1  ]
+    movd          m1, [r1+1]
+    add           r1, r2
+    punpcklbw     m6, m7
+    punpcklbw     m1, m7
+    pmullw        m6, m4
+    pmullw        m1, m2
+    paddw         m1, m6
+    movq          m6, m1
+    pmullw        m0, m5
+    pmullw        m1, m3
+    paddw         m0, [rnd_2d_%2+rnd_bias*8]
+    paddw         m1, m0
+    psrlw         m1, 6
+    packuswb      m1, m1
+    CHROMAMC_AVG4 m1, m0, [r0]
+    movd        [r0], m1
+    add           r0, r2
+    sub          r3d, 2
+    jnz .next2rows
+    REP_RET
+%endmacro
+
+%macro chroma_mc2_mmx_func 2
+cglobal %1_%2_chroma_mc2, 6, 7, 0
+    mov          r6d, r4d
+    shl          r4d, 16
+    sub          r4d, r6d
+    add          r4d, 8
+    imul         r5d, r4d         ; x*y<<16 | y*(8-x)
+    shl          r4d, 3
+    sub          r4d, r5d         ; x*(8-y)<<16 | (8-x)*(8-y)
+
+    movd          m5, r4d
+    movd          m6, r5d
+    punpckldq     m5, m5          ; mm5 = {A,B,A,B}
+    punpckldq     m6, m6          ; mm6 = {C,D,C,D}
+    pxor          m7, m7
+    movd          m2, [r1]
+    punpcklbw     m2, m7
+    pshufw        m2, m2, 0x94    ; mm0 = src[0,1,1,2]
+
+.nextrow:
+    add           r1, r2
+    movq          m1, m2
+    pmaddwd       m1, m5          ; mm1 = A * src[0,1] + B * src[1,2]
+    movd          m0, [r1]
+    punpcklbw     m0, m7
+    pshufw        m0, m0, 0x94    ; mm0 = src[0,1,1,2]
+    movq          m2, m0
+    pmaddwd       m0, m6
+    paddw         m1, [rnd_2d_%2]
+    paddw         m1, m0          ; mm1 += C * src[0,1] + D * src[1,2]
+    psrlw         m1, 6
+    packssdw      m1, m7
+    packuswb      m1, m7
+    CHROMAMC_AVG4 m1, m3, [r0]
+    movd         r5d, m1
+    mov         [r0], r5w
+    add           r0, r2
+    sub          r3d, 1
+    jnz .nextrow
+    REP_RET
+%endmacro
+
+%define rnd_1d_h264 pw_4
+%define rnd_2d_h264 pw_32
+%define rnd_1d_vc1  pw_3
+%define rnd_2d_vc1  pw_28
+
+%macro NOTHING 2-3
+%endmacro
+%macro DIRECT_AVG 2
+    PAVGB         %1, %2
+%endmacro
+%macro COPY_AVG 3
+    movd          %2, %3
+    PAVGB         %1, %2
+%endmacro
+
+INIT_MMX mmx
+%define CHROMAMC_AVG  NOTHING
+%define CHROMAMC_AVG4 NOTHING
+chroma_mc8_mmx_func put, h264, _rnd
+chroma_mc8_mmx_func put, vc1,  _nornd
+chroma_mc8_mmx_func put, rv40
+chroma_mc4_mmx_func put, h264
+chroma_mc4_mmx_func put, rv40
+
+INIT_MMX mmxext
+chroma_mc2_mmx_func put, h264
+
+%define CHROMAMC_AVG  DIRECT_AVG
+%define CHROMAMC_AVG4 COPY_AVG
+chroma_mc8_mmx_func avg, h264, _rnd
+chroma_mc8_mmx_func avg, vc1,  _nornd
+chroma_mc8_mmx_func avg, rv40
+chroma_mc4_mmx_func avg, h264
+chroma_mc4_mmx_func avg, rv40
+chroma_mc2_mmx_func avg, h264
+
+INIT_MMX 3dnow
+chroma_mc8_mmx_func avg, h264, _rnd
+chroma_mc8_mmx_func avg, vc1,  _nornd
+chroma_mc8_mmx_func avg, rv40
+chroma_mc4_mmx_func avg, h264
+chroma_mc4_mmx_func avg, rv40
+
+%macro chroma_mc8_ssse3_func 2-3
+cglobal %1_%2_chroma_mc8%3, 6, 7, 8
+    mov          r6d, r5d
+    or           r6d, r4d
+    jne .at_least_one_non_zero
+    ; mx == 0 AND my == 0 - no filter needed
+    mv0_pixels_mc8
+    REP_RET
+
+.at_least_one_non_zero:
+    test         r5d, r5d
+    je .my_is_zero
+    test         r4d, r4d
+    je .mx_is_zero
+
+    ; general case, bilinear
+    mov          r6d, r4d
+    shl          r4d, 8
+    sub           r4, r6
+    mov           r6, 8
+    add           r4, 8           ; x*288+8 = x<<8 | (8-x)
+    sub          r6d, r5d
+    imul          r6, r4          ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
+    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
+
+    movd          m7, r6d
+    movd          m6, r4d
+    movdqa        m5, [rnd_2d_%2]
+    movq          m0, [r1  ]
+    movq          m1, [r1+1]
+    pshuflw       m7, m7, 0
+    pshuflw       m6, m6, 0
+    punpcklbw     m0, m1
+    movlhps       m7, m7
+    movlhps       m6, m6
+
+.next2rows:
+    movq          m1, [r1+r2*1   ]
+    movq          m2, [r1+r2*1+1]
+    movq          m3, [r1+r2*2  ]
+    movq          m4, [r1+r2*2+1]
+    lea           r1, [r1+r2*2]
+    punpcklbw     m1, m2
+    movdqa        m2, m1
+    punpcklbw     m3, m4
+    movdqa        m4, m3
+    pmaddubsw     m0, m7
+    pmaddubsw     m1, m6
+    pmaddubsw     m2, m7
+    pmaddubsw     m3, m6
+    paddw         m0, m5
+    paddw         m2, m5
+    paddw         m1, m0
+    paddw         m3, m2
+    psrlw         m1, 6
+    movdqa        m0, m4
+    psrlw         m3, 6
+%ifidn %1, avg
+    movq          m2, [r0   ]
+    movhps        m2, [r0+r2]
+%endif
+    packuswb      m1, m3
+    CHROMAMC_AVG  m1, m2
+    movq     [r0   ], m1
+    movhps   [r0+r2], m1
+    sub          r3d, 2
+    lea           r0, [r0+r2*2]
+    jg .next2rows
+    REP_RET
+
+.my_is_zero:
+    mov          r5d, r4d
+    shl          r4d, 8
+    add           r4, 8
+    sub           r4, r5          ; 255*x+8 = x<<8 | (8-x)
+    movd          m7, r4d
+    movdqa        m6, [rnd_1d_%2]
+    pshuflw       m7, m7, 0
+    movlhps       m7, m7
+
+.next2xrows:
+    movq          m0, [r1     ]
+    movq          m1, [r1   +1]
+    movq          m2, [r1+r2  ]
+    movq          m3, [r1+r2+1]
+    punpcklbw     m0, m1
+    punpcklbw     m2, m3
+    pmaddubsw     m0, m7
+    pmaddubsw     m2, m7
+%ifidn %1, avg
+    movq          m4, [r0   ]
+    movhps        m4, [r0+r2]
+%endif
+    paddw         m0, m6
+    paddw         m2, m6
+    psrlw         m0, 3
+    psrlw         m2, 3
+    packuswb      m0, m2
+    CHROMAMC_AVG  m0, m4
+    movq     [r0   ], m0
+    movhps   [r0+r2], m0
+    sub          r3d, 2
+    lea           r0, [r0+r2*2]
+    lea           r1, [r1+r2*2]
+    jg .next2xrows
+    REP_RET
+
+.mx_is_zero:
+    mov          r4d, r5d
+    shl          r5d, 8
+    add           r5, 8
+    sub           r5, r4          ; 255*y+8 = y<<8 | (8-y)
+    movd          m7, r5d
+    movdqa        m6, [rnd_1d_%2]
+    pshuflw       m7, m7, 0
+    movlhps       m7, m7
+
+.next2yrows:
+    movq          m0, [r1     ]
+    movq          m1, [r1+r2  ]
+    movdqa        m2, m1
+    movq          m3, [r1+r2*2]
+    lea           r1, [r1+r2*2]
+    punpcklbw     m0, m1
+    punpcklbw     m2, m3
+    pmaddubsw     m0, m7
+    pmaddubsw     m2, m7
+%ifidn %1, avg
+    movq          m4, [r0   ]
+    movhps        m4, [r0+r2]
+%endif
+    paddw         m0, m6
+    paddw         m2, m6
+    psrlw         m0, 3
+    psrlw         m2, 3
+    packuswb      m0, m2
+    CHROMAMC_AVG  m0, m4
+    movq     [r0   ], m0
+    movhps   [r0+r2], m0
+    sub          r3d, 2
+    lea           r0, [r0+r2*2]
+    jg .next2yrows
+    REP_RET
+%endmacro
+
+%macro chroma_mc4_ssse3_func 2
+cglobal %1_%2_chroma_mc4, 6, 7, 0
+    mov           r6, r4
+    shl          r4d, 8
+    sub          r4d, r6d
+    mov           r6, 8
+    add          r4d, 8           ; x*288+8
+    sub          r6d, r5d
+    imul         r6d, r4d         ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
+    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
+
+    movd          m7, r6d
+    movd          m6, r4d
+    movq          m5, [pw_32]
+    movd          m0, [r1  ]
+    pshufw        m7, m7, 0
+    punpcklbw     m0, [r1+1]
+    pshufw        m6, m6, 0
+
+.next2rows:
+    movd          m1, [r1+r2*1  ]
+    movd          m3, [r1+r2*2  ]
+    punpcklbw     m1, [r1+r2*1+1]
+    punpcklbw     m3, [r1+r2*2+1]
+    lea           r1, [r1+r2*2]
+    movq          m2, m1
+    movq          m4, m3
+    pmaddubsw     m0, m7
+    pmaddubsw     m1, m6
+    pmaddubsw     m2, m7
+    pmaddubsw     m3, m6
+    paddw         m0, m5
+    paddw         m2, m5
+    paddw         m1, m0
+    paddw         m3, m2
+    psrlw         m1, 6
+    movq          m0, m4
+    psrlw         m3, 6
+    packuswb      m1, m1
+    packuswb      m3, m3
+    CHROMAMC_AVG  m1, [r0  ]
+    CHROMAMC_AVG  m3, [r0+r2]
+    movd     [r0   ], m1
+    movd     [r0+r2], m3
+    sub          r3d, 2
+    lea           r0, [r0+r2*2]
+    jg .next2rows
+    REP_RET
+%endmacro
+
+%define CHROMAMC_AVG NOTHING
+INIT_XMM ssse3
+chroma_mc8_ssse3_func put, h264, _rnd
+chroma_mc8_ssse3_func put, vc1,  _nornd
+INIT_MMX ssse3
+chroma_mc4_ssse3_func put, h264
+
+%define CHROMAMC_AVG DIRECT_AVG
+INIT_XMM ssse3
+chroma_mc8_ssse3_func avg, h264, _rnd
+chroma_mc8_ssse3_func avg, vc1,  _nornd
+INIT_MMX ssse3
+chroma_mc4_ssse3_func avg, h264
--- a/externals/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm
+++ b/externals/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm
@@ -0,0 +1,269 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code
+;*****************************************************************************
+;* Copyright (C) 2005-2011 x264 project
+;*
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pw_4
+cextern pw_8
+cextern pw_32
+cextern pw_64
+
+SECTION .text
+
+
+%macro MV0_PIXELS_MC8 0
+    lea           r4, [r2*3   ]
+    lea           r5, [r2*4   ]
+.next4rows:
+    movu          m0, [r1     ]
+    movu          m1, [r1+r2  ]
+    CHROMAMC_AVG  m0, [r0     ]
+    CHROMAMC_AVG  m1, [r0+r2  ]
+    mova   [r0     ], m0
+    mova   [r0+r2  ], m1
+    movu          m0, [r1+r2*2]
+    movu          m1, [r1+r4  ]
+    CHROMAMC_AVG  m0, [r0+r2*2]
+    CHROMAMC_AVG  m1, [r0+r4  ]
+    mova   [r0+r2*2], m0
+    mova   [r0+r4  ], m1
+    add           r1, r5
+    add           r0, r5
+    sub          r3d, 4
+    jne .next4rows
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void ff_put/avg_h264_chroma_mc8(pixel *dst, pixel *src, ptrdiff_t stride,
+;                                 int h, int mx, int my)
+;-----------------------------------------------------------------------------
+%macro CHROMA_MC8 1
+cglobal %1_h264_chroma_mc8_10, 6,7,8
+    mov          r6d, r5d
+    or           r6d, r4d
+    jne .at_least_one_non_zero
+    ; mx == 0 AND my == 0 - no filter needed
+    MV0_PIXELS_MC8
+    REP_RET
+
+.at_least_one_non_zero:
+    mov          r6d, 2
+    test         r5d, r5d
+    je .x_interpolation
+    mov           r6, r2        ; dxy = x ? 1 : stride
+    test         r4d, r4d
+    jne .xy_interpolation
+.x_interpolation:
+    ; mx == 0 XOR my == 0 - 1 dimensional filter only
+    or           r4d, r5d       ; x + y
+    movd          m5, r4d
+    mova          m4, [pw_8]
+    mova          m6, [pw_4]    ; mm6 = rnd >> 3
+    SPLATW        m5, m5        ; mm5 = B = x
+    psubw         m4, m5        ; mm4 = A = 8-x
+
+.next1drow:
+    movu          m0, [r1   ]   ; mm0 = src[0..7]
+    movu          m2, [r1+r6]   ; mm2 = src[1..8]
+
+    pmullw        m0, m4        ; mm0 = A * src[0..7]
+    pmullw        m2, m5        ; mm2 = B * src[1..8]
+
+    paddw         m0, m6
+    paddw         m0, m2
+    psrlw         m0, 3
+    CHROMAMC_AVG  m0, [r0]
+    mova        [r0], m0        ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
+
+    add           r0, r2
+    add           r1, r2
+    dec           r3d
+    jne .next1drow
+    REP_RET
+
+.xy_interpolation: ; general case, bilinear
+    movd          m4, r4m         ; x
+    movd          m6, r5m         ; y
+
+    SPLATW        m4, m4          ; mm4 = x words
+    SPLATW        m6, m6          ; mm6 = y words
+    psllw         m5, m4, 3       ; mm5 = 8x
+    pmullw        m4, m6          ; mm4 = x * y
+    psllw         m6, 3           ; mm6 = 8y
+    paddw         m1, m5, m6      ; mm7 = 8x+8y
+    mova          m7, m4          ; DD = x * y
+    psubw         m5, m4          ; mm5 = B = 8x - xy
+    psubw         m6, m4          ; mm6 = C = 8y - xy
+    paddw         m4, [pw_64]
+    psubw         m4, m1          ; mm4 = A = xy - (8x+8y) + 64
+
+    movu          m0, [r1  ]      ; mm0 = src[0..7]
+    movu          m1, [r1+2]      ; mm1 = src[1..8]
+.next2drow:
+    add           r1, r2
+
+    pmullw        m2, m0, m4
+    pmullw        m1, m5
+    paddw         m2, m1          ; mm2 = A * src[0..7] + B * src[1..8]
+
+    movu          m0, [r1]
+    movu          m1, [r1+2]
+    pmullw        m3, m0, m6
+    paddw         m2, m3          ; mm2 += C * src[0..7+strde]
+    pmullw        m3, m1, m7
+    paddw         m2, m3          ; mm2 += D * src[1..8+strde]
+
+    paddw         m2, [pw_32]
+    psrlw         m2, 6
+    CHROMAMC_AVG  m2, [r0]
+    mova        [r0], m2          ; dst[0..7] = (mm2 + 32) >> 6
+
+    add           r0, r2
+    dec          r3d
+    jne .next2drow
+    REP_RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void ff_put/avg_h264_chroma_mc4(pixel *dst, pixel *src, ptrdiff_t stride,
+;                                 int h, int mx, int my)
+;-----------------------------------------------------------------------------
+;TODO: xmm mc4
+%macro MC4_OP 2
+    movq          %1, [r1  ]
+    movq          m1, [r1+2]
+    add           r1, r2
+    pmullw        %1, m4
+    pmullw        m1, m2
+    paddw         m1, %1
+    mova          %1, m1
+
+    pmullw        %2, m5
+    pmullw        m1, m3
+    paddw         %2, [pw_32]
+    paddw         m1, %2
+    psrlw         m1, 6
+    CHROMAMC_AVG  m1, %2, [r0]
+    movq        [r0], m1
+    add           r0, r2
+%endmacro
+
+%macro CHROMA_MC4 1
+cglobal %1_h264_chroma_mc4_10, 6,6,7
+    movd          m2, r4m         ; x
+    movd          m3, r5m         ; y
+    mova          m4, [pw_8]
+    mova          m5, m4
+    SPLATW        m2, m2
+    SPLATW        m3, m3
+    psubw         m4, m2
+    psubw         m5, m3
+
+    movq          m0, [r1  ]
+    movq          m6, [r1+2]
+    add           r1, r2
+    pmullw        m0, m4
+    pmullw        m6, m2
+    paddw         m6, m0
+
+.next2rows:
+    MC4_OP m0, m6
+    MC4_OP m6, m0
+    sub   r3d, 2
+    jnz .next2rows
+    REP_RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void ff_put/avg_h264_chroma_mc2(pixel *dst, pixel *src, ptrdiff_t stride,
+;                                 int h, int mx, int my)
+;-----------------------------------------------------------------------------
+%macro CHROMA_MC2 1
+cglobal %1_h264_chroma_mc2_10, 6,7
+    mov          r6d, r4d
+    shl          r4d, 16
+    sub          r4d, r6d
+    add          r4d, 8
+    imul         r5d, r4d         ; x*y<<16 | y*(8-x)
+    shl          r4d, 3
+    sub          r4d, r5d         ; x*(8-y)<<16 | (8-x)*(8-y)
+
+    movd          m5, r4d
+    movd          m6, r5d
+    punpckldq     m5, m5          ; mm5 = {A,B,A,B}
+    punpckldq     m6, m6          ; mm6 = {C,D,C,D}
+    pxor          m7, m7
+    pshufw        m2, [r1], 0x94    ; mm0 = src[0,1,1,2]
+
+.nextrow:
+    add           r1, r2
+    movq          m1, m2
+    pmaddwd       m1, m5          ; mm1 = A * src[0,1] + B * src[1,2]
+    pshufw        m0, [r1], 0x94    ; mm0 = src[0,1,1,2]
+    movq          m2, m0
+    pmaddwd       m0, m6
+    paddw         m1, [pw_32]
+    paddw         m1, m0          ; mm1 += C * src[0,1] + D * src[1,2]
+    psrlw         m1, 6
+    packssdw      m1, m7
+    CHROMAMC_AVG  m1, m3, [r0]
+    movd        [r0], m1
+    add           r0, r2
+    dec          r3d
+    jnz .nextrow
+    REP_RET
+%endmacro
+
+%macro NOTHING 2-3
+%endmacro
+%macro AVG 2-3
+%if %0==3
+    movq          %2, %3
+%endif
+    pavgw         %1, %2
+%endmacro
+
+%define CHROMAMC_AVG  NOTHING
+INIT_XMM sse2
+CHROMA_MC8 put
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+CHROMA_MC8 put
+%endif
+INIT_MMX mmxext
+CHROMA_MC4 put
+CHROMA_MC2 put
+
+%define CHROMAMC_AVG  AVG
+INIT_XMM sse2
+CHROMA_MC8 avg
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+CHROMA_MC8 avg
+%endif
+INIT_MMX mmxext
+CHROMA_MC4 avg
+CHROMA_MC2 avg
--- a/externals/ffmpeg/libavcodec/x86/h264_deblock.asm
+++ b/externals/ffmpeg/libavcodec/x86/h264_deblock.asm
--- a/externals/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm
+++ b/externals/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm
--- a/externals/ffmpeg/libavcodec/x86/h264_idct.asm
+++ b/externals/ffmpeg/libavcodec/x86/h264_idct.asm
--- a/externals/ffmpeg/libavcodec/x86/h264_idct_10bit.asm
+++ b/externals/ffmpeg/libavcodec/x86/h264_idct_10bit.asm
@@ -0,0 +1,657 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
+;*****************************************************************************
+;* Copyright (C) 2005-2011 x264 project
+;*
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+cextern pw_1023
+%define pw_pixel_max pw_1023
+cextern pd_32
+
+;-----------------------------------------------------------------------------
+; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+%macro STORE_DIFFx2 6
+    psrad       %1, 6
+    psrad       %2, 6
+    packssdw    %1, %2
+    movq        %3, [%5]
+    movhps      %3, [%5+%6]
+    paddsw      %1, %3
+    CLIPW       %1, %4, [pw_pixel_max]
+    movq      [%5], %1
+    movhps [%5+%6], %1
+%endmacro
+
+%macro STORE_DIFF16 5
+    psrad       %1, 6
+    psrad       %2, 6
+    packssdw    %1, %2
+    paddsw      %1, [%5]
+    CLIPW       %1, %3, %4
+    mova      [%5], %1
+%endmacro
+
+;dst, in, stride
+%macro IDCT4_ADD_10 3
+    mova  m0, [%2+ 0]
+    mova  m1, [%2+16]
+    mova  m2, [%2+32]
+    mova  m3, [%2+48]
+    IDCT4_1D d,0,1,2,3,4,5
+    TRANSPOSE4x4D 0,1,2,3,4
+    paddd m0, [pd_32]
+    IDCT4_1D d,0,1,2,3,4,5
+    pxor  m5, m5
+    mova [%2+ 0], m5
+    mova [%2+16], m5
+    mova [%2+32], m5
+    mova [%2+48], m5
+    STORE_DIFFx2 m0, m1, m4, m5, %1, %3
+    lea   %1, [%1+%3*2]
+    STORE_DIFFx2 m2, m3, m4, m5, %1, %3
+%endmacro
+
+%macro IDCT_ADD_10 0
+cglobal h264_idct_add_10, 3,3
+    movsxdifnidn r2, r2d
+    IDCT4_ADD_10 r0, r1, r2
+    RET
+%endmacro
+
+INIT_XMM sse2
+IDCT_ADD_10
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT_ADD_10
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset,
+;                            int16_t *block, int stride,
+;                            const uint8_t nnzc[6*8])
+;-----------------------------------------------------------------------------
+;;;;;;; NO FATE SAMPLES TRIGGER THIS
+%macro ADD4x4IDCT 0
+add4x4_idct %+ SUFFIX:
+    add   r5, r0
+    mova  m0, [r2+ 0]
+    mova  m1, [r2+16]
+    mova  m2, [r2+32]
+    mova  m3, [r2+48]
+    IDCT4_1D d,0,1,2,3,4,5
+    TRANSPOSE4x4D 0,1,2,3,4
+    paddd m0, [pd_32]
+    IDCT4_1D d,0,1,2,3,4,5
+    pxor  m5, m5
+    mova  [r2+ 0], m5
+    mova  [r2+16], m5
+    mova  [r2+32], m5
+    mova  [r2+48], m5
+    STORE_DIFFx2 m0, m1, m4, m5, r5, r3
+    lea   r5, [r5+r3*2]
+    STORE_DIFFx2 m2, m3, m4, m5, r5, r3
+    ret
+%endmacro
+
+INIT_XMM sse2
+ALIGN 16
+ADD4x4IDCT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+ALIGN 16
+ADD4x4IDCT
+%endif
+
+%macro ADD16_OP 2
+    cmp          byte [r4+%2], 0
+    jz .skipblock%1
+    mov         r5d, [r1+%1*4]
+    call add4x4_idct %+ SUFFIX
+.skipblock%1:
+%if %1<15
+    add          r2, 64
+%endif
+%endmacro
+
+%macro IDCT_ADD16_10 0
+cglobal h264_idct_add16_10, 5,6
+    movsxdifnidn r3, r3d
+    ADD16_OP 0, 4+1*8
+    ADD16_OP 1, 5+1*8
+    ADD16_OP 2, 4+2*8
+    ADD16_OP 3, 5+2*8
+    ADD16_OP 4, 6+1*8
+    ADD16_OP 5, 7+1*8
+    ADD16_OP 6, 6+2*8
+    ADD16_OP 7, 7+2*8
+    ADD16_OP 8, 4+3*8
+    ADD16_OP 9, 5+3*8
+    ADD16_OP 10, 4+4*8
+    ADD16_OP 11, 5+4*8
+    ADD16_OP 12, 6+3*8
+    ADD16_OP 13, 7+3*8
+    ADD16_OP 14, 6+4*8
+    ADD16_OP 15, 7+4*8
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+IDCT_ADD16_10
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT_ADD16_10
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+%macro IDCT_DC_ADD_OP_10 3
+    pxor      m5, m5
+%if avx_enabled
+    paddw     m1, m0, [%1+0   ]
+    paddw     m2, m0, [%1+%2  ]
+    paddw     m3, m0, [%1+%2*2]
+    paddw     m4, m0, [%1+%3  ]
+%else
+    mova      m1, [%1+0   ]
+    mova      m2, [%1+%2  ]
+    mova      m3, [%1+%2*2]
+    mova      m4, [%1+%3  ]
+    paddw     m1, m0
+    paddw     m2, m0
+    paddw     m3, m0
+    paddw     m4, m0
+%endif
+    CLIPW     m1, m5, m6
+    CLIPW     m2, m5, m6
+    CLIPW     m3, m5, m6
+    CLIPW     m4, m5, m6
+    mova [%1+0   ], m1
+    mova [%1+%2  ], m2
+    mova [%1+%2*2], m3
+    mova [%1+%3  ], m4
+%endmacro
+
+INIT_MMX mmxext
+cglobal h264_idct_dc_add_10,3,3
+    movsxdifnidn r2, r2d
+    movd      m0, [r1]
+    mov dword [r1], 0
+    paddd     m0, [pd_32]
+    psrad     m0, 6
+    lea       r1, [r2*3]
+    pshufw    m0, m0, 0
+    mova      m6, [pw_pixel_max]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    RET
+
+;-----------------------------------------------------------------------------
+; void ff_h264_idct8_dc_add_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+%macro IDCT8_DC_ADD 0
+cglobal h264_idct8_dc_add_10,3,4,7
+    movsxdifnidn r2, r2d
+    movd      m0, [r1]
+    mov dword[r1], 0
+    paddd     m0, [pd_32]
+    psrad     m0, 6
+    lea       r1, [r2*3]
+    SPLATW    m0, m0, 0
+    mova      m6, [pw_pixel_max]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    lea       r0, [r0+r2*4]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    RET
+%endmacro
+
+INIT_XMM sse2
+IDCT8_DC_ADD
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT8_DC_ADD
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset,
+;                                 int16_t *block, int stride,
+;                                 const uint8_t nnzc[6*8])
+;-----------------------------------------------------------------------------
+%macro AC 1
+.ac%1:
+    mov  r5d, [r1+(%1+0)*4]
+    call add4x4_idct %+ SUFFIX
+    mov  r5d, [r1+(%1+1)*4]
+    add  r2, 64
+    call add4x4_idct %+ SUFFIX
+    add  r2, 64
+    jmp .skipadd%1
+%endmacro
+
+%assign last_block 16
+%macro ADD16_OP_INTRA 2
+    cmp      word [r4+%2], 0
+    jnz .ac%1
+    mov      r5d, [r2+ 0]
+    or       r5d, [r2+64]
+    jz .skipblock%1
+    mov      r5d, [r1+(%1+0)*4]
+    call idct_dc_add %+ SUFFIX
+.skipblock%1:
+%if %1<last_block-2
+    add       r2, 128
+%endif
+.skipadd%1:
+%endmacro
+
+%macro IDCT_ADD16INTRA_10 0
+idct_dc_add %+ SUFFIX:
+    add       r5, r0
+    movq      m0, [r2+ 0]
+    movhps    m0, [r2+64]
+    mov dword [r2+ 0], 0
+    mov dword [r2+64], 0
+    paddd     m0, [pd_32]
+    psrad     m0, 6
+    pshufhw   m0, m0, 0
+    pshuflw   m0, m0, 0
+    lea       r6, [r3*3]
+    mova      m6, [pw_pixel_max]
+    IDCT_DC_ADD_OP_10 r5, r3, r6
+    ret
+
+cglobal h264_idct_add16intra_10,5,7,8
+    movsxdifnidn r3, r3d
+    ADD16_OP_INTRA 0, 4+1*8
+    ADD16_OP_INTRA 2, 4+2*8
+    ADD16_OP_INTRA 4, 6+1*8
+    ADD16_OP_INTRA 6, 6+2*8
+    ADD16_OP_INTRA 8, 4+3*8
+    ADD16_OP_INTRA 10, 4+4*8
+    ADD16_OP_INTRA 12, 6+3*8
+    ADD16_OP_INTRA 14, 6+4*8
+    REP_RET
+    AC 8
+    AC 10
+    AC 12
+    AC 14
+    AC 0
+    AC 2
+    AC 4
+    AC 6
+%endmacro
+
+INIT_XMM sse2
+IDCT_ADD16INTRA_10
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT_ADD16INTRA_10
+%endif
+
+%assign last_block 36
+;-----------------------------------------------------------------------------
+; void ff_h264_idct_add8_10(pixel **dst, const int *block_offset,
+;                           int16_t *block, int stride,
+;                           const uint8_t nnzc[6*8])
+;-----------------------------------------------------------------------------
+%macro IDCT_ADD8 0
+cglobal h264_idct_add8_10,5,8,7
+    movsxdifnidn r3, r3d
+%if ARCH_X86_64
+    mov      r7, r0
+%endif
+    add      r2, 1024
+    mov      r0, [r0]
+    ADD16_OP_INTRA 16, 4+ 6*8
+    ADD16_OP_INTRA 18, 4+ 7*8
+    add      r2, 1024-128*2
+%if ARCH_X86_64
+    mov      r0, [r7+gprsize]
+%else
+    mov      r0, r0m
+    mov      r0, [r0+gprsize]
+%endif
+    ADD16_OP_INTRA 32, 4+11*8
+    ADD16_OP_INTRA 34, 4+12*8
+    REP_RET
+    AC 16
+    AC 18
+    AC 32
+    AC 34
+
+%endmacro ; IDCT_ADD8
+
+INIT_XMM sse2
+IDCT_ADD8
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT_ADD8
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_h264_idct_add8_422_10(pixel **dst, const int *block_offset,
+;                               int16_t *block, int stride,
+;                               const uint8_t nnzc[6*8])
+;-----------------------------------------------------------------------------
+%assign last_block 44
+
+%macro IDCT_ADD8_422 0
+
+cglobal h264_idct_add8_422_10, 5, 8, 7
+    movsxdifnidn r3, r3d
+%if ARCH_X86_64
+    mov      r7, r0
+%endif
+
+    add      r2, 1024
+    mov      r0, [r0]
+    ADD16_OP_INTRA 16, 4+ 6*8
+    ADD16_OP_INTRA 18, 4+ 7*8
+    ADD16_OP_INTRA 24, 4+ 8*8 ; i+4
+    ADD16_OP_INTRA 26, 4+ 9*8 ; i+4
+    add      r2, 1024-128*4
+
+%if ARCH_X86_64
+    mov      r0, [r7+gprsize]
+%else
+    mov      r0, r0m
+    mov      r0, [r0+gprsize]
+%endif
+
+    ADD16_OP_INTRA 32, 4+11*8
+    ADD16_OP_INTRA 34, 4+12*8
+    ADD16_OP_INTRA 40, 4+13*8 ; i+4
+    ADD16_OP_INTRA 42, 4+14*8 ; i+4
+REP_RET
+    AC 16
+    AC 18
+    AC 24 ; i+4
+    AC 26 ; i+4
+    AC 32
+    AC 34
+    AC 40 ; i+4
+    AC 42 ; i+4
+
+%endmacro
+
+INIT_XMM sse2
+IDCT_ADD8_422
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT_ADD8_422
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+%macro IDCT8_1D 2
+    SWAP      0, 1
+    psrad     m4, m5, 1
+    psrad     m1, m0, 1
+    paddd     m4, m5
+    paddd     m1, m0
+    paddd     m4, m7
+    paddd     m1, m5
+    psubd     m4, m0
+    paddd     m1, m3
+
+    psubd     m0, m3
+    psubd     m5, m3
+    paddd     m0, m7
+    psubd     m5, m7
+    psrad     m3, 1
+    psrad     m7, 1
+    psubd     m0, m3
+    psubd     m5, m7
+
+    SWAP      1, 7
+    psrad     m1, m7, 2
+    psrad     m3, m4, 2
+    paddd     m3, m0
+    psrad     m0, 2
+    paddd     m1, m5
+    psrad     m5, 2
+    psubd     m0, m4
+    psubd     m7, m5
+
+    SWAP      5, 6
+    psrad     m4, m2, 1
+    psrad     m6, m5, 1
+    psubd     m4, m5
+    paddd     m6, m2
+
+    mova      m2, %1
+    mova      m5, %2
+    SUMSUB_BA d, 5, 2
+    SUMSUB_BA d, 6, 5
+    SUMSUB_BA d, 4, 2
+    SUMSUB_BA d, 7, 6
+    SUMSUB_BA d, 0, 4
+    SUMSUB_BA d, 3, 2
+    SUMSUB_BA d, 1, 5
+    SWAP      7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
+%endmacro
+
+%macro IDCT8_1D_FULL 1
+    mova         m7, [%1+112*2]
+    mova         m6, [%1+ 96*2]
+    mova         m5, [%1+ 80*2]
+    mova         m3, [%1+ 48*2]
+    mova         m2, [%1+ 32*2]
+    mova         m1, [%1+ 16*2]
+    IDCT8_1D   [%1], [%1+ 64*2]
+%endmacro
+
+; %1=int16_t *block, %2=int16_t *dstblock
+%macro IDCT8_ADD_SSE_START 2
+    IDCT8_1D_FULL %1
+%if ARCH_X86_64
+    TRANSPOSE4x4D  0,1,2,3,8
+    mova    [%2    ], m0
+    TRANSPOSE4x4D  4,5,6,7,8
+    mova    [%2+8*2], m4
+%else
+    mova         [%1], m7
+    TRANSPOSE4x4D   0,1,2,3,7
+    mova           m7, [%1]
+    mova    [%2     ], m0
+    mova    [%2+16*2], m1
+    mova    [%2+32*2], m2
+    mova    [%2+48*2], m3
+    TRANSPOSE4x4D   4,5,6,7,3
+    mova    [%2+ 8*2], m4
+    mova    [%2+24*2], m5
+    mova    [%2+40*2], m6
+    mova    [%2+56*2], m7
+%endif
+%endmacro
+
+; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
+%macro IDCT8_ADD_SSE_END 3
+    IDCT8_1D_FULL %2
+    mova  [%2     ], m6
+    mova  [%2+16*2], m7
+
+    pxor         m7, m7
+    STORE_DIFFx2 m0, m1, m6, m7, %1, %3
+    lea          %1, [%1+%3*2]
+    STORE_DIFFx2 m2, m3, m6, m7, %1, %3
+    mova         m0, [%2     ]
+    mova         m1, [%2+16*2]
+    lea          %1, [%1+%3*2]
+    STORE_DIFFx2 m4, m5, m6, m7, %1, %3
+    lea          %1, [%1+%3*2]
+    STORE_DIFFx2 m0, m1, m6, m7, %1, %3
+%endmacro
+
+%macro IDCT8_ADD 0
+cglobal h264_idct8_add_10, 3,4,16
+    movsxdifnidn r2, r2d
+%if UNIX64 == 0
+    %assign pad 16-gprsize-(stack_offset&15)
+    sub  rsp, pad
+    call h264_idct8_add1_10 %+ SUFFIX
+    add  rsp, pad
+    RET
+%endif
+
+ALIGN 16
+; TODO: does not need to use stack
+h264_idct8_add1_10 %+ SUFFIX:
+%assign pad 256+16-gprsize
+    sub          rsp, pad
+    add   dword [r1], 32
+
+%if ARCH_X86_64
+    IDCT8_ADD_SSE_START r1, rsp
+    SWAP 1,  9
+    SWAP 2, 10
+    SWAP 3, 11
+    SWAP 5, 13
+    SWAP 6, 14
+    SWAP 7, 15
+    IDCT8_ADD_SSE_START r1+16, rsp+128
+    PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
+    IDCT8_1D [rsp], [rsp+128]
+    SWAP 0,  8
+    SWAP 1,  9
+    SWAP 2, 10
+    SWAP 3, 11
+    SWAP 4, 12
+    SWAP 5, 13
+    SWAP 6, 14
+    SWAP 7, 15
+    IDCT8_1D [rsp+16], [rsp+144]
+    psrad         m8, 6
+    psrad         m0, 6
+    packssdw      m8, m0
+    paddsw        m8, [r0]
+    pxor          m0, m0
+    mova    [r1+  0], m0
+    mova    [r1+ 16], m0
+    mova    [r1+ 32], m0
+    mova    [r1+ 48], m0
+    mova    [r1+ 64], m0
+    mova    [r1+ 80], m0
+    mova    [r1+ 96], m0
+    mova    [r1+112], m0
+    mova    [r1+128], m0
+    mova    [r1+144], m0
+    mova    [r1+160], m0
+    mova    [r1+176], m0
+    mova    [r1+192], m0
+    mova    [r1+208], m0
+    mova    [r1+224], m0
+    mova    [r1+240], m0
+    CLIPW         m8, m0, [pw_pixel_max]
+    mova        [r0], m8
+    mova          m8, [pw_pixel_max]
+    STORE_DIFF16  m9, m1, m0, m8, r0+r2
+    lea           r0, [r0+r2*2]
+    STORE_DIFF16 m10, m2, m0, m8, r0
+    STORE_DIFF16 m11, m3, m0, m8, r0+r2
+    lea           r0, [r0+r2*2]
+    STORE_DIFF16 m12, m4, m0, m8, r0
+    STORE_DIFF16 m13, m5, m0, m8, r0+r2
+    lea           r0, [r0+r2*2]
+    STORE_DIFF16 m14, m6, m0, m8, r0
+    STORE_DIFF16 m15, m7, m0, m8, r0+r2
+%else
+    IDCT8_ADD_SSE_START r1,    rsp
+    IDCT8_ADD_SSE_START r1+16, rsp+128
+    lea           r3, [r0+8]
+    IDCT8_ADD_SSE_END r0, rsp,    r2
+    IDCT8_ADD_SSE_END r3, rsp+16, r2
+    mova    [r1+  0], m7
+    mova    [r1+ 16], m7
+    mova    [r1+ 32], m7
+    mova    [r1+ 48], m7
+    mova    [r1+ 64], m7
+    mova    [r1+ 80], m7
+    mova    [r1+ 96], m7
+    mova    [r1+112], m7
+    mova    [r1+128], m7
+    mova    [r1+144], m7
+    mova    [r1+160], m7
+    mova    [r1+176], m7
+    mova    [r1+192], m7
+    mova    [r1+208], m7
+    mova    [r1+224], m7
+    mova    [r1+240], m7
+%endif ; ARCH_X86_64
+
+    add          rsp, pad
+    ret
+%endmacro
+
+INIT_XMM sse2
+IDCT8_ADD
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT8_ADD
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset,
+;                            int16_t *block, int stride,
+;                            const uint8_t nnzc[6*8])
+;-----------------------------------------------------------------------------
+;;;;;;; NO FATE SAMPLES TRIGGER THIS
+%macro IDCT8_ADD4_OP 2
+    cmp       byte [r4+%2], 0
+    jz .skipblock%1
+    mov      r0d, [r6+%1*4]
+    add       r0, r5
+    call h264_idct8_add1_10 %+ SUFFIX
+.skipblock%1:
+%if %1<12
+    add       r1, 256
+%endif
+%endmacro
+
+%macro IDCT8_ADD4 0
+cglobal h264_idct8_add4_10, 0,7,16
+    movsxdifnidn r3, r3d
+    %assign pad 16-gprsize-(stack_offset&15)
+    SUB      rsp, pad
+    mov       r5, r0mp
+    mov       r6, r1mp
+    mov       r1, r2mp
+    mov      r2d, r3m
+    movifnidn r4, r4mp
+    IDCT8_ADD4_OP  0, 4+1*8
+    IDCT8_ADD4_OP  4, 6+1*8
+    IDCT8_ADD4_OP  8, 4+3*8
+    IDCT8_ADD4_OP 12, 6+3*8
+    ADD       rsp, pad
+    RET
+%endmacro ; IDCT8_ADD4
+
+INIT_XMM sse2
+IDCT8_ADD4
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT8_ADD4
+%endif
--- a/externals/ffmpeg/libavcodec/x86/h264_intrapred.asm
+++ b/externals/ffmpeg/libavcodec/x86/h264_intrapred.asm
--- a/externals/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm
+++ b/externals/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm
--- a/externals/ffmpeg/libavcodec/x86/h264_intrapred_init.c
+++ b/externals/ffmpeg/libavcodec/x86/h264_intrapred_init.c
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/h264pred.h"
+
+#define PRED4x4(TYPE, DEPTH, OPT) \
+void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+                                                    const uint8_t *topright, \
+                                                    ptrdiff_t stride);
+
+PRED4x4(dc, 10, mmxext)
+PRED4x4(down_left, 10, sse2)
+PRED4x4(down_left, 10, avx)
+PRED4x4(down_right, 10, sse2)
+PRED4x4(down_right, 10, ssse3)
+PRED4x4(down_right, 10, avx)
+PRED4x4(vertical_left, 10, sse2)
+PRED4x4(vertical_left, 10, avx)
+PRED4x4(vertical_right, 10, sse2)
+PRED4x4(vertical_right, 10, ssse3)
+PRED4x4(vertical_right, 10, avx)
+PRED4x4(horizontal_up, 10, mmxext)
+PRED4x4(horizontal_down, 10, sse2)
+PRED4x4(horizontal_down, 10, ssse3)
+PRED4x4(horizontal_down, 10, avx)
+
+#define PRED8x8(TYPE, DEPTH, OPT) \
+void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+                                                    ptrdiff_t stride);
+
+PRED8x8(dc, 10, mmxext)
+PRED8x8(dc, 10, sse2)
+PRED8x8(top_dc, 10, sse2)
+PRED8x8(plane, 10, sse2)
+PRED8x8(vertical, 10, sse2)
+PRED8x8(horizontal, 10, sse2)
+
+#define PRED8x8L(TYPE, DEPTH, OPT)\
+void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+                                                     int has_topleft, \
+                                                     int has_topright, \
+                                                     ptrdiff_t stride);
+
+PRED8x8L(dc, 10, sse2)
+PRED8x8L(dc, 10, avx)
+PRED8x8L(128_dc, 10, mmxext)
+PRED8x8L(128_dc, 10, sse2)
+PRED8x8L(top_dc, 10, sse2)
+PRED8x8L(top_dc, 10, avx)
+PRED8x8L(vertical, 10, sse2)
+PRED8x8L(vertical, 10, avx)
+PRED8x8L(horizontal, 10, sse2)
+PRED8x8L(horizontal, 10, ssse3)
+PRED8x8L(horizontal, 10, avx)
+PRED8x8L(down_left, 10, sse2)
+PRED8x8L(down_left, 10, ssse3)
+PRED8x8L(down_left, 10, avx)
+PRED8x8L(down_right, 10, sse2)
+PRED8x8L(down_right, 10, ssse3)
+PRED8x8L(down_right, 10, avx)
+PRED8x8L(vertical_right, 10, sse2)
+PRED8x8L(vertical_right, 10, ssse3)
+PRED8x8L(vertical_right, 10, avx)
+PRED8x8L(horizontal_up, 10, sse2)
+PRED8x8L(horizontal_up, 10, ssse3)
+PRED8x8L(horizontal_up, 10, avx)
+
+#define PRED16x16(TYPE, DEPTH, OPT)\
+void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+                                                      ptrdiff_t stride);
+
+PRED16x16(dc, 10, mmxext)
+PRED16x16(dc, 10, sse2)
+PRED16x16(top_dc, 10, mmxext)
+PRED16x16(top_dc, 10, sse2)
+PRED16x16(128_dc, 10, mmxext)
+PRED16x16(128_dc, 10, sse2)
+PRED16x16(left_dc, 10, mmxext)
+PRED16x16(left_dc, 10, sse2)
+PRED16x16(vertical, 10, mmxext)
+PRED16x16(vertical, 10, sse2)
+PRED16x16(horizontal, 10, mmxext)
+PRED16x16(horizontal, 10, sse2)
+
+/* 8-bit versions */
+PRED16x16(vertical, 8, mmx)
+PRED16x16(vertical, 8, sse)
+PRED16x16(horizontal, 8, mmx)
+PRED16x16(horizontal, 8, mmxext)
+PRED16x16(horizontal, 8, ssse3)
+PRED16x16(dc, 8, mmxext)
+PRED16x16(dc, 8, sse2)
+PRED16x16(dc, 8, ssse3)
+PRED16x16(plane_h264, 8, mmx)
+PRED16x16(plane_h264, 8, mmxext)
+PRED16x16(plane_h264, 8, sse2)
+PRED16x16(plane_h264, 8, ssse3)
+PRED16x16(plane_rv40, 8, mmx)
+PRED16x16(plane_rv40, 8, mmxext)
+PRED16x16(plane_rv40, 8, sse2)
+PRED16x16(plane_rv40, 8, ssse3)
+PRED16x16(plane_svq3, 8, mmx)
+PRED16x16(plane_svq3, 8, mmxext)
+PRED16x16(plane_svq3, 8, sse2)
+PRED16x16(plane_svq3, 8, ssse3)
+PRED16x16(tm_vp8, 8, mmx)
+PRED16x16(tm_vp8, 8, mmxext)
+PRED16x16(tm_vp8, 8, sse2)
+PRED16x16(tm_vp8, 8, avx2)
+
+PRED8x8(top_dc, 8, mmxext)
+PRED8x8(dc_rv40, 8, mmxext)
+PRED8x8(dc, 8, mmxext)
+PRED8x8(vertical, 8, mmx)
+PRED8x8(horizontal, 8, mmx)
+PRED8x8(horizontal, 8, mmxext)
+PRED8x8(horizontal, 8, ssse3)
+PRED8x8(plane, 8, mmx)
+PRED8x8(plane, 8, mmxext)
+PRED8x8(plane, 8, sse2)
+PRED8x8(plane, 8, ssse3)
+PRED8x8(tm_vp8, 8, mmx)
+PRED8x8(tm_vp8, 8, mmxext)
+PRED8x8(tm_vp8, 8, sse2)
+PRED8x8(tm_vp8, 8, ssse3)
+
+PRED8x8L(top_dc, 8, mmxext)
+PRED8x8L(top_dc, 8, ssse3)
+PRED8x8L(dc, 8, mmxext)
+PRED8x8L(dc, 8, ssse3)
+PRED8x8L(horizontal, 8, mmxext)
+PRED8x8L(horizontal, 8, ssse3)
+PRED8x8L(vertical, 8, mmxext)
+PRED8x8L(vertical, 8, ssse3)
+PRED8x8L(down_left, 8, mmxext)
+PRED8x8L(down_left, 8, sse2)
+PRED8x8L(down_left, 8, ssse3)
+PRED8x8L(down_right, 8, mmxext)
+PRED8x8L(down_right, 8, sse2)
+PRED8x8L(down_right, 8, ssse3)
+PRED8x8L(vertical_right, 8, mmxext)
+PRED8x8L(vertical_right, 8, sse2)
+PRED8x8L(vertical_right, 8, ssse3)
+PRED8x8L(vertical_left, 8, sse2)
+PRED8x8L(vertical_left, 8, ssse3)
+PRED8x8L(horizontal_up, 8, mmxext)
+PRED8x8L(horizontal_up, 8, ssse3)
+PRED8x8L(horizontal_down, 8, mmxext)
+PRED8x8L(horizontal_down, 8, sse2)
+PRED8x8L(horizontal_down, 8, ssse3)
+
+PRED4x4(dc, 8, mmxext)
+PRED4x4(down_left, 8, mmxext)
+PRED4x4(down_right, 8, mmxext)
+PRED4x4(vertical_left, 8, mmxext)
+PRED4x4(vertical_right, 8, mmxext)
+PRED4x4(horizontal_up, 8, mmxext)
+PRED4x4(horizontal_down, 8, mmxext)
+PRED4x4(tm_vp8, 8, mmx)
+PRED4x4(tm_vp8, 8, mmxext)
+PRED4x4(tm_vp8, 8, ssse3)
+PRED4x4(vertical_vp8, 8, mmxext)
+
+av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
+                                   const int bit_depth,
+                                   const int chroma_format_idc)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (bit_depth == 8) {
+        if (EXTERNAL_MMX(cpu_flags)) {
+            h->pred16x16[VERT_PRED8x8         ] = ff_pred16x16_vertical_8_mmx;
+            h->pred16x16[HOR_PRED8x8          ] = ff_pred16x16_horizontal_8_mmx;
+            if (chroma_format_idc <= 1) {
+                h->pred8x8  [VERT_PRED8x8     ] = ff_pred8x8_vertical_8_mmx;
+                h->pred8x8  [HOR_PRED8x8      ] = ff_pred8x8_horizontal_8_mmx;
+            }
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+                h->pred16x16[PLANE_PRED8x8    ] = ff_pred16x16_tm_vp8_8_mmx;
+                h->pred8x8  [PLANE_PRED8x8    ] = ff_pred8x8_tm_vp8_8_mmx;
+                h->pred4x4  [TM_VP8_PRED      ] = ff_pred4x4_tm_vp8_8_mmx;
+            } else {
+                if (chroma_format_idc <= 1)
+                    h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx;
+                if (codec_id == AV_CODEC_ID_SVQ3) {
+                    if (cpu_flags & AV_CPU_FLAG_CMOV)
+                        h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx;
+                } else if (codec_id == AV_CODEC_ID_RV40) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx;
+                } else {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx;
+                }
+            }
+        }
+
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            h->pred16x16[HOR_PRED8x8            ] = ff_pred16x16_horizontal_8_mmxext;
+            h->pred16x16[DC_PRED8x8             ] = ff_pred16x16_dc_8_mmxext;
+            if (chroma_format_idc <= 1)
+                h->pred8x8[HOR_PRED8x8          ] = ff_pred8x8_horizontal_8_mmxext;
+            h->pred8x8l [TOP_DC_PRED            ] = ff_pred8x8l_top_dc_8_mmxext;
+            h->pred8x8l [DC_PRED                ] = ff_pred8x8l_dc_8_mmxext;
+            h->pred8x8l [HOR_PRED               ] = ff_pred8x8l_horizontal_8_mmxext;
+            h->pred8x8l [VERT_PRED              ] = ff_pred8x8l_vertical_8_mmxext;
+            h->pred8x8l [DIAG_DOWN_RIGHT_PRED   ] = ff_pred8x8l_down_right_8_mmxext;
+            h->pred8x8l [VERT_RIGHT_PRED        ] = ff_pred8x8l_vertical_right_8_mmxext;
+            h->pred8x8l [HOR_UP_PRED            ] = ff_pred8x8l_horizontal_up_8_mmxext;
+            h->pred8x8l [DIAG_DOWN_LEFT_PRED    ] = ff_pred8x8l_down_left_8_mmxext;
+            h->pred8x8l [HOR_DOWN_PRED          ] = ff_pred8x8l_horizontal_down_8_mmxext;
+            h->pred4x4  [DIAG_DOWN_RIGHT_PRED   ] = ff_pred4x4_down_right_8_mmxext;
+            h->pred4x4  [VERT_RIGHT_PRED        ] = ff_pred4x4_vertical_right_8_mmxext;
+            h->pred4x4  [HOR_DOWN_PRED          ] = ff_pred4x4_horizontal_down_8_mmxext;
+            h->pred4x4  [DC_PRED                ] = ff_pred4x4_dc_8_mmxext;
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8 ||
+                codec_id == AV_CODEC_ID_H264) {
+                h->pred4x4  [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext;
+            }
+            if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
+                h->pred4x4  [VERT_LEFT_PRED     ] = ff_pred4x4_vertical_left_8_mmxext;
+            }
+            if (codec_id != AV_CODEC_ID_RV40) {
+                h->pred4x4  [HOR_UP_PRED        ] = ff_pred4x4_horizontal_up_8_mmxext;
+            }
+            if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
+                if (chroma_format_idc <= 1) {
+                    h->pred8x8[TOP_DC_PRED8x8   ] = ff_pred8x8_top_dc_8_mmxext;
+                    h->pred8x8[DC_PRED8x8       ] = ff_pred8x8_dc_8_mmxext;
+                }
+            }
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+                h->pred16x16[PLANE_PRED8x8      ] = ff_pred16x16_tm_vp8_8_mmxext;
+                h->pred8x8  [DC_PRED8x8         ] = ff_pred8x8_dc_rv40_8_mmxext;
+                h->pred8x8  [PLANE_PRED8x8      ] = ff_pred8x8_tm_vp8_8_mmxext;
+                h->pred4x4  [TM_VP8_PRED        ] = ff_pred4x4_tm_vp8_8_mmxext;
+                h->pred4x4  [VERT_PRED          ] = ff_pred4x4_vertical_vp8_8_mmxext;
+            } else {
+                if (chroma_format_idc <= 1)
+                    h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext;
+                if (codec_id == AV_CODEC_ID_SVQ3) {
+                    h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_svq3_8_mmxext;
+                } else if (codec_id == AV_CODEC_ID_RV40) {
+                    h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_rv40_8_mmxext;
+                } else {
+                    h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_h264_8_mmxext;
+                }
+            }
+        }
+
+        if (EXTERNAL_SSE(cpu_flags)) {
+            h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse;
+        }
+
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            h->pred16x16[DC_PRED8x8           ] = ff_pred16x16_dc_8_sse2;
+            h->pred8x8l [DIAG_DOWN_LEFT_PRED  ] = ff_pred8x8l_down_left_8_sse2;
+            h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2;
+            h->pred8x8l [VERT_RIGHT_PRED      ] = ff_pred8x8l_vertical_right_8_sse2;
+            h->pred8x8l [VERT_LEFT_PRED       ] = ff_pred8x8l_vertical_left_8_sse2;
+            h->pred8x8l [HOR_DOWN_PRED        ] = ff_pred8x8l_horizontal_down_8_sse2;
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+                h->pred16x16[PLANE_PRED8x8    ] = ff_pred16x16_tm_vp8_8_sse2;
+                h->pred8x8  [PLANE_PRED8x8    ] = ff_pred8x8_tm_vp8_8_sse2;
+            } else {
+                if (chroma_format_idc <= 1)
+                    h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2;
+                if (codec_id == AV_CODEC_ID_SVQ3) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2;
+                } else if (codec_id == AV_CODEC_ID_RV40) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2;
+                } else {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2;
+                }
+            }
+        }
+
+        if (EXTERNAL_SSSE3(cpu_flags)) {
+            h->pred16x16[HOR_PRED8x8          ] = ff_pred16x16_horizontal_8_ssse3;
+            h->pred16x16[DC_PRED8x8           ] = ff_pred16x16_dc_8_ssse3;
+            if (chroma_format_idc <= 1)
+                h->pred8x8  [HOR_PRED8x8      ] = ff_pred8x8_horizontal_8_ssse3;
+            h->pred8x8l [TOP_DC_PRED          ] = ff_pred8x8l_top_dc_8_ssse3;
+            h->pred8x8l [DC_PRED              ] = ff_pred8x8l_dc_8_ssse3;
+            h->pred8x8l [HOR_PRED             ] = ff_pred8x8l_horizontal_8_ssse3;
+            h->pred8x8l [VERT_PRED            ] = ff_pred8x8l_vertical_8_ssse3;
+            h->pred8x8l [DIAG_DOWN_LEFT_PRED  ] = ff_pred8x8l_down_left_8_ssse3;
+            h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3;
+            h->pred8x8l [VERT_RIGHT_PRED      ] = ff_pred8x8l_vertical_right_8_ssse3;
+            h->pred8x8l [VERT_LEFT_PRED       ] = ff_pred8x8l_vertical_left_8_ssse3;
+            h->pred8x8l [HOR_UP_PRED          ] = ff_pred8x8l_horizontal_up_8_ssse3;
+            h->pred8x8l [HOR_DOWN_PRED        ] = ff_pred8x8l_horizontal_down_8_ssse3;
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+                h->pred8x8  [PLANE_PRED8x8    ] = ff_pred8x8_tm_vp8_8_ssse3;
+                h->pred4x4  [TM_VP8_PRED      ] = ff_pred4x4_tm_vp8_8_ssse3;
+            } else {
+                if (chroma_format_idc <= 1)
+                    h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3;
+                if (codec_id == AV_CODEC_ID_SVQ3) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3;
+                } else if (codec_id == AV_CODEC_ID_RV40) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3;
+                } else {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3;
+                }
+            }
+        }
+
+        if(EXTERNAL_AVX2(cpu_flags)){
+            if (codec_id == AV_CODEC_ID_VP8) {
+                h->pred16x16[PLANE_PRED8x8    ] = ff_pred16x16_tm_vp8_8_avx2;
+            }
+        }
+    } else if (bit_depth == 10) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            h->pred4x4[DC_PRED             ] = ff_pred4x4_dc_10_mmxext;
+            h->pred4x4[HOR_UP_PRED         ] = ff_pred4x4_horizontal_up_10_mmxext;
+
+            if (chroma_format_idc <= 1)
+                h->pred8x8[DC_PRED8x8      ] = ff_pred8x8_dc_10_mmxext;
+
+            h->pred8x8l[DC_128_PRED        ] = ff_pred8x8l_128_dc_10_mmxext;
+
+            h->pred16x16[DC_PRED8x8        ] = ff_pred16x16_dc_10_mmxext;
+            h->pred16x16[TOP_DC_PRED8x8    ] = ff_pred16x16_top_dc_10_mmxext;
+            h->pred16x16[DC_128_PRED8x8    ] = ff_pred16x16_128_dc_10_mmxext;
+            h->pred16x16[LEFT_DC_PRED8x8   ] = ff_pred16x16_left_dc_10_mmxext;
+            h->pred16x16[VERT_PRED8x8      ] = ff_pred16x16_vertical_10_mmxext;
+            h->pred16x16[HOR_PRED8x8       ] = ff_pred16x16_horizontal_10_mmxext;
+        }
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2;
+            h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2;
+            h->pred4x4[VERT_LEFT_PRED      ] = ff_pred4x4_vertical_left_10_sse2;
+            h->pred4x4[VERT_RIGHT_PRED     ] = ff_pred4x4_vertical_right_10_sse2;
+            h->pred4x4[HOR_DOWN_PRED       ] = ff_pred4x4_horizontal_down_10_sse2;
+
+            if (chroma_format_idc <= 1) {
+                h->pred8x8[DC_PRED8x8      ] = ff_pred8x8_dc_10_sse2;
+                h->pred8x8[TOP_DC_PRED8x8  ] = ff_pred8x8_top_dc_10_sse2;
+                h->pred8x8[PLANE_PRED8x8   ] = ff_pred8x8_plane_10_sse2;
+                h->pred8x8[VERT_PRED8x8    ] = ff_pred8x8_vertical_10_sse2;
+                h->pred8x8[HOR_PRED8x8     ] = ff_pred8x8_horizontal_10_sse2;
+            }
+
+            h->pred8x8l[VERT_PRED           ] = ff_pred8x8l_vertical_10_sse2;
+            h->pred8x8l[HOR_PRED            ] = ff_pred8x8l_horizontal_10_sse2;
+            h->pred8x8l[DC_PRED             ] = ff_pred8x8l_dc_10_sse2;
+            h->pred8x8l[DC_128_PRED         ] = ff_pred8x8l_128_dc_10_sse2;
+            h->pred8x8l[TOP_DC_PRED         ] = ff_pred8x8l_top_dc_10_sse2;
+            h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2;
+            h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2;
+            h->pred8x8l[VERT_RIGHT_PRED     ] = ff_pred8x8l_vertical_right_10_sse2;
+            h->pred8x8l[HOR_UP_PRED         ] = ff_pred8x8l_horizontal_up_10_sse2;
+
+            h->pred16x16[DC_PRED8x8        ] = ff_pred16x16_dc_10_sse2;
+            h->pred16x16[TOP_DC_PRED8x8    ] = ff_pred16x16_top_dc_10_sse2;
+            h->pred16x16[DC_128_PRED8x8    ] = ff_pred16x16_128_dc_10_sse2;
+            h->pred16x16[LEFT_DC_PRED8x8   ] = ff_pred16x16_left_dc_10_sse2;
+            h->pred16x16[VERT_PRED8x8      ] = ff_pred16x16_vertical_10_sse2;
+            h->pred16x16[HOR_PRED8x8       ] = ff_pred16x16_horizontal_10_sse2;
+        }
+        if (EXTERNAL_SSSE3(cpu_flags)) {
+            h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3;
+            h->pred4x4[VERT_RIGHT_PRED     ] = ff_pred4x4_vertical_right_10_ssse3;
+            h->pred4x4[HOR_DOWN_PRED       ] = ff_pred4x4_horizontal_down_10_ssse3;
+
+            h->pred8x8l[HOR_PRED            ] = ff_pred8x8l_horizontal_10_ssse3;
+            h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
+            h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3;
+            h->pred8x8l[VERT_RIGHT_PRED     ] = ff_pred8x8l_vertical_right_10_ssse3;
+            h->pred8x8l[HOR_UP_PRED         ] = ff_pred8x8l_horizontal_up_10_ssse3;
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
+            h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
+            h->pred4x4[VERT_LEFT_PRED      ] = ff_pred4x4_vertical_left_10_avx;
+            h->pred4x4[VERT_RIGHT_PRED     ] = ff_pred4x4_vertical_right_10_avx;
+            h->pred4x4[HOR_DOWN_PRED       ] = ff_pred4x4_horizontal_down_10_avx;
+
+            h->pred8x8l[VERT_PRED           ] = ff_pred8x8l_vertical_10_avx;
+            h->pred8x8l[HOR_PRED            ] = ff_pred8x8l_horizontal_10_avx;
+            h->pred8x8l[DC_PRED             ] = ff_pred8x8l_dc_10_avx;
+            h->pred8x8l[TOP_DC_PRED         ] = ff_pred8x8l_top_dc_10_avx;
+            h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
+            h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
+            h->pred8x8l[VERT_RIGHT_PRED     ] = ff_pred8x8l_vertical_right_10_avx;
+            h->pred8x8l[HOR_UP_PRED         ] = ff_pred8x8l_horizontal_up_10_avx;
+        }
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/h264_qpel.c
+++ b/externals/ffmpeg/libavcodec/x86/h264_qpel.c
@@ -0,0 +1,634 @@
+/*
+ * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
+ * Copyright (c) 2011 Daniel Kang
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/h264dec.h"
+#include "libavcodec/h264qpel.h"
+#include "libavcodec/pixels.h"
+#include "fpel.h"
+
+#if HAVE_X86ASM
+void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                              int dstStride, int src1Stride, int h);
+void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                              int dstStride, int src1Stride, int h);
+void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                              int dstStride, int src1Stride, int h);
+void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                              int dstStride, int src1Stride, int h);
+void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                               int dstStride, int src1Stride, int h);
+void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                               int dstStride, int src1Stride, int h);
+#define ff_put_pixels8_l2_sse2  ff_put_pixels8_l2_mmxext
+#define ff_avg_pixels8_l2_sse2  ff_avg_pixels8_l2_mmxext
+#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
+#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
+#define ff_put_pixels16_mmxext  ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext   ff_put_pixels8_mmx
+#define ff_put_pixels4_mmxext   ff_put_pixels4_mmx
+
+#define DEF_QPEL(OPNAME)\
+void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
+void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
+void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
+void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
+void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
+void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
+void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
+void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
+void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
+void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
+void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\
+void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);
+
+DEF_QPEL(avg)
+DEF_QPEL(put)
+
+static av_always_inline void ff_put_h264_qpel8or16_hv1_lowpass_mmxext(int16_t *tmp, const uint8_t *src, int tmpStride, int srcStride, int size)
+{
+    int w = (size + 8) >> 2;
+    src -= 2 * srcStride + 2;
+    while (w--) {
+        ff_put_h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);
+        tmp += 4;
+        src += 4;
+    }
+}
+
+#define QPEL_H264(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    int w=3;\
+    src -= 2*srcStride+2;\
+    while(w--){\
+        ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
+        tmp += 4;\
+        src += 4;\
+    }\
+    tmp -= 3*4;\
+    ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\
+    src -= 2*srcStride;\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
+    src += 4;\
+    dst += 4;\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
+    int w = size>>4;\
+    do{\
+    ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
+    tmp += 8;\
+    dst += 8;\
+    }while(w--);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+    src += 8*dstStride;\
+    dst += 8*dstStride;\
+    src2 += 8*src2Stride;\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+    ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
+    ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 16);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+    ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
+    ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
+}\
+
+
+#if ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+
+void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
+void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
+
+#else // ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+    src += 8*dstStride;\
+    dst += 8*dstStride;\
+    src2 += 8*src2Stride;\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}
+#endif // ARCH_X86_64
+
+#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
+QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+
+#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}
+
+static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
+                                                                 const uint8_t *src,
+                                                                 int tmpStride,
+                                                                 int srcStride,
+                                                                 int size)
+{
+    int w = (size+8)>>3;
+    src -= 2*srcStride+2;
+    while(w--){
+        ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
+        tmp += 8;
+        src += 8;
+    }
+}
+
+#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+    put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
+    ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
+}\
+
+#define ff_put_h264_qpel8_h_lowpass_l2_sse2  ff_put_h264_qpel8_h_lowpass_l2_mmxext
+#define ff_avg_h264_qpel8_h_lowpass_l2_sse2  ff_avg_h264_qpel8_h_lowpass_l2_mmxext
+#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
+#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
+
+#define ff_put_h264_qpel8_v_lowpass_ssse3  ff_put_h264_qpel8_v_lowpass_sse2
+#define ff_avg_h264_qpel8_v_lowpass_ssse3  ff_avg_h264_qpel8_v_lowpass_sse2
+#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
+#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
+
+#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
+#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
+
+#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
+H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
+
+static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride)
+{
+    ff_put_pixels16_sse2(dst, src, stride, 16);
+}
+static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride)
+{
+    ff_avg_pixels16_sse2(dst, src, stride, 16);
+}
+#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
+#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
+
+#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
+}\
+
+#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
+}\
+
+#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
+    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
+    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
+}\
+
+#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
+    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
+    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
+    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
+    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    av_assert2(((int)temp & 7) == 0);\
+    ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    av_assert2(((int)temp & 7) == 0);\
+    ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    av_assert2(((int)temp & 7) == 0);\
+    ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    av_assert2(((int)temp & 7) == 0);\
+    ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
+}\
+
+#define H264_MC_4816(MMX)\
+H264_MC(put_, 4, MMX, 8)\
+H264_MC(put_, 8, MMX, 8)\
+H264_MC(put_, 16,MMX, 8)\
+H264_MC(avg_, 4, MMX, 8)\
+H264_MC(avg_, 8, MMX, 8)\
+H264_MC(avg_, 16,MMX, 8)\
+
+#define H264_MC_816(QPEL, XMM)\
+QPEL(put_, 8, XMM, 16)\
+QPEL(put_, 16,XMM, 16)\
+QPEL(avg_, 8, XMM, 16)\
+QPEL(avg_, 16,XMM, 16)\
+
+QPEL_H264(put_,        PUT_OP, mmxext)
+QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
+QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
+QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
+QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
+QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
+QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
+QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
+QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
+QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
+
+H264_MC_4816(mmxext)
+H264_MC_816(H264_MC_V, sse2)
+H264_MC_816(H264_MC_HV, sse2)
+H264_MC_816(H264_MC_H, ssse3)
+H264_MC_816(H264_MC_HV, ssse3)
+
+
+//10bit
+#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
+void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
+    (uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(put,  4, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(avg,  4, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(put,  8, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(avg,  8, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
+
+#define LUMA_MC_816(DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(put,  8, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(avg,  8, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
+
+LUMA_MC_ALL(10, mc00, mmxext)
+LUMA_MC_ALL(10, mc10, mmxext)
+LUMA_MC_ALL(10, mc20, mmxext)
+LUMA_MC_ALL(10, mc30, mmxext)
+LUMA_MC_ALL(10, mc01, mmxext)
+LUMA_MC_ALL(10, mc11, mmxext)
+LUMA_MC_ALL(10, mc21, mmxext)
+LUMA_MC_ALL(10, mc31, mmxext)
+LUMA_MC_ALL(10, mc02, mmxext)
+LUMA_MC_ALL(10, mc12, mmxext)
+LUMA_MC_ALL(10, mc22, mmxext)
+LUMA_MC_ALL(10, mc32, mmxext)
+LUMA_MC_ALL(10, mc03, mmxext)
+LUMA_MC_ALL(10, mc13, mmxext)
+LUMA_MC_ALL(10, mc23, mmxext)
+LUMA_MC_ALL(10, mc33, mmxext)
+
+LUMA_MC_816(10, mc00, sse2)
+LUMA_MC_816(10, mc10, sse2)
+LUMA_MC_816(10, mc10, sse2_cache64)
+LUMA_MC_816(10, mc10, ssse3_cache64)
+LUMA_MC_816(10, mc20, sse2)
+LUMA_MC_816(10, mc20, sse2_cache64)
+LUMA_MC_816(10, mc20, ssse3_cache64)
+LUMA_MC_816(10, mc30, sse2)
+LUMA_MC_816(10, mc30, sse2_cache64)
+LUMA_MC_816(10, mc30, ssse3_cache64)
+LUMA_MC_816(10, mc01, sse2)
+LUMA_MC_816(10, mc11, sse2)
+LUMA_MC_816(10, mc21, sse2)
+LUMA_MC_816(10, mc31, sse2)
+LUMA_MC_816(10, mc02, sse2)
+LUMA_MC_816(10, mc12, sse2)
+LUMA_MC_816(10, mc22, sse2)
+LUMA_MC_816(10, mc32, sse2)
+LUMA_MC_816(10, mc03, sse2)
+LUMA_MC_816(10, mc13, sse2)
+LUMA_MC_816(10, mc23, sse2)
+LUMA_MC_816(10, mc33, sse2)
+
+#define QPEL16_OPMC(OP, MC, MMX)\
+void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride){\
+    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst   , src   , stride);\
+    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
+    src += 8*stride;\
+    dst += 8*stride;\
+    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst   , src   , stride);\
+    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
+}
+
+#define QPEL16_OP(MC, MMX)\
+QPEL16_OPMC(put, MC, MMX)\
+QPEL16_OPMC(avg, MC, MMX)
+
+#define QPEL16(MMX)\
+QPEL16_OP(mc00, MMX)\
+QPEL16_OP(mc01, MMX)\
+QPEL16_OP(mc02, MMX)\
+QPEL16_OP(mc03, MMX)\
+QPEL16_OP(mc10, MMX)\
+QPEL16_OP(mc11, MMX)\
+QPEL16_OP(mc12, MMX)\
+QPEL16_OP(mc13, MMX)\
+QPEL16_OP(mc20, MMX)\
+QPEL16_OP(mc21, MMX)\
+QPEL16_OP(mc22, MMX)\
+QPEL16_OP(mc23, MMX)\
+QPEL16_OP(mc30, MMX)\
+QPEL16_OP(mc31, MMX)\
+QPEL16_OP(mc32, MMX)\
+QPEL16_OP(mc33, MMX)
+
+#if ARCH_X86_32 // ARCH_X86_64 implies SSE2+
+QPEL16(mmxext)
+#endif
+
+#endif /* HAVE_X86ASM */
+
+#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
+    do {                                                                     \
+    c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
+    } while (0)
+
+#define H264_QPEL_FUNCS(x, y, CPU)                                                            \
+    do {                                                                                      \
+        c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
+        c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc  ## x ## y ## _ ## CPU; \
+        c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
+        c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc  ## x ## y ## _ ## CPU; \
+    } while (0)
+
+#define H264_QPEL_FUNCS_10(x, y, CPU)                                                               \
+    do {                                                                                            \
+        c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
+        c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc  ## x ## y ## _10_ ## CPU; \
+        c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
+        c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc  ## x ## y ## _10_ ## CPU; \
+    } while (0)
+
+av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
+{
+#if HAVE_X86ASM
+    int high_bit_depth = bit_depth > 8;
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        if (!high_bit_depth) {
+            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
+            SET_QPEL_FUNCS(put_h264_qpel, 1,  8, mmxext, );
+            SET_QPEL_FUNCS(put_h264_qpel, 2,  4, mmxext, );
+            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
+            SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, mmxext, );
+            SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, mmxext, );
+        } else if (bit_depth == 10) {
+#if ARCH_X86_32
+            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
+            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
+            SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_mmxext, ff_);
+            SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 10_mmxext, ff_);
+#endif
+            SET_QPEL_FUNCS(put_h264_qpel, 2, 4,  10_mmxext, ff_);
+            SET_QPEL_FUNCS(avg_h264_qpel, 2, 4,  10_mmxext, ff_);
+        }
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        if (!high_bit_depth) {
+            H264_QPEL_FUNCS(0, 1, sse2);
+            H264_QPEL_FUNCS(0, 2, sse2);
+            H264_QPEL_FUNCS(0, 3, sse2);
+            H264_QPEL_FUNCS(1, 1, sse2);
+            H264_QPEL_FUNCS(1, 2, sse2);
+            H264_QPEL_FUNCS(1, 3, sse2);
+            H264_QPEL_FUNCS(2, 1, sse2);
+            H264_QPEL_FUNCS(2, 2, sse2);
+            H264_QPEL_FUNCS(2, 3, sse2);
+            H264_QPEL_FUNCS(3, 1, sse2);
+            H264_QPEL_FUNCS(3, 2, sse2);
+            H264_QPEL_FUNCS(3, 3, sse2);
+        }
+
+        if (bit_depth == 10) {
+            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
+            SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_sse2, ff_);
+            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
+            SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 10_sse2, ff_);
+            H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
+            H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
+            H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
+        }
+    }
+
+    if (EXTERNAL_SSE2_FAST(cpu_flags)) {
+        if (!high_bit_depth) {
+            H264_QPEL_FUNCS(0, 0, sse2);
+        }
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        if (!high_bit_depth) {
+            H264_QPEL_FUNCS(1, 0, ssse3);
+            H264_QPEL_FUNCS(1, 1, ssse3);
+            H264_QPEL_FUNCS(1, 2, ssse3);
+            H264_QPEL_FUNCS(1, 3, ssse3);
+            H264_QPEL_FUNCS(2, 0, ssse3);
+            H264_QPEL_FUNCS(2, 1, ssse3);
+            H264_QPEL_FUNCS(2, 2, ssse3);
+            H264_QPEL_FUNCS(2, 3, ssse3);
+            H264_QPEL_FUNCS(3, 0, ssse3);
+            H264_QPEL_FUNCS(3, 1, ssse3);
+            H264_QPEL_FUNCS(3, 2, ssse3);
+            H264_QPEL_FUNCS(3, 3, ssse3);
+        }
+
+        if (bit_depth == 10) {
+            H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
+            H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
+            H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
+        }
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        /* AVX implies 64 byte cache lines without the need to avoid unaligned
+         * memory accesses that cross the boundary between two cache lines.
+         * TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid
+         * having to treat SSE2 functions with such properties as AVX. */
+        if (bit_depth == 10) {
+            H264_QPEL_FUNCS_10(1, 0, sse2);
+            H264_QPEL_FUNCS_10(2, 0, sse2);
+            H264_QPEL_FUNCS_10(3, 0, sse2);
+        }
+    }
+#endif
+}
--- a/externals/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm
+++ b/externals/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm
@@ -0,0 +1,884 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
+;*****************************************************************************
+;* Copyright (C) 2011 x264 project
+;*
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+cextern pd_65535
+cextern pw_1023
+%define pw_pixel_max pw_1023
+cextern pw_16
+cextern pw_1
+cextern pb_0
+
+pad10: times 8 dw 10*1023
+pad20: times 8 dw 20*1023
+pad30: times 8 dw 30*1023
+depad: times 4 dd 32*20*1023 + 512
+depad2: times 8 dw 20*1023 + 16*1022 + 16
+unpad: times 8 dw 16*1022/32 ; needs to be mod 16
+
+tap1: times 4 dw  1, -5
+tap2: times 4 dw 20, 20
+tap3: times 4 dw -5,  1
+
+SECTION .text
+
+
+%macro AVG_MOV 2
+    pavgw %2, %1
+    mova  %1, %2
+%endmacro
+
+%macro ADDW 3
+%if mmsize == 8
+    paddw %1, %2
+%else
+    movu  %3, %2
+    paddw %1, %3
+%endif
+%endmacro
+
+%macro FILT_H 4
+    paddw  %1, %4
+    psubw  %1, %2  ; a-b
+    psraw  %1, 2   ; (a-b)/4
+    psubw  %1, %2  ; (a-b)/4-b
+    paddw  %1, %3  ; (a-b)/4-b+c
+    psraw  %1, 2   ; ((a-b)/4-b+c)/4
+    paddw  %1, %3  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+%endmacro
+
+%macro PRELOAD_V 0
+    lea      r3, [r2*3]
+    sub      r1, r3
+    movu     m0, [r1+r2]
+    movu     m1, [r1+r2*2]
+    add      r1, r3
+    movu     m2, [r1]
+    movu     m3, [r1+r2]
+    movu     m4, [r1+r2*2]
+    add      r1, r3
+%endmacro
+
+%macro FILT_V 8
+    movu     %6, [r1]
+    paddw    %1, %6
+    mova     %7, %2
+    paddw    %7, %5
+    mova     %8, %3
+    paddw    %8, %4
+    FILT_H   %1, %7, %8, [pw_16]
+    psraw    %1, 1
+    CLIPW    %1, [pb_0], [pw_pixel_max]
+%endmacro
+
+%macro MC 1
+%define OP_MOV mova
+INIT_MMX mmxext
+%1 put, 4
+INIT_XMM sse2
+%1 put, 8
+
+%define OP_MOV AVG_MOV
+INIT_MMX mmxext
+%1 avg, 4
+INIT_XMM sse2
+%1 avg, 8
+%endmacro
+
+%macro MCAxA_OP 7
+%if ARCH_X86_32
+cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+    mov  r0, r0m
+    mov  r1, r1m
+    add  r0, %3*2
+    add  r1, %3*2
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+    mov  r0, r0m
+    mov  r1, r1m
+    lea  r0, [r0+r2*%3]
+    lea  r1, [r1+r2*%3]
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+    mov  r0, r0m
+    mov  r1, r1m
+    lea  r0, [r0+r2*%3+%3*2]
+    lea  r1, [r1+r2*%3+%3*2]
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+    RET
+%else ; ARCH_X86_64
+cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
+    mov r%6, r0
+%assign p1 %6+1
+    mov r %+ p1, r1
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+    lea  r0, [r%6+%3*2]
+    lea  r1, [r %+ p1+%3*2]
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+    lea  r0, [r%6+r2*%3]
+    lea  r1, [r %+ p1+r2*%3]
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+    lea  r0, [r%6+r2*%3+%3*2]
+    lea  r1, [r %+ p1+r2*%3+%3*2]
+%if UNIX64 == 0 ; fall through to function
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+    RET
+%endif
+%endif
+%endmacro
+
+;cpu, put/avg, mc, 4/8, ...
+%macro cglobal_mc 6
+%assign i %3*2
+%if ARCH_X86_32 || cpuflag(sse2)
+MCAxA_OP %1, %2, %3, i, %4,%5,%6
+%endif
+
+cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
+%if UNIX64 == 0 ; no prologue or epilogue for UNIX64
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+    RET
+%endif
+
+stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro COPY4 0
+    movu          m0, [r1     ]
+    OP_MOV [r0     ], m0
+    movu          m0, [r1+r2  ]
+    OP_MOV [r0+r2  ], m0
+    movu          m0, [r1+r2*2]
+    OP_MOV [r0+r2*2], m0
+    movu          m0, [r1+r3  ]
+    OP_MOV [r0+r3  ], m0
+%endmacro
+
+%macro MC00 1
+INIT_MMX mmxext
+cglobal_mc %1, mc00, 4, 3,4,0
+    lea           r3, [r2*3]
+    COPY4
+    ret
+
+INIT_XMM sse2
+cglobal %1_h264_qpel8_mc00_10, 3,4
+    lea  r3, [r2*3]
+    COPY4
+    lea  r0, [r0+r2*4]
+    lea  r1, [r1+r2*4]
+    COPY4
+    RET
+
+cglobal %1_h264_qpel16_mc00_10, 3,4
+    mov r3d, 8
+.loop:
+    movu           m0, [r1      ]
+    movu           m1, [r1   +16]
+    OP_MOV [r0      ], m0
+    OP_MOV [r0   +16], m1
+    movu           m0, [r1+r2   ]
+    movu           m1, [r1+r2+16]
+    OP_MOV [r0+r2   ], m0
+    OP_MOV [r0+r2+16], m1
+    lea            r0, [r0+r2*2]
+    lea            r1, [r1+r2*2]
+    dec r3d
+    jg .loop
+    REP_RET
+%endmacro
+
+%define OP_MOV mova
+MC00 put
+
+%define OP_MOV AVG_MOV
+MC00 avg
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC_CACHE 1
+%define OP_MOV mova
+INIT_MMX mmxext
+%1 put, 4
+INIT_XMM sse2, cache64
+%1 put, 8
+INIT_XMM ssse3, cache64
+%1 put, 8
+INIT_XMM sse2
+%1 put, 8
+
+%define OP_MOV AVG_MOV
+INIT_MMX mmxext
+%1 avg, 4
+INIT_XMM sse2, cache64
+%1 avg, 8
+INIT_XMM ssse3, cache64
+%1 avg, 8
+INIT_XMM sse2
+%1 avg, 8
+%endmacro
+
+%macro MC20 2
+cglobal_mc %1, mc20, %2, 3,4,9
+    mov     r3d, %2
+    mova     m1, [pw_pixel_max]
+%if num_mmregs > 8
+    mova     m8, [pw_16]
+    %define p16 m8
+%else
+    %define p16 [pw_16]
+%endif
+.nextrow:
+%if %0 == 4
+    movu     m2, [r1-4]
+    movu     m3, [r1-2]
+    movu     m4, [r1+0]
+    ADDW     m2, [r1+6], m5
+    ADDW     m3, [r1+4], m5
+    ADDW     m4, [r1+2], m5
+%else ; movu is slow on these processors
+%if mmsize==16
+    movu     m2, [r1-4]
+    movu     m0, [r1+6]
+    mova     m6, m0
+    psrldq   m0, 6
+
+    paddw    m6, m2
+    PALIGNR  m3, m0, m2, 2, m5
+    PALIGNR  m7, m0, m2, 8, m5
+    paddw    m3, m7
+    PALIGNR  m4, m0, m2, 4, m5
+    PALIGNR  m7, m0, m2, 6, m5
+    paddw    m4, m7
+    SWAP      2, 6
+%else
+    movu     m2, [r1-4]
+    movu     m6, [r1+4]
+    PALIGNR  m3, m6, m2, 2, m5
+    paddw    m3, m6
+    PALIGNR  m4, m6, m2, 4, m5
+    PALIGNR  m7, m6, m2, 6, m5
+    paddw    m4, m7
+    paddw    m2, [r1+6]
+%endif
+%endif
+
+    FILT_H   m2, m3, m4, p16
+    psraw    m2, 1
+    pxor     m0, m0
+    CLIPW    m2, m0, m1
+    OP_MOV [r0], m2
+    add      r0, r2
+    add      r1, r2
+    dec     r3d
+    jg .nextrow
+    rep ret
+%endmacro
+
+MC_CACHE MC20
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC30 2
+cglobal_mc %1, mc30, %2, 3,5,9
+    lea r4, [r1+2]
+    jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC_CACHE MC30
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC10 2
+cglobal_mc %1, mc10, %2, 3,5,9
+    mov      r4, r1
+.body:
+    mov     r3d, %2
+    mova     m1, [pw_pixel_max]
+%if num_mmregs > 8
+    mova     m8, [pw_16]
+    %define p16 m8
+%else
+    %define p16 [pw_16]
+%endif
+.nextrow:
+%if %0 == 4
+    movu     m2, [r1-4]
+    movu     m3, [r1-2]
+    movu     m4, [r1+0]
+    ADDW     m2, [r1+6], m5
+    ADDW     m3, [r1+4], m5
+    ADDW     m4, [r1+2], m5
+%else ; movu is slow on these processors
+%if mmsize==16
+    movu     m2, [r1-4]
+    movu     m0, [r1+6]
+    mova     m6, m0
+    psrldq   m0, 6
+
+    paddw    m6, m2
+    PALIGNR  m3, m0, m2, 2, m5
+    PALIGNR  m7, m0, m2, 8, m5
+    paddw    m3, m7
+    PALIGNR  m4, m0, m2, 4, m5
+    PALIGNR  m7, m0, m2, 6, m5
+    paddw    m4, m7
+    SWAP      2, 6
+%else
+    movu     m2, [r1-4]
+    movu     m6, [r1+4]
+    PALIGNR  m3, m6, m2, 2, m5
+    paddw    m3, m6
+    PALIGNR  m4, m6, m2, 4, m5
+    PALIGNR  m7, m6, m2, 6, m5
+    paddw    m4, m7
+    paddw    m2, [r1+6]
+%endif
+%endif
+
+    FILT_H   m2, m3, m4, p16
+    psraw    m2, 1
+    pxor     m0, m0
+    CLIPW    m2, m0, m1
+    movu     m3, [r4]
+    pavgw    m2, m3
+    OP_MOV [r0], m2
+    add      r0, r2
+    add      r1, r2
+    add      r4, r2
+    dec     r3d
+    jg .nextrow
+    rep ret
+%endmacro
+
+MC_CACHE MC10
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro V_FILT 10
+v_filt%9_%10_10:
+    add    r4, r2
+.no_addr4:
+    FILT_V m0, m1, m2, m3, m4, m5, m6, m7
+    add    r1, r2
+    add    r0, r2
+    ret
+%endmacro
+
+INIT_MMX mmxext
+RESET_MM_PERMUTATION
+%assign i 0
+%rep 4
+V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
+SWAP 0,1,2,3,4,5
+%assign i i+1
+%endrep
+
+INIT_XMM sse2
+RESET_MM_PERMUTATION
+%assign i 0
+%rep 6
+V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
+SWAP 0,1,2,3,4,5
+%assign i i+1
+%endrep
+
+%macro MC02 2
+cglobal_mc %1, mc02, %2, 3,4,8
+    PRELOAD_V
+
+    sub      r0, r2
+%assign j 0
+%rep %2
+    %assign i (j % 6)
+    call v_filt%2_ %+ i %+ _10.no_addr4
+    OP_MOV [r0], m0
+    SWAP 0,1,2,3,4,5
+    %assign j j+1
+%endrep
+    ret
+%endmacro
+
+MC MC02
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC01 2
+cglobal_mc %1, mc01, %2, 3,5,8
+    mov      r4, r1
+.body:
+    PRELOAD_V
+
+    sub      r4, r2
+    sub      r0, r2
+%assign j 0
+%rep %2
+    %assign i (j % 6)
+    call v_filt%2_ %+ i %+ _10
+    movu     m7, [r4]
+    pavgw    m0, m7
+    OP_MOV [r0], m0
+    SWAP 0,1,2,3,4,5
+    %assign j j+1
+%endrep
+    ret
+%endmacro
+
+MC MC01
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC03 2
+cglobal_mc %1, mc03, %2, 3,5,8
+    lea r4, [r1+r2]
+    jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC MC03
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro H_FILT_AVG 2-3
+h_filt%1_%2_10:
+;FILT_H with fewer registers and averaged with the FILT_V result
+;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
+;unfortunately I need three registers, so m5 will have to be re-read from memory
+    movu     m5, [r4-4]
+    ADDW     m5, [r4+6], m7
+    movu     m6, [r4-2]
+    ADDW     m6, [r4+4], m7
+    paddw    m5, [pw_16]
+    psubw    m5, m6  ; a-b
+    psraw    m5, 2   ; (a-b)/4
+    psubw    m5, m6  ; (a-b)/4-b
+    movu     m6, [r4+0]
+    ADDW     m6, [r4+2], m7
+    paddw    m5, m6  ; (a-b)/4-b+c
+    psraw    m5, 2   ; ((a-b)/4-b+c)/4
+    paddw    m5, m6  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    psraw    m5, 1
+    CLIPW    m5, [pb_0], [pw_pixel_max]
+;avg FILT_V, FILT_H
+    pavgw    m0, m5
+%if %0!=4
+    movu     m5, [r1+r5]
+%endif
+    ret
+%endmacro
+
+INIT_MMX mmxext
+RESET_MM_PERMUTATION
+%assign i 0
+%rep 3
+H_FILT_AVG 4, i
+SWAP 0,1,2,3,4,5
+%assign i i+1
+%endrep
+H_FILT_AVG 4, i, 0
+
+INIT_XMM sse2
+RESET_MM_PERMUTATION
+%assign i 0
+%rep 6
+%if i==1
+H_FILT_AVG 8, i, 0
+%else
+H_FILT_AVG 8, i
+%endif
+SWAP 0,1,2,3,4,5
+%assign i i+1
+%endrep
+
+%macro MC11 2
+; this REALLY needs x86_64
+cglobal_mc %1, mc11, %2, 3,6,8
+    mov      r4, r1
+.body:
+    PRELOAD_V
+
+    sub      r0, r2
+    sub      r4, r2
+    mov      r5, r2
+    neg      r5
+%assign j 0
+%rep %2
+    %assign i (j % 6)
+    call v_filt%2_ %+ i %+ _10
+    call h_filt%2_ %+ i %+ _10
+%if %2==8 && i==1
+    movu     m5, [r1+r5]
+%endif
+    OP_MOV [r0], m0
+    SWAP 0,1,2,3,4,5
+    %assign j j+1
+%endrep
+    ret
+%endmacro
+
+MC MC11
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC31 2
+cglobal_mc %1, mc31, %2, 3,6,8
+    mov r4, r1
+    add r1, 2
+    jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC MC31
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC13 2
+cglobal_mc %1, mc13, %2, 3,7,12
+    lea r4, [r1+r2]
+    jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC MC13
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC33 2
+cglobal_mc %1, mc33, %2, 3,6,8
+    lea r4, [r1+r2]
+    add r1, 2
+    jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC MC33
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro FILT_H2 3
+    psubw  %1, %2  ; a-b
+    psubw  %2, %3  ; b-c
+    psllw  %2, 2
+    psubw  %1, %2  ; a-5*b+4*c
+    psllw  %3, 4
+    paddw  %1, %3  ; a-5*b+20*c
+%endmacro
+
+%macro FILT_VNRD 8
+    movu     %6, [r1]
+    paddw    %1, %6
+    mova     %7, %2
+    paddw    %7, %5
+    mova     %8, %3
+    paddw    %8, %4
+    FILT_H2  %1, %7, %8
+%endmacro
+
+%macro HV 1
+%if mmsize==16
+%define PAD 12
+%define COUNT 2
+%else
+%define PAD 4
+%define COUNT 3
+%endif
+put_hv%1_10:
+    neg      r2           ; This actually saves instructions
+    lea      r1, [r1+r2*2-mmsize+PAD]
+    lea      r4, [rsp+PAD+gprsize]
+    mov     r3d, COUNT
+.v_loop:
+    movu     m0, [r1]
+    sub      r1, r2
+    movu     m1, [r1]
+    sub      r1, r2
+    movu     m2, [r1]
+    sub      r1, r2
+    movu     m3, [r1]
+    sub      r1, r2
+    movu     m4, [r1]
+    sub      r1, r2
+%assign i 0
+%rep %1-1
+    FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
+    psubw    m0, [pad20]
+    movu     [r4+i*mmsize*3], m0
+    sub      r1, r2
+    SWAP 0,1,2,3,4,5
+%assign i i+1
+%endrep
+    FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
+    psubw    m0, [pad20]
+    movu     [r4+i*mmsize*3], m0
+    add      r4, mmsize
+    lea      r1, [r1+r2*8+mmsize]
+%if %1==8
+    lea      r1, [r1+r2*4]
+%endif
+    dec      r3d
+    jg .v_loop
+    neg      r2
+    ret
+%endmacro
+
+INIT_MMX mmxext
+HV 4
+INIT_XMM sse2
+HV 8
+
+%macro H_LOOP 1
+%if num_mmregs > 8
+    %define s1 m8
+    %define s2 m9
+    %define s3 m10
+    %define d1 m11
+%else
+    %define s1 [tap1]
+    %define s2 [tap2]
+    %define s3 [tap3]
+    %define d1 [depad]
+%endif
+h%1_loop_op:
+    movu       m1, [r1+mmsize-4]
+    movu       m2, [r1+mmsize-2]
+    mova       m3, [r1+mmsize+0]
+    movu       m4, [r1+mmsize+2]
+    movu       m5, [r1+mmsize+4]
+    movu       m6, [r1+mmsize+6]
+%if num_mmregs > 8
+    pmaddwd    m1, s1
+    pmaddwd    m2, s1
+    pmaddwd    m3, s2
+    pmaddwd    m4, s2
+    pmaddwd    m5, s3
+    pmaddwd    m6, s3
+    paddd      m1, d1
+    paddd      m2, d1
+%else
+    mova       m0, s1
+    pmaddwd    m1, m0
+    pmaddwd    m2, m0
+    mova       m0, s2
+    pmaddwd    m3, m0
+    pmaddwd    m4, m0
+    mova       m0, s3
+    pmaddwd    m5, m0
+    pmaddwd    m6, m0
+    mova       m0, d1
+    paddd      m1, m0
+    paddd      m2, m0
+%endif
+    paddd      m3, m5
+    paddd      m4, m6
+    paddd      m1, m3
+    paddd      m2, m4
+    psrad      m1, 10
+    psrad      m2, 10
+    pslld      m2, 16
+    pand       m1, [pd_65535]
+    por        m1, m2
+%if num_mmregs <= 8
+    pxor       m0, m0
+%endif
+    CLIPW      m1, m0, m7
+    add        r1, mmsize*3
+    ret
+%endmacro
+
+INIT_MMX mmxext
+H_LOOP 4
+INIT_XMM sse2
+H_LOOP 8
+
+%macro MC22 2
+cglobal_mc %1, mc22, %2, 3,7,12
+%define PAD mmsize*8*4*2      ; SIZE*16*4*sizeof(pixel)
+    mov      r6, rsp          ; backup stack pointer
+    and     rsp, ~(mmsize-1)  ; align stack
+    sub     rsp, PAD
+
+    call put_hv%2_10
+
+    mov       r3d, %2
+    mova       m7, [pw_pixel_max]
+%if num_mmregs > 8
+    pxor       m0, m0
+    mova       m8, [tap1]
+    mova       m9, [tap2]
+    mova      m10, [tap3]
+    mova      m11, [depad]
+%endif
+    mov        r1, rsp
+.h_loop:
+    call h%2_loop_op
+
+    OP_MOV   [r0], m1
+    add        r0, r2
+    dec       r3d
+    jg .h_loop
+
+    mov     rsp, r6          ; restore stack pointer
+    ret
+%endmacro
+
+MC MC22
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC12 2
+cglobal_mc %1, mc12, %2, 3,7,12
+%define PAD mmsize*8*4*2        ; SIZE*16*4*sizeof(pixel)
+    mov        r6, rsp          ; backup stack pointer
+    and       rsp, ~(mmsize-1)  ; align stack
+    sub       rsp, PAD
+
+    call put_hv%2_10
+
+    xor       r4d, r4d
+.body:
+    mov       r3d, %2
+    pxor       m0, m0
+    mova       m7, [pw_pixel_max]
+%if num_mmregs > 8
+    mova       m8, [tap1]
+    mova       m9, [tap2]
+    mova      m10, [tap3]
+    mova      m11, [depad]
+%endif
+    mov        r1, rsp
+.h_loop:
+    call h%2_loop_op
+
+    movu       m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
+    paddw      m3, [depad2]
+    psrlw      m3, 5
+    psubw      m3, [unpad]
+    CLIPW      m3, m0, m7
+    pavgw      m1, m3
+
+    OP_MOV   [r0], m1
+    add        r0, r2
+    dec       r3d
+    jg .h_loop
+
+    mov     rsp, r6          ; restore stack pointer
+    ret
+%endmacro
+
+MC MC12
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC32 2
+cglobal_mc %1, mc32, %2, 3,7,12
+%define PAD mmsize*8*3*2  ; SIZE*16*4*sizeof(pixel)
+    mov  r6, rsp          ; backup stack pointer
+    and rsp, ~(mmsize-1)  ; align stack
+    sub rsp, PAD
+
+    call put_hv%2_10
+
+    mov r4d, 2            ; sizeof(pixel)
+    jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC MC32
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro H_NRD 1
+put_h%1_10:
+    add       rsp, gprsize
+    mov       r3d, %1
+    xor       r4d, r4d
+    mova       m6, [pad20]
+.nextrow:
+    movu       m2, [r5-4]
+    movu       m3, [r5-2]
+    movu       m4, [r5+0]
+    ADDW       m2, [r5+6], m5
+    ADDW       m3, [r5+4], m5
+    ADDW       m4, [r5+2], m5
+
+    FILT_H2    m2, m3, m4
+    psubw      m2, m6
+    mova [rsp+r4], m2
+    add       r4d, mmsize*3
+    add        r5, r2
+    dec       r3d
+    jg .nextrow
+    sub       rsp, gprsize
+    ret
+%endmacro
+
+INIT_MMX mmxext
+H_NRD 4
+INIT_XMM sse2
+H_NRD 8
+
+%macro MC21 2
+cglobal_mc %1, mc21, %2, 3,7,12
+    mov   r5, r1
+.body:
+%define PAD mmsize*8*3*2   ; SIZE*16*4*sizeof(pixel)
+    mov   r6, rsp          ; backup stack pointer
+    and  rsp, ~(mmsize-1)  ; align stack
+
+    sub  rsp, PAD
+    call put_h%2_10
+
+    sub  rsp, PAD
+    call put_hv%2_10
+
+    mov r4d, PAD-mmsize    ; H buffer
+    jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC MC21
+
+;-----------------------------------------------------------------------------
+; void ff_h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC23 2
+cglobal_mc %1, mc23, %2, 3,7,12
+    lea   r5, [r1+r2]
+    jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC MC23
--- a/externals/ffmpeg/libavcodec/x86/h264_qpel_8bit.asm
+++ b/externals/ffmpeg/libavcodec/x86/h264_qpel_8bit.asm
@@ -0,0 +1,862 @@
+;*****************************************************************************
+;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
+;*****************************************************************************
+;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
+;* Copyright (C) 2012 Daniel Kang
+;*
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+cextern pw_16
+cextern pw_5
+cextern pb_0
+
+SECTION .text
+
+
+%macro op_avgh 3
+    movh   %3, %2
+    pavgb  %1, %3
+    movh   %2, %1
+%endmacro
+
+%macro op_avg 2-3
+    pavgb  %1, %2
+    mova   %2, %1
+%endmacro
+
+%macro op_puth 2-3
+    movh   %2, %1
+%endmacro
+
+%macro op_put 2-3
+    mova   %2, %1
+%endmacro
+
+%macro QPEL4_H_LOWPASS_OP 1
+cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
+    movsxdifnidn  r2, r2d
+    movsxdifnidn  r3, r3d
+    pxor          m7, m7
+    mova          m4, [pw_5]
+    mova          m5, [pw_16]
+    mov          r4d, 4
+.loop:
+    movh          m1, [r1-1]
+    movh          m2, [r1+0]
+    movh          m3, [r1+1]
+    movh          m0, [r1+2]
+    punpcklbw     m1, m7
+    punpcklbw     m2, m7
+    punpcklbw     m3, m7
+    punpcklbw     m0, m7
+    paddw         m1, m0
+    paddw         m2, m3
+    movh          m0, [r1-2]
+    movh          m3, [r1+3]
+    punpcklbw     m0, m7
+    punpcklbw     m3, m7
+    paddw         m0, m3
+    psllw         m2, 2
+    psubw         m2, m1
+    pmullw        m2, m4
+    paddw         m0, m5
+    paddw         m0, m2
+    psraw         m0, 5
+    packuswb      m0, m0
+    op_%1h        m0, [r0], m6
+    add           r0, r2
+    add           r1, r3
+    dec          r4d
+    jg         .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL4_H_LOWPASS_OP put
+QPEL4_H_LOWPASS_OP avg
+
+%macro QPEL8_H_LOWPASS_OP 1
+cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
+    movsxdifnidn  r2, r2d
+    movsxdifnidn  r3, r3d
+    mov          r4d, 8
+    pxor          m7, m7
+    mova          m6, [pw_5]
+.loop:
+    mova          m0, [r1]
+    mova          m2, [r1+1]
+    mova          m1, m0
+    mova          m3, m2
+    punpcklbw     m0, m7
+    punpckhbw     m1, m7
+    punpcklbw     m2, m7
+    punpckhbw     m3, m7
+    paddw         m0, m2
+    paddw         m1, m3
+    psllw         m0, 2
+    psllw         m1, 2
+    mova          m2, [r1-1]
+    mova          m4, [r1+2]
+    mova          m3, m2
+    mova          m5, m4
+    punpcklbw     m2, m7
+    punpckhbw     m3, m7
+    punpcklbw     m4, m7
+    punpckhbw     m5, m7
+    paddw         m2, m4
+    paddw         m5, m3
+    psubw         m0, m2
+    psubw         m1, m5
+    pmullw        m0, m6
+    pmullw        m1, m6
+    movd          m2, [r1-2]
+    movd          m5, [r1+7]
+    punpcklbw     m2, m7
+    punpcklbw     m5, m7
+    paddw         m2, m3
+    paddw         m4, m5
+    mova          m5, [pw_16]
+    paddw         m2, m5
+    paddw         m4, m5
+    paddw         m0, m2
+    paddw         m1, m4
+    psraw         m0, 5
+    psraw         m1, 5
+    packuswb      m0, m1
+    op_%1         m0, [r0], m4
+    add           r0, r2
+    add           r1, r3
+    dec          r4d
+    jg         .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL8_H_LOWPASS_OP put
+QPEL8_H_LOWPASS_OP avg
+
+%macro QPEL8_H_LOWPASS_OP_XMM 1
+cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
+    movsxdifnidn  r2, r2d
+    movsxdifnidn  r3, r3d
+    mov          r4d, 8
+    pxor          m7, m7
+    mova          m6, [pw_5]
+.loop:
+    movu          m1, [r1-2]
+    mova          m0, m1
+    punpckhbw     m1, m7
+    punpcklbw     m0, m7
+    mova          m2, m1
+    mova          m3, m1
+    mova          m4, m1
+    mova          m5, m1
+    palignr       m4, m0, 2
+    palignr       m3, m0, 4
+    palignr       m2, m0, 6
+    palignr       m1, m0, 8
+    palignr       m5, m0, 10
+    paddw         m0, m5
+    paddw         m2, m3
+    paddw         m1, m4
+    psllw         m2, 2
+    psubw         m2, m1
+    paddw         m0, [pw_16]
+    pmullw        m2, m6
+    paddw         m2, m0
+    psraw         m2, 5
+    packuswb      m2, m2
+    op_%1h        m2, [r0], m4
+    add           r1, r3
+    add           r0, r2
+    dec          r4d
+    jne        .loop
+    REP_RET
+%endmacro
+
+INIT_XMM ssse3
+QPEL8_H_LOWPASS_OP_XMM put
+QPEL8_H_LOWPASS_OP_XMM avg
+
+
+%macro QPEL4_H_LOWPASS_L2_OP 1
+cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
+    movsxdifnidn  r3, r3d
+    movsxdifnidn  r4, r4d
+    pxor          m7, m7
+    mova          m4, [pw_5]
+    mova          m5, [pw_16]
+    mov          r5d, 4
+.loop:
+    movh          m1, [r1-1]
+    movh          m2, [r1+0]
+    movh          m3, [r1+1]
+    movh          m0, [r1+2]
+    punpcklbw     m1, m7
+    punpcklbw     m2, m7
+    punpcklbw     m3, m7
+    punpcklbw     m0, m7
+    paddw         m1, m0
+    paddw         m2, m3
+    movh          m0, [r1-2]
+    movh          m3, [r1+3]
+    punpcklbw     m0, m7
+    punpcklbw     m3, m7
+    paddw         m0, m3
+    psllw         m2, 2
+    psubw         m2, m1
+    pmullw        m2, m4
+    paddw         m0, m5
+    paddw         m0, m2
+    movh          m3, [r2]
+    psraw         m0, 5
+    packuswb      m0, m0
+    pavgb         m0, m3
+    op_%1h        m0, [r0], m6
+    add           r0, r3
+    add           r1, r3
+    add           r2, r4
+    dec          r5d
+    jg         .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL4_H_LOWPASS_L2_OP put
+QPEL4_H_LOWPASS_L2_OP avg
+
+
+%macro QPEL8_H_LOWPASS_L2_OP 1
+cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
+    movsxdifnidn  r3, r3d
+    movsxdifnidn  r4, r4d
+    mov          r5d, 8
+    pxor          m7, m7
+    mova          m6, [pw_5]
+.loop:
+    mova          m0, [r1]
+    mova          m2, [r1+1]
+    mova          m1, m0
+    mova          m3, m2
+    punpcklbw     m0, m7
+    punpckhbw     m1, m7
+    punpcklbw     m2, m7
+    punpckhbw     m3, m7
+    paddw         m0, m2
+    paddw         m1, m3
+    psllw         m0, 2
+    psllw         m1, 2
+    mova          m2, [r1-1]
+    mova          m4, [r1+2]
+    mova          m3, m2
+    mova          m5, m4
+    punpcklbw     m2, m7
+    punpckhbw     m3, m7
+    punpcklbw     m4, m7
+    punpckhbw     m5, m7
+    paddw         m2, m4
+    paddw         m5, m3
+    psubw         m0, m2
+    psubw         m1, m5
+    pmullw        m0, m6
+    pmullw        m1, m6
+    movd          m2, [r1-2]
+    movd          m5, [r1+7]
+    punpcklbw     m2, m7
+    punpcklbw     m5, m7
+    paddw         m2, m3
+    paddw         m4, m5
+    mova          m5, [pw_16]
+    paddw         m2, m5
+    paddw         m4, m5
+    paddw         m0, m2
+    paddw         m1, m4
+    psraw         m0, 5
+    psraw         m1, 5
+    mova          m4, [r2]
+    packuswb      m0, m1
+    pavgb         m0, m4
+    op_%1         m0, [r0], m4
+    add           r0, r3
+    add           r1, r3
+    add           r2, r4
+    dec          r5d
+    jg         .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL8_H_LOWPASS_L2_OP put
+QPEL8_H_LOWPASS_L2_OP avg
+
+
+%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
+cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
+    movsxdifnidn  r3, r3d
+    movsxdifnidn  r4, r4d
+    mov          r5d, 8
+    pxor          m7, m7
+    mova          m6, [pw_5]
+.loop:
+    lddqu         m1, [r1-2]
+    mova          m0, m1
+    punpckhbw     m1, m7
+    punpcklbw     m0, m7
+    mova          m2, m1
+    mova          m3, m1
+    mova          m4, m1
+    mova          m5, m1
+    palignr       m4, m0, 2
+    palignr       m3, m0, 4
+    palignr       m2, m0, 6
+    palignr       m1, m0, 8
+    palignr       m5, m0, 10
+    paddw         m0, m5
+    paddw         m2, m3
+    paddw         m1, m4
+    psllw         m2, 2
+    movh          m3, [r2]
+    psubw         m2, m1
+    paddw         m0, [pw_16]
+    pmullw        m2, m6
+    paddw         m2, m0
+    psraw         m2, 5
+    packuswb      m2, m2
+    pavgb         m2, m3
+    op_%1h        m2, [r0], m4
+    add           r1, r3
+    add           r0, r3
+    add           r2, r4
+    dec          r5d
+    jg         .loop
+    REP_RET
+%endmacro
+
+INIT_XMM ssse3
+QPEL8_H_LOWPASS_L2_OP_XMM put
+QPEL8_H_LOWPASS_L2_OP_XMM avg
+
+
+; All functions that call this are required to have function arguments of
+; dst, src, dstStride, srcStride
+%macro FILT_V 1
+    mova      m6, m2
+    movh      m5, [r1]
+    paddw     m6, m3
+    psllw     m6, 2
+    psubw     m6, m1
+    psubw     m6, m4
+    punpcklbw m5, m7
+    pmullw    m6, [pw_5]
+    paddw     m0, [pw_16]
+    add       r1, r3
+    paddw     m0, m5
+    paddw     m6, m0
+    psraw     m6, 5
+    packuswb  m6, m6
+    op_%1h    m6, [r0], m0 ; 1
+    add       r0, r2
+    SWAP       0, 1, 2, 3, 4, 5
+%endmacro
+
+%macro QPEL4_V_LOWPASS_OP 1
+cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
+    movsxdifnidn  r2, r2d
+    movsxdifnidn  r3, r3d
+    sub           r1, r3
+    sub           r1, r3
+    pxor          m7, m7
+    movh          m0, [r1]
+    movh          m1, [r1+r3]
+    lea           r1, [r1+2*r3]
+    movh          m2, [r1]
+    movh          m3, [r1+r3]
+    lea           r1, [r1+2*r3]
+    movh          m4, [r1]
+    add           r1, r3
+    punpcklbw     m0, m7
+    punpcklbw     m1, m7
+    punpcklbw     m2, m7
+    punpcklbw     m3, m7
+    punpcklbw     m4, m7
+    FILT_V        %1
+    FILT_V        %1
+    FILT_V        %1
+    FILT_V        %1
+    RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL4_V_LOWPASS_OP put
+QPEL4_V_LOWPASS_OP avg
+
+
+
+%macro QPEL8OR16_V_LOWPASS_OP 1
+%if cpuflag(sse2)
+cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
+    movsxdifnidn  r2, r2d
+    movsxdifnidn  r3, r3d
+    sub           r1, r3
+    sub           r1, r3
+%else
+cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
+    movsxdifnidn  r2, r2d
+    movsxdifnidn  r3, r3d
+%endif
+    pxor          m7, m7
+    movh          m0, [r1]
+    movh          m1, [r1+r3]
+    lea           r1, [r1+2*r3]
+    movh          m2, [r1]
+    movh          m3, [r1+r3]
+    lea           r1, [r1+2*r3]
+    movh          m4, [r1]
+    add           r1, r3
+    punpcklbw     m0, m7
+    punpcklbw     m1, m7
+    punpcklbw     m2, m7
+    punpcklbw     m3, m7
+    punpcklbw     m4, m7
+    FILT_V        %1
+    FILT_V        %1
+    FILT_V        %1
+    FILT_V        %1
+    FILT_V        %1
+    FILT_V        %1
+    FILT_V        %1
+    FILT_V        %1
+    cmp          r4d, 16
+    jne         .end
+    FILT_V        %1
+    FILT_V        %1
+    FILT_V        %1
+    FILT_V        %1
+    FILT_V        %1
+    FILT_V        %1
+    FILT_V        %1
+    FILT_V        %1
+.end:
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL8OR16_V_LOWPASS_OP put
+QPEL8OR16_V_LOWPASS_OP avg
+
+INIT_XMM sse2
+QPEL8OR16_V_LOWPASS_OP put
+QPEL8OR16_V_LOWPASS_OP avg
+
+
+; All functions that use this are required to have args:
+; src, tmp, srcSize
+%macro FILT_HV 1 ; offset
+    mova           m6, m2
+    movh           m5, [r0]
+    paddw          m6, m3
+    psllw          m6, 2
+    paddw          m0, [pw_16]
+    psubw          m6, m1
+    psubw          m6, m4
+    punpcklbw      m5, m7
+    pmullw         m6, [pw_5]
+    paddw          m0, m5
+    add            r0, r2
+    paddw          m6, m0
+    mova      [r1+%1], m6
+    SWAP            0, 1, 2, 3, 4, 5
+%endmacro
+
+%macro QPEL4_HV1_LOWPASS_OP 1
+cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
+    movsxdifnidn  r2, r2d
+    pxor          m7, m7
+    movh          m0, [r0]
+    movh          m1, [r0+r2]
+    lea           r0, [r0+2*r2]
+    movh          m2, [r0]
+    movh          m3, [r0+r2]
+    lea           r0, [r0+2*r2]
+    movh          m4, [r0]
+    add           r0, r2
+    punpcklbw     m0, m7
+    punpcklbw     m1, m7
+    punpcklbw     m2, m7
+    punpcklbw     m3, m7
+    punpcklbw     m4, m7
+    FILT_HV       0*24
+    FILT_HV       1*24
+    FILT_HV       2*24
+    FILT_HV       3*24
+    RET
+
+cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
+    movsxdifnidn  r2, r2d
+    mov          r3d, 4
+.loop:
+    mova          m0, [r0]
+    paddw         m0, [r0+10]
+    mova          m1, [r0+2]
+    paddw         m1, [r0+8]
+    mova          m2, [r0+4]
+    paddw         m2, [r0+6]
+    psubw         m0, m1
+    psraw         m0, 2
+    psubw         m0, m1
+    paddsw        m0, m2
+    psraw         m0, 2
+    paddw         m0, m2
+    psraw         m0, 6
+    packuswb      m0, m0
+    op_%1h        m0, [r1], m7
+    add           r0, 24
+    add           r1, r2
+    dec          r3d
+    jnz        .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL4_HV1_LOWPASS_OP put
+QPEL4_HV1_LOWPASS_OP avg
+
+%macro QPEL8OR16_HV1_LOWPASS_OP 1
+cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
+    movsxdifnidn  r2, r2d
+    pxor          m7, m7
+    movh          m0, [r0]
+    movh          m1, [r0+r2]
+    lea           r0, [r0+2*r2]
+    movh          m2, [r0]
+    movh          m3, [r0+r2]
+    lea           r0, [r0+2*r2]
+    movh          m4, [r0]
+    add           r0, r2
+    punpcklbw     m0, m7
+    punpcklbw     m1, m7
+    punpcklbw     m2, m7
+    punpcklbw     m3, m7
+    punpcklbw     m4, m7
+    FILT_HV     0*48
+    FILT_HV     1*48
+    FILT_HV     2*48
+    FILT_HV     3*48
+    FILT_HV     4*48
+    FILT_HV     5*48
+    FILT_HV     6*48
+    FILT_HV     7*48
+    cmp          r3d, 16
+    jne         .end
+    FILT_HV     8*48
+    FILT_HV     9*48
+    FILT_HV    10*48
+    FILT_HV    11*48
+    FILT_HV    12*48
+    FILT_HV    13*48
+    FILT_HV    14*48
+    FILT_HV    15*48
+.end:
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL8OR16_HV1_LOWPASS_OP put
+QPEL8OR16_HV1_LOWPASS_OP avg
+
+INIT_XMM sse2
+QPEL8OR16_HV1_LOWPASS_OP put
+
+
+
+%macro QPEL8OR16_HV2_LOWPASS_OP 1
+; unused is to match ssse3 and mmxext args
+cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
+    movsxdifnidn  r2, r2d
+.loop:
+    mova          m0, [r1]
+    mova          m3, [r1+8]
+    mova          m1, [r1+2]
+    mova          m4, [r1+10]
+    paddw         m0, m4
+    paddw         m1, m3
+    paddw         m3, [r1+18]
+    paddw         m4, [r1+16]
+    mova          m2, [r1+4]
+    mova          m5, [r1+12]
+    paddw         m2, [r1+6]
+    paddw         m5, [r1+14]
+    psubw         m0, m1
+    psubw         m3, m4
+    psraw         m0, 2
+    psraw         m3, 2
+    psubw         m0, m1
+    psubw         m3, m4
+    paddsw        m0, m2
+    paddsw        m3, m5
+    psraw         m0, 2
+    psraw         m3, 2
+    paddw         m0, m2
+    paddw         m3, m5
+    psraw         m0, 6
+    psraw         m3, 6
+    packuswb      m0, m3
+    op_%1         m0, [r0], m7
+    add           r1, 48
+    add           r0, r2
+    dec          r4d
+    jne        .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL8OR16_HV2_LOWPASS_OP put
+QPEL8OR16_HV2_LOWPASS_OP avg
+
+%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
+cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
+    movsxdifnidn  r2, r2d
+    movsxdifnidn  r3, r3d
+    cmp          r4d, 16
+    je         .op16
+.loop8:
+    mova          m1, [r1+16]
+    mova          m0, [r1]
+    mova          m2, m1
+    mova          m3, m1
+    mova          m4, m1
+    mova          m5, m1
+    palignr       m5, m0, 10
+    palignr       m4, m0, 8
+    palignr       m3, m0, 6
+    palignr       m2, m0, 4
+    palignr       m1, m0, 2
+    paddw         m0, m5
+    paddw         m1, m4
+    paddw         m2, m3
+    psubw         m0, m1
+    psraw         m0, 2
+    psubw         m0, m1
+    paddw         m0, m2
+    psraw         m0, 2
+    paddw         m0, m2
+    psraw         m0, 6
+    packuswb      m0, m0
+    op_%1h        m0, [r0], m7
+    add           r1, 48
+    add           r0, r2
+    dec          r4d
+    jne       .loop8
+    jmp        .done
+.op16:
+    mova          m4, [r1+32]
+    mova          m5, [r1+16]
+    mova          m7, [r1]
+    mova          m3, m4
+    mova          m2, m4
+    mova          m1, m4
+    mova          m0, m4
+    palignr       m0, m5, 10
+    palignr       m1, m5, 8
+    palignr       m2, m5, 6
+    palignr       m3, m5, 4
+    palignr       m4, m5, 2
+    paddw         m0, m5
+    paddw         m1, m4
+    paddw         m2, m3
+    mova          m6, m5
+    mova          m4, m5
+    mova          m3, m5
+    palignr       m4, m7, 8
+    palignr       m6, m7, 2
+    palignr       m3, m7, 10
+    paddw         m4, m6
+    mova          m6, m5
+    palignr       m5, m7, 6
+    palignr       m6, m7, 4
+    paddw         m3, m7
+    paddw         m5, m6
+    psubw         m0, m1
+    psubw         m3, m4
+    psraw         m0, 2
+    psraw         m3, 2
+    psubw         m0, m1
+    psubw         m3, m4
+    paddw         m0, m2
+    paddw         m3, m5
+    psraw         m0, 2
+    psraw         m3, 2
+    paddw         m0, m2
+    paddw         m3, m5
+    psraw         m0, 6
+    psraw         m3, 6
+    packuswb      m3, m0
+    op_%1         m3, [r0], m7
+    add           r1, 48
+    add           r0, r2
+    dec          r4d
+    jne        .op16
+.done:
+    REP_RET
+%endmacro
+
+INIT_XMM ssse3
+QPEL8OR16_HV2_LOWPASS_OP_XMM put
+QPEL8OR16_HV2_LOWPASS_OP_XMM avg
+
+
+%macro PIXELS4_L2_SHIFT5 1
+cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
+    movsxdifnidn  r3, r3d
+    movsxdifnidn  r4, r4d
+    mova          m0, [r1]
+    mova          m1, [r1+24]
+    psraw         m0, 5
+    psraw         m1, 5
+    packuswb      m0, m0
+    packuswb      m1, m1
+    pavgb         m0, [r2]
+    pavgb         m1, [r2+r4]
+    op_%1h        m0, [r0], m4
+    op_%1h        m1, [r0+r3], m5
+    lea           r2, [r2+r4*2]
+    lea           r0, [r0+r3*2]
+    mova          m0, [r1+48]
+    mova          m1, [r1+72]
+    psraw         m0, 5
+    psraw         m1, 5
+    packuswb      m0, m0
+    packuswb      m1, m1
+    pavgb         m0, [r2]
+    pavgb         m1, [r2+r4]
+    op_%1h        m0, [r0], m4
+    op_%1h        m1, [r0+r3], m5
+    RET
+%endmacro
+
+INIT_MMX mmxext
+PIXELS4_L2_SHIFT5 put
+PIXELS4_L2_SHIFT5 avg
+
+
+%macro PIXELS8_L2_SHIFT5 1
+cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
+    movsxdifnidn  r3, r3d
+    movsxdifnidn  r4, r4d
+.loop:
+    mova          m0, [r1]
+    mova          m1, [r1+8]
+    mova          m2, [r1+48]
+    mova          m3, [r1+48+8]
+    psraw         m0, 5
+    psraw         m1, 5
+    psraw         m2, 5
+    psraw         m3, 5
+    packuswb      m0, m1
+    packuswb      m2, m3
+    pavgb         m0, [r2]
+    pavgb         m2, [r2+r4]
+    op_%1         m0, [r0], m4
+    op_%1         m2, [r0+r3], m5
+    lea           r2, [r2+2*r4]
+    add           r1, 48*2
+    lea           r0, [r0+2*r3]
+    sub          r5d, 2
+    jne        .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PIXELS8_L2_SHIFT5 put
+PIXELS8_L2_SHIFT5 avg
+
+
+%if ARCH_X86_64
+%macro QPEL16_H_LOWPASS_L2_OP 1
+cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
+    movsxdifnidn  r3, r3d
+    movsxdifnidn  r4, r4d
+    mov          r5d, 16
+    pxor         m15, m15
+    mova         m14, [pw_5]
+    mova         m13, [pw_16]
+.loop:
+    lddqu         m1, [r1+6]
+    lddqu         m7, [r1-2]
+    mova          m0, m1
+    punpckhbw     m1, m15
+    punpcklbw     m0, m15
+    punpcklbw     m7, m15
+    mova          m2, m1
+    mova          m6, m0
+    mova          m3, m1
+    mova          m8, m0
+    mova          m4, m1
+    mova          m9, m0
+    mova         m12, m0
+    mova         m11, m1
+    palignr      m11, m0, 10
+    palignr      m12, m7, 10
+    palignr       m4, m0, 2
+    palignr       m9, m7, 2
+    palignr       m3, m0, 4
+    palignr       m8, m7, 4
+    palignr       m2, m0, 6
+    palignr       m6, m7, 6
+    paddw        m11, m0
+    palignr       m1, m0, 8
+    palignr       m0, m7, 8
+    paddw         m7, m12
+    paddw         m2, m3
+    paddw         m6, m8
+    paddw         m1, m4
+    paddw         m0, m9
+    psllw         m2, 2
+    psllw         m6, 2
+    psubw         m2, m1
+    psubw         m6, m0
+    paddw        m11, m13
+    paddw         m7, m13
+    pmullw        m2, m14
+    pmullw        m6, m14
+    lddqu         m3, [r2]
+    paddw         m2, m11
+    paddw         m6, m7
+    psraw         m2, 5
+    psraw         m6, 5
+    packuswb      m6, m2
+    pavgb         m6, m3
+    op_%1         m6, [r0], m11
+    add           r1, r3
+    add           r0, r3
+    add           r2, r4
+    dec          r5d
+    jg         .loop
+    REP_RET
+%endmacro
+
+INIT_XMM ssse3
+QPEL16_H_LOWPASS_L2_OP put
+QPEL16_H_LOWPASS_L2_OP avg
+%endif
--- a/externals/ffmpeg/libavcodec/x86/h264_weight.asm
+++ b/externals/ffmpeg/libavcodec/x86/h264_weight.asm
@@ -0,0 +1,320 @@
+;*****************************************************************************
+;* SSE2-optimized weighted prediction code
+;*****************************************************************************
+;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
+;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; biweight pred:
+;
+; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
+;                               int height, int log2_denom, int weightd,
+;                               int weights, int offset);
+; and
+; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height,
+;                             int log2_denom, int weight, int offset);
+;-----------------------------------------------------------------------------
+
+%macro WEIGHT_SETUP 0
+    add        r5, r5
+    inc        r5
+    movd       m3, r4d
+    movd       m5, r5d
+    movd       m6, r3d
+    pslld      m5, m6
+    psrld      m5, 1
+%if mmsize == 16
+    pshuflw    m3, m3, 0
+    pshuflw    m5, m5, 0
+    punpcklqdq m3, m3
+    punpcklqdq m5, m5
+%else
+    pshufw     m3, m3, 0
+    pshufw     m5, m5, 0
+%endif
+    pxor       m7, m7
+%endmacro
+
+%macro WEIGHT_OP 2
+    movh          m0, [r0+%1]
+    movh          m1, [r0+%2]
+    punpcklbw     m0, m7
+    punpcklbw     m1, m7
+    pmullw        m0, m3
+    pmullw        m1, m3
+    paddsw        m0, m5
+    paddsw        m1, m5
+    psraw         m0, m6
+    psraw         m1, m6
+    packuswb      m0, m1
+%endmacro
+
+INIT_MMX mmxext
+cglobal h264_weight_16, 6, 6, 0
+    WEIGHT_SETUP
+.nextrow:
+    WEIGHT_OP 0,  4
+    mova     [r0  ], m0
+    WEIGHT_OP 8, 12
+    mova     [r0+8], m0
+    add        r0, r1
+    dec        r2d
+    jnz .nextrow
+    REP_RET
+
+%macro WEIGHT_FUNC_MM 2
+cglobal h264_weight_%1, 6, 6, %2
+    WEIGHT_SETUP
+.nextrow:
+    WEIGHT_OP 0, mmsize/2
+    mova     [r0], m0
+    add        r0, r1
+    dec        r2d
+    jnz .nextrow
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+WEIGHT_FUNC_MM  8, 0
+INIT_XMM sse2
+WEIGHT_FUNC_MM 16, 8
+
+%macro WEIGHT_FUNC_HALF_MM 2
+cglobal h264_weight_%1, 6, 6, %2
+    WEIGHT_SETUP
+    sar       r2d, 1
+    lea        r3, [r1*2]
+.nextrow:
+    WEIGHT_OP 0, r1
+    movh     [r0], m0
+%if mmsize == 16
+    movhps   [r0+r1], m0
+%else
+    psrlq      m0, 32
+    movh     [r0+r1], m0
+%endif
+    add        r0, r3
+    dec        r2d
+    jnz .nextrow
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+WEIGHT_FUNC_HALF_MM 4, 0
+INIT_XMM sse2
+WEIGHT_FUNC_HALF_MM 8, 8
+
+%macro BIWEIGHT_SETUP 0
+%if ARCH_X86_64
+%define off_regd r7d
+%else
+%define off_regd r3d
+%endif
+    mov  off_regd, r7m
+    add  off_regd, 1
+    or   off_regd, 1
+    add       r4d, 1
+    cmp       r6d, 128
+    je .nonnormal
+    cmp       r5d, 128
+    jne .normal
+.nonnormal:
+    sar       r5d, 1
+    sar       r6d, 1
+    sar  off_regd, 1
+    sub       r4d, 1
+.normal:
+%if cpuflag(ssse3)
+    movd       m4, r5d
+    movd       m0, r6d
+%else
+    movd       m3, r5d
+    movd       m4, r6d
+%endif
+    movd       m5, off_regd
+    movd       m6, r4d
+    pslld      m5, m6
+    psrld      m5, 1
+%if cpuflag(ssse3)
+    punpcklbw  m4, m0
+    pshuflw    m4, m4, 0
+    pshuflw    m5, m5, 0
+    punpcklqdq m4, m4
+    punpcklqdq m5, m5
+
+%else
+%if mmsize == 16
+    pshuflw    m3, m3, 0
+    pshuflw    m4, m4, 0
+    pshuflw    m5, m5, 0
+    punpcklqdq m3, m3
+    punpcklqdq m4, m4
+    punpcklqdq m5, m5
+%else
+    pshufw     m3, m3, 0
+    pshufw     m4, m4, 0
+    pshufw     m5, m5, 0
+%endif
+    pxor       m7, m7
+%endif
+%endmacro
+
+%macro BIWEIGHT_STEPA 3
+    movh       m%1, [r0+%3]
+    movh       m%2, [r1+%3]
+    punpcklbw  m%1, m7
+    punpcklbw  m%2, m7
+    pmullw     m%1, m3
+    pmullw     m%2, m4
+    paddsw     m%1, m%2
+%endmacro
+
+%macro BIWEIGHT_STEPB 0
+    paddsw     m0, m5
+    paddsw     m1, m5
+    psraw      m0, m6
+    psraw      m1, m6
+    packuswb   m0, m1
+%endmacro
+
+INIT_MMX mmxext
+cglobal h264_biweight_16, 7, 8, 0
+    BIWEIGHT_SETUP
+    movifnidn r3d, r3m
+.nextrow:
+    BIWEIGHT_STEPA 0, 1, 0
+    BIWEIGHT_STEPA 1, 2, 4
+    BIWEIGHT_STEPB
+    mova       [r0], m0
+    BIWEIGHT_STEPA 0, 1, 8
+    BIWEIGHT_STEPA 1, 2, 12
+    BIWEIGHT_STEPB
+    mova     [r0+8], m0
+    add        r0, r2
+    add        r1, r2
+    dec        r3d
+    jnz .nextrow
+    REP_RET
+
+%macro BIWEIGHT_FUNC_MM 2
+cglobal h264_biweight_%1, 7, 8, %2
+    BIWEIGHT_SETUP
+    movifnidn r3d, r3m
+.nextrow:
+    BIWEIGHT_STEPA 0, 1, 0
+    BIWEIGHT_STEPA 1, 2, mmsize/2
+    BIWEIGHT_STEPB
+    mova       [r0], m0
+    add        r0, r2
+    add        r1, r2
+    dec        r3d
+    jnz .nextrow
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+BIWEIGHT_FUNC_MM  8, 0
+INIT_XMM sse2
+BIWEIGHT_FUNC_MM 16, 8
+
+%macro BIWEIGHT_FUNC_HALF_MM 2
+cglobal h264_biweight_%1, 7, 8, %2
+    BIWEIGHT_SETUP
+    movifnidn r3d, r3m
+    sar        r3, 1
+    lea        r4, [r2*2]
+.nextrow:
+    BIWEIGHT_STEPA 0, 1, 0
+    BIWEIGHT_STEPA 1, 2, r2
+    BIWEIGHT_STEPB
+    movh       [r0], m0
+%if mmsize == 16
+    movhps     [r0+r2], m0
+%else
+    psrlq      m0, 32
+    movh       [r0+r2], m0
+%endif
+    add        r0, r4
+    add        r1, r4
+    dec        r3d
+    jnz .nextrow
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+BIWEIGHT_FUNC_HALF_MM 4, 0
+INIT_XMM sse2
+BIWEIGHT_FUNC_HALF_MM 8, 8
+
+%macro BIWEIGHT_SSSE3_OP 0
+    pmaddubsw  m0, m4
+    pmaddubsw  m2, m4
+    paddsw     m0, m5
+    paddsw     m2, m5
+    psraw      m0, m6
+    psraw      m2, m6
+    packuswb   m0, m2
+%endmacro
+
+INIT_XMM ssse3
+cglobal h264_biweight_16, 7, 8, 8
+    BIWEIGHT_SETUP
+    movifnidn r3d, r3m
+
+.nextrow:
+    movh       m0, [r0]
+    movh       m2, [r0+8]
+    movh       m3, [r1+8]
+    punpcklbw  m0, [r1]
+    punpcklbw  m2, m3
+    BIWEIGHT_SSSE3_OP
+    mova       [r0], m0
+    add        r0, r2
+    add        r1, r2
+    dec        r3d
+    jnz .nextrow
+    REP_RET
+
+INIT_XMM ssse3
+cglobal h264_biweight_8, 7, 8, 8
+    BIWEIGHT_SETUP
+    movifnidn r3d, r3m
+    sar        r3, 1
+    lea        r4, [r2*2]
+
+.nextrow:
+    movh       m0, [r0]
+    movh       m1, [r1]
+    movh       m2, [r0+r2]
+    movh       m3, [r1+r2]
+    punpcklbw  m0, m1
+    punpcklbw  m2, m3
+    BIWEIGHT_SSSE3_OP
+    movh       [r0], m0
+    movhps     [r0+r2], m0
+    add        r0, r4
+    add        r1, r4
+    dec        r3d
+    jnz .nextrow
+    REP_RET
--- a/externals/ffmpeg/libavcodec/x86/h264_weight_10bit.asm
+++ b/externals/ffmpeg/libavcodec/x86/h264_weight_10bit.asm
@@ -0,0 +1,284 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
+;*****************************************************************************
+;* Copyright (C) 2005-2011 x264 project
+;*
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+sq_1: dq 1
+      dq 0
+
+cextern pw_1
+cextern pw_1023
+%define pw_pixel_max pw_1023
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void ff_h264_weight_16_10(uint8_t *dst, int stride, int height,
+;                           int log2_denom, int weight, int offset);
+;-----------------------------------------------------------------------------
+%macro WEIGHT_PROLOGUE 0
+.prologue:
+    PROLOGUE 0,6,8
+    movifnidn  r0, r0mp
+    movifnidn r1d, r1m
+    movifnidn r2d, r2m
+    movifnidn r4d, r4m
+    movifnidn r5d, r5m
+%endmacro
+
+%macro WEIGHT_SETUP 0
+    mova       m0, [pw_1]
+    movd       m2, r3m
+    pslld      m0, m2       ; 1<<log2_denom
+    SPLATW     m0, m0
+    shl        r5, 19       ; *8, move to upper half of dword
+    lea        r5, [r5+r4*2+0x10000]
+    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
+    pshufd     m3, m3, 0
+    mova       m4, [pw_pixel_max]
+    paddw      m2, [sq_1]   ; log2_denom+1
+%if notcpuflag(sse4)
+    pxor       m7, m7
+%endif
+%endmacro
+
+%macro WEIGHT_OP 1-2
+%if %0==1
+    mova        m5, [r0+%1]
+    punpckhwd   m6, m5, m0
+    punpcklwd   m5, m0
+%else
+    movq        m5, [r0+%1]
+    movq        m6, [r0+%2]
+    punpcklwd   m5, m0
+    punpcklwd   m6, m0
+%endif
+    pmaddwd     m5, m3
+    pmaddwd     m6, m3
+    psrad       m5, m2
+    psrad       m6, m2
+%if cpuflag(sse4)
+    packusdw    m5, m6
+    pminsw      m5, m4
+%else
+    packssdw    m5, m6
+    CLIPW       m5, m7, m4
+%endif
+%endmacro
+
+%macro WEIGHT_FUNC_DBL 0
+cglobal h264_weight_16_10
+    WEIGHT_PROLOGUE
+    WEIGHT_SETUP
+.nextrow:
+    WEIGHT_OP  0
+    mova [r0   ], m5
+    WEIGHT_OP 16
+    mova [r0+16], m5
+    add       r0, r1
+    dec       r2d
+    jnz .nextrow
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+WEIGHT_FUNC_DBL
+INIT_XMM sse4
+WEIGHT_FUNC_DBL
+
+
+%macro WEIGHT_FUNC_MM 0
+cglobal h264_weight_8_10
+    WEIGHT_PROLOGUE
+    WEIGHT_SETUP
+.nextrow:
+    WEIGHT_OP   0
+    mova     [r0], m5
+    add        r0, r1
+    dec        r2d
+    jnz .nextrow
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+WEIGHT_FUNC_MM
+INIT_XMM sse4
+WEIGHT_FUNC_MM
+
+
+%macro WEIGHT_FUNC_HALF_MM 0
+cglobal h264_weight_4_10
+    WEIGHT_PROLOGUE
+    sar         r2d, 1
+    WEIGHT_SETUP
+    lea         r3, [r1*2]
+.nextrow:
+    WEIGHT_OP    0, r1
+    movh      [r0], m5
+    movhps [r0+r1], m5
+    add         r0, r3
+    dec         r2d
+    jnz .nextrow
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+WEIGHT_FUNC_HALF_MM
+INIT_XMM sse4
+WEIGHT_FUNC_HALF_MM
+
+
+;-----------------------------------------------------------------------------
+; void ff_h264_biweight_16_10(uint8_t *dst, uint8_t *src, int stride,
+;                             int height, int log2_denom, int weightd,
+;                             int weights, int offset);
+;-----------------------------------------------------------------------------
+%if ARCH_X86_32
+DECLARE_REG_TMP 3
+%else
+DECLARE_REG_TMP 7
+%endif
+
+%macro BIWEIGHT_PROLOGUE 0
+.prologue:
+    PROLOGUE 0,8,8
+    movifnidn  r0, r0mp
+    movifnidn  r1, r1mp
+    movifnidn r2d, r2m
+    movifnidn r5d, r5m
+    movifnidn r6d, r6m
+    movifnidn t0d, r7m
+%endmacro
+
+%macro BIWEIGHT_SETUP 0
+    lea        t0, [t0*4+1] ; (offset<<2)+1
+    or         t0, 1
+    shl        r6, 16
+    or         r5, r6
+    movd       m4, r5d      ; weightd | weights
+    movd       m5, t0d      ; (offset+1)|1
+    movd       m6, r4m      ; log2_denom
+    pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
+    paddd      m6, [sq_1]
+    pshufd     m4, m4, 0
+    pshufd     m5, m5, 0
+    mova       m3, [pw_pixel_max]
+    movifnidn r3d, r3m
+%if notcpuflag(sse4)
+    pxor       m7, m7
+%endif
+%endmacro
+
+%macro BIWEIGHT 1-2
+%if %0==1
+    mova       m0, [r0+%1]
+    mova       m1, [r1+%1]
+    punpckhwd  m2, m0, m1
+    punpcklwd  m0, m1
+%else
+    movq       m0, [r0+%1]
+    movq       m1, [r1+%1]
+    punpcklwd  m0, m1
+    movq       m2, [r0+%2]
+    movq       m1, [r1+%2]
+    punpcklwd  m2, m1
+%endif
+    pmaddwd    m0, m4
+    pmaddwd    m2, m4
+    paddd      m0, m5
+    paddd      m2, m5
+    psrad      m0, m6
+    psrad      m2, m6
+%if cpuflag(sse4)
+    packusdw   m0, m2
+    pminsw     m0, m3
+%else
+    packssdw   m0, m2
+    CLIPW      m0, m7, m3
+%endif
+%endmacro
+
+%macro BIWEIGHT_FUNC_DBL 0
+cglobal h264_biweight_16_10
+    BIWEIGHT_PROLOGUE
+    BIWEIGHT_SETUP
+.nextrow:
+    BIWEIGHT   0
+    mova [r0   ], m0
+    BIWEIGHT  16
+    mova [r0+16], m0
+    add       r0, r2
+    add       r1, r2
+    dec       r3d
+    jnz .nextrow
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+BIWEIGHT_FUNC_DBL
+INIT_XMM sse4
+BIWEIGHT_FUNC_DBL
+
+%macro BIWEIGHT_FUNC 0
+cglobal h264_biweight_8_10
+    BIWEIGHT_PROLOGUE
+    BIWEIGHT_SETUP
+.nextrow:
+    BIWEIGHT  0
+    mova   [r0], m0
+    add      r0, r2
+    add      r1, r2
+    dec      r3d
+    jnz .nextrow
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+BIWEIGHT_FUNC
+INIT_XMM sse4
+BIWEIGHT_FUNC
+
+%macro BIWEIGHT_FUNC_HALF 0
+cglobal h264_biweight_4_10
+    BIWEIGHT_PROLOGUE
+    BIWEIGHT_SETUP
+    sar        r3d, 1
+    lea        r4, [r2*2]
+.nextrow:
+    BIWEIGHT     0, r2
+    movh   [r0   ], m0
+    movhps [r0+r2], m0
+    add         r0, r4
+    add         r1, r4
+    dec         r3d
+    jnz .nextrow
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+BIWEIGHT_FUNC_HALF
+INIT_XMM sse4
+BIWEIGHT_FUNC_HALF
--- a/externals/ffmpeg/libavcodec/x86/h264chroma_init.c
+++ b/externals/ffmpeg/libavcodec/x86/h264chroma_init.c
@@ -0,0 +1,117 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/h264chroma.h"
+
+void ff_put_h264_chroma_mc8_rnd_mmx  (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
+                                       ptrdiff_t stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc4_mmx      (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc4_mmxext   (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc4_3dnow    (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc2_mmxext   (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc2_mmxext   (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+void ff_put_h264_chroma_mc4_ssse3    (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+
+void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc4_ssse3    (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+
+#define CHROMA_MC(OP, NUM, DEPTH, OPT)                                  \
+void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT      \
+                                      (uint8_t *dst, uint8_t *src,      \
+                                       ptrdiff_t stride, int h, int x, int y);
+
+CHROMA_MC(put, 2, 10, mmxext)
+CHROMA_MC(avg, 2, 10, mmxext)
+CHROMA_MC(put, 4, 10, mmxext)
+CHROMA_MC(avg, 4, 10, mmxext)
+CHROMA_MC(put, 8, 10, sse2)
+CHROMA_MC(avg, 8, 10, sse2)
+CHROMA_MC(put, 8, 10, avx)
+CHROMA_MC(avg, 8, 10, avx)
+
+av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth)
+{
+    int high_bit_depth = bit_depth > 8;
+    int cpu_flags      = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags) && !high_bit_depth) {
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
+        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
+    }
+
+    if (EXTERNAL_AMD3DNOW(cpu_flags) && !high_bit_depth) {
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && !high_bit_depth) {
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
+        c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
+        c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
+        c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
+        c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
+        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags) && !high_bit_depth) {
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
+        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
+    }
+
+    if (EXTERNAL_AVX(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
+        // AVX implies !cache64.
+        // TODO: Port cache(32|64) detection from x264.
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/h264dsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/h264dsp_init.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/h264dsp.h"
+
+/***********************************/
+/* IDCT */
+#define IDCT_ADD_FUNC(NUM, DEPTH, OPT)                                  \
+void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst,    \
+                                                       int16_t *block,  \
+                                                       int stride);
+
+IDCT_ADD_FUNC(, 8, mmx)
+IDCT_ADD_FUNC(, 8, sse2)
+IDCT_ADD_FUNC(, 8, avx)
+IDCT_ADD_FUNC(, 10, sse2)
+IDCT_ADD_FUNC(_dc, 8, mmxext)
+IDCT_ADD_FUNC(_dc, 8, sse2)
+IDCT_ADD_FUNC(_dc, 8, avx)
+IDCT_ADD_FUNC(_dc, 10, mmxext)
+IDCT_ADD_FUNC(8_dc, 8, mmxext)
+IDCT_ADD_FUNC(8_dc, 10, sse2)
+IDCT_ADD_FUNC(8, 8, mmx)
+IDCT_ADD_FUNC(8, 8, sse2)
+IDCT_ADD_FUNC(8, 10, sse2)
+IDCT_ADD_FUNC(, 10, avx)
+IDCT_ADD_FUNC(8_dc, 10, avx)
+IDCT_ADD_FUNC(8, 10, avx)
+
+
+#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT)                         \
+void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT       \
+    (uint8_t *dst, const int *block_offset,                             \
+     int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
+
+IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
+IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
+IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
+IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
+IDCT_ADD_REP_FUNC(8, 4, 10, avx)
+IDCT_ADD_REP_FUNC(, 16, 8, mmx)
+IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
+IDCT_ADD_REP_FUNC(, 16, 8, sse2)
+IDCT_ADD_REP_FUNC(, 16, 10, sse2)
+IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
+IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
+IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
+IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
+IDCT_ADD_REP_FUNC(, 16, 10, avx)
+IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
+
+
+#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT)                      \
+void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT     \
+    (uint8_t **dst, const int *block_offset,                          \
+     int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
+
+IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
+IDCT_ADD_REP_FUNC2(, 8, 8, mmxext)
+IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
+IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
+IDCT_ADD_REP_FUNC2(, 8, 10, avx)
+
+IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx)
+
+IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2)
+IDCT_ADD_REP_FUNC2(, 8_422, 10, avx)
+
+void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
+void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
+
+/***********************************/
+/* deblocking */
+
+void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40],
+                                         int8_t ref[2][40],
+                                         int16_t mv[2][40][2],
+                                         int bidir, int edges, int step,
+                                         int mask_mv0, int mask_mv1, int field);
+
+#define LF_FUNC(DIR, TYPE, DEPTH, OPT)                                        \
+void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix,  \
+                                                               ptrdiff_t stride, \
+                                                               int alpha,     \
+                                                               int beta,      \
+                                                               int8_t *tc0);
+#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
+void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix,  \
+                                                               ptrdiff_t stride, \
+                                                               int alpha,     \
+                                                               int beta);
+
+#define LF_FUNCS(type, depth)                   \
+LF_FUNC(h,  chroma,          depth, mmxext)     \
+LF_IFUNC(h, chroma_intra,    depth, mmxext)     \
+LF_FUNC(h,  chroma422,       depth, mmxext)     \
+LF_IFUNC(h, chroma422_intra, depth, mmxext)     \
+LF_FUNC(v,  chroma,          depth, mmxext)     \
+LF_IFUNC(v, chroma_intra,    depth, mmxext)     \
+LF_FUNC(h,  luma,            depth, mmxext)     \
+LF_IFUNC(h, luma_intra,      depth, mmxext)     \
+LF_FUNC(h,  luma,            depth, sse2)       \
+LF_IFUNC(h, luma_intra,      depth, sse2)       \
+LF_FUNC(v,  luma,            depth, sse2)       \
+LF_IFUNC(v, luma_intra,      depth, sse2)       \
+LF_FUNC(h,  chroma,          depth, sse2)       \
+LF_IFUNC(h, chroma_intra,    depth, sse2)       \
+LF_FUNC(h,  chroma422,       depth, sse2)       \
+LF_IFUNC(h, chroma422_intra, depth, sse2)       \
+LF_FUNC(v,  chroma,          depth, sse2)       \
+LF_IFUNC(v, chroma_intra,    depth, sse2)       \
+LF_FUNC(h,  luma,            depth, avx)        \
+LF_IFUNC(h, luma_intra,      depth, avx)        \
+LF_FUNC(v,  luma,            depth, avx)        \
+LF_IFUNC(v, luma_intra,      depth, avx)        \
+LF_FUNC(h,  chroma,          depth, avx)        \
+LF_IFUNC(h, chroma_intra,    depth, avx)        \
+LF_FUNC(h,  chroma422,       depth, avx)        \
+LF_IFUNC(h, chroma422_intra, depth, avx)        \
+LF_FUNC(v,  chroma,          depth, avx)        \
+LF_IFUNC(v, chroma_intra,    depth, avx)
+
+LF_FUNC(h, luma_mbaff, 8, sse2)
+LF_FUNC(h, luma_mbaff, 8, avx)
+
+LF_FUNCS(uint8_t,   8)
+LF_FUNCS(uint16_t, 10)
+
+#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
+LF_FUNC(v8, luma, 8, mmxext)
+static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
+                                    int beta, int8_t *tc0)
+{
+    if ((tc0[0] & tc0[1]) >= 0)
+        ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0);
+    if ((tc0[2] & tc0[3]) >= 0)
+        ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2);
+}
+LF_IFUNC(v8, luma_intra, 8, mmxext)
+static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride,
+                                          int alpha, int beta)
+{
+    ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta);
+    ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta);
+}
+#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
+
+LF_FUNC(v,  luma,       10, mmxext)
+LF_IFUNC(v, luma_intra, 10, mmxext)
+
+/***********************************/
+/* weighted prediction */
+
+#define H264_WEIGHT(W, OPT)                                             \
+void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, ptrdiff_t stride,   \
+                                      int height, int log2_denom,       \
+                                      int weight, int offset);
+
+#define H264_BIWEIGHT(W, OPT)                                           \
+void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src,     \
+                                        ptrdiff_t stride, int height,   \
+                                        int log2_denom, int weightd,    \
+                                        int weights, int offset);
+
+#define H264_BIWEIGHT_MMX(W)                    \
+    H264_WEIGHT(W, mmxext)                      \
+    H264_BIWEIGHT(W, mmxext)
+
+#define H264_BIWEIGHT_MMX_SSE(W)                \
+    H264_BIWEIGHT_MMX(W)                        \
+    H264_WEIGHT(W, sse2)                        \
+    H264_BIWEIGHT(W, sse2)                      \
+    H264_BIWEIGHT(W, ssse3)
+
+H264_BIWEIGHT_MMX_SSE(16)
+H264_BIWEIGHT_MMX_SSE(8)
+H264_BIWEIGHT_MMX(4)
+
+#define H264_WEIGHT_10(W, DEPTH, OPT)                                   \
+void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,       \
+                                                    ptrdiff_t stride,   \
+                                                    int height,         \
+                                                    int log2_denom,     \
+                                                    int weight,         \
+                                                    int offset);
+
+#define H264_BIWEIGHT_10(W, DEPTH, OPT)                                 \
+void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,     \
+                                                      uint8_t *src,     \
+                                                      ptrdiff_t stride, \
+                                                      int height,       \
+                                                      int log2_denom,   \
+                                                      int weightd,      \
+                                                      int weights,      \
+                                                      int offset);
+
+#define H264_BIWEIGHT_10_SSE(W, DEPTH)          \
+    H264_WEIGHT_10(W, DEPTH, sse2)              \
+    H264_WEIGHT_10(W, DEPTH, sse4)              \
+    H264_BIWEIGHT_10(W, DEPTH, sse2)            \
+    H264_BIWEIGHT_10(W, DEPTH, sse4)
+
+H264_BIWEIGHT_10_SSE(16, 10)
+H264_BIWEIGHT_10_SSE(8,  10)
+H264_BIWEIGHT_10_SSE(4,  10)
+
+av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
+                                 const int chroma_format_idc)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1)
+        c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext;
+
+    if (bit_depth == 8) {
+        if (EXTERNAL_MMX(cpu_flags)) {
+            c->h264_idct_dc_add   =
+            c->h264_idct_add      = ff_h264_idct_add_8_mmx;
+            c->h264_idct8_dc_add  =
+            c->h264_idct8_add     = ff_h264_idct8_add_8_mmx;
+
+            c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
+            c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
+            if (chroma_format_idc <= 1) {
+                c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
+            } else {
+                c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx;
+            }
+            c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
+            if (cpu_flags & AV_CPU_FLAG_CMOV)
+                c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
+        }
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->h264_idct_dc_add  = ff_h264_idct_dc_add_8_mmxext;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
+            c->h264_idct_add16   = ff_h264_idct_add16_8_mmxext;
+            c->h264_idct8_add4   = ff_h264_idct8_add4_8_mmxext;
+            if (chroma_format_idc <= 1)
+                c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
+            c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
+
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_mmxext;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_mmxext;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext;
+            }
+#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
+            c->h264_v_loop_filter_luma       = deblock_v_luma_8_mmxext;
+            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_mmxext;
+            c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
+            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
+#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
+            c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
+            c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
+            c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
+
+            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
+            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
+            c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
+        }
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            c->h264_idct8_add  = ff_h264_idct8_add_8_sse2;
+
+            c->h264_idct_add16 = ff_h264_idct_add16_8_sse2;
+            c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2;
+            if (chroma_format_idc <= 1)
+                c->h264_idct_add8 = ff_h264_idct_add8_8_sse2;
+            c->h264_idct_add16intra      = ff_h264_idct_add16intra_8_sse2;
+            c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
+
+            c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2;
+            c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2;
+
+            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
+            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
+
+            c->h264_v_loop_filter_luma       = ff_deblock_v_luma_8_sse2;
+            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_sse2;
+            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
+            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
+
+#if ARCH_X86_64
+            c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_sse2;
+#endif
+
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_sse2;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_sse2;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_sse2;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_sse2;
+            } else {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_sse2;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2;
+            }
+
+            c->h264_idct_add        = ff_h264_idct_add_8_sse2;
+            c->h264_idct_dc_add     = ff_h264_idct_dc_add_8_sse2;
+        }
+        if (EXTERNAL_SSSE3(cpu_flags)) {
+            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
+            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->h264_v_loop_filter_luma       = ff_deblock_v_luma_8_avx;
+            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_avx;
+            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
+            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
+#if ARCH_X86_64
+            c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx;
+#endif
+
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_avx;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_avx;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_avx;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_avx;
+            } else {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_avx;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
+            }
+
+            c->h264_idct_add        = ff_h264_idct_add_8_avx;
+            c->h264_idct_dc_add     = ff_h264_idct_dc_add_8_avx;
+        }
+    } else if (bit_depth == 10) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+#if ARCH_X86_32
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_mmxext;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext;
+            }
+            c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_mmxext;
+            c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_mmxext;
+            c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_mmxext;
+            c->h264_h_loop_filter_luma_intra   = ff_deblock_h_luma_intra_10_mmxext;
+#endif /* ARCH_X86_32 */
+            c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
+        }
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            c->h264_idct_add     = ff_h264_idct_add_10_sse2;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
+
+            c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
+            if (chroma_format_idc <= 1) {
+                c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
+            } else {
+                c->h264_idct_add8 = ff_h264_idct_add8_422_10_sse2;
+            }
+            c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
+#if HAVE_ALIGNED_STACK
+            c->h264_idct8_add  = ff_h264_idct8_add_10_sse2;
+            c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
+#endif /* HAVE_ALIGNED_STACK */
+
+            c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
+            c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
+            c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
+
+            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
+            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
+            c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
+
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_sse2;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
+            }
+#if HAVE_ALIGNED_STACK
+            c->h264_v_loop_filter_luma       = ff_deblock_v_luma_10_sse2;
+            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_10_sse2;
+            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
+            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
+#endif /* HAVE_ALIGNED_STACK */
+        }
+        if (EXTERNAL_SSE4(cpu_flags)) {
+            c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
+            c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
+            c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
+
+            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
+            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
+            c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->h264_idct_dc_add  =
+            c->h264_idct_add     = ff_h264_idct_add_10_avx;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
+
+            c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
+            if (chroma_format_idc <= 1) {
+                c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
+            } else {
+                c->h264_idct_add8 = ff_h264_idct_add8_422_10_avx;
+            }
+            c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
+#if HAVE_ALIGNED_STACK
+            c->h264_idct8_add  = ff_h264_idct8_add_10_avx;
+            c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
+#endif /* HAVE_ALIGNED_STACK */
+
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_avx;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
+            }
+#if HAVE_ALIGNED_STACK
+            c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_avx;
+            c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_avx;
+            c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_avx;
+            c->h264_h_loop_filter_luma_intra   = ff_deblock_h_luma_intra_10_avx;
+#endif /* HAVE_ALIGNED_STACK */
+        }
+    }
+#endif
+}
--- a/externals/ffmpeg/libavcodec/x86/hevc_add_res.asm
+++ b/externals/ffmpeg/libavcodec/x86/hevc_add_res.asm
@@ -0,0 +1,369 @@
+; *****************************************************************************
+; * Provide SIMD optimizations for add_residual functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; ******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+cextern pw_1023
+%define max_pixels_10 pw_1023
+
+; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
+%macro ADD_RES_MMX_4_8 0
+    mova              m0, [r1]
+    mova              m2, [r1+8]
+
+    movd              m1, [r0]
+    movd              m3, [r0+r2]
+    punpcklbw         m1, m4
+    punpcklbw         m3, m4
+
+    paddsw            m0, m1
+    paddsw            m2, m3
+    packuswb          m0, m4
+    packuswb          m2, m4
+
+    movd            [r0], m0
+    movd         [r0+r2], m2
+%endmacro
+
+
+INIT_MMX mmxext
+; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_4_8, 3, 3, 6
+    pxor              m4, m4
+    ADD_RES_MMX_4_8
+    add               r1, 16
+    lea               r0, [r0+r2*2]
+    ADD_RES_MMX_4_8
+    RET
+
+%macro ADD_RES_SSE_8_8 0
+    movq              m0, [r0]
+    movq              m1, [r0+r2]
+    punpcklbw         m0, m4
+    punpcklbw         m1, m4
+    mova              m2, [r1]
+    mova              m3, [r1+16]
+    paddsw            m0, m2
+    paddsw            m1, m3
+    packuswb          m0, m1
+
+    movq              m2, [r0+r2*2]
+    movq              m3, [r0+r3]
+    punpcklbw         m2, m4
+    punpcklbw         m3, m4
+    mova              m6, [r1+32]
+    mova              m7, [r1+48]
+    paddsw            m2, m6
+    paddsw            m3, m7
+    packuswb          m2, m3
+
+    movq            [r0], m0
+    movhps       [r0+r2], m0
+    movq       [r0+r2*2], m2
+    movhps       [r0+r3], m2
+%endmacro
+
+%macro ADD_RES_SSE_16_32_8 3
+    mova              m1, [%2]
+    mova              m2, m1
+    punpcklbw         m1, m0
+    punpckhbw         m2, m0
+    mova             xm5, [r1+%1]
+    mova             xm6, [r1+%1+16]
+%if cpuflag(avx2)
+    vinserti128       m5, m5, [r1+%1+32], 1
+    vinserti128       m6, m6, [r1+%1+48], 1
+%endif
+    paddsw            m1, m5
+    paddsw            m2, m6
+
+    mova              m3, [%3]
+    mova              m4, m3
+    punpcklbw         m3, m0
+    punpckhbw         m4, m0
+    mova             xm5, [r1+%1+mmsize*2]
+    mova             xm6, [r1+%1+mmsize*2+16]
+%if cpuflag(avx2)
+    vinserti128       m5, m5, [r1+%1+96], 1
+    vinserti128       m6, m6, [r1+%1+112], 1
+%endif
+    paddsw            m3, m5
+    paddsw            m4, m6
+
+    packuswb          m1, m2
+    packuswb          m3, m4
+    mova            [%2], m1
+    mova            [%3], m3
+%endmacro
+
+
+%macro TRANSFORM_ADD_8 0
+; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_8_8, 3, 4, 8
+    pxor              m4, m4
+    lea               r3, [r2*3]
+    ADD_RES_SSE_8_8
+    add               r1, 64
+    lea               r0, [r0+r2*4]
+    ADD_RES_SSE_8_8
+    RET
+
+; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_16_8, 3, 5, 7
+    pxor                m0, m0
+    lea                 r3, [r2*3]
+    mov                r4d, 4
+.loop:
+    ADD_RES_SSE_16_32_8  0, r0,      r0+r2
+    ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
+    add                 r1, 128
+    lea                 r0, [r0+r2*4]
+    dec                r4d
+    jg .loop
+    RET
+
+; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_32_8, 3, 5, 7
+    pxor                m0, m0
+    mov                r4d, 16
+.loop:
+    ADD_RES_SSE_16_32_8  0, r0,    r0+16
+    ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
+    add                 r1, 128
+    lea                 r0, [r0+r2*2]
+    dec                r4d
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+TRANSFORM_ADD_8
+INIT_XMM avx
+TRANSFORM_ADD_8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_32_8, 3, 5, 7
+    pxor                 m0, m0
+    lea                  r3, [r2*3]
+    mov                 r4d, 8
+.loop:
+    ADD_RES_SSE_16_32_8   0, r0,      r0+r2
+    ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
+    add                  r1, 256
+    lea                  r0, [r0+r2*4]
+    dec                 r4d
+    jg .loop
+    RET
+%endif ;HAVE_AVX2_EXTERNAL
+
+%macro ADD_RES_SSE_8_10 4
+    mova              m0, [%4]
+    mova              m1, [%4+16]
+    mova              m2, [%4+32]
+    mova              m3, [%4+48]
+    paddw             m0, [%1+0]
+    paddw             m1, [%1+%2]
+    paddw             m2, [%1+%2*2]
+    paddw             m3, [%1+%3]
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova          [%1+0], m0
+    mova         [%1+%2], m1
+    mova       [%1+%2*2], m2
+    mova         [%1+%3], m3
+%endmacro
+
+%macro ADD_RES_MMX_4_10 3
+    mova              m0, [%1+0]
+    mova              m1, [%1+%2]
+    paddw             m0, [%3]
+    paddw             m1, [%3+8]
+    CLIPW             m0, m2, m3
+    CLIPW             m1, m2, m3
+    mova          [%1+0], m0
+    mova         [%1+%2], m1
+%endmacro
+
+%macro ADD_RES_SSE_16_10 3
+    mova              m0, [%3]
+    mova              m1, [%3+16]
+    mova              m2, [%3+32]
+    mova              m3, [%3+48]
+    paddw             m0, [%1]
+    paddw             m1, [%1+16]
+    paddw             m2, [%1+%2]
+    paddw             m3, [%1+%2+16]
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova            [%1], m0
+    mova         [%1+16], m1
+    mova         [%1+%2], m2
+    mova      [%1+%2+16], m3
+%endmacro
+
+%macro ADD_RES_SSE_32_10 2
+    mova              m0, [%2]
+    mova              m1, [%2+16]
+    mova              m2, [%2+32]
+    mova              m3, [%2+48]
+
+    paddw             m0, [%1]
+    paddw             m1, [%1+16]
+    paddw             m2, [%1+32]
+    paddw             m3, [%1+48]
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova            [%1], m0
+    mova         [%1+16], m1
+    mova         [%1+32], m2
+    mova         [%1+48], m3
+%endmacro
+
+%macro ADD_RES_AVX2_16_10 4
+    mova              m0, [%4]
+    mova              m1, [%4+32]
+    mova              m2, [%4+64]
+    mova              m3, [%4+96]
+
+    paddw             m0, [%1+0]
+    paddw             m1, [%1+%2]
+    paddw             m2, [%1+%2*2]
+    paddw             m3, [%1+%3]
+
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova          [%1+0], m0
+    mova         [%1+%2], m1
+    mova       [%1+%2*2], m2
+    mova         [%1+%3], m3
+%endmacro
+
+%macro ADD_RES_AVX2_32_10 3
+    mova              m0, [%3]
+    mova              m1, [%3+32]
+    mova              m2, [%3+64]
+    mova              m3, [%3+96]
+
+    paddw             m0, [%1]
+    paddw             m1, [%1+32]
+    paddw             m2, [%1+%2]
+    paddw             m3, [%1+%2+32]
+
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova            [%1], m0
+    mova         [%1+32], m1
+    mova         [%1+%2], m2
+    mova      [%1+%2+32], m3
+%endmacro
+
+; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
+INIT_MMX mmxext
+cglobal hevc_add_residual_4_10, 3, 3, 6
+    pxor              m2, m2
+    mova              m3, [max_pixels_10]
+    ADD_RES_MMX_4_10  r0, r2, r1
+    add               r1, 16
+    lea               r0, [r0+2*r2]
+    ADD_RES_MMX_4_10  r0, r2, r1
+    RET
+
+INIT_XMM sse2
+cglobal hevc_add_residual_8_10, 3, 4, 6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+    lea               r3, [r2*3]
+
+    ADD_RES_SSE_8_10  r0, r2, r3, r1
+    lea               r0, [r0+r2*4]
+    add               r1, 64
+    ADD_RES_SSE_8_10  r0, r2, r3, r1
+    RET
+
+cglobal hevc_add_residual_16_10, 3, 5, 6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+
+    mov              r4d, 8
+.loop:
+    ADD_RES_SSE_16_10 r0, r2, r1
+    lea               r0, [r0+r2*2]
+    add               r1, 64
+    dec              r4d
+    jg .loop
+    RET
+
+cglobal hevc_add_residual_32_10, 3, 5, 6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+
+    mov              r4d, 32
+.loop:
+    ADD_RES_SSE_32_10 r0, r1
+    lea               r0, [r0+r2]
+    add               r1, 64
+    dec              r4d
+    jg .loop
+    RET
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal hevc_add_residual_16_10, 3, 5, 6
+    pxor               m4, m4
+    mova               m5, [max_pixels_10]
+    lea                r3, [r2*3]
+
+    mov               r4d, 4
+.loop:
+    ADD_RES_AVX2_16_10 r0, r2, r3, r1
+    lea                r0, [r0+r2*4]
+    add                r1, 128
+    dec               r4d
+    jg .loop
+    RET
+
+cglobal hevc_add_residual_32_10, 3, 5, 6
+    pxor               m4, m4
+    mova               m5, [max_pixels_10]
+
+    mov               r4d, 16
+.loop:
+    ADD_RES_AVX2_32_10 r0, r2, r1
+    lea                r0, [r0+r2*2]
+    add                r1, 128
+    dec               r4d
+    jg .loop
+    RET
+%endif ;HAVE_AVX2_EXTERNAL
--- a/externals/ffmpeg/libavcodec/x86/hevc_deblock.asm
+++ b/externals/ffmpeg/libavcodec/x86/hevc_deblock.asm
@@ -0,0 +1,871 @@
+;*****************************************************************************
+;* SSE2-optimized HEVC deblocking code
+;*****************************************************************************
+;* Copyright (C) 2013 VTT
+;*
+;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pw_1023
+%define pw_pixel_max_10 pw_1023
+pw_pixel_max_12: times 8 dw ((1 << 12)-1)
+pw_m2:           times 8 dw -2
+pd_1 :           times 4 dd  1
+
+cextern pw_4
+cextern pw_8
+cextern pw_m1
+
+SECTION .text
+INIT_XMM sse2
+
+; in: 8 rows of 4 bytes in %4..%11
+; out: 4 rows of 8 words in m0..m3
+%macro TRANSPOSE4x8B_LOAD 8
+    movd             m0, %1
+    movd             m2, %2
+    movd             m1, %3
+    movd             m3, %4
+
+    punpcklbw        m0, m2
+    punpcklbw        m1, m3
+    punpcklwd        m0, m1
+
+    movd             m4, %5
+    movd             m6, %6
+    movd             m5, %7
+    movd             m3, %8
+
+    punpcklbw        m4, m6
+    punpcklbw        m5, m3
+    punpcklwd        m4, m5
+
+    punpckhdq        m2, m0, m4
+    punpckldq        m0, m4
+
+    pxor             m5, m5
+    punpckhbw        m1, m0, m5
+    punpcklbw        m0, m5
+    punpckhbw        m3, m2, m5
+    punpcklbw        m2, m5
+%endmacro
+
+; in: 4 rows of 8 words in m0..m3
+; out: 8 rows of 4 bytes in %1..%8
+%macro TRANSPOSE8x4B_STORE 8
+    packuswb         m0, m2
+    packuswb         m1, m3
+    SBUTTERFLY bw, 0, 1, 2
+    SBUTTERFLY wd, 0, 1, 2
+
+    movd             %1, m0
+    pshufd           m0, m0, 0x39
+    movd             %2, m0
+    pshufd           m0, m0, 0x39
+    movd             %3, m0
+    pshufd           m0, m0, 0x39
+    movd             %4, m0
+
+    movd             %5, m1
+    pshufd           m1, m1, 0x39
+    movd             %6, m1
+    pshufd           m1, m1, 0x39
+    movd             %7, m1
+    pshufd           m1, m1, 0x39
+    movd             %8, m1
+%endmacro
+
+; in: 8 rows of 4 words in %4..%11
+; out: 4 rows of 8 words in m0..m3
+%macro TRANSPOSE4x8W_LOAD 8
+    movq             m0, %1
+    movq             m2, %2
+    movq             m1, %3
+    movq             m3, %4
+
+    punpcklwd        m0, m2
+    punpcklwd        m1, m3
+    punpckhdq        m2, m0, m1
+    punpckldq        m0, m1
+
+    movq             m4, %5
+    movq             m6, %6
+    movq             m5, %7
+    movq             m3, %8
+
+    punpcklwd        m4, m6
+    punpcklwd        m5, m3
+    punpckhdq        m6, m4, m5
+    punpckldq        m4, m5
+
+    punpckhqdq       m1, m0, m4
+    punpcklqdq       m0, m4
+    punpckhqdq       m3, m2, m6
+    punpcklqdq       m2, m6
+
+%endmacro
+
+; in: 4 rows of 8 words in m0..m3
+; out: 8 rows of 4 words in %1..%8
+%macro TRANSPOSE8x4W_STORE 9
+    TRANSPOSE4x4W     0, 1, 2, 3, 4
+
+    pxor             m5, m5; zeros reg
+    CLIPW            m0, m5, %9
+    CLIPW            m1, m5, %9
+    CLIPW            m2, m5, %9
+    CLIPW            m3, m5, %9
+
+    movq             %1, m0
+    movhps           %2, m0
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
+%endmacro
+
+; in: 8 rows of 8 bytes in %1..%8
+; out: 8 rows of 8 words in m0..m7
+%macro TRANSPOSE8x8B_LOAD 8
+    movq             m7, %1
+    movq             m2, %2
+    movq             m1, %3
+    movq             m3, %4
+
+    punpcklbw        m7, m2
+    punpcklbw        m1, m3
+    punpcklwd        m3, m7, m1
+    punpckhwd        m7, m1
+
+    movq             m4, %5
+    movq             m6, %6
+    movq             m5, %7
+    movq            m15, %8
+
+    punpcklbw        m4, m6
+    punpcklbw        m5, m15
+    punpcklwd        m9, m4, m5
+    punpckhwd        m4, m5
+
+    punpckldq        m1, m3, m9;  0, 1
+    punpckhdq        m3, m9;  2, 3
+
+    punpckldq        m5, m7, m4;  4, 5
+    punpckhdq        m7, m4;  6, 7
+
+    pxor            m13, m13
+
+    punpcklbw        m0, m1, m13; 0 in 16 bit
+    punpckhbw        m1, m13; 1 in 16 bit
+
+    punpcklbw        m2, m3, m13; 2
+    punpckhbw        m3, m13; 3
+
+    punpcklbw        m4, m5, m13; 4
+    punpckhbw        m5, m13; 5
+
+    punpcklbw        m6, m7, m13; 6
+    punpckhbw        m7, m13; 7
+%endmacro
+
+
+; in: 8 rows of 8 words in m0..m8
+; out: 8 rows of 8 bytes in %1..%8
+%macro TRANSPOSE8x8B_STORE 8
+    packuswb         m0, m4
+    packuswb         m1, m5
+    packuswb         m2, m6
+    packuswb         m3, m7
+    TRANSPOSE2x4x4B   0, 1, 2, 3, 4
+
+    movq             %1, m0
+    movhps           %2, m0
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
+%endmacro
+
+; in: 8 rows of 8 words in %1..%8
+; out: 8 rows of 8 words in m0..m7
+%macro TRANSPOSE8x8W_LOAD 8
+    movdqu           m0, %1
+    movdqu           m1, %2
+    movdqu           m2, %3
+    movdqu           m3, %4
+    movdqu           m4, %5
+    movdqu           m5, %6
+    movdqu           m6, %7
+    movdqu           m7, %8
+    TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
+%endmacro
+
+; in: 8 rows of 8 words in m0..m8
+; out: 8 rows of 8 words in %1..%8
+%macro TRANSPOSE8x8W_STORE 9
+    TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
+
+    pxor             m8, m8
+    CLIPW            m0, m8, %9
+    CLIPW            m1, m8, %9
+    CLIPW            m2, m8, %9
+    CLIPW            m3, m8, %9
+    CLIPW            m4, m8, %9
+    CLIPW            m5, m8, %9
+    CLIPW            m6, m8, %9
+    CLIPW            m7, m8, %9
+
+    movdqu           %1, m0
+    movdqu           %2, m1
+    movdqu           %3, m2
+    movdqu           %4, m3
+    movdqu           %5, m4
+    movdqu           %6, m5
+    movdqu           %7, m6
+    movdqu           %8, m7
+%endmacro
+
+
+; in: %2 clobbered
+; out: %1
+; mask in m11
+; clobbers m10
+%macro MASKED_COPY 2
+    pand             %2, m11 ; and mask
+    pandn           m10, m11, %1; and -mask
+    por              %2, m10
+    mova             %1, %2
+%endmacro
+
+; in: %2 clobbered
+; out: %1
+; mask in %3, will be clobbered
+%macro MASKED_COPY2 3
+    pand             %2, %3 ; and mask
+    pandn            %3, %1; and -mask
+    por              %2, %3
+    mova             %1, %2
+%endmacro
+
+ALIGN 16
+; input in m0 ... m3 and tcs in r2. Output in m1 and m2
+%macro CHROMA_DEBLOCK_BODY 1
+    psubw            m4, m2, m1; q0 - p0
+    psubw            m5, m0, m3; p1 - q1
+    psllw            m4, 2; << 2
+    paddw            m5, m4;
+
+    ;tc calculations
+    movq             m6, [tcq]; tc0
+    punpcklwd        m6, m6
+    pshufd           m6, m6, 0xA0; tc0, tc1
+%if cpuflag(ssse3)
+    psignw           m4, m6, [pw_m1]; -tc0, -tc1
+%else
+    pmullw           m4, m6, [pw_m1]; -tc0, -tc1
+%endif
+    ;end tc calculations
+
+    paddw            m5, [pw_4]; +4
+    psraw            m5, 3; >> 3
+
+%if %1 > 8
+    psllw            m4, %1-8; << (BIT_DEPTH - 8)
+    psllw            m6, %1-8; << (BIT_DEPTH - 8)
+%endif
+    pmaxsw           m5, m4
+    pminsw           m5, m6
+    paddw            m1, m5; p0 + delta0
+    psubw            m2, m5; q0 - delta0
+%endmacro
+
+; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6
+%macro LUMA_DEBLOCK_BODY 2
+    psllw            m9, m2, 1; *2
+    psubw           m10, m1, m9
+    paddw           m10, m3
+    ABS1            m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3
+
+    psllw            m9, m5, 1; *2
+    psubw           m11, m6, m9
+    paddw           m11, m4
+    ABS1            m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3
+
+    ;beta calculations
+%if %1 > 8
+    shl             betaq, %1 - 8
+%endif
+    movd            m13, betad
+    SPLATW          m13, m13, 0
+    ;end beta calculations
+
+    paddw            m9, m10, m11;   0d0, 0d3  ,  1d0, 1d3
+
+    pshufhw         m14, m9, 0x0f ;0b00001111;  0d3 0d3 0d0 0d0 in high
+    pshuflw         m14, m14, 0x0f ;0b00001111;  1d3 1d3 1d0 1d0 in low
+
+    pshufhw          m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
+    pshuflw          m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
+
+    paddw           m14, m9; 0d0+0d3, 1d0+1d3
+
+    ;compare
+    pcmpgtw         m15, m13, m14
+    movmskps        r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
+    test            r13, r13
+    je              .bypassluma
+
+    ;weak / strong decision compare to beta_2
+    psraw           m15, m13, 2;   beta >> 2
+    psllw            m8, m9, 1;
+    pcmpgtw         m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
+    movmskps        r6, m15;
+    ;end weak / strong decision
+
+    ; weak filter nd_p/q calculation
+    pshufd           m8, m10, 0x31
+    psrld            m8, 16
+    paddw            m8, m10
+    movd            r7d, m8
+    pshufd           m8, m8, 0x4E
+    movd            r8d, m8
+
+    pshufd           m8, m11, 0x31
+    psrld            m8, 16
+    paddw            m8, m11
+    movd            r9d, m8
+    pshufd           m8, m8, 0x4E
+    movd           r10d, m8
+    ; end calc for weak filter
+
+    ; filtering mask
+    mov             r11, r13
+    shr             r11, 3
+    movd            m15, r11d
+    and             r13, 1
+    movd            m11, r13d
+    shufps          m11, m15, 0
+    shl             r11, 1
+    or              r13, r11
+
+    pcmpeqd         m11, [pd_1]; filtering mask
+
+    ;decide between strong and weak filtering
+    ;tc25 calculations
+    mov            r11d, [tcq];
+%if %1 > 8
+    shl             r11, %1 - 8
+%endif
+    movd             m8, r11d; tc0
+    mov             r3d, [tcq+4];
+%if %1 > 8
+    shl              r3, %1 - 8
+%endif
+    add            r11d, r3d; tc0 + tc1
+    jz             .bypassluma
+    movd             m9, r3d; tc1
+    punpcklwd        m8, m8
+    punpcklwd        m9, m9
+    shufps           m8, m9, 0; tc0, tc1
+    mova             m9, m8
+    psllw            m8, 2; tc << 2
+    pavgw            m8, m9; tc25 = ((tc * 5 + 1) >> 1)
+    ;end tc25 calculations
+
+    ;----beta_3 comparison-----
+    psubw           m12, m0, m3;      p3 - p0
+    ABS1            m12, m14; abs(p3 - p0)
+
+    psubw           m15, m7, m4;      q3 - q0
+    ABS1            m15, m14; abs(q3 - q0)
+
+    paddw           m12, m15; abs(p3 - p0) + abs(q3 - q0)
+
+    pshufhw         m12, m12, 0xf0 ;0b11110000;
+    pshuflw         m12, m12, 0xf0 ;0b11110000;
+
+    psraw           m13, 3; beta >> 3
+    pcmpgtw         m13, m12;
+    movmskps        r11, m13;
+    and             r6, r11; strong mask , beta_2 and beta_3 comparisons
+    ;----beta_3 comparison end-----
+    ;----tc25 comparison---
+    psubw           m12, m3, m4;      p0 - q0
+    ABS1            m12, m14; abs(p0 - q0)
+
+    pshufhw         m12, m12, 0xf0 ;0b11110000;
+    pshuflw         m12, m12, 0xf0 ;0b11110000;
+
+    pcmpgtw          m8, m12; tc25 comparisons
+    movmskps        r11, m8;
+    and             r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
+    ;----tc25 comparison end---
+    mov             r11, r6;
+    shr             r11, 1;
+    and             r6, r11; strong mask, bits 2 and 0
+
+    pmullw          m14, m9, [pw_m2]; -tc * 2
+    paddw            m9, m9
+
+    and             r6, 5; 0b101
+    mov             r11, r6; strong mask
+    shr             r6, 2;
+    movd            m12, r6d; store to xmm for mask generation
+    shl             r6, 1
+    and             r11, 1
+    movd            m10, r11d; store to xmm for mask generation
+    or              r6, r11; final strong mask, bits 1 and 0
+    jz      .weakfilter
+
+    shufps          m10, m12, 0
+    pcmpeqd         m10, [pd_1]; strong mask
+
+    mova            m13, [pw_4]; 4 in every cell
+    pand            m11, m10; combine filtering mask and strong mask
+    paddw           m12, m2, m3;          p1 +   p0
+    paddw           m12, m4;          p1 +   p0 +   q0
+    mova            m10, m12; copy
+    paddw           m12, m12;       2*p1 + 2*p0 + 2*q0
+    paddw           m12, m1;   p2 + 2*p1 + 2*p0 + 2*q0
+    paddw           m12, m5;   p2 + 2*p1 + 2*p0 + 2*q0 + q1
+    paddw           m12, m13;  p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4
+    psraw           m12, 3;  ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3)
+    psubw           m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0
+    pmaxsw          m12, m14
+    pminsw          m12, m9; av_clip( , -2 * tc, 2 * tc)
+    paddw           m12, m3; p0'
+
+    paddw           m15, m1, m10; p2 + p1 + p0 + q0
+    psrlw           m13, 1; 2 in every cell
+    paddw           m15, m13; p2 + p1 + p0 + q0 + 2
+    psraw           m15, 2;  (p2 + p1 + p0 + q0 + 2) >> 2
+    psubw           m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1
+    pmaxsw          m15, m14
+    pminsw          m15, m9; av_clip( , -2 * tc, 2 * tc)
+    paddw           m15, m2; p1'
+
+    paddw            m8, m1, m0;     p3 +   p2
+    paddw            m8, m8;   2*p3 + 2*p2
+    paddw            m8, m1;   2*p3 + 3*p2
+    paddw            m8, m10;  2*p3 + 3*p2 + p1 + p0 + q0
+    paddw           m13, m13
+    paddw            m8, m13;  2*p3 + 3*p2 + p1 + p0 + q0 + 4
+    psraw            m8, 3;   (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
+    psubw            m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2
+    pmaxsw           m8, m14
+    pminsw           m8, m9; av_clip( , -2 * tc, 2 * tc)
+    paddw            m8, m1; p2'
+    MASKED_COPY      m1, m8
+
+    paddw            m8, m3, m4;         p0 +   q0
+    paddw            m8, m5;         p0 +   q0 +   q1
+    paddw            m8, m8;       2*p0 + 2*q0 + 2*q1
+    paddw            m8, m2;  p1 + 2*p0 + 2*q0 + 2*q1
+    paddw            m8, m6;  p1 + 2*p0 + 2*q0 + 2*q1 + q2
+    paddw            m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4
+    psraw            m8, 3;  (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3
+    psubw            m8, m4;
+    pmaxsw           m8, m14
+    pminsw           m8, m9; av_clip( , -2 * tc, 2 * tc)
+    paddw            m8, m4; q0'
+    MASKED_COPY      m2, m15
+
+    paddw           m15, m3, m4;   p0 + q0
+    paddw           m15, m5;   p0 + q0 + q1
+    mova            m10, m15;
+    paddw           m15, m6;   p0 + q0 + q1 + q2
+    psrlw           m13, 1; 2 in every cell
+    paddw           m15, m13;  p0 + q0 + q1 + q2 + 2
+    psraw           m15, 2;   (p0 + q0 + q1 + q2 + 2) >> 2
+    psubw           m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1
+    pmaxsw          m15, m14
+    pminsw          m15, m9; av_clip( , -2 * tc, 2 * tc)
+    paddw           m15, m5; q1'
+
+    paddw           m13, m7;      q3 + 2
+    paddw           m13, m6;      q3 +  q2 + 2
+    paddw           m13, m13;   2*q3 + 2*q2 + 4
+    paddw           m13, m6;    2*q3 + 3*q2 + 4
+    paddw           m13, m10;   2*q3 + 3*q2 + q1 + q0 + p0 + 4
+    psraw           m13, 3;    (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3
+    psubw           m13, m6;  ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2
+    pmaxsw          m13, m14
+    pminsw          m13, m9; av_clip( , -2 * tc, 2 * tc)
+    paddw           m13, m6; q2'
+
+    MASKED_COPY      m6, m13
+    MASKED_COPY      m5, m15
+    MASKED_COPY      m4, m8
+    MASKED_COPY      m3, m12
+
+.weakfilter:
+    not             r6; strong mask -> weak mask
+    and             r6, r13; final weak filtering mask, bits 0 and 1
+    jz             .store
+
+    ; weak filtering mask
+    mov             r11, r6
+    shr             r11, 1
+    movd            m12, r11d
+    and             r6, 1
+    movd            m11, r6d
+    shufps          m11, m12, 0
+    pcmpeqd         m11, [pd_1]; filtering mask
+
+    mov             r13, betaq
+    shr             r13, 1;
+    add             betaq, r13
+    shr             betaq, 3; ((beta + (beta >> 1)) >> 3))
+
+    mova            m13, [pw_8]
+    psubw           m12, m4, m3 ; q0 - p0
+    psllw           m10, m12, 3; 8 * (q0 - p0)
+    paddw           m12, m10 ; 9 * (q0 - p0)
+
+    psubw           m10, m5, m2 ; q1 - p1
+    psllw            m8, m10, 1; 2 * ( q1 - p1 )
+    paddw           m10, m8; 3 * ( q1 - p1 )
+    psubw           m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
+    paddw           m12, m13; + 8
+    psraw           m12, 4; >> 4 , delta0
+    PABSW           m13, m12; abs(delta0)
+
+
+    psllw           m10, m9, 2; 8 * tc
+    paddw           m10, m9; 10 * tc
+    pcmpgtw         m10, m13
+    pand            m11, m10
+
+    psraw            m9, 1;   tc * 2 -> tc
+    psraw           m14, 1; -tc * 2 -> -tc
+
+    pmaxsw          m12, m14
+    pminsw          m12, m9;  av_clip(delta0, -tc, tc)
+
+    psraw            m9, 1;   tc -> tc / 2
+%if cpuflag(ssse3)
+    psignw          m14, m9, [pw_m1]; -tc / 2
+%else
+    pmullw          m14, m9, [pw_m1]; -tc / 2
+%endif
+
+    pavgw           m15, m1, m3;   (p2 + p0 + 1) >> 1
+    psubw           m15, m2;  ((p2 + p0 + 1) >> 1) - p1
+    paddw           m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0
+    psraw           m15, 1;   (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
+    pmaxsw          m15, m14
+    pminsw          m15, m9; av_clip(deltap1, -tc/2, tc/2)
+    paddw           m15, m2; p1'
+
+    ;beta calculations
+    movd            m10, betad
+    SPLATW          m10, m10, 0
+
+    movd            m13, r7d; 1dp0 + 1dp3
+    movd             m8, r8d; 0dp0 + 0dp3
+    punpcklwd        m8, m8
+    punpcklwd       m13, m13
+    shufps          m13, m8, 0;
+    pcmpgtw          m8, m10, m13
+    pand             m8, m11
+    ;end beta calculations
+    MASKED_COPY2     m2, m15, m8; write p1'
+
+    pavgw            m8, m6, m4;   (q2 + q0 + 1) >> 1
+    psubw            m8, m5;  ((q2 + q0 + 1) >> 1) - q1
+    psubw            m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0)
+    psraw            m8, 1;   ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
+    pmaxsw           m8, m14
+    pminsw           m8, m9; av_clip(deltaq1, -tc/2, tc/2)
+    paddw            m8, m5; q1'
+
+    movd            m13, r9d;
+    movd            m15, r10d;
+    punpcklwd       m15, m15
+    punpcklwd       m13, m13
+    shufps          m13, m15, 0; dq0 + dq3
+
+    pcmpgtw         m10, m13; compare to ((beta+(beta>>1))>>3)
+    pand            m10, m11
+    MASKED_COPY2     m5, m8, m10; write q1'
+
+    paddw           m15, m3, m12 ; p0 + delta0
+    MASKED_COPY      m3, m15
+
+    psubw            m8, m4, m12 ; q0 - delta0
+    MASKED_COPY      m4, m8
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
+;                                   uint8_t *_no_p, uint8_t *_no_q);
+;-----------------------------------------------------------------------------
+%macro LOOP_FILTER_CHROMA 0
+cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 2
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8B_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
+    CHROMA_DEBLOCK_BODY 8
+    TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
+    RET
+
+cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 4
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
+    CHROMA_DEBLOCK_BODY 10
+    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
+    RET
+
+cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 4
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
+    CHROMA_DEBLOCK_BODY 12
+    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
+    RET
+
+;-----------------------------------------------------------------------------
+; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
+;                                   uint8_t *_no_p, uint8_t *_no_q);
+;-----------------------------------------------------------------------------
+cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
+    mov           pix0q, pixq
+    sub           pix0q, strideq
+    sub           pix0q, strideq
+    movq             m0, [pix0q];    p1
+    movq             m1, [pix0q+strideq]; p0
+    movq             m2, [pixq];    q0
+    movq             m3, [pixq+strideq]; q1
+    pxor             m5, m5; zeros reg
+    punpcklbw        m0, m5
+    punpcklbw        m1, m5
+    punpcklbw        m2, m5
+    punpcklbw        m3, m5
+    CHROMA_DEBLOCK_BODY  8
+    packuswb         m1, m2
+    movh[pix0q+strideq], m1
+    movhps       [pixq], m1
+    RET
+
+cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
+    mov          pix0q, pixq
+    sub          pix0q, strideq
+    sub          pix0q, strideq
+    movu            m0, [pix0q];    p1
+    movu            m1, [pix0q+strideq]; p0
+    movu            m2, [pixq];    q0
+    movu            m3, [pixq+strideq]; q1
+    CHROMA_DEBLOCK_BODY 10
+    pxor            m5, m5; zeros reg
+    CLIPW           m1, m5, [pw_pixel_max_10]
+    CLIPW           m2, m5, [pw_pixel_max_10]
+    movu [pix0q+strideq], m1
+    movu        [pixq], m2
+    RET
+
+cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
+    mov          pix0q, pixq
+    sub          pix0q, strideq
+    sub          pix0q, strideq
+    movu            m0, [pix0q];    p1
+    movu            m1, [pix0q+strideq]; p0
+    movu            m2, [pixq];    q0
+    movu            m3, [pixq+strideq]; q1
+    CHROMA_DEBLOCK_BODY 12
+    pxor            m5, m5; zeros reg
+    CLIPW           m1, m5, [pw_pixel_max_12]
+    CLIPW           m2, m5, [pw_pixel_max_12]
+    movu [pix0q+strideq], m1
+    movu        [pixq], m2
+    RET
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_CHROMA
+INIT_XMM avx
+LOOP_FILTER_CHROMA
+
+%if ARCH_X86_64
+%macro LOOP_FILTER_LUMA 0
+;-----------------------------------------------------------------------------
+; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
+;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
+;-----------------------------------------------------------------------------
+cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    sub            pixq, 4
+    lea           pix0q, [3 * r1]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8B_LOAD  PASS8ROWS(src3strideq, pixq, r1, pix0q)
+    LUMA_DEBLOCK_BODY 8, v
+.store:
+    TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
+.bypassluma:
+    RET
+
+cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    sub            pixq, 8
+    lea           pix0q, [3 * strideq]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
+    LUMA_DEBLOCK_BODY 10, v
+.store:
+    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
+.bypassluma:
+    RET
+
+cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    sub            pixq, 8
+    lea           pix0q, [3 * strideq]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
+    LUMA_DEBLOCK_BODY 12, v
+.store:
+    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
+.bypassluma:
+    RET
+
+;-----------------------------------------------------------------------------
+; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
+;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
+;-----------------------------------------------------------------------------
+cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    lea     src3strideq, [3 * strideq]
+    mov           pix0q, pixq
+    sub           pix0q, src3strideq
+    sub           pix0q, strideq
+    movq             m0, [pix0q];               p3
+    movq             m1, [pix0q +     strideq]; p2
+    movq             m2, [pix0q + 2 * strideq]; p1
+    movq             m3, [pix0q + src3strideq]; p0
+    movq             m4, [pixq];                q0
+    movq             m5, [pixq +     strideq];  q1
+    movq             m6, [pixq + 2 * strideq];  q2
+    movq             m7, [pixq + src3strideq];  q3
+    pxor             m8, m8
+    punpcklbw        m0, m8
+    punpcklbw        m1, m8
+    punpcklbw        m2, m8
+    punpcklbw        m3, m8
+    punpcklbw        m4, m8
+    punpcklbw        m5, m8
+    punpcklbw        m6, m8
+    punpcklbw        m7, m8
+    LUMA_DEBLOCK_BODY 8, h
+.store:
+    packuswb          m1, m2
+    packuswb          m3, m4
+    packuswb          m5, m6
+    movh   [pix0q +     strideq], m1
+    movhps [pix0q + 2 * strideq], m1
+    movh   [pix0q + src3strideq], m3
+    movhps [pixq               ], m3
+    movh   [pixq  +     strideq], m5
+    movhps [pixq  + 2 * strideq], m5
+.bypassluma:
+    RET
+
+cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    lea                  src3strideq, [3 * strideq]
+    mov                        pix0q, pixq
+    sub                        pix0q, src3strideq
+    sub                        pix0q, strideq
+    movdqu                        m0, [pix0q];               p3
+    movdqu                        m1, [pix0q +     strideq]; p2
+    movdqu                        m2, [pix0q + 2 * strideq]; p1
+    movdqu                        m3, [pix0q + src3strideq]; p0
+    movdqu                        m4, [pixq];                q0
+    movdqu                        m5, [pixq  +     strideq]; q1
+    movdqu                        m6, [pixq  + 2 * strideq]; q2
+    movdqu                        m7, [pixq  + src3strideq]; q3
+    LUMA_DEBLOCK_BODY             10, h
+.store:
+    pxor                          m8, m8; zeros reg
+    CLIPW                         m1, m8, [pw_pixel_max_10]
+    CLIPW                         m2, m8, [pw_pixel_max_10]
+    CLIPW                         m3, m8, [pw_pixel_max_10]
+    CLIPW                         m4, m8, [pw_pixel_max_10]
+    CLIPW                         m5, m8, [pw_pixel_max_10]
+    CLIPW                         m6, m8, [pw_pixel_max_10]
+    movdqu     [pix0q +     strideq], m1;  p2
+    movdqu     [pix0q + 2 * strideq], m2;  p1
+    movdqu     [pix0q + src3strideq], m3;  p0
+    movdqu     [pixq               ], m4;  q0
+    movdqu     [pixq  +     strideq], m5;  q1
+    movdqu     [pixq  + 2 * strideq], m6;  q2
+.bypassluma:
+    RET
+
+cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    lea                  src3strideq, [3 * strideq]
+    mov                        pix0q, pixq
+    sub                        pix0q, src3strideq
+    sub                        pix0q, strideq
+    movdqu                        m0, [pix0q];               p3
+    movdqu                        m1, [pix0q +     strideq]; p2
+    movdqu                        m2, [pix0q + 2 * strideq]; p1
+    movdqu                        m3, [pix0q + src3strideq]; p0
+    movdqu                        m4, [pixq];                q0
+    movdqu                        m5, [pixq  +     strideq]; q1
+    movdqu                        m6, [pixq  + 2 * strideq]; q2
+    movdqu                        m7, [pixq  + src3strideq]; q3
+    LUMA_DEBLOCK_BODY             12, h
+.store:
+    pxor                          m8, m8; zeros reg
+    CLIPW                         m1, m8, [pw_pixel_max_12]
+    CLIPW                         m2, m8, [pw_pixel_max_12]
+    CLIPW                         m3, m8, [pw_pixel_max_12]
+    CLIPW                         m4, m8, [pw_pixel_max_12]
+    CLIPW                         m5, m8, [pw_pixel_max_12]
+    CLIPW                         m6, m8, [pw_pixel_max_12]
+    movdqu     [pix0q +     strideq], m1;  p2
+    movdqu     [pix0q + 2 * strideq], m2;  p1
+    movdqu     [pix0q + src3strideq], m3;  p0
+    movdqu     [pixq               ], m4;  q0
+    movdqu     [pixq  +     strideq], m5;  q1
+    movdqu     [pixq  + 2 * strideq], m6;  q2
+.bypassluma:
+    RET
+
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_LUMA
+INIT_XMM ssse3
+LOOP_FILTER_LUMA
+INIT_XMM avx
+LOOP_FILTER_LUMA
+%endif
--- a/externals/ffmpeg/libavcodec/x86/hevc_idct.asm
+++ b/externals/ffmpeg/libavcodec/x86/hevc_idct.asm
@@ -0,0 +1,853 @@
+;*******************************************************************************
+;* SIMD-optimized IDCT functions for HEVC decoding
+;* Copyright (c) 2014 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;* Copyright (c) 2016 Alexandra Hájková
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_64: times 4 dd 64
+pd_2048: times 4 dd 2048
+pd_512: times 4 dd 512
+
+; 4x4 transform coeffs
+cextern pw_64
+pw_64_m64: times 4 dw 64, -64
+pw_83_36: times 4 dw 83, 36
+pw_36_m83: times 4 dw 36, -83
+
+; 8x8 transform coeffs
+pw_89_75: times 4 dw 89, 75
+pw_50_18: times 4 dw 50, 18
+
+pw_75_m18: times 4 dw 75, -18
+pw_m89_m50: times 4 dw -89, -50
+
+pw_50_m89: times 4 dw 50, -89
+pw_18_75: times 4 dw 18, 75
+
+pw_18_m50: times 4 dw 18, -50
+pw_75_m89: times 4 dw 75, -89
+
+; 16x16 transformation coeffs
+trans_coeffs16: times 4 dw 90, 87
+times 4 dw 80, 70
+times 4 dw 57, 43
+times 4 dw 25, 9
+
+times 4 dw 87, 57
+times 4 dw 9, -43
+times 4 dw -80, -90
+times 4 dw -70, -25
+
+times 4 dw 80, 9
+times 4 dw -70, -87
+times 4 dw -25, 57
+times 4 dw 90, 43
+
+times 4 dw 70, -43
+times 4 dw -87, 9
+times 4 dw 90, 25
+times 4 dw -80, -57
+
+times 4 dw 57, -80
+times 4 dw -25, 90
+times 4 dw -9, -87
+times 4 dw 43, 70
+
+times 4 dw 43, -90
+times 4 dw 57, 25
+times 4 dw -87, 70
+times 4 dw 9, -80
+
+times 4 dw 25, -70
+times 4 dw 90, -80
+times 4 dw 43, 9
+times 4 dw -57, 87
+
+times 4 dw 9, -25
+times 4 dw 43, -57
+times 4 dw 70, -80
+times 4 dw 87, -90
+
+; 32x32 transform coeffs
+trans_coeff32: times 8 dw 90
+times 4 dw 88, 85
+times 4 dw 82, 78
+times 4 dw 73, 67
+times 4 dw 61, 54
+times 4 dw 46, 38
+times 4 dw 31, 22
+times 4 dw 13, 4
+
+times 4 dw 90, 82
+times 4 dw 67, 46
+times 4 dw 22, -4
+times 4 dw -31, -54
+times 4 dw -73, -85
+times 4 dw -90, -88
+times 4 dw -78, -61
+times 4 dw -38, -13
+
+times 4 dw 88, 67
+times 4 dw 31, -13
+times 4 dw -54, -82
+times 4 dw -90, -78
+times 4 dw -46, -4
+times 4 dw 38, 73
+times 4 dw 90, 85
+times 4 dw 61, 22
+
+times 4 dw 85, 46
+times 4 dw -13, -67
+times 4 dw -90, -73
+times 4 dw -22, 38
+times 4 dw 82, 88
+times 4 dw 54, -4
+times 4 dw -61, -90
+times 4 dw -78, -31
+
+times 4 dw 82, 22
+times 4 dw -54, -90
+times 4 dw -61, 13
+times 4 dw 78, 85
+times 4 dw 31, -46
+times 4 dw -90, -67
+times 4 dw 4, 73
+times 4 dw 88, 38
+
+times 4 dw 78, -4
+times 4 dw -82, -73
+times 4 dw 13, 85
+times 4 dw 67, -22
+times 4 dw -88, -61
+times 4 dw 31, 90
+times 4 dw 54, -38
+times 4 dw -90, -46
+
+times 4 dw 73, -31
+times 4 dw -90, -22
+times 4 dw 78, 67
+times 4 dw -38, -90
+times 4 dw -13, 82
+times 4 dw 61, -46
+times 4 dw -88, -4
+times 4 dw 85, 54
+
+times 4 dw 67, -54
+times 4 dw -78, 38
+times 4 dw 85, -22
+times 4 dw -90, 4
+times 4 dw 90, 13
+times 4 dw -88, -31
+times 4 dw 82, 46
+times 4 dw -73, -61
+
+times 4 dw 61, -73
+times 4 dw -46, 82
+times 4 dw 31, -88
+times 4 dw -13, 90
+times 4 dw -4, -90
+times 4 dw 22, 85
+times 4 dw -38, -78
+times 4 dw 54, 67
+
+times 4 dw 54, -85
+times 4 dw -4, 88
+times 4 dw -46, -61
+times 4 dw 82, 13
+times 4 dw -90, 38
+times 4 dw 67, -78
+times 4 dw -22, 90
+times 4 dw -31, -73
+
+times 4 dw 46, -90
+times 4 dw 38, 54
+times 4 dw -90, 31
+times 4 dw 61, -88
+times 4 dw 22, 67
+times 4 dw -85, 13
+times 4 dw 73, -82
+times 4 dw 4, 78
+
+times 4 dw 38, -88
+times 4 dw 73, -4
+times 4 dw -67, 90
+times 4 dw -46, -31
+times 4 dw 85, -78
+times 4 dw 13, 61
+times 4 dw -90, 54
+times 4 dw 22, -82
+
+times 4 dw 31, -78
+times 4 dw 90, -61
+times 4 dw 4, 54
+times 4 dw -88, 82
+times 4 dw -38, -22
+times 4 dw 73, -90
+times 4 dw 67, -13
+times 4 dw -46, 85
+
+times 4 dw 22, -61
+times 4 dw 85, -90
+times 4 dw 73, -38
+times 4 dw -4, 46
+times 4 dw -78, 90
+times 4 dw -82, 54
+times 4 dw -13, -31
+times 4 dw 67, -88
+
+times 4 dw 13, -38
+times 4 dw 61, -78
+times 4 dw 88, -90
+times 4 dw 85, -73
+times 4 dw 54, -31
+times 4 dw 4, 22
+times 4 dw -46, 67
+times 4 dw -82, 90
+
+times 4 dw 4, -13
+times 4 dw 22, -31
+times 4 dw 38, -46
+times 4 dw 54, -61
+times 4 dw 67, -73
+times 4 dw 78, -82
+times 4 dw 85, -88
+times 4 dw 90, -90
+
+SECTION .text
+
+; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
+; %1 = HxW
+; %2 = number of loops
+; %3 = bitdepth
+%macro IDCT_DC 3
+cglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp
+    movsx             tmpd, word [coeffq]
+    add               tmpd, (1 << (14 - %3)) + 1
+    sar               tmpd, (15 - %3)
+    movd               xm0, tmpd
+    SPLATW              m0, xm0
+    DEFINE_ARGS coeff, cnt
+    mov               cntd, %2
+.loop:
+    mova [coeffq+mmsize*0], m0
+    mova [coeffq+mmsize*1], m0
+    mova [coeffq+mmsize*2], m0
+    mova [coeffq+mmsize*3], m0
+    add  coeffq, mmsize*8
+    mova [coeffq+mmsize*-4], m0
+    mova [coeffq+mmsize*-3], m0
+    mova [coeffq+mmsize*-2], m0
+    mova [coeffq+mmsize*-1], m0
+    dec  cntd
+    jg  .loop
+    RET
+%endmacro
+
+; %1 = HxW
+; %2 = bitdepth
+%macro IDCT_DC_NL 2 ; No loop
+cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp
+    movsx             tmpd, word [coeffq]
+    add               tmpd, (1 << (14 - %2)) + 1
+    sar               tmpd, (15 - %2)
+    movd                m0, tmpd
+    SPLATW              m0, xm0
+    mova [coeffq+mmsize*0], m0
+    mova [coeffq+mmsize*1], m0
+    mova [coeffq+mmsize*2], m0
+    mova [coeffq+mmsize*3], m0
+%if mmsize == 16
+    mova [coeffq+mmsize*4], m0
+    mova [coeffq+mmsize*5], m0
+    mova [coeffq+mmsize*6], m0
+    mova [coeffq+mmsize*7], m0
+%endif
+    RET
+%endmacro
+
+; IDCT 4x4, expects input in m0, m1
+; %1 - shift
+; %2 - 1/0 - SCALE and Transpose or not
+; %3 - 1/0 add constant or not
+%macro TR_4x4 3
+    ; interleaves src0 with src2 to m0
+    ;         and src1 with scr3 to m2
+    ; src0: 00 01 02 03     m0: 00 20 01 21 02 22 03 23
+    ; src1: 10 11 12 13 -->
+    ; src2: 20 21 22 23     m1: 10 30 11 31 12 32 13 33
+    ; src3: 30 31 32 33
+
+    SBUTTERFLY wd, 0, 1, 2
+
+    pmaddwd m2, m0, [pw_64]    ; e0
+    pmaddwd m3, m1, [pw_83_36] ; o0
+    pmaddwd m0, [pw_64_m64]    ; e1
+    pmaddwd m1, [pw_36_m83]    ; o1
+
+%if %3 == 1
+    %assign %%add 1 << (%1 - 1)
+    mova  m4, [pd_ %+ %%add]
+    paddd m2, m4
+    paddd m0, m4
+%endif
+
+    SUMSUB_BADC d, 3, 2, 1, 0, 4
+
+%if %2 == 1
+    psrad m3, %1 ; e0 + o0
+    psrad m1, %1 ; e1 + o1
+    psrad m2, %1 ; e0 - o0
+    psrad m0, %1 ; e1 - o1
+    ;clip16
+    packssdw m3, m1
+    packssdw m0, m2
+    ; Transpose
+    SBUTTERFLY wd, 3, 0, 1
+    SBUTTERFLY wd, 3, 0, 1
+    SWAP 3, 1, 0
+%else
+    SWAP 3, 2, 0
+%endif
+%endmacro
+
+%macro DEFINE_BIAS 1
+    %assign shift (20 - %1)
+    %assign c_add (1 << (shift - 1))
+    %define arr_add pd_ %+ c_add
+%endmacro
+
+; %1 - bit_depth
+; %2 - register add constant
+; is loaded to
+; shift = 20 - bit_depth
+%macro LOAD_BIAS 2
+    DEFINE_BIAS %1
+    mova %2, [arr_add]
+%endmacro
+
+; %1, %2 - registers to load packed 16 bit values to
+; %3, %4, %5, %6 - vertical offsets
+; %7 - horizontal offset
+%macro LOAD_BLOCK 7
+    movq   %1, [r0 + %3 + %7]
+    movhps %1, [r0 + %5 + %7]
+    movq   %2, [r0 + %4 + %7]
+    movhps %2, [r0 + %6 + %7]
+%endmacro
+
+; void ff_hevc_idct_4x4__{8,10}_<opt>(int16_t *coeffs, int col_limit)
+; %1 = bitdepth
+%macro IDCT_4x4 1
+cglobal hevc_idct_4x4_%1, 1, 1, 5, coeffs
+    mova m0, [coeffsq]
+    mova m1, [coeffsq + 16]
+
+    TR_4x4 7, 1, 1
+    TR_4x4 20 - %1, 1, 1
+
+    mova [coeffsq],      m0
+    mova [coeffsq + 16], m1
+    RET
+%endmacro
+
+; scale, pack (clip16) and store the residuals     0 e8[0] + o8[0] --> + %1
+; 4 at one time (4 columns)                        1 e8[1] + o8[1]
+; from %5: e8/16 + o8/16, with %1 offset                  ...
+; and  %3: e8/16 - o8/16, with %2 offset           6 e8[1] - o8[1]
+; %4 - shift                                       7 e8[0] - o8[0] --> + %2
+%macro STORE_8 7
+    psrad    %5, %4
+    psrad    %3, %4
+    packssdw %5, %3
+    movq     [coeffsq + %1], %5
+    movhps   [coeffsq + %2], %5
+%endmacro
+
+; %1 - horizontal offset
+; %2 - shift
+; %3, %4 - transform coeffs
+; %5 - vertical offset for e8 + o8
+; %6 - vertical offset for e8 - o8
+; %7 - register with e8 inside
+; %8 - block_size
+; %9 - register to store e8 +o8
+; %10 - register to store e8 - o8
+%macro E8_O8 10
+    pmaddwd m6, m4, %3
+    pmaddwd m7, m5, %4
+
+    paddd m6, m7
+    paddd m7, m6, %7 ; o8 + e8
+    psubd %7, m6     ; e8 - o8
+%if %8 == 8
+    STORE_8 %5 + %1, %6 + %1, %7, %2, m7, 0, 0
+%else
+    SWAP m7, %9
+    SWAP %7, %10
+%endif
+%endmacro
+
+; 8x4 residuals are processed and stored
+; %1 - horizontal offset
+; %2 - shift
+; %3 - offset of the even row
+; %4 - step: 1 for 8x8, 2 for 16x16, 4 for 32x32
+; %5 - offset of the odd row
+; %6 - block size
+; %7 - 1/0 add a constant in TR_4x4 or not
+; I want to add a constant for 8x8 transform but not for 16x16 and 32x32
+%macro TR_8x4 7
+    ; load 4 columns of even rows
+    LOAD_BLOCK  m0, m1, 0, 2 * %4 * %3, %4 * %3, 3 * %4 * %3, %1
+
+    TR_4x4 %2, 0, %7 ; e8: m0, m1, m2, m3, for 4 columns only
+
+    ; load 4 columns of odd rows
+    LOAD_BLOCK m4, m5, %4 * %5, 3 * %4 * %5, 5 * %4 * %5, 7 * %4 * %5, %1
+
+    ; 00 01 02 03
+    ; 10 11 12 13      m4: 10 30 11 31 12 32 13 33
+
+    ; ...        -- >
+    ;                  m5: 50 70 51 71 52 72 53 73
+    ; 70 71 72 73
+    SBUTTERFLY wd, 4, 5, 6
+
+    E8_O8 %1, %2, [pw_89_75],  [pw_50_18],   0,      %5 * 7, m0, %6, m8, m15
+    E8_O8 %1, %2, [pw_75_m18], [pw_m89_m50], %5,     %5 * 6, m1, %6, m9, m14
+    E8_O8 %1, %2, [pw_50_m89], [pw_18_75],   %5 * 2, %5 * 5, m2, %6, m10, m13
+    E8_O8 %1, %2, [pw_18_m50], [pw_75_m89],  %5 * 3, %5 * 4, m3, %6, m11, m12
+%endmacro
+
+%macro STORE_PACKED 7
+    movq   [r0 + %3 + %7], %1
+    movhps [r0 + %4 + %7], %1
+    movq   [r0 + %5 + %7], %2
+    movhps [r0 + %6 + %7], %2
+%endmacro
+
+; transpose 4x4 block packed
+; in %1 and %2 registers
+; %3 - temporary register
+%macro TRANSPOSE_4x4 3
+    SBUTTERFLY wd, %1, %2, %3
+    SBUTTERFLY dq, %1, %2, %3
+%endmacro
+
+; %1 - horizontal offset of the block i
+; %2 - vertical offset of the block i
+; %3 - width in bytes
+; %4 - vertical offset for the block j
+; %5 - horizontal offset for the block j
+%macro SWAP_BLOCKS 5
+    ; M_j
+    LOAD_BLOCK m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
+    TRANSPOSE_4x4 4, 5, 6
+
+    ; M_i
+    LOAD_BLOCK m6, m7, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+
+    STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+
+    ; transpose and store M_i
+    SWAP m6, m4
+    SWAP m7, m5
+    TRANSPOSE_4x4 4, 5, 6
+    STORE_PACKED m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
+%endmacro
+
+; %1 - horizontal offset
+; %2 - vertical offset of the block
+; %3 - width in bytes
+%macro TRANSPOSE_BLOCK 3
+    LOAD_BLOCK m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+    TRANSPOSE_4x4 4, 5, 6
+    STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+%endmacro
+
+%macro TRANSPOSE_8x8 0
+cglobal hevc_idct_transpose_8x8, 0, 0, 0
+    ; M1 M2 ^T = M1^t M3^t
+    ; M3 M4      M2^t M4^t
+
+    ; M1 4x4 block
+    TRANSPOSE_BLOCK 0, 0, 16
+
+    ; M2 and M3
+    SWAP_BLOCKS 0, 64, 16, 0, 8
+
+    ; M4
+    TRANSPOSE_BLOCK 8, 64, 16
+
+    ret
+%endmacro
+
+; void ff_hevc_idct_8x8_{8,10}_<opt>(int16_t *coeffs, int col_limit)
+; %1 = bitdepth
+%macro IDCT_8x8 1
+cglobal hevc_idct_8x8_%1, 1, 1, 8, coeffs
+    TR_8x4 0, 7, 32, 1, 16, 8, 1
+    TR_8x4 8, 7, 32, 1, 16, 8, 1
+
+    call hevc_idct_transpose_8x8_ %+ cpuname
+
+    DEFINE_BIAS %1
+    TR_8x4 0, shift, 32, 1, 16, 8, 1
+    TR_8x4 8, shift, 32, 1, 16, 8, 1
+
+    TAIL_CALL hevc_idct_transpose_8x8_ %+ cpuname, 1
+%endmacro
+
+; store intermedite e32 coeffs on stack
+; as 16x4 matrix
+; from m10: e8 + o8, with %6 offset
+; and  %3:  e8 - o8, with %7 offset
+; %4 - shift, unused here
+%macro STORE_16 7
+    mova [rsp + %6], %5
+    mova [rsp + %7], %3
+%endmacro
+
+; %1, %2 - transform constants
+; %3, %4 - regs with interleaved coeffs
+; %5 - 1/0 SWAP or add
+; %6, %7 - registers for intermidiate sums
+; %8 - accumulator register
+%macro ADD_ROWS 8
+    pmaddwd %6, %3, %1
+    pmaddwd %7, %4, %2
+    paddd   %6, %7
+%if %5 == 1
+    SWAP %6, %8
+%else
+    paddd %8, %6
+%endif
+%endmacro
+
+; %1 - transform coeffs
+; %2, %3 offsets for storing e+o/e-o back to coeffsq
+; %4 - shift
+; %5 - add
+; %6 - block_size
+; %7 - register with e16
+; %8, %9 - stack offsets for storing e+o/e-o
+%macro E16_O16 9
+    ADD_ROWS [%1],          [%1 +     16], m0, m1, 1, m5, m6, m7
+    ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m5, m6, m7
+
+%if %6 == 8
+    paddd %7, %5
+%endif
+
+    paddd m4, m7, %7 ; o16 + e16
+    psubd %7, m7     ; e16 - o16
+    STORE_%6 %2, %3, %7, %4, m4, %8, %9
+%endmacro
+
+%macro TR_16x4 10
+    ; produce 8x4 matrix of e16 coeffs
+    ; for 4 first rows and store it on stack (128 bytes)
+    TR_8x4 %1, 7, %4, %5, %6, %8, 0
+
+    ; load 8 even rows
+    LOAD_BLOCK m0, m1, %9 * %6, %9 * 3 * %6, %9 * 5 * %6, %9 * 7 * %6, %1
+    LOAD_BLOCK m2, m3, %9 * 9 * %6, %9 * 11 * %6, %9 * 13 * %6, %9 * 15 * %6, %1
+
+    SBUTTERFLY wd, 0, 1, 4
+    SBUTTERFLY wd, 2, 3, 4
+
+    E16_O16 trans_coeffs16,               0 + %1, 15 * %6 + %1, %2, %3, %7, m8,       0, 15 * 16
+    mova m8, %3
+    E16_O16 trans_coeffs16 +     64,     %6 + %1, 14 * %6 + %1, %2, m8, %7, m9,      16, 14 * 16
+    E16_O16 trans_coeffs16 + 2 * 64, 2 * %6 + %1, 13 * %6 + %1, %2, m8, %7, m10, 2 * 16, 13 * 16
+    E16_O16 trans_coeffs16 + 3 * 64, 3 * %6 + %1, 12 * %6 + %1, %2, m8, %7, m11, 3 * 16, 12 * 16
+    E16_O16 trans_coeffs16 + 4 * 64, 4 * %6 + %1, 11 * %6 + %1, %2, m8, %7, m12, 4 * 16, 11 * 16
+    E16_O16 trans_coeffs16 + 5 * 64, 5 * %6 + %1, 10 * %6 + %1, %2, m8, %7, m13, 5 * 16, 10 * 16
+    E16_O16 trans_coeffs16 + 6 * 64, 6 * %6 + %1,  9 * %6 + %1, %2, m8, %7, m14, 6 * 16,  9 * 16
+    E16_O16 trans_coeffs16 + 7 * 64, 7 * %6 + %1,  8 * %6 + %1, %2, m8, %7, m15, 7 * 16,  8 * 16
+%endmacro
+
+%macro TRANSPOSE_16x16 0
+cglobal hevc_idct_transpose_16x16, 0, 0, 0
+; M1  M2  M3  M4 ^T      m1 m5 m9  m13   M_i^T = m_i
+; M5  M6  M7  M8    -->  m2 m6 m10 m14
+; M9  M10 M11 M12        m3 m7 m11 m15
+; M13 M14 M15 M16        m4 m8 m12 m16
+
+    ; M1 4x4 block
+    TRANSPOSE_BLOCK 0, 0, 32
+
+    ; M5, M2
+    SWAP_BLOCKS 0, 128, 32, 0, 8
+    ; M9, M3
+    SWAP_BLOCKS 0, 256, 32, 0, 16
+    ; M13, M4
+    SWAP_BLOCKS 0, 384, 32, 0, 24
+
+    ;M6
+    TRANSPOSE_BLOCK 8, 128, 32
+
+    ; M10, M7
+    SWAP_BLOCKS 8, 256, 32, 128, 16
+    ; M14, M8
+    SWAP_BLOCKS 8, 384, 32, 128, 24
+
+    ;M11
+    TRANSPOSE_BLOCK 16, 256, 32
+
+    ; M15, M12
+    SWAP_BLOCKS 16, 384, 32, 256, 24
+
+    ;M16
+    TRANSPOSE_BLOCK 24, 384, 32
+
+    ret
+%endmacro
+
+; void ff_hevc_idct_16x16_{8,10}_<opt>(int16_t *coeffs, int col_limit)
+; %1 = bitdepth
+%macro IDCT_16x16 1
+cglobal hevc_idct_16x16_%1, 1, 2, 16, coeffs
+    mov r1d, 3
+.loop16:
+    TR_16x4 8 * r1, 7, [pd_64], 64, 2, 32, 8, 16, 1, 0
+    dec r1d
+    jge .loop16
+
+    call hevc_idct_transpose_16x16_ %+ cpuname
+
+    DEFINE_BIAS %1
+    mov r1d, 3
+.loop16_2:
+    TR_16x4 8 * r1, shift, [arr_add], 64, 2, 32, 8, 16, 1, 1
+    dec r1d
+    jge .loop16_2
+
+    TAIL_CALL hevc_idct_transpose_16x16_ %+ cpuname, 1
+%endmacro
+
+; scale, pack (clip16) and store the residuals     0 e32[0] + o32[0] --> %1
+; 4 at one time (4 columns)                        1 e32[1] + o32[1]
+; %1 - address to store e32 + o32
+; %2 - address to store e32 - e32
+; %5 - reg with e32 + o32                                  ...
+; %3 - reg with e32 - o32                          30 e32[1] - o32[1]
+; %4 - shift                                       31 e32[0] - o32[0] --> %2
+%macro STORE_32 5
+    psrad    %5, %4
+    psrad    %3, %4
+    packssdw %5, %3
+    movq     [%1], %5
+    movhps   [%2], %5
+%endmacro
+
+; %1 - transform coeffs
+; %2 - stack offset for e32
+; %2, %3 offsets for storing e+o/e-o back to coeffsq
+; %4 - shift
+; %5 - stack offset of e32
+%macro E32_O32 5
+    ADD_ROWS [%1],          [%1 +     16], m0, m1, 1, m8, m9, m10
+    ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m8, m9, m10
+    ADD_ROWS [%1 + 4 * 16], [%1 + 5 * 16], m4, m5, 0, m8, m9, m10
+    ADD_ROWS [%1 + 6 * 16], [%1 + 7 * 16], m6, m7, 0, m8, m9, m10
+
+    paddd m11, m14, [rsp + %5]
+    paddd m12, m10, m11 ; o32 + e32
+    psubd m11, m10      ; e32 - o32
+    STORE_32 %2, %3, m11, %4, m12
+%endmacro
+
+; %1 - horizontal offset
+; %2 - bitdepth
+%macro TR_32x4 3
+    TR_16x4 %1, 7, [pd_64], 128, 4, 64, 16, 16, 2, 0
+
+    LOAD_BLOCK m0, m1,      64,  3 * 64,  5 * 64,  7 * 64, %1
+    LOAD_BLOCK m2, m3,  9 * 64, 11 * 64, 13 * 64, 15 * 64, %1
+    LOAD_BLOCK m4, m5, 17 * 64, 19 * 64, 21 * 64, 23 * 64, %1
+    LOAD_BLOCK m6, m7, 25 * 64, 27 * 64, 29 * 64, 31 * 64, %1
+
+    SBUTTERFLY wd, 0, 1, 8
+    SBUTTERFLY wd, 2, 3, 8
+    SBUTTERFLY wd, 4, 5, 8
+    SBUTTERFLY wd, 6, 7, 8
+
+%if %3 == 1
+    %assign shift 7
+    mova m14, [pd_64]
+%else
+    LOAD_BIAS %2, m14
+%endif
+
+    lea r2, [trans_coeff32 + 15 * 128]
+    lea r3, [coeffsq + %1]
+    lea r4, [r3 + 16 * 64]
+    mov r5d, 15 * 16
+%%loop:
+    E32_O32 r2, r3 + r5 * 4, r4, shift, r5
+    sub r2, 128
+    add r4, 64
+    sub r5d, 16
+    jge %%loop
+%endmacro
+
+%macro TRANSPOSE_32x32 0
+cglobal hevc_idct_transpose_32x32, 0, 0, 0
+    ; M0  M1 ... M7
+    ; M8         M15
+    ;
+    ; ...
+    ;
+    ; M56        M63
+
+    TRANSPOSE_BLOCK 0, 0, 64 ; M1
+    mov r1d, 7
+    mov r2d, 7 * 256
+.loop_transpose:
+    SWAP_BLOCKS 0, r2, 64, 0, r1 * 8
+    sub r2d, 256
+    dec r1d
+    jg .loop_transpose
+
+    TRANSPOSE_BLOCK 8, 256, 64 ; M9
+    mov r1d, 6
+    mov r2d, 512
+    mov r3d, 16
+.loop_transpose2:
+    SWAP_BLOCKS 8, r2, 64, 256, r3
+    add r3d, 8
+    add r2d, 256
+    dec r1d
+    jg .loop_transpose2
+
+    TRANSPOSE_BLOCK 2 * 8, 2 * 256, 64 ; M9
+    mov r1d, 5
+    mov r2d, 768
+    mov r3d, 24
+.loop_transpose3:
+    SWAP_BLOCKS 2 * 8, r2, 64, 2 * 256, r3
+    add r3d, 8
+    add r2d, 256
+    dec r1d
+    jg .loop_transpose3
+
+    TRANSPOSE_BLOCK 3 * 8, 3 * 256, 64 ; M27
+    mov r1d, 4
+    mov r2d, 1024
+    mov r3d, 32
+.loop_transpose4:
+    SWAP_BLOCKS 3 * 8, r2, 64, 3 * 256, r3
+    add r3d, 8
+    add r2d, 256
+    dec r1d
+    jg .loop_transpose4
+
+    TRANSPOSE_BLOCK 4 * 8, 4 * 256, 64 ; M36
+    mov r1d, 3
+    mov r2d, 1280
+    mov r3d, 40
+.loop_transpose5:
+    SWAP_BLOCKS 4 * 8, r2, 64, 4 * 256, r3
+    add r3d, 8
+    add r2d, 256
+    dec r1d
+    jg .loop_transpose5
+
+    TRANSPOSE_BLOCK 5 * 8, 5 * 256, 64 ; M45
+    SWAP_BLOCKS 5 * 8, 6 * 256, 64, 5 * 256, 6 * 8
+    SWAP_BLOCKS 5 * 8, 7 * 256, 64, 5 * 256, 7 * 8
+
+    TRANSPOSE_BLOCK 6 * 8, 6 * 256, 64 ; M54
+    SWAP_BLOCKS 6 * 8, 7 * 256, 64, 6 * 256, 7 * 8
+
+    TRANSPOSE_BLOCK 7 * 8, 7 * 256, 64 ; M63
+
+    ret
+%endmacro
+
+; void ff_hevc_idct_32x32_{8,10}_<opt>(int16_t *coeffs, int col_limit)
+; %1 = bitdepth
+%macro IDCT_32x32 1
+cglobal hevc_idct_32x32_%1, 1, 6, 16, 256, coeffs
+    mov r1d, 7
+.loop32:
+    TR_32x4 8 * r1, %1, 1
+    dec r1d
+    jge .loop32
+
+    call hevc_idct_transpose_32x32_ %+ cpuname
+
+    mov r1d, 7
+.loop32_2:
+    TR_32x4 8 * r1, %1, 0
+    dec r1d
+    jge .loop32_2
+
+    TAIL_CALL hevc_idct_transpose_32x32_ %+ cpuname, 1
+%endmacro
+
+%macro INIT_IDCT_DC 1
+INIT_MMX mmxext
+IDCT_DC_NL  4,      %1
+IDCT_DC     8,  2,  %1
+
+INIT_XMM sse2
+IDCT_DC_NL  8,      %1
+IDCT_DC    16,  4,  %1
+IDCT_DC    32, 16,  %1
+
+%if HAVE_AVX2_EXTERNAL
+    INIT_YMM avx2
+    IDCT_DC    16,  2,  %1
+    IDCT_DC    32,  8,  %1
+%endif ;HAVE_AVX2_EXTERNAL
+%endmacro
+
+%macro INIT_IDCT 2
+INIT_XMM %2
+%if %1 == 8
+    TRANSPOSE_8x8
+    %if ARCH_X86_64
+        TRANSPOSE_16x16
+        TRANSPOSE_32x32
+    %endif
+%endif
+%if ARCH_X86_64
+    IDCT_32x32 %1
+    IDCT_16x16 %1
+%endif
+IDCT_8x8 %1
+IDCT_4x4 %1
+%endmacro
+
+INIT_IDCT_DC 8
+INIT_IDCT_DC 10
+INIT_IDCT_DC 12
+INIT_IDCT 8, sse2
+INIT_IDCT 8, avx
+INIT_IDCT 10, sse2
+INIT_IDCT 10, avx
+;INIT_IDCT 12, sse2
+;INIT_IDCT 12, avx
--- a/externals/ffmpeg/libavcodec/x86/hevc_mc.asm
+++ b/externals/ffmpeg/libavcodec/x86/hevc_mc.asm
--- a/externals/ffmpeg/libavcodec/x86/hevc_sao.asm
+++ b/externals/ffmpeg/libavcodec/x86/hevc_sao.asm
@@ -0,0 +1,340 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for HEVC 8bit decoding
+;*
+;* Copyright (c) 2013 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+pb_eo:                   db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
+cextern pb_1
+cextern pb_2
+
+SECTION .text
+
+;******************************************************************************
+;SAO Band Filter
+;******************************************************************************
+
+%macro HEVC_SAO_BAND_FILTER_INIT 0
+    and            leftq, 31
+    movd             xm0, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm1, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm2, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm3, leftd
+
+    SPLATW            m0, xm0
+    SPLATW            m1, xm1
+    SPLATW            m2, xm2
+    SPLATW            m3, xm3
+%if mmsize > 16
+    SPLATW            m4, [offsetq + 2]
+    SPLATW            m5, [offsetq + 4]
+    SPLATW            m6, [offsetq + 6]
+    SPLATW            m7, [offsetq + 8]
+%else
+    movq              m7, [offsetq + 2]
+    SPLATW            m4, m7, 0
+    SPLATW            m5, m7, 1
+    SPLATW            m6, m7, 2
+    SPLATW            m7, m7, 3
+%endif
+
+%if ARCH_X86_64
+    pxor             m14, m14
+
+%else ; ARCH_X86_32
+    mova  [rsp+mmsize*0], m0
+    mova  [rsp+mmsize*1], m1
+    mova  [rsp+mmsize*2], m2
+    mova  [rsp+mmsize*3], m3
+    mova  [rsp+mmsize*4], m4
+    mova  [rsp+mmsize*5], m5
+    mova  [rsp+mmsize*6], m6
+    pxor              m0, m0
+    %assign MMSIZE mmsize
+    %define m14 m0
+    %define m13 m1
+    %define  m9 m2
+    %define  m8 m3
+%endif ; ARCH
+DEFINE_ARGS dst, src, dststride, srcstride, offset, height
+    mov          heightd, r7m
+%endmacro
+
+%macro HEVC_SAO_BAND_FILTER_COMPUTE 2
+    psraw             %1, %2, 3
+%if ARCH_X86_64
+    pcmpeqw          m10, %1, m0
+    pcmpeqw          m11, %1, m1
+    pcmpeqw          m12, %1, m2
+    pcmpeqw           %1, m3
+    pand             m10, m4
+    pand             m11, m5
+    pand             m12, m6
+    pand              %1, m7
+    por              m10, m11
+    por              m12, %1
+    por              m10, m12
+    paddw             %2, m10
+%else ; ARCH_X86_32
+    pcmpeqw           m4, %1, [rsp+MMSIZE*0]
+    pcmpeqw           m5, %1, [rsp+MMSIZE*1]
+    pcmpeqw           m6, %1, [rsp+MMSIZE*2]
+    pcmpeqw           %1, [rsp+MMSIZE*3]
+    pand              m4, [rsp+MMSIZE*4]
+    pand              m5, [rsp+MMSIZE*5]
+    pand              m6, [rsp+MMSIZE*6]
+    pand              %1, m7
+    por               m4, m5
+    por               m6, %1
+    por               m4, m6
+    paddw             %2, m4
+%endif ; ARCH
+%endmacro
+
+;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+;                                             int16_t *sao_offset_val, int sao_left_class, int width, int height);
+%macro HEVC_SAO_BAND_FILTER 2
+cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
+    HEVC_SAO_BAND_FILTER_INIT
+
+align 16
+.loop:
+%if %1 == 8
+    movq              m8, [srcq]
+    punpcklbw         m8, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
+    packuswb          m8, m14
+    movq          [dstq], m8
+%endif ; %1 == 8
+
+%assign i 0
+%rep %2
+    mova             m13, [srcq + i]
+    punpcklbw         m8, m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9,  m8
+    punpckhbw        m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
+    packuswb          m8, m13
+    mova      [dstq + i], m8
+%assign i i+mmsize
+%endrep
+
+%if %1 == 48
+INIT_XMM cpuname
+
+    mova             m13, [srcq + i]
+    punpcklbw         m8, m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9,  m8
+    punpckhbw        m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
+    packuswb          m8, m13
+    mova      [dstq + i], m8
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif ; %1 == 48
+
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    REP_RET
+%endmacro
+
+
+%macro HEVC_SAO_BAND_FILTER_FUNCS 0
+HEVC_SAO_BAND_FILTER  8, 0
+HEVC_SAO_BAND_FILTER 16, 1
+HEVC_SAO_BAND_FILTER 32, 2
+HEVC_SAO_BAND_FILTER 48, 2
+HEVC_SAO_BAND_FILTER 64, 4
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_BAND_FILTER_FUNCS
+INIT_XMM avx
+HEVC_SAO_BAND_FILTER_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER  8, 0
+HEVC_SAO_BAND_FILTER 16, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 32, 1
+HEVC_SAO_BAND_FILTER 48, 1
+HEVC_SAO_BAND_FILTER 64, 2
+%endif
+
+;******************************************************************************
+;SAO Edge Filter
+;******************************************************************************
+
+%define MAX_PB_SIZE  64
+%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
+%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+
+%macro HEVC_SAO_EDGE_FILTER_INIT 0
+%if WIN64
+    movsxd           eoq, dword eom
+%elif ARCH_X86_64
+    movsxd           eoq, eod
+%else
+    mov              eoq, r4m
+%endif
+    lea            tmp2q, [pb_eo]
+    movsx      a_strideq, byte [tmp2q+eoq*4+1]
+    movsx      b_strideq, byte [tmp2q+eoq*4+3]
+    imul       a_strideq, EDGE_SRCSTRIDE
+    imul       b_strideq, EDGE_SRCSTRIDE
+    movsx           tmpq, byte [tmp2q+eoq*4]
+    add        a_strideq, tmpq
+    movsx           tmpq, byte [tmp2q+eoq*4+2]
+    add        b_strideq, tmpq
+%endmacro
+
+%macro HEVC_SAO_EDGE_FILTER_COMPUTE 1
+    pminub            m4, m1, m2
+    pminub            m5, m1, m3
+    pcmpeqb           m2, m4
+    pcmpeqb           m3, m5
+    pcmpeqb           m4, m1
+    pcmpeqb           m5, m1
+    psubb             m4, m2
+    psubb             m5, m3
+    paddb             m4, m6
+    paddb             m4, m5
+
+    pshufb            m2, m0, m4
+%if %1 > 8
+    punpckhbw         m5, m7, m1
+    punpckhbw         m4, m2, m7
+    punpcklbw         m3, m7, m1
+    punpcklbw         m2, m7
+    pmaddubsw         m5, m4
+    pmaddubsw         m3, m2
+    packuswb          m3, m5
+%else
+    punpcklbw         m3, m7, m1
+    punpcklbw         m2, m7
+    pmaddubsw         m3, m2
+    packuswb          m3, m3
+%endif
+%endmacro
+
+;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+;                                             int eo, int width, int height);
+%macro HEVC_SAO_EDGE_FILTER 2-3
+%if ARCH_X86_64
+cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
+%define tmp2q heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov          heightd, r6m
+
+%else ; ARCH_X86_32
+cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height
+%define eoq   srcq
+%define tmpq  heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov             srcq, srcm
+    mov          offsetq, r3m
+    mov       dststrideq, dststridem
+%endif ; ARCH
+
+%if mmsize > 16
+    vbroadcasti128    m0, [offsetq]
+%else
+    movu              m0, [offsetq]
+%endif
+    mova              m1, [pb_edge_shuffle]
+    packsswb          m0, m0
+    mova              m7, [pb_1]
+    pshufb            m0, m1
+    mova              m6, [pb_2]
+%if ARCH_X86_32
+    mov          heightd, r6m
+%endif
+
+align 16
+.loop:
+
+%if %1 == 8
+    movq              m1, [srcq]
+    movq              m2, [srcq + a_strideq]
+    movq              m3, [srcq + b_strideq]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    movq          [dstq], m3
+%endif
+
+%assign i 0
+%rep %2
+    mova              m1, [srcq + i]
+    movu              m2, [srcq + a_strideq + i]
+    movu              m3, [srcq + b_strideq + i]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    mov%3     [dstq + i], m3
+%assign i i+mmsize
+%endrep
+
+%if %1 == 48
+INIT_XMM cpuname
+
+    mova              m1, [srcq + i]
+    movu              m2, [srcq + a_strideq + i]
+    movu              m3, [srcq + b_strideq + i]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    mova      [dstq + i], m3
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif
+
+    add             dstq, dststrideq
+    add             srcq, EDGE_SRCSTRIDE
+    dec          heightd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+HEVC_SAO_EDGE_FILTER  8, 0
+HEVC_SAO_EDGE_FILTER 16, 1, a
+HEVC_SAO_EDGE_FILTER 32, 2, a
+HEVC_SAO_EDGE_FILTER 48, 2, a
+HEVC_SAO_EDGE_FILTER 64, 4, a
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 32, 1, a
+HEVC_SAO_EDGE_FILTER 48, 1, u
+HEVC_SAO_EDGE_FILTER 64, 2, a
+%endif
--- a/externals/ffmpeg/libavcodec/x86/hevc_sao_10bit.asm
+++ b/externals/ffmpeg/libavcodec/x86/hevc_sao_10bit.asm
@@ -0,0 +1,370 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for HEVC 10/12bit decoding
+;*
+;* Copyright (c) 2013 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_m2:     times 16 dw -2
+pw_mask10: times 16 dw 0x03FF
+pw_mask12: times 16 dw 0x0FFF
+pb_eo:              db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
+cextern pw_m1
+cextern pw_1
+cextern pw_2
+
+SECTION .text
+
+;******************************************************************************
+;SAO Band Filter
+;******************************************************************************
+
+%macro HEVC_SAO_BAND_FILTER_INIT 1
+    and            leftq, 31
+    movd             xm0, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm1, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm2, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm3, leftd
+
+    SPLATW            m0, xm0
+    SPLATW            m1, xm1
+    SPLATW            m2, xm2
+    SPLATW            m3, xm3
+%if mmsize > 16
+    SPLATW            m4, [offsetq + 2]
+    SPLATW            m5, [offsetq + 4]
+    SPLATW            m6, [offsetq + 6]
+    SPLATW            m7, [offsetq + 8]
+%else
+    movq              m7, [offsetq + 2]
+    SPLATW            m4, m7, 0
+    SPLATW            m5, m7, 1
+    SPLATW            m6, m7, 2
+    SPLATW            m7, m7, 3
+%endif
+
+%if ARCH_X86_64
+    mova             m13, [pw_mask %+ %1]
+    pxor             m14, m14
+
+%else ; ARCH_X86_32
+    mova  [rsp+mmsize*0], m0
+    mova  [rsp+mmsize*1], m1
+    mova  [rsp+mmsize*2], m2
+    mova  [rsp+mmsize*3], m3
+    mova  [rsp+mmsize*4], m4
+    mova  [rsp+mmsize*5], m5
+    mova  [rsp+mmsize*6], m6
+    mova              m1, [pw_mask %+ %1]
+    pxor              m0, m0
+    %define m14 m0
+    %define m13 m1
+    %define  m9 m2
+    %define  m8 m3
+%endif ; ARCH
+DEFINE_ARGS dst, src, dststride, srcstride, offset, height
+    mov          heightd, r7m
+%endmacro
+
+;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+;                                                   int16_t *sao_offset_val, int sao_left_class, int width, int height);
+%macro HEVC_SAO_BAND_FILTER 3
+cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
+    HEVC_SAO_BAND_FILTER_INIT %1
+
+align 16
+.loop:
+
+%assign i 0
+%assign j 0
+%rep %3
+%assign k 8+(j&1)
+%assign l 9-(j&1)
+    mova          m %+ k, [srcq + i]
+    psraw         m %+ l, m %+ k, %1-5
+%if ARCH_X86_64
+    pcmpeqw          m10, m %+ l, m0
+    pcmpeqw          m11, m %+ l, m1
+    pcmpeqw          m12, m %+ l, m2
+    pcmpeqw       m %+ l, m3
+    pand             m10, m4
+    pand             m11, m5
+    pand             m12, m6
+    pand          m %+ l, m7
+    por              m10, m11
+    por              m12, m %+ l
+    por              m10, m12
+    paddw         m %+ k, m10
+%else ; ARCH_X86_32
+    pcmpeqw           m4, m %+ l, [rsp+mmsize*0]
+    pcmpeqw           m5, m %+ l, [rsp+mmsize*1]
+    pcmpeqw           m6, m %+ l, [rsp+mmsize*2]
+    pcmpeqw       m %+ l, [rsp+mmsize*3]
+    pand              m4, [rsp+mmsize*4]
+    pand              m5, [rsp+mmsize*5]
+    pand              m6, [rsp+mmsize*6]
+    pand          m %+ l, m7
+    por               m4, m5
+    por               m6, m %+ l
+    por               m4, m6
+    paddw         m %+ k, m4
+%endif ; ARCH
+    CLIPW             m %+ k, m14, m13
+    mova      [dstq + i], m %+ k
+%assign i i+mmsize
+%assign j j+1
+%endrep
+
+    add             dstq, dststrideq
+    add             srcq, srcstrideq
+    dec          heightd
+    jg .loop
+    REP_RET
+%endmacro
+
+%macro HEVC_SAO_BAND_FILTER_FUNCS 0
+HEVC_SAO_BAND_FILTER 10,  8, 1
+HEVC_SAO_BAND_FILTER 10, 16, 2
+HEVC_SAO_BAND_FILTER 10, 32, 4
+HEVC_SAO_BAND_FILTER 10, 48, 6
+HEVC_SAO_BAND_FILTER 10, 64, 8
+
+HEVC_SAO_BAND_FILTER 12,  8, 1
+HEVC_SAO_BAND_FILTER 12, 16, 2
+HEVC_SAO_BAND_FILTER 12, 32, 4
+HEVC_SAO_BAND_FILTER 12, 48, 6
+HEVC_SAO_BAND_FILTER 12, 64, 8
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_BAND_FILTER_FUNCS
+INIT_XMM avx
+HEVC_SAO_BAND_FILTER_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER 10,  8, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 10, 16, 1
+HEVC_SAO_BAND_FILTER 10, 32, 2
+HEVC_SAO_BAND_FILTER 10, 48, 3
+HEVC_SAO_BAND_FILTER 10, 64, 4
+
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER 12,  8, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 12, 16, 1
+HEVC_SAO_BAND_FILTER 12, 32, 2
+HEVC_SAO_BAND_FILTER 12, 48, 3
+HEVC_SAO_BAND_FILTER 12, 64, 4
+%endif
+
+;******************************************************************************
+;SAO Edge Filter
+;******************************************************************************
+
+%define MAX_PB_SIZE  64
+%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
+%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+
+%macro PMINUW 4
+%if cpuflag(sse4)
+    pminuw            %1, %2, %3
+%else
+    psubusw           %4, %2, %3
+    psubw             %1, %2, %4
+%endif
+%endmacro
+
+%macro HEVC_SAO_EDGE_FILTER_INIT 0
+%if WIN64
+    movsxd           eoq, dword eom
+%elif ARCH_X86_64
+    movsxd           eoq, eod
+%else
+    mov              eoq, r4m
+%endif
+    lea            tmp2q, [pb_eo]
+    movsx      a_strideq, byte [tmp2q+eoq*4+1]
+    movsx      b_strideq, byte [tmp2q+eoq*4+3]
+    imul       a_strideq, EDGE_SRCSTRIDE >> 1
+    imul       b_strideq, EDGE_SRCSTRIDE >> 1
+    movsx           tmpq, byte [tmp2q+eoq*4]
+    add        a_strideq, tmpq
+    movsx           tmpq, byte [tmp2q+eoq*4+2]
+    add        b_strideq, tmpq
+%endmacro
+
+;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+;                                                   int eo, int width, int height);
+%macro HEVC_SAO_EDGE_FILTER 3
+%if ARCH_X86_64
+cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
+%define tmp2q heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov          heightd, r6m
+    add        a_strideq, a_strideq
+    add        b_strideq, b_strideq
+
+%else ; ARCH_X86_32
+cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
+%define eoq   srcq
+%define tmpq  heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+%define m8 m1
+%define m9 m2
+%define m10 m3
+%define m11 m4
+%define m12 m5
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov             srcq, srcm
+    mov          offsetq, r3m
+    mov       dststrideq, dststridem
+    add        a_strideq, a_strideq
+    add        b_strideq, b_strideq
+
+%endif ; ARCH
+
+%if mmsize > 16
+    SPLATW            m8, [offsetq+2]
+    SPLATW            m9, [offsetq+4]
+    SPLATW           m10, [offsetq+0]
+    SPLATW           m11, [offsetq+6]
+    SPLATW           m12, [offsetq+8]
+%else
+    movq             m10, [offsetq+0]
+    movd             m12, [offsetq+6]
+    SPLATW            m8, xm10, 1
+    SPLATW            m9, xm10, 2
+    SPLATW           m10, xm10, 0
+    SPLATW           m11, xm12, 0
+    SPLATW           m12, xm12, 1
+%endif
+    pxor              m0, m0
+%if ARCH_X86_64
+    mova             m13, [pw_m1]
+    mova             m14, [pw_1]
+    mova             m15, [pw_2]
+%else
+    mov          heightd, r6m
+    mova  [rsp+mmsize*0], m8
+    mova  [rsp+mmsize*1], m9
+    mova  [rsp+mmsize*2], m10
+    mova  [rsp+mmsize*3], m11
+    mova  [rsp+mmsize*4], m12
+%endif
+
+align 16
+.loop:
+
+%assign i 0
+%rep %3
+    mova              m1, [srcq + i]
+    movu              m2, [srcq+a_strideq + i]
+    movu              m3, [srcq+b_strideq + i]
+    PMINUW            m4, m1, m2, m6
+    PMINUW            m5, m1, m3, m7
+    pcmpeqw           m2, m4
+    pcmpeqw           m3, m5
+    pcmpeqw           m4, m1
+    pcmpeqw           m5, m1
+    psubw             m4, m2
+    psubw             m5, m3
+
+    paddw             m4, m5
+    pcmpeqw           m2, m4, [pw_m2]
+%if ARCH_X86_64
+    pcmpeqw           m3, m4, m13
+    pcmpeqw           m5, m4, m0
+    pcmpeqw           m6, m4, m14
+    pcmpeqw           m7, m4, m15
+    pand              m2, m8
+    pand              m3, m9
+    pand              m5, m10
+    pand              m6, m11
+    pand              m7, m12
+%else
+    pcmpeqw           m3, m4, [pw_m1]
+    pcmpeqw           m5, m4, m0
+    pcmpeqw           m6, m4, [pw_1]
+    pcmpeqw           m7, m4, [pw_2]
+    pand              m2, [rsp+mmsize*0]
+    pand              m3, [rsp+mmsize*1]
+    pand              m5, [rsp+mmsize*2]
+    pand              m6, [rsp+mmsize*3]
+    pand              m7, [rsp+mmsize*4]
+%endif
+    paddw             m2, m3
+    paddw             m5, m6
+    paddw             m2, m7
+    paddw             m2, m1
+    paddw             m2, m5
+    CLIPW             m2, m0, [pw_mask %+ %1]
+    mova      [dstq + i], m2
+%assign i i+mmsize
+%endrep
+
+    add             dstq, dststrideq
+    add             srcq, EDGE_SRCSTRIDE
+    dec          heightd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_EDGE_FILTER 10,  8, 1
+HEVC_SAO_EDGE_FILTER 10, 16, 2
+HEVC_SAO_EDGE_FILTER 10, 32, 4
+HEVC_SAO_EDGE_FILTER 10, 48, 6
+HEVC_SAO_EDGE_FILTER 10, 64, 8
+
+HEVC_SAO_EDGE_FILTER 12,  8, 1
+HEVC_SAO_EDGE_FILTER 12, 16, 2
+HEVC_SAO_EDGE_FILTER 12, 32, 4
+HEVC_SAO_EDGE_FILTER 12, 48, 6
+HEVC_SAO_EDGE_FILTER 12, 64, 8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_EDGE_FILTER 10,  8, 1
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 10, 16, 1
+HEVC_SAO_EDGE_FILTER 10, 32, 2
+HEVC_SAO_EDGE_FILTER 10, 48, 3
+HEVC_SAO_EDGE_FILTER 10, 64, 4
+
+INIT_XMM avx2
+HEVC_SAO_EDGE_FILTER 12,  8, 1
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 12, 16, 1
+HEVC_SAO_EDGE_FILTER 12, 32, 2
+HEVC_SAO_EDGE_FILTER 12, 48, 3
+HEVC_SAO_EDGE_FILTER 12, 64, 4
+%endif
--- a/externals/ffmpeg/libavcodec/x86/hevcdsp.h
+++ b/externals/ffmpeg/libavcodec/x86/hevcdsp.h
@@ -0,0 +1,259 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
+ *
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_HEVCDSP_H
+#define AVCODEC_X86_HEVCDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
+dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
+dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
+dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
+dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
+dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
+
+
+#define PEL_PROTOTYPE(name, D, opt) \
+void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
+void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// MC functions
+///////////////////////////////////////////////////////////////////////////////
+
+#define EPEL_PROTOTYPES(fname, bitd, opt) \
+        PEL_PROTOTYPE(fname##4,  bitd, opt); \
+        PEL_PROTOTYPE(fname##6,  bitd, opt); \
+        PEL_PROTOTYPE(fname##8,  bitd, opt); \
+        PEL_PROTOTYPE(fname##12, bitd, opt); \
+        PEL_PROTOTYPE(fname##16, bitd, opt); \
+        PEL_PROTOTYPE(fname##24, bitd, opt); \
+        PEL_PROTOTYPE(fname##32, bitd, opt); \
+        PEL_PROTOTYPE(fname##48, bitd, opt); \
+        PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define QPEL_PROTOTYPES(fname, bitd, opt) \
+        PEL_PROTOTYPE(fname##4,  bitd, opt); \
+        PEL_PROTOTYPE(fname##8,  bitd, opt); \
+        PEL_PROTOTYPE(fname##12, bitd, opt); \
+        PEL_PROTOTYPE(fname##16, bitd, opt); \
+        PEL_PROTOTYPE(fname##24, bitd, opt); \
+        PEL_PROTOTYPE(fname##32, bitd, opt); \
+        PEL_PROTOTYPE(fname##48, bitd, opt); \
+        PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
+void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom,  int _wx, int _ox); \
+void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)
+
+#define WEIGHTING_PROTOTYPES(bitd, opt) \
+        WEIGHTING_PROTOTYPE(2, bitd, opt); \
+        WEIGHTING_PROTOTYPE(4, bitd, opt); \
+        WEIGHTING_PROTOTYPE(6, bitd, opt); \
+        WEIGHTING_PROTOTYPE(8, bitd, opt); \
+        WEIGHTING_PROTOTYPE(12, bitd, opt); \
+        WEIGHTING_PROTOTYPE(16, bitd, opt); \
+        WEIGHTING_PROTOTYPE(24, bitd, opt); \
+        WEIGHTING_PROTOTYPE(32, bitd, opt); \
+        WEIGHTING_PROTOTYPE(48, bitd, opt); \
+        WEIGHTING_PROTOTYPE(64, bitd, opt)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL_PIXELS EPEL_PIXELS
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(pel_pixels ,  8, sse4);
+EPEL_PROTOTYPES(pel_pixels , 10, sse4);
+EPEL_PROTOTYPES(pel_pixels , 12, sse4);
+
+void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+
+
+void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
+void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
+
+
+void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+///////////////////////////////////////////////////////////////////////////////
+// EPEL
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(epel_h ,  8, sse4);
+EPEL_PROTOTYPES(epel_h , 10, sse4);
+EPEL_PROTOTYPES(epel_h , 12, sse4);
+
+EPEL_PROTOTYPES(epel_v ,  8, sse4);
+EPEL_PROTOTYPES(epel_v , 10, sse4);
+EPEL_PROTOTYPES(epel_v , 12, sse4);
+
+EPEL_PROTOTYPES(epel_hv ,  8, sse4);
+EPEL_PROTOTYPES(epel_hv , 10, sse4);
+EPEL_PROTOTYPES(epel_hv , 12, sse4);
+
+PEL_PROTOTYPE(epel_h16, 8, avx2);
+PEL_PROTOTYPE(epel_h24, 8, avx2);
+PEL_PROTOTYPE(epel_h32, 8, avx2);
+PEL_PROTOTYPE(epel_h48, 8, avx2);
+PEL_PROTOTYPE(epel_h64, 8, avx2);
+
+PEL_PROTOTYPE(epel_h16,10, avx2);
+PEL_PROTOTYPE(epel_h24,10, avx2);
+PEL_PROTOTYPE(epel_h32,10, avx2);
+PEL_PROTOTYPE(epel_h48,10, avx2);
+PEL_PROTOTYPE(epel_h64,10, avx2);
+
+PEL_PROTOTYPE(epel_v16, 8, avx2);
+PEL_PROTOTYPE(epel_v24, 8, avx2);
+PEL_PROTOTYPE(epel_v32, 8, avx2);
+PEL_PROTOTYPE(epel_v48, 8, avx2);
+PEL_PROTOTYPE(epel_v64, 8, avx2);
+
+PEL_PROTOTYPE(epel_v16,10, avx2);
+PEL_PROTOTYPE(epel_v24,10, avx2);
+PEL_PROTOTYPE(epel_v32,10, avx2);
+PEL_PROTOTYPE(epel_v48,10, avx2);
+PEL_PROTOTYPE(epel_v64,10, avx2);
+
+PEL_PROTOTYPE(epel_hv16, 8, avx2);
+PEL_PROTOTYPE(epel_hv24, 8, avx2);
+PEL_PROTOTYPE(epel_hv32, 8, avx2);
+PEL_PROTOTYPE(epel_hv48, 8, avx2);
+PEL_PROTOTYPE(epel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(epel_hv16,10, avx2);
+PEL_PROTOTYPE(epel_hv24,10, avx2);
+PEL_PROTOTYPE(epel_hv32,10, avx2);
+PEL_PROTOTYPE(epel_hv48,10, avx2);
+PEL_PROTOTYPE(epel_hv64,10, avx2);
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL
+///////////////////////////////////////////////////////////////////////////////
+QPEL_PROTOTYPES(qpel_h ,  8, sse4);
+QPEL_PROTOTYPES(qpel_h , 10, sse4);
+QPEL_PROTOTYPES(qpel_h , 12, sse4);
+
+QPEL_PROTOTYPES(qpel_v,  8, sse4);
+QPEL_PROTOTYPES(qpel_v, 10, sse4);
+QPEL_PROTOTYPES(qpel_v, 12, sse4);
+
+QPEL_PROTOTYPES(qpel_hv,  8, sse4);
+QPEL_PROTOTYPES(qpel_hv, 10, sse4);
+QPEL_PROTOTYPES(qpel_hv, 12, sse4);
+
+PEL_PROTOTYPE(qpel_h16, 8, avx2);
+PEL_PROTOTYPE(qpel_h24, 8, avx2);
+PEL_PROTOTYPE(qpel_h32, 8, avx2);
+PEL_PROTOTYPE(qpel_h48, 8, avx2);
+PEL_PROTOTYPE(qpel_h64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_h16,10, avx2);
+PEL_PROTOTYPE(qpel_h24,10, avx2);
+PEL_PROTOTYPE(qpel_h32,10, avx2);
+PEL_PROTOTYPE(qpel_h48,10, avx2);
+PEL_PROTOTYPE(qpel_h64,10, avx2);
+
+PEL_PROTOTYPE(qpel_v16, 8, avx2);
+PEL_PROTOTYPE(qpel_v24, 8, avx2);
+PEL_PROTOTYPE(qpel_v32, 8, avx2);
+PEL_PROTOTYPE(qpel_v48, 8, avx2);
+PEL_PROTOTYPE(qpel_v64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_v16,10, avx2);
+PEL_PROTOTYPE(qpel_v24,10, avx2);
+PEL_PROTOTYPE(qpel_v32,10, avx2);
+PEL_PROTOTYPE(qpel_v48,10, avx2);
+PEL_PROTOTYPE(qpel_v64,10, avx2);
+
+PEL_PROTOTYPE(qpel_hv16, 8, avx2);
+PEL_PROTOTYPE(qpel_hv24, 8, avx2);
+PEL_PROTOTYPE(qpel_hv32, 8, avx2);
+PEL_PROTOTYPE(qpel_hv48, 8, avx2);
+PEL_PROTOTYPE(qpel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_hv16,10, avx2);
+PEL_PROTOTYPE(qpel_hv24,10, avx2);
+PEL_PROTOTYPE(qpel_hv32,10, avx2);
+PEL_PROTOTYPE(qpel_hv48,10, avx2);
+PEL_PROTOTYPE(qpel_hv64,10, avx2);
+
+WEIGHTING_PROTOTYPES(8, sse4);
+WEIGHTING_PROTOTYPES(10, sse4);
+WEIGHTING_PROTOTYPES(12, sse4);
+
+///////////////////////////////////////////////////////////////////////////////
+// TRANSFORM_ADD
+///////////////////////////////////////////////////////////////////////////////
+
+void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+#endif // AVCODEC_X86_HEVCDSP_H
--- a/externals/ffmpeg/libavcodec/x86/hevcdsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/hevcdsp_init.c
--- a/externals/ffmpeg/libavcodec/x86/hpeldsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/hpeldsp.asm
@@ -0,0 +1,591 @@
+;******************************************************************************
+;*
+;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
+;* Copyright (c)      Nick Kurshev <nickols_k@mail.ru>
+;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* SIMD-optimized halfpel functions
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+cextern pb_1
+cextern pw_2
+pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pb_interleave8:  db 0, 4, 1, 5, 2, 6, 3, 7
+
+cextern pw_8192
+
+SECTION .text
+
+; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro PUT_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_x2, 4,5,4
+%else
+cglobal put_pixels8_x2, 4,5
+%endif
+    lea          r4, [r2*2]
+.loop:
+    movu         m0, [r1+1]
+    movu         m1, [r1+r2+1]
+%if cpuflag(sse2)
+    movu         m2, [r1]
+    movu         m3, [r1+r2]
+    pavgb        m0, m2
+    pavgb        m1, m3
+%else
+    PAVGB        m0, [r1]
+    PAVGB        m1, [r1+r2]
+%endif
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    add          r1, r4
+    add          r0, r4
+    movu         m0, [r1+1]
+    movu         m1, [r1+r2+1]
+%if cpuflag(sse2)
+    movu         m2, [r1]
+    movu         m3, [r1+r2]
+    pavgb        m0, m2
+    pavgb        m1, m3
+%else
+    PAVGB        m0, [r1]
+    PAVGB        m1, [r1+r2]
+%endif
+    add          r1, r4
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_PIXELS8_X2
+INIT_MMX 3dnow
+PUT_PIXELS8_X2
+
+
+; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro PUT_PIXELS_16 0
+cglobal put_pixels16_x2, 4,5
+    lea          r4, [r2*2]
+.loop:
+    mova         m0, [r1]
+    mova         m1, [r1+r2]
+    mova         m2, [r1+8]
+    mova         m3, [r1+r2+8]
+    PAVGB        m0, [r1+1]
+    PAVGB        m1, [r1+r2+1]
+    PAVGB        m2, [r1+9]
+    PAVGB        m3, [r1+r2+9]
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    mova     [r0+8], m2
+    mova  [r0+r2+8], m3
+    add          r1, r4
+    add          r0, r4
+    mova         m0, [r1]
+    mova         m1, [r1+r2]
+    mova         m2, [r1+8]
+    mova         m3, [r1+r2+8]
+    PAVGB        m0, [r1+1]
+    PAVGB        m1, [r1+r2+1]
+    PAVGB        m2, [r1+9]
+    PAVGB        m3, [r1+r2+9]
+    add          r1, r4
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    mova     [r0+8], m2
+    mova  [r0+r2+8], m3
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_PIXELS_16
+INIT_MMX 3dnow
+PUT_PIXELS_16
+; The 8_X2 macro can easily be used here
+INIT_XMM sse2
+PUT_PIXELS8_X2
+
+
+; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro PUT_NO_RND_PIXELS8_X2 0
+cglobal put_no_rnd_pixels8_x2, 4,5
+    mova         m6, [pb_1]
+    lea          r4, [r2*2]
+.loop:
+    mova         m0, [r1]
+    mova         m2, [r1+r2]
+    mova         m1, [r1+1]
+    mova         m3, [r1+r2+1]
+    add          r1, r4
+    psubusb      m0, m6
+    psubusb      m2, m6
+    PAVGB        m0, m1
+    PAVGB        m2, m3
+    mova       [r0], m0
+    mova    [r0+r2], m2
+    mova         m0, [r1]
+    mova         m1, [r1+1]
+    mova         m2, [r1+r2]
+    mova         m3, [r1+r2+1]
+    add          r0, r4
+    add          r1, r4
+    psubusb      m0, m6
+    psubusb      m2, m6
+    PAVGB        m0, m1
+    PAVGB        m2, m3
+    mova       [r0], m0
+    mova    [r0+r2], m2
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_X2
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_X2
+
+
+; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro PUT_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_y2, 4,5,3
+%else
+cglobal put_pixels8_y2, 4,5
+%endif
+    lea          r4, [r2*2]
+    movu         m0, [r1]
+    sub          r0, r2
+.loop:
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r4]
+    add          r1, r4
+    PAVGB        m0, m1
+    PAVGB        m1, m2
+    mova    [r0+r2], m0
+    mova    [r0+r4], m1
+    movu         m1, [r1+r2]
+    movu         m0, [r1+r4]
+    add          r0, r4
+    add          r1, r4
+    PAVGB        m2, m1
+    PAVGB        m1, m0
+    mova    [r0+r2], m2
+    mova    [r0+r4], m1
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_PIXELS8_Y2
+INIT_MMX 3dnow
+PUT_PIXELS8_Y2
+; actually, put_pixels16_y2_sse2
+INIT_XMM sse2
+PUT_PIXELS8_Y2
+
+
+; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro PUT_NO_RND_PIXELS8_Y2 0
+cglobal put_no_rnd_pixels8_y2, 4,5
+    mova         m6, [pb_1]
+    lea          r4, [r2+r2]
+    mova         m0, [r1]
+    sub          r0, r2
+.loop:
+    mova         m1, [r1+r2]
+    mova         m2, [r1+r4]
+    add          r1, r4
+    psubusb      m1, m6
+    PAVGB        m0, m1
+    PAVGB        m1, m2
+    mova    [r0+r2], m0
+    mova    [r0+r4], m1
+    mova         m1, [r1+r2]
+    mova         m0, [r1+r4]
+    add          r0, r4
+    add          r1, r4
+    psubusb      m1, m6
+    PAVGB        m2, m1
+    PAVGB        m1, m0
+    mova    [r0+r2], m2
+    mova    [r0+r4], m1
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_Y2
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_Y2
+
+
+; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro AVG_PIXELS8 0
+cglobal avg_pixels8, 4,5
+    lea          r4, [r2*2]
+.loop:
+    mova         m0, [r0]
+    mova         m1, [r0+r2]
+    PAVGB        m0, [r1]
+    PAVGB        m1, [r1+r2]
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    add          r1, r4
+    add          r0, r4
+    mova         m0, [r0]
+    mova         m1, [r0+r2]
+    PAVGB        m0, [r1]
+    PAVGB        m1, [r1+r2]
+    add          r1, r4
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX 3dnow
+AVG_PIXELS8
+
+
+; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro AVG_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_x2, 4,5,4
+%else
+cglobal avg_pixels8_x2, 4,5
+%endif
+    lea          r4, [r2*2]
+%if notcpuflag(mmxext)
+    pcmpeqd      m5, m5
+    paddb        m5, m5
+%endif
+.loop:
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+%if cpuflag(sse2)
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    pavgb        m0, m1
+    pavgb        m2, m3
+%else
+    PAVGB        m0, [r1+1], m3, m5
+    PAVGB        m2, [r1+r2+1], m4, m5
+%endif
+    PAVGB        m0, [r0], m3, m5
+    PAVGB        m2, [r0+r2], m4, m5
+    add          r1, r4
+    mova       [r0], m0
+    mova    [r0+r2], m2
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+%if cpuflag(sse2)
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    pavgb        m0, m1
+    pavgb        m2, m3
+%else
+    PAVGB        m0, [r1+1], m3, m5
+    PAVGB        m2, [r1+r2+1], m4, m5
+%endif
+    add          r0, r4
+    add          r1, r4
+    PAVGB        m0, [r0], m3, m5
+    PAVGB        m2, [r0+r2], m4, m5
+    mova       [r0], m0
+    mova    [r0+r2], m2
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmx
+AVG_PIXELS8_X2
+INIT_MMX mmxext
+AVG_PIXELS8_X2
+INIT_MMX 3dnow
+AVG_PIXELS8_X2
+; actually avg_pixels16_x2
+INIT_XMM sse2
+AVG_PIXELS8_X2
+
+
+; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro AVG_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_y2, 4,5,3
+%else
+cglobal avg_pixels8_y2, 4,5
+%endif
+    lea          r4, [r2*2]
+    movu         m0, [r1]
+    sub          r0, r2
+.loop:
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r4]
+    add          r1, r4
+    PAVGB        m0, m1
+    PAVGB        m1, m2
+    PAVGB        m0, [r0+r2]
+    PAVGB        m1, [r0+r4]
+    mova    [r0+r2], m0
+    mova    [r0+r4], m1
+    movu         m1, [r1+r2]
+    movu         m0, [r1+r4]
+    PAVGB        m2, m1
+    PAVGB        m1, m0
+    add          r0, r4
+    add          r1, r4
+    PAVGB        m2, [r0+r2]
+    PAVGB        m1, [r0+r4]
+    mova    [r0+r2], m2
+    mova    [r0+r4], m1
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+AVG_PIXELS8_Y2
+INIT_MMX 3dnow
+AVG_PIXELS8_Y2
+; actually avg_pixels16_y2
+INIT_XMM sse2
+AVG_PIXELS8_Y2
+
+
+; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+; Note this is not correctly rounded, and is therefore used for
+; not-bitexact output
+%macro AVG_APPROX_PIXELS8_XY2 0
+cglobal avg_approx_pixels8_xy2, 4,5
+    mova         m6, [pb_1]
+    lea          r4, [r2*2]
+    mova         m0, [r1]
+    PAVGB        m0, [r1+1]
+.loop:
+    mova         m2, [r1+r4]
+    mova         m1, [r1+r2]
+    psubusb      m2, m6
+    PAVGB        m1, [r1+r2+1]
+    PAVGB        m2, [r1+r4+1]
+    add          r1, r4
+    PAVGB        m0, m1
+    PAVGB        m1, m2
+    PAVGB        m0, [r0]
+    PAVGB        m1, [r0+r2]
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    mova         m1, [r1+r2]
+    mova         m0, [r1+r4]
+    PAVGB        m1, [r1+r2+1]
+    PAVGB        m0, [r1+r4+1]
+    add          r0, r4
+    add          r1, r4
+    PAVGB        m2, m1
+    PAVGB        m1, m0
+    PAVGB        m2, [r0]
+    PAVGB        m1, [r0+r2]
+    mova       [r0], m2
+    mova    [r0+r2], m1
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+AVG_APPROX_PIXELS8_XY2
+INIT_MMX 3dnow
+AVG_APPROX_PIXELS8_XY2
+
+
+; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro SET_PIXELS_XY2 1
+%if cpuflag(sse2)
+cglobal %1_pixels16_xy2, 4,5,8
+%else
+cglobal %1_pixels8_xy2, 4,5
+%endif
+    pxor        m7, m7
+    mova        m6, [pw_2]
+    movu        m0, [r1]
+    movu        m4, [r1+1]
+    mova        m1, m0
+    mova        m5, m4
+    punpcklbw   m0, m7
+    punpcklbw   m4, m7
+    punpckhbw   m1, m7
+    punpckhbw   m5, m7
+    paddusw     m4, m0
+    paddusw     m5, m1
+    xor         r4, r4
+    add         r1, r2
+.loop:
+    movu        m0, [r1+r4]
+    movu        m2, [r1+r4+1]
+    mova        m1, m0
+    mova        m3, m2
+    punpcklbw   m0, m7
+    punpcklbw   m2, m7
+    punpckhbw   m1, m7
+    punpckhbw   m3, m7
+    paddusw     m0, m2
+    paddusw     m1, m3
+    paddusw     m4, m6
+    paddusw     m5, m6
+    paddusw     m4, m0
+    paddusw     m5, m1
+    psrlw       m4, 2
+    psrlw       m5, 2
+%ifidn %1, avg
+    mova        m3, [r0+r4]
+    packuswb    m4, m5
+    PAVGB       m4, m3
+%else
+    packuswb    m4, m5
+%endif
+    mova   [r0+r4], m4
+    add         r4, r2
+
+    movu        m2, [r1+r4]
+    movu        m4, [r1+r4+1]
+    mova        m3, m2
+    mova        m5, m4
+    punpcklbw   m2, m7
+    punpcklbw   m4, m7
+    punpckhbw   m3, m7
+    punpckhbw   m5, m7
+    paddusw     m4, m2
+    paddusw     m5, m3
+    paddusw     m0, m6
+    paddusw     m1, m6
+    paddusw     m0, m4
+    paddusw     m1, m5
+    psrlw       m0, 2
+    psrlw       m1, 2
+%ifidn %1, avg
+    mova        m3, [r0+r4]
+    packuswb    m0, m1
+    PAVGB       m0, m3
+%else
+    packuswb    m0, m1
+%endif
+    mova   [r0+r4], m0
+    add         r4, r2
+    sub        r3d, 2
+    jnz .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+SET_PIXELS_XY2 avg
+INIT_MMX 3dnow
+SET_PIXELS_XY2 avg
+INIT_XMM sse2
+SET_PIXELS_XY2 put
+SET_PIXELS_XY2 avg
+
+%macro SSSE3_PIXELS_XY2 1-2
+%if %0 == 2 ; sse2
+cglobal %1_pixels16_xy2, 4,5,%2
+    mova        m4, [pb_interleave16]
+%else
+cglobal %1_pixels8_xy2, 4,5
+    mova        m4, [pb_interleave8]
+%endif
+    mova        m5, [pb_1]
+    movu        m0, [r1]
+    movu        m1, [r1+1]
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m5
+    xor         r4, r4
+    add         r1, r2
+.loop:
+    movu        m2, [r1+r4]
+    movu        m3, [r1+r4+1]
+    pmaddubsw   m2, m5
+    pmaddubsw   m3, m5
+    paddusw     m0, m2
+    paddusw     m1, m3
+    pmulhrsw    m0, [pw_8192]
+    pmulhrsw    m1, [pw_8192]
+%ifidn %1, avg
+    mova        m6, [r0+r4]
+    packuswb    m0, m1
+    pshufb      m0, m4
+    pavgb       m0, m6
+%else
+    packuswb    m0, m1
+    pshufb      m0, m4
+%endif
+    mova   [r0+r4], m0
+    add         r4, r2
+
+    movu        m0, [r1+r4]
+    movu        m1, [r1+r4+1]
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m5
+    paddusw     m2, m0
+    paddusw     m3, m1
+    pmulhrsw    m2, [pw_8192]
+    pmulhrsw    m3, [pw_8192]
+%ifidn %1, avg
+    mova        m6, [r0+r4]
+    packuswb    m2, m3
+    pshufb      m2, m4
+    pavgb       m2, m6
+%else
+    packuswb    m2, m3
+    pshufb      m2, m4
+%endif
+    mova   [r0+r4], m2
+    add         r4, r2
+    sub        r3d, 2
+    jnz .loop
+    REP_RET
+%endmacro
+
+INIT_MMX ssse3
+SSSE3_PIXELS_XY2 put
+SSSE3_PIXELS_XY2 avg
+INIT_XMM ssse3
+SSSE3_PIXELS_XY2 put, 6
+SSSE3_PIXELS_XY2 avg, 7
--- a/externals/ffmpeg/libavcodec/x86/hpeldsp.h
+++ b/externals/ffmpeg/libavcodec/x86/hpeldsp.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_HPELDSP_H
+#define AVCODEC_X86_HPELDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavcodec/hpeldsp.h"
+
+void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h);
+
+void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+
+void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+
+void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h);
+void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+
+void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags);
+
+#endif /* AVCODEC_X86_HPELDSP_H */
--- a/externals/ffmpeg/libavcodec/x86/hpeldsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/hpeldsp_init.c
@@ -0,0 +1,313 @@
+/*
+ * SIMD-optimized halfpel functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/hpeldsp.h"
+#include "libavcodec/pixels.h"
+#include "fpel.h"
+#include "hpeldsp.h"
+
+void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                    ptrdiff_t line_size, int h);
+void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                    ptrdiff_t line_size, int h);
+void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
+                          ptrdiff_t line_size, int h);
+void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                      ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+
+#define avg_pixels8_mmx         ff_avg_pixels8_mmx
+#define avg_pixels8_x2_mmx      ff_avg_pixels8_x2_mmx
+#define avg_pixels16_mmx        ff_avg_pixels16_mmx
+#define avg_pixels8_xy2_mmx     ff_avg_pixels8_xy2_mmx
+#define avg_pixels16_xy2_mmx    ff_avg_pixels16_xy2_mmx
+#define put_pixels8_mmx         ff_put_pixels8_mmx
+#define put_pixels16_mmx        ff_put_pixels16_mmx
+#define put_pixels8_xy2_mmx     ff_put_pixels8_xy2_mmx
+#define put_pixels16_xy2_mmx    ff_put_pixels16_xy2_mmx
+#define avg_no_rnd_pixels16_mmx ff_avg_pixels16_mmx
+#define put_no_rnd_pixels8_mmx  ff_put_pixels8_mmx
+#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx
+
+#if HAVE_INLINE_ASM
+
+/***********************************/
+/* MMX no rounding */
+#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
+#define SET_RND  MOVQ_WONE
+#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
+#define STATIC static
+
+#include "rnd_template.c"
+#include "hpeldsp_rnd_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+#undef STATIC
+
+#if HAVE_MMX
+CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
+CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
+
+CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
+CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
+#endif
+
+/***********************************/
+/* MMX rounding */
+
+#define DEF(x, y) x ## _ ## y ## _mmx
+#define SET_RND  MOVQ_WTWO
+#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
+
+#include "hpeldsp_rnd_template.c"
+
+#undef DEF
+#define DEF(x, y) ff_ ## x ## _ ## y ## _mmx
+#define STATIC
+
+#include "rnd_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+
+#if HAVE_MMX
+CALL_2X_PIXELS(avg_pixels16_y2_mmx, avg_pixels8_y2_mmx, 8)
+CALL_2X_PIXELS(put_pixels16_y2_mmx, put_pixels8_y2_mmx, 8)
+
+CALL_2X_PIXELS_EXPORT(ff_avg_pixels16_xy2_mmx, ff_avg_pixels8_xy2_mmx, 8)
+CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
+#endif
+
+#endif /* HAVE_INLINE_ASM */
+
+
+#if HAVE_X86ASM
+
+#define HPELDSP_AVG_PIXELS16(CPUEXT)                      \
+    CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 ## CPUEXT, 8) \
+    CALL_2X_PIXELS(put_pixels16_y2        ## CPUEXT, ff_put_pixels8_y2        ## CPUEXT, 8) \
+    CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_pixels16           ## CPUEXT, ff_avg_pixels8           ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_pixels16_x2        ## CPUEXT, ff_avg_pixels8_x2        ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_pixels16_y2        ## CPUEXT, ff_avg_pixels8_y2        ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_pixels16_xy2       ## CPUEXT, ff_avg_pixels8_xy2       ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
+
+HPELDSP_AVG_PIXELS16(_3dnow)
+HPELDSP_AVG_PIXELS16(_mmxext)
+
+#endif /* HAVE_X86ASM */
+
+#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                             \
+    if (HAVE_MMX_EXTERNAL)                                                  \
+    c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU;
+
+#if HAVE_MMX_INLINE
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
+    do {                                                                        \
+        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                                 \
+        c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
+        c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
+        c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
+    } while (0)
+#else
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
+    do {                                                                        \
+        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                                 \
+    } while (0)
+#endif
+
+static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
+{
+    SET_HPEL_FUNCS(put,        [0], 16, mmx);
+    SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
+    SET_HPEL_FUNCS(avg,        [0], 16, mmx);
+    SET_HPEL_FUNCS(avg_no_rnd,    , 16, mmx);
+    SET_HPEL_FUNCS(put,        [1],  8, mmx);
+    SET_HPEL_FUNCS(put_no_rnd, [1],  8, mmx);
+    if (HAVE_MMX_EXTERNAL) {
+        c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx;
+        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx;
+    }
+#if HAVE_MMX_INLINE
+    c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx;
+#endif
+}
+
+static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
+{
+#if HAVE_MMXEXT_EXTERNAL
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
+    c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
+
+    c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
+    c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
+    c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
+
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
+
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
+    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
+    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
+
+    if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
+        c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
+        c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
+        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
+        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
+
+        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
+        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
+    }
+#endif /* HAVE_MMXEXT_EXTERNAL */
+}
+
+static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
+{
+#if HAVE_AMD3DNOW_EXTERNAL
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
+    c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
+
+    c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
+    c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
+    c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
+
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
+
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
+    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
+    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
+
+    if (!(flags & AV_CODEC_FLAG_BITEXACT)){
+        c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
+        c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
+        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
+        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
+
+        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow;
+        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow;
+    }
+#endif /* HAVE_AMD3DNOW_EXTERNAL */
+}
+
+static void hpeldsp_init_sse2_fast(HpelDSPContext *c, int flags)
+{
+#if HAVE_SSE2_EXTERNAL
+    c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
+    c->put_pixels_tab[0][1]        = ff_put_pixels16_x2_sse2;
+    c->put_pixels_tab[0][2]        = ff_put_pixels16_y2_sse2;
+    c->put_pixels_tab[0][3]        = ff_put_pixels16_xy2_sse2;
+    c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
+    c->avg_pixels_tab[0][1]        = ff_avg_pixels16_x2_sse2;
+    c->avg_pixels_tab[0][2]        = ff_avg_pixels16_y2_sse2;
+    c->avg_pixels_tab[0][3]        = ff_avg_pixels16_xy2_sse2;
+#endif /* HAVE_SSE2_EXTERNAL */
+}
+
+static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags)
+{
+#if HAVE_SSSE3_EXTERNAL
+    c->put_pixels_tab[0][3]            = ff_put_pixels16_xy2_ssse3;
+    c->avg_pixels_tab[0][3]            = ff_avg_pixels16_xy2_ssse3;
+    c->put_pixels_tab[1][3]            = ff_put_pixels8_xy2_ssse3;
+    c->avg_pixels_tab[1][3]            = ff_avg_pixels8_xy2_ssse3;
+#endif
+}
+
+av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (INLINE_MMX(cpu_flags))
+        hpeldsp_init_mmx(c, flags);
+
+    if (EXTERNAL_AMD3DNOW(cpu_flags))
+        hpeldsp_init_3dnow(c, flags);
+
+    if (EXTERNAL_MMXEXT(cpu_flags))
+        hpeldsp_init_mmxext(c, flags);
+
+    if (EXTERNAL_SSE2_FAST(cpu_flags))
+        hpeldsp_init_sse2_fast(c, flags);
+
+    if (EXTERNAL_SSSE3(cpu_flags))
+        hpeldsp_init_ssse3(c, flags);
+
+    if (CONFIG_VP3_DECODER)
+        ff_hpeldsp_vp3_init_x86(c, cpu_flags, flags);
+}
--- a/externals/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c
+++ b/externals/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c
@@ -0,0 +1,202 @@
+/*
+ * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
+ * and improved by Zdenek Kabelac <kabi@users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+// put_pixels
+av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
+
+av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "movq   8(%1), %%mm0            \n\t"
+        "movq   9(%1), %%mm1            \n\t"
+        "movq   8(%1, %3), %%mm2        \n\t"
+        "movq   9(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, 8(%2)            \n\t"
+        "movq   %%mm5, 8(%2, %3)        \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "movq   8(%1), %%mm0            \n\t"
+        "movq   9(%1), %%mm1            \n\t"
+        "movq   8(%1, %3), %%mm2        \n\t"
+        "movq   9(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, 8(%2)            \n\t"
+        "movq   %%mm5, 8(%2, %3)        \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
+
+av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea (%3, %3), %%"FF_REG_a"     \n\t"
+        "movq (%1), %%mm0               \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"FF_REG_a"),%%mm2\n\t"
+        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"FF_REG_a"),%%mm0\n\t"
+        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
+
+av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+        __asm__ volatile(
+            ".p2align 3                 \n\t"
+            "1:                         \n\t"
+            "movq  (%1), %%mm0          \n\t"
+            "movq  1(%1), %%mm1         \n\t"
+            "movq  (%2), %%mm3          \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, (%2)          \n\t"
+            "movq  8(%1), %%mm0         \n\t"
+            "movq  9(%1), %%mm1         \n\t"
+            "movq  8(%2), %%mm3         \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, 8(%2)         \n\t"
+            "add    %3, %1              \n\t"
+            "add    %3, %2              \n\t"
+            "subl   $1, %0              \n\t"
+            "jnz    1b                  \n\t"
+            :"+g"(h), "+S"(pixels), "+D"(block)
+            :"r"((x86_reg)line_size)
+            :"memory");
+}
+
+av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm2 \n\t"
+        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
+        "movq   (%2), %%mm3             \n\t"
+        PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
+        "movq   (%2, %3), %%mm3         \n\t"
+        PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
+        "movq   %%mm0, (%2)             \n\t"
+        "movq   %%mm1, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm0 \n\t"
+        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
+        "movq   (%2), %%mm3             \n\t"
+        PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
+        "movq   (%2, %3), %%mm3         \n\t"
+        PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
+        "movq   %%mm2, (%2)             \n\t"
+        "movq   %%mm1, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
--- a/externals/ffmpeg/libavcodec/x86/hpeldsp_vp3.asm
+++ b/externals/ffmpeg/libavcodec/x86/hpeldsp_vp3.asm
@@ -0,0 +1,111 @@
+;******************************************************************************
+;* SIMD-optimized halfpel functions for VP3
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro PUT_NO_RND_PIXELS8_X2_EXACT 0
+cglobal put_no_rnd_pixels8_x2_exact, 4,5
+    lea          r4, [r2*3]
+    pcmpeqb      m6, m6
+.loop:
+    mova         m0, [r1]
+    mova         m2, [r1+r2]
+    mova         m1, [r1+1]
+    mova         m3, [r1+r2+1]
+    pxor         m0, m6
+    pxor         m2, m6
+    pxor         m1, m6
+    pxor         m3, m6
+    PAVGB        m0, m1
+    PAVGB        m2, m3
+    pxor         m0, m6
+    pxor         m2, m6
+    mova       [r0], m0
+    mova    [r0+r2], m2
+    mova         m0, [r1+r2*2]
+    mova         m1, [r1+r2*2+1]
+    mova         m2, [r1+r4]
+    mova         m3, [r1+r4+1]
+    pxor         m0, m6
+    pxor         m1, m6
+    pxor         m2, m6
+    pxor         m3, m6
+    PAVGB        m0, m1
+    PAVGB        m2, m3
+    pxor         m0, m6
+    pxor         m2, m6
+    mova  [r0+r2*2], m0
+    mova    [r0+r4], m2
+    lea          r1, [r1+r2*4]
+    lea          r0, [r0+r2*4]
+    sub         r3d, 4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_X2_EXACT
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_X2_EXACT
+
+
+; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
+cglobal put_no_rnd_pixels8_y2_exact, 4,5
+    lea          r4, [r2*3]
+    mova         m0, [r1]
+    pcmpeqb      m6, m6
+    add          r1, r2
+    pxor         m0, m6
+.loop:
+    mova         m1, [r1]
+    mova         m2, [r1+r2]
+    pxor         m1, m6
+    pxor         m2, m6
+    PAVGB        m0, m1
+    PAVGB        m1, m2
+    pxor         m0, m6
+    pxor         m1, m6
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    mova         m1, [r1+r2*2]
+    mova         m0, [r1+r4]
+    pxor         m1, m6
+    pxor         m0, m6
+    PAVGB        m2, m1
+    PAVGB        m1, m0
+    pxor         m2, m6
+    pxor         m1, m6
+    mova  [r0+r2*2], m2
+    mova    [r0+r4], m1
+    lea          r1, [r1+r2*4]
+    lea          r0, [r0+r2*4]
+    sub         r3d, 4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_Y2_EXACT
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_Y2_EXACT
--- a/externals/ffmpeg/libavcodec/x86/hpeldsp_vp3_init.c
+++ b/externals/ffmpeg/libavcodec/x86/hpeldsp_vp3_init.c
@@ -0,0 +1,56 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/hpeldsp.h"
+
+#include "hpeldsp.h"
+
+void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
+                                           const uint8_t *pixels,
+                                           ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
+                                          const uint8_t *pixels,
+                                          ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
+                                           const uint8_t *pixels,
+                                           ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
+                                          const uint8_t *pixels,
+                                          ptrdiff_t line_size, int h);
+
+av_cold void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags)
+{
+    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
+        if (flags & AV_CODEC_FLAG_BITEXACT) {
+            c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
+            c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
+        }
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        if (flags & AV_CODEC_FLAG_BITEXACT) {
+            c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
+            c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
+        }
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/huffyuvdsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/huffyuvdsp.asm
@@ -0,0 +1,164 @@
+;******************************************************************************
+;* SIMD-optimized HuffYUV functions
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Christophe Gisquet
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%include "libavcodec/x86/huffyuvdsp_template.asm"
+
+;------------------------------------------------------------------------------
+; void (*add_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+;------------------------------------------------------------------------------
+
+%macro ADD_INT16 0
+cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
+%if mmsize > 8
+    test srcq, mmsize-1
+    jnz .unaligned
+    test dstq, mmsize-1
+    jnz .unaligned
+%endif
+    INT16_LOOP a, add
+%if mmsize > 8
+.unaligned:
+    INT16_LOOP u, add
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+ADD_INT16
+%endif
+
+INIT_XMM sse2
+ADD_INT16
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+ADD_INT16
+%endif
+
+; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
+;                               intptr_t w, uint8_t *left)
+%macro LEFT_BGR32 0
+cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
+    shl           wq, 2
+    movd          m0, [leftq]
+    lea         dstq, [dstq + wq]
+    lea         srcq, [srcq + wq]
+    LSHIFT        m0, mmsize-4
+    neg           wq
+.loop:
+    movu          m1, [srcq+wq]
+    mova          m2, m1
+%if mmsize == 8
+    punpckhdq     m0, m0
+%endif
+    LSHIFT        m1, 4
+    paddb         m1, m2
+%if mmsize == 16
+    pshufd        m0, m0, q3333
+    mova          m2, m1
+    LSHIFT        m1, 8
+    paddb         m1, m2
+%endif
+    paddb         m0, m1
+    movu   [dstq+wq], m0
+    add           wq, mmsize
+    jl         .loop
+    movd          m0, [dstq-4]
+    movd     [leftq], m0
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+LEFT_BGR32
+%endif
+INIT_XMM sse2
+LEFT_BGR32
+
+; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
+INIT_MMX mmxext
+cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
+    add      wd, wd
+    movd    mm6, maskd
+    SPLATW  mm6, mm6
+    movq    mm0, [topq]
+    movq    mm2, mm0
+    movd    mm4, [left_topq]
+    psllq   mm2, 16
+    movq    mm1, mm0
+    por     mm4, mm2
+    movd    mm3, [leftq]
+    psubw   mm0, mm4 ; t-tl
+    add    dstq, wq
+    add    topq, wq
+    add   diffq, wq
+    neg      wq
+    jmp .skip
+.loop:
+    movq    mm4, [topq+wq]
+    movq    mm0, mm4
+    psllq   mm4, 16
+    por     mm4, mm1
+    movq    mm1, mm0 ; t
+    psubw   mm0, mm4 ; t-tl
+.skip:
+    movq    mm2, [diffq+wq]
+%assign i 0
+%rep 4
+    movq    mm4, mm0
+    paddw   mm4, mm3 ; t-tl+l
+    pand    mm4, mm6
+    movq    mm5, mm3
+    pmaxsw  mm3, mm1
+    pminsw  mm5, mm1
+    pminsw  mm3, mm4
+    pmaxsw  mm3, mm5 ; median
+    paddw   mm3, mm2 ; +residual
+    pand    mm3, mm6
+%if i==0
+    movq    mm7, mm3
+    psllq   mm7, 48
+%else
+    movq    mm4, mm3
+    psrlq   mm7, 16
+    psllq   mm4, 48
+    por     mm7, mm4
+%endif
+%if i<3
+    psrlq   mm0, 16
+    psrlq   mm1, 16
+    psrlq   mm2, 16
+%endif
+%assign i i+1
+%endrep
+    movq [dstq+wq], mm7
+    add      wq, 8
+    jl .loop
+    movzx   r2d, word [dstq-2]
+    mov [leftq], r2d
+    movzx   r2d, word [topq-2]
+    mov [left_topq], r2d
+    RET
--- a/externals/ffmpeg/libavcodec/x86/huffyuvdsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/huffyuvdsp_init.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/huffyuvdsp.h"
+
+void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_avx2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+
+void ff_add_hfyu_left_pred_bgr32_mmx(uint8_t *dst, const uint8_t *src,
+                                     intptr_t w, uint8_t *left);
+void ff_add_hfyu_left_pred_bgr32_sse2(uint8_t *dst, const uint8_t *src,
+                                      intptr_t w, uint8_t *left);
+void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
+
+av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt)
+{
+    int cpu_flags = av_get_cpu_flags();
+    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(pix_fmt);
+
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmx;
+        c->add_int16 = ff_add_int16_mmx;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
+        c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_int16 = ff_add_int16_sse2;
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_sse2;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->add_int16 = ff_add_int16_avx2;
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/huffyuvdsp_template.asm
+++ b/externals/ffmpeg/libavcodec/x86/huffyuvdsp_template.asm
@@ -0,0 +1,76 @@
+;******************************************************************************
+;* SIMD-optimized HuffYUV functions
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Christophe Gisquet
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
+    movd    xm4, maskd
+    SPLATW  m4, xm4
+    add     wd, wd
+    test    wq, 2*mmsize - 1
+    jz %%.tomainloop
+    push  tmpq
+%%.wordloop:
+    sub     wq, 2
+%ifidn %2, add
+    mov   tmpw, [srcq+wq]
+    add   tmpw, [dstq+wq]
+%else
+    mov   tmpw, [src1q+wq]
+    sub   tmpw, [src2q+wq]
+%endif
+    and   tmpw, maskw
+    mov     [dstq+wq], tmpw
+    test    wq, 2*mmsize - 1
+    jnz %%.wordloop
+    pop   tmpq
+%%.tomainloop:
+%ifidn %2, add
+    add     srcq, wq
+%else
+    add     src1q, wq
+    add     src2q, wq
+%endif
+    add     dstq, wq
+    neg     wq
+    jz      %%.end
+%%.loop:
+%ifidn %2, add
+    mov%1   m0, [srcq+wq]
+    mov%1   m1, [dstq+wq]
+    mov%1   m2, [srcq+wq+mmsize]
+    mov%1   m3, [dstq+wq+mmsize]
+%else
+    mov%1   m0, [src1q+wq]
+    mov%1   m1, [src2q+wq]
+    mov%1   m2, [src1q+wq+mmsize]
+    mov%1   m3, [src2q+wq+mmsize]
+%endif
+    p%2w    m0, m1
+    p%2w    m2, m3
+    pand    m0, m4
+    pand    m2, m4
+    mov%1   [dstq+wq]       , m0
+    mov%1   [dstq+wq+mmsize], m2
+    add     wq, 2*mmsize
+    jl %%.loop
+%%.end:
+    RET
+%endmacro
--- a/externals/ffmpeg/libavcodec/x86/huffyuvencdsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/huffyuvencdsp.asm
@@ -0,0 +1,105 @@
+;************************************************************************
+;* SIMD-optimized HuffYUV encoding functions
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+;*
+;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%include "libavcodec/x86/huffyuvdsp_template.asm"
+
+;------------------------------------------------------------------------------
+; void ff_diff_int16(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                    unsigned mask, int w);
+;------------------------------------------------------------------------------
+
+%macro DIFF_INT16 0
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
+%if mmsize > 8
+    test src1q, mmsize-1
+    jnz .unaligned
+    test src2q, mmsize-1
+    jnz .unaligned
+    test dstq, mmsize-1
+    jnz .unaligned
+%endif
+    INT16_LOOP a, sub
+%if mmsize > 8
+.unaligned:
+    INT16_LOOP u, sub
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+DIFF_INT16
+%endif
+
+INIT_XMM sse2
+DIFF_INT16
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+DIFF_INT16
+%endif
+
+INIT_MMX mmxext
+cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
+    add      wd, wd
+    movd    mm7, maskd
+    SPLATW  mm7, mm7
+    movq    mm0, [src1q]
+    movq    mm2, [src2q]
+    psllq   mm0, 16
+    psllq   mm2, 16
+    movd    mm6, [left_topq]
+    por     mm0, mm6
+    movd    mm6, [leftq]
+    por     mm2, mm6
+    xor     maskq, maskq
+.loop:
+    movq    mm1, [src1q + maskq]
+    movq    mm3, [src2q + maskq]
+    movq    mm4, mm2
+    psubw   mm2, mm0
+    paddw   mm2, mm1
+    pand    mm2, mm7
+    movq    mm5, mm4
+    pmaxsw  mm4, mm1
+    pminsw  mm1, mm5
+    pminsw  mm4, mm2
+    pmaxsw  mm4, mm1
+    psubw   mm3, mm4
+    pand    mm3, mm7
+    movq    [dstq + maskq], mm3
+    add     maskq, 8
+    movq    mm0, [src1q + maskq - 2]
+    movq    mm2, [src2q + maskq - 2]
+    cmp     maskq, wq
+        jb .loop
+    movzx maskd, word [src1q + wq - 2]
+    mov [left_topq], maskd
+    movzx maskd, word [src2q + wq - 2]
+    mov [leftq], maskd
+    RET
--- a/externals/ffmpeg/libavcodec/x86/huffyuvencdsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/huffyuvencdsp_init.c
@@ -0,0 +1,60 @@
+/*
+ * SIMD-optimized HuffYUV encoding functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/huffyuvencdsp.h"
+
+void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                        unsigned mask, int w);
+void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                        unsigned mask, int w);
+void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                        unsigned mask, int w);
+void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                                          unsigned mask, int w, int *left, int *left_top);
+
+av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
+{
+    av_unused int cpu_flags = av_get_cpu_flags();
+    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->diff_int16 = ff_diff_int16_mmx;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
+        c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->diff_int16 = ff_diff_int16_sse2;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->diff_int16 = ff_diff_int16_avx2;
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/idctdsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/idctdsp.asm
@@ -0,0 +1,183 @@
+;******************************************************************************
+;* SIMD-optimized IDCT-related routines
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_80
+
+SECTION .text
+
+;--------------------------------------------------------------------------
+;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                                  ptrdiff_t line_size)
+;--------------------------------------------------------------------------
+
+%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
+    mova     m1, [blockq+mmsize*0+%1]
+    mova     m2, [blockq+mmsize*2+%1]
+%if mmsize == 8
+    mova     m3, [blockq+mmsize*4+%1]
+    mova     m4, [blockq+mmsize*6+%1]
+%endif
+    packsswb m1, [blockq+mmsize*1+%1]
+    packsswb m2, [blockq+mmsize*3+%1]
+%if mmsize == 8
+    packsswb m3, [blockq+mmsize*5+%1]
+    packsswb m4, [blockq+mmsize*7+%1]
+%endif
+    paddb    m1, m0
+    paddb    m2, m0
+%if mmsize == 8
+    paddb    m3, m0
+    paddb    m4, m0
+    movq     [pixelsq+lsizeq*0], m1
+    movq     [pixelsq+lsizeq*1], m2
+    movq     [pixelsq+lsizeq*2], m3
+    movq     [pixelsq+lsize3q ], m4
+%else
+    movq     [pixelsq+lsizeq*0], m1
+    movhps   [pixelsq+lsizeq*1], m1
+    movq     [pixelsq+lsizeq*2], m2
+    movhps   [pixelsq+lsize3q ], m2
+%endif
+%endmacro
+
+%macro PUT_SIGNED_PIXELS_CLAMPED 1
+cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3
+    mova     m0, [pb_80]
+    lea      lsize3q, [lsizeq*3]
+    PUT_SIGNED_PIXELS_CLAMPED_HALF 0
+    lea      pixelsq, [pixelsq+lsizeq*4]
+    PUT_SIGNED_PIXELS_CLAMPED_HALF 64
+    RET
+%endmacro
+
+INIT_MMX mmx
+PUT_SIGNED_PIXELS_CLAMPED 0
+INIT_XMM sse2
+PUT_SIGNED_PIXELS_CLAMPED 3
+
+;--------------------------------------------------------------------------
+; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                            ptrdiff_t line_size);
+;--------------------------------------------------------------------------
+; %1 = block offset
+%macro PUT_PIXELS_CLAMPED_HALF 1
+    mova     m0, [blockq+mmsize*0+%1]
+    mova     m1, [blockq+mmsize*2+%1]
+%if mmsize == 8
+    mova     m2, [blockq+mmsize*4+%1]
+    mova     m3, [blockq+mmsize*6+%1]
+%endif
+    packuswb m0, [blockq+mmsize*1+%1]
+    packuswb m1, [blockq+mmsize*3+%1]
+%if mmsize == 8
+    packuswb m2, [blockq+mmsize*5+%1]
+    packuswb m3, [blockq+mmsize*7+%1]
+    movq           [pixelsq], m0
+    movq    [lsizeq+pixelsq], m1
+    movq  [2*lsizeq+pixelsq], m2
+    movq   [lsize3q+pixelsq], m3
+%else
+    movq           [pixelsq], m0
+    movhps  [lsizeq+pixelsq], m0
+    movq  [2*lsizeq+pixelsq], m1
+    movhps [lsize3q+pixelsq], m1
+%endif
+%endmacro
+
+%macro PUT_PIXELS_CLAMPED 0
+cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3
+    lea lsize3q, [lsizeq*3]
+    PUT_PIXELS_CLAMPED_HALF 0
+    lea pixelsq, [pixelsq+lsizeq*4]
+    PUT_PIXELS_CLAMPED_HALF 64
+    RET
+%endmacro
+
+INIT_MMX mmx
+PUT_PIXELS_CLAMPED
+INIT_XMM sse2
+PUT_PIXELS_CLAMPED
+
+;--------------------------------------------------------------------------
+; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                            ptrdiff_t line_size);
+;--------------------------------------------------------------------------
+; %1 = block offset
+%macro ADD_PIXELS_CLAMPED 1
+    mova       m0, [blockq+mmsize*0+%1]
+    mova       m1, [blockq+mmsize*1+%1]
+%if mmsize == 8
+    mova       m5, [blockq+mmsize*2+%1]
+    mova       m6, [blockq+mmsize*3+%1]
+%endif
+    movq       m2, [pixelsq]
+    movq       m3, [pixelsq+lsizeq]
+%if mmsize == 8
+    mova       m7, m2
+    punpcklbw  m2, m4
+    punpckhbw  m7, m4
+    paddsw     m0, m2
+    paddsw     m1, m7
+    mova       m7, m3
+    punpcklbw  m3, m4
+    punpckhbw  m7, m4
+    paddsw     m5, m3
+    paddsw     m6, m7
+%else
+    punpcklbw  m2, m4
+    punpcklbw  m3, m4
+    paddsw     m0, m2
+    paddsw     m1, m3
+%endif
+    packuswb   m0, m1
+%if mmsize == 8
+    packuswb   m5, m6
+    movq       [pixelsq], m0
+    movq       [pixelsq+lsizeq], m5
+%else
+    movq       [pixelsq], m0
+    movhps     [pixelsq+lsizeq], m0
+%endif
+%endmacro
+
+%macro ADD_PIXELS_CLAMPED 0
+cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize
+    pxor       m4, m4
+    ADD_PIXELS_CLAMPED 0
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 32
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 64
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 96
+    RET
+%endmacro
+
+INIT_MMX mmx
+ADD_PIXELS_CLAMPED
+INIT_XMM sse2
+ADD_PIXELS_CLAMPED
--- a/externals/ffmpeg/libavcodec/x86/idctdsp.h
+++ b/externals/ffmpeg/libavcodec/x86/idctdsp.h
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_IDCTDSP_H
+#define AVCODEC_X86_IDCTDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                               ptrdiff_t line_size);
+void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
+void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                               ptrdiff_t line_size);
+void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                                      ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                       ptrdiff_t line_size);
+
+
+#endif /* AVCODEC_X86_IDCTDSP_H */
--- a/externals/ffmpeg/libavcodec/x86/idctdsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/idctdsp_init.c
@@ -0,0 +1,162 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idctdsp.h"
+#include "simple_idct.h"
+
+/* Input permutation for the simple_idct_mmx */
+static const uint8_t simple_mmx_permutation[64] = {
+    0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
+    0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
+    0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
+    0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
+    0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
+    0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
+    0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
+    0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
+};
+
+static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
+                                              enum idct_permutation_type perm_type)
+{
+    int i;
+
+    switch (perm_type) {
+    case FF_IDCT_PERM_SIMPLE:
+        for (i = 0; i < 64; i++)
+            idct_permutation[i] = simple_mmx_permutation[i];
+        return 1;
+    case FF_IDCT_PERM_SSE2:
+        for (i = 0; i < 64; i++)
+            idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
+        return 1;
+    }
+
+    return 0;
+}
+
+av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
+                                 unsigned high_bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
+
+        if (!high_bit_depth &&
+            avctx->lowres == 0 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+                c->idct_put  = ff_simple_idct_put_mmx;
+                c->idct_add  = ff_simple_idct_add_mmx;
+                c->idct      = ff_simple_idct_mmx;
+                c->perm_type = FF_IDCT_PERM_SIMPLE;
+        }
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_sse2;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_sse2;
+
+        if (!high_bit_depth &&
+            avctx->lowres == 0 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+                c->idct_put  = ff_simple_idct_put_sse2;
+                c->idct_add  = ff_simple_idct_add_sse2;
+                c->perm_type = FF_IDCT_PERM_SIMPLE;
+        }
+
+        if (ARCH_X86_64 &&
+            !high_bit_depth &&
+            avctx->lowres == 0 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX ||
+                avctx->idct_algo == FF_IDCT_SIMPLE)) {
+                c->idct      = ff_simple_idct8_sse2;
+                c->idct_put  = ff_simple_idct8_put_sse2;
+                c->idct_add  = ff_simple_idct8_add_sse2;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+        }
+    }
+
+    if (ARCH_X86_64 && avctx->lowres == 0) {
+        if (EXTERNAL_AVX(cpu_flags) &&
+            !high_bit_depth &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX ||
+                avctx->idct_algo == FF_IDCT_SIMPLE)) {
+                c->idct      = ff_simple_idct8_avx;
+                c->idct_put  = ff_simple_idct8_put_avx;
+                c->idct_add  = ff_simple_idct8_add_avx;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+        }
+
+        if (avctx->bits_per_raw_sample == 10 &&
+            avctx->codec_id != AV_CODEC_ID_MPEG4 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLE)) {
+            if (EXTERNAL_SSE2(cpu_flags)) {
+                c->idct_put  = ff_simple_idct10_put_sse2;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct10_sse2;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+
+            }
+            if (EXTERNAL_AVX(cpu_flags)) {
+                c->idct_put  = ff_simple_idct10_put_avx;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct10_avx;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+            }
+        }
+
+        if (avctx->bits_per_raw_sample == 12 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+            if (EXTERNAL_SSE2(cpu_flags)) {
+                c->idct_put  = ff_simple_idct12_put_sse2;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct12_sse2;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+            }
+            if (EXTERNAL_AVX(cpu_flags)) {
+                c->idct_put  = ff_simple_idct12_put_avx;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct12_avx;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+            }
+        }
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/imdct36.asm
+++ b/externals/ffmpeg/libavcodec/x86/imdct36.asm
@@ -0,0 +1,741 @@
+;******************************************************************************
+;* 36 point SSE-optimized IMDCT transform
+;* Copyright (c) 2011 Vitor Sessak
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+ps_mask:  dd 0, ~0, ~0, ~0
+ps_mask2: dd 0, ~0,  0, ~0
+ps_mask3: dd 0,  0,  0, ~0
+ps_mask4: dd 0, ~0,  0,  0
+
+ps_val1:  dd          -0.5,          -0.5, -0.8660254038, -0.8660254038
+ps_val2:  dd           1.0,           1.0,  0.8660254038,  0.8660254038
+ps_val3:  dd  0.1736481777,  0.1736481777,  0.3420201433,  0.3420201433
+ps_val4:  dd -0.7660444431, -0.7660444431,  0.8660254038,  0.8660254038
+ps_val5:  dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
+ps_val6:  dd           0.5,           0.5, -0.6427876097, -0.6427876097
+ps_val7:  dd           1.0,           1.0, -0.6427876097, -0.6427876097
+
+ps_p1p1m1m1: dd 0,          0, 0x80000000, 0x80000000
+ps_p1m1p1m1: dd 0, 0x80000000,          0, 0x80000000
+
+ps_cosh:       dd 1.0, 0.50190991877167369479,  1.0,  5.73685662283492756461
+               dd 1.0, 0.51763809020504152469,  1.0,  1.93185165257813657349
+               dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
+               dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
+               dd 1.0, 0.70710678118654752439,  0.0,  0.0
+
+ps_cosh_sse3:  dd 1.0, -0.50190991877167369479,  1.0, -5.73685662283492756461
+               dd 1.0, -0.51763809020504152469,  1.0, -1.93185165257813657349
+               dd 1.0, -0.55168895948124587824, -1.0,  1.18310079157624925896
+               dd 1.0, -0.61038729438072803416, -1.0,  0.87172339781054900991
+               dd 1.0, -0.70710678118654752439,  0.0,  0.0
+
+costabs:  times 4 dd  0.98480773
+          times 4 dd  0.93969262
+          times 4 dd  0.86602539
+          times 4 dd -0.76604444
+          times 4 dd -0.64278764
+          times 4 dd  0.50000000
+          times 4 dd -0.50000000
+          times 4 dd -0.34202015
+          times 4 dd -0.17364818
+          times 4 dd  0.50190992
+          times 4 dd  0.51763808
+          times 4 dd  0.55168896
+          times 4 dd  0.61038726
+          times 4 dd  0.70710677
+          times 4 dd  0.87172341
+          times 4 dd  1.18310082
+          times 4 dd  1.93185163
+          times 4 dd  5.73685646
+
+%define SBLIMIT 32
+SECTION .text
+
+%macro PSHUFD 3
+%if cpuflag(sse2) && notcpuflag(avx)
+    pshufd %1, %2, %3
+%else
+    shufps %1, %2, %2, %3
+%endif
+%endmacro
+
+; input  %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
+; output %1={x3,x4,y1,y2}
+%macro BUILDINVHIGHLOW 3
+%if cpuflag(avx)
+    shufps %1, %2, %3, 0x4e
+%else
+    movlhps %1, %3
+    movhlps %1, %2
+%endif
+%endmacro
+
+; input  %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
+; output %1={x4,y1,y2,y3}
+%macro ROTLEFT 3
+%if cpuflag(ssse3)
+    palignr  %1, %3, %2, 12
+%else
+    BUILDINVHIGHLOW %1, %2, %3
+    shufps  %1, %1, %3, 0x99
+%endif
+%endmacro
+
+%macro INVERTHL 2
+%if cpuflag(sse2)
+    PSHUFD  %1, %2, 0x4e
+%else
+    movhlps %1, %2
+    movlhps %1, %2
+%endif
+%endmacro
+
+%macro BUTTERF 3
+    INVERTHL %2, %1
+    xorps    %1, [ps_p1p1m1m1]
+    addps    %1, %2
+%if cpuflag(sse3)
+    mulps    %1, %1, [ps_cosh_sse3 + %3]
+    PSHUFD   %2, %1, 0xb1
+    addsubps %1, %1, %2
+%else
+    mulps    %1, [ps_cosh + %3]
+    PSHUFD   %2, %1, 0xb1
+    xorps    %1, [ps_p1m1p1m1]
+    addps    %1, %2
+%endif
+%endmacro
+
+%macro BUTTERF2 3
+%if cpuflag(sse3)
+    mulps    %1, %1, [ps_cosh_sse3 + %3]
+    PSHUFD   %2, %1, 0xe1
+    addsubps %1, %1, %2
+%else
+    mulps    %1, [ps_cosh + %3]
+    PSHUFD   %2, %1, 0xe1
+    xorps    %1, [ps_p1m1p1m1]
+    addps    %1, %2
+%endif
+%endmacro
+
+%macro STORE 4
+%if cpuflag(sse4)
+    movss     [%3       ], %1
+    extractps dword [%3 +   %4], %1, 1
+    extractps dword [%3 + 2*%4], %1, 2
+    extractps dword [%3 + 3*%4], %1, 3
+%else
+    movhlps %2, %1
+    movss   [%3       ], %1
+    movss   [%3 + 2*%4], %2
+    shufps  %1, %1, 0xb1
+    movss   [%3 +   %4], %1
+    movhlps %2, %1
+    movss   [%3 + 3*%4], %2
+%endif
+%endmacro
+
+%macro LOAD 4
+    movlps  %1, [%3       ]
+    movhps  %1, [%3 +   %4]
+    movlps  %2, [%3 + 2*%4]
+    movhps  %2, [%3 + 3*%4]
+    shufps  %1, %2, 0x88
+%endmacro
+
+%macro LOADA64 2
+%if cpuflag(avx)
+   movu     %1, [%2]
+%else
+   movlps   %1, [%2]
+   movhps   %1, [%2 + 8]
+%endif
+%endmacro
+
+%macro DEFINE_IMDCT 0
+cglobal imdct36_float, 4,4,9, out, buf, in, win
+
+    ; for(i=17;i>=1;i--) in[i] += in[i-1];
+    LOADA64 m0, inq
+    LOADA64 m1, inq + 16
+
+    ROTLEFT m5, m0, m1
+
+    PSHUFD  m6, m0, 0x93
+    andps   m6, m6, [ps_mask]
+    addps   m0, m0, m6
+
+    LOADA64 m2, inq + 32
+
+    ROTLEFT m7, m1, m2
+
+    addps   m1, m1, m5
+    LOADA64 m3, inq + 48
+
+    ROTLEFT m5, m2, m3
+
+    xorps   m4, m4, m4
+    movlps  m4, [inq+64]
+    BUILDINVHIGHLOW m6, m3, m4
+    shufps  m6, m6, m4, 0xa9
+
+    addps   m4, m4, m6
+    addps   m2, m2, m7
+    addps   m3, m3, m5
+
+    ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
+    movlhps m5, m5, m0
+    andps   m5, m5, [ps_mask3]
+
+    BUILDINVHIGHLOW m7, m0, m1
+    andps   m7, m7, [ps_mask2]
+
+    addps   m0, m0, m5
+
+    BUILDINVHIGHLOW m6, m1, m2
+    andps   m6, m6, [ps_mask2]
+
+    addps  m1, m1, m7
+
+    BUILDINVHIGHLOW m7, m2, m3
+    andps   m7, m7, [ps_mask2]
+
+    addps   m2, m2, m6
+
+    movhlps m6, m6, m3
+    andps   m6, m6, [ps_mask4]
+
+    addps  m3, m3, m7
+    addps  m4, m4, m6
+
+    ; Populate tmp[]
+    movlhps m6, m1, m5    ; zero out high values
+    subps   m6, m6, m4
+
+    subps  m5, m0, m3
+
+%if ARCH_X86_64
+    SWAP   m5, m8
+%endif
+
+    mulps  m7, m2, [ps_val1]
+
+%if ARCH_X86_64
+    mulps  m5, m8, [ps_val2]
+%else
+    mulps  m5, m5, [ps_val2]
+%endif
+    addps  m7, m7, m5
+
+    mulps  m5, m6, [ps_val1]
+    subps  m7, m7, m5
+
+%if ARCH_X86_64
+    SWAP   m5, m8
+%else
+    subps  m5, m0, m3
+%endif
+
+    subps  m5, m5, m6
+    addps  m5, m5, m2
+
+    shufps m6, m4, m3, 0xe4
+    subps  m6, m6, m2
+    mulps  m6, m6, [ps_val3]
+
+    addps  m4, m4, m1
+    mulps  m4, m4, [ps_val4]
+
+    shufps m1, m1, m0, 0xe4
+    addps  m1, m1, m2
+    mulps  m1, m1, [ps_val5]
+
+    mulps  m3, m3, [ps_val6]
+    mulps  m0, m0, [ps_val7]
+    addps  m0, m0, m3
+
+    xorps  m2, m1, [ps_p1p1m1m1]
+    subps  m2, m2, m4
+    addps  m2, m2, m0
+
+    addps  m3, m4, m0
+    subps  m3, m3, m6
+    xorps  m3, m3, [ps_p1p1m1m1]
+
+    shufps m0, m0, m4, 0xe4
+    subps  m0, m0, m1
+    addps  m0, m0, m6
+
+    BUILDINVHIGHLOW m4, m2, m3
+    shufps  m3, m3, m2, 0x4e
+
+    ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
+
+    BUTTERF  m0, m1, 0
+    BUTTERF  m7, m2, 16
+    BUTTERF  m3, m6, 32
+    BUTTERF  m4, m1, 48
+    BUTTERF2 m5, m1, 64
+
+    ; permutates:
+    ; m0    0  1  2  3     =>     2  6 10 14   m1
+    ; m7    4  5  6  7     =>     3  7 11 15   m2
+    ; m3    8  9 10 11     =>    17 13  9  5   m3
+    ; m4   12 13 14 15     =>    16 12  8  4   m5
+    ; m5   16 17 xx xx     =>     0  1 xx xx   m0
+
+    unpckhps m1, m0, m7
+    unpckhps m6, m3, m4
+    movhlps  m2, m6, m1
+    movlhps  m1, m1, m6
+
+    unpcklps m5, m5, m4
+    unpcklps m3, m3, m7
+    movhlps  m4, m3, m5
+    movlhps  m5, m5, m3
+    SWAP m4, m3
+    ; permutation done
+
+    PSHUFD  m6, m2, 0xb1
+    movss   m4, [bufq + 4*68]
+    movss   m7, [bufq + 4*64]
+    unpcklps  m7, m7, m4
+    mulps   m6, m6, [winq + 16*4]
+    addps   m6, m6, m7
+    movss   [outq + 64*SBLIMIT], m6
+    shufps  m6, m6, m6, 0xb1
+    movss   [outq + 68*SBLIMIT], m6
+
+    mulps   m6, m3, [winq + 4*4]
+    LOAD    m4, m7, bufq + 4*16, 16
+    addps   m6, m6, m4
+    STORE   m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
+
+    shufps  m4, m0, m3, 0xb5
+    mulps   m4, m4, [winq + 8*4]
+    LOAD    m7, m6, bufq + 4*32, 16
+    addps   m4, m4, m7
+    STORE   m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
+
+    shufps  m3, m3, m2, 0xb1
+    mulps   m3, m3, [winq + 12*4]
+    LOAD    m7, m6, bufq + 4*48, 16
+    addps   m3, m3, m7
+    STORE   m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
+
+    mulps   m2, m2, [winq]
+    LOAD    m6, m7, bufq, 16
+    addps   m2, m2, m6
+    STORE   m2, m7, outq, 4*SBLIMIT
+
+    mulps    m4, m1, [winq + 20*4]
+    STORE    m4, m7, bufq, 16
+
+    mulps    m3, m5, [winq + 24*4]
+    STORE    m3, m7, bufq + 4*16, 16
+
+    shufps   m0, m0, m5, 0xb0
+    mulps    m0, m0, [winq + 28*4]
+    STORE    m0, m7, bufq + 4*32, 16
+
+    shufps   m5, m5, m1, 0xb1
+    mulps    m5, m5, [winq + 32*4]
+    STORE    m5, m7, bufq + 4*48, 16
+
+    shufps   m1, m1, m1, 0xb1
+    mulps    m1, m1, [winq + 36*4]
+    movss    [bufq + 4*64], m1
+    shufps   m1, m1, 0xb1
+    movss    [bufq + 4*68], m1
+    RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_XMM sse
+DEFINE_IMDCT
+%endif
+
+INIT_XMM sse2
+DEFINE_IMDCT
+
+INIT_XMM sse3
+DEFINE_IMDCT
+
+INIT_XMM ssse3
+DEFINE_IMDCT
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+DEFINE_IMDCT
+%endif
+
+INIT_XMM sse
+
+%if ARCH_X86_64
+%define SPILL SWAP
+%define UNSPILL SWAP
+%define SPILLED(x) m %+ x
+%else
+%define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
+%macro SPILL 2 ; xmm#, mempos
+    movaps SPILLED(%2), m%1
+%endmacro
+%macro UNSPILL 2
+    movaps m%1, SPILLED(%2)
+%endmacro
+%endif
+
+%macro DEFINE_FOUR_IMDCT 0
+cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
+    movlps  m0, [inq+64]
+    movhps  m0, [inq+64 +   72]
+    movlps  m3, [inq+64 + 2*72]
+    movhps  m3, [inq+64 + 3*72]
+
+    shufps  m5, m0, m3, 0xdd
+    shufps  m0, m0, m3, 0x88
+
+    mova     m1, [inq+48]
+    movu     m6, [inq+48 +   72]
+    mova     m7, [inq+48 + 2*72]
+    movu     m3, [inq+48 + 3*72]
+
+    TRANSPOSE4x4PS 1, 6, 7, 3, 4
+
+    addps   m4, m6, m7
+    mova    [tmpq+4*28], m4
+
+    addps    m7, m3
+    addps    m6, m1
+    addps    m3, m0
+    addps    m0, m5
+    addps    m0, m7
+    addps    m7, m6
+    mova    [tmpq+4*12], m7
+    SPILL   3, 12
+
+    mova     m4, [inq+32]
+    movu     m5, [inq+32 +   72]
+    mova     m2, [inq+32 + 2*72]
+    movu     m7, [inq+32 + 3*72]
+
+    TRANSPOSE4x4PS 4, 5, 2, 7, 3
+
+    addps   m1, m7
+    SPILL   1, 11
+
+    addps   m3, m5, m2
+    SPILL   3, 13
+
+    addps    m7, m2
+    addps    m5, m4
+    addps    m6, m7
+    mova    [tmpq], m6
+    addps   m7, m5
+    mova    [tmpq+4*16], m7
+
+    mova    m2, [inq+16]
+    movu    m7, [inq+16 +   72]
+    mova    m1, [inq+16 + 2*72]
+    movu    m6, [inq+16 + 3*72]
+
+    TRANSPOSE4x4PS 2, 7, 1, 6, 3
+
+    addps   m4, m6
+    addps   m6, m1
+    addps   m1, m7
+    addps   m7, m2
+    addps   m5, m6
+    SPILL   5, 15
+    addps   m6, m7
+    mulps   m6, [costabs + 16*2]
+    mova    [tmpq+4*8], m6
+    SPILL   1, 10
+    SPILL   0, 14
+
+    mova    m1, [inq]
+    movu    m6, [inq +   72]
+    mova    m3, [inq + 2*72]
+    movu    m5, [inq + 3*72]
+
+    TRANSPOSE4x4PS 1, 6, 3, 5, 0
+
+    addps    m2, m5
+    addps    m5, m3
+    addps    m7, m5
+    addps    m3, m6
+    addps    m6, m1
+    SPILL    7, 8
+    addps    m5, m6
+    SPILL    6, 9
+    addps    m6, m4, SPILLED(12)
+    subps    m6, m2
+    UNSPILL  7, 11
+    SPILL    5, 11
+    subps    m5, m1, m7
+    mulps    m7, [costabs + 16*5]
+    addps    m7, m1
+    mulps    m0, m6, [costabs + 16*6]
+    addps    m0, m5
+    mova     [tmpq+4*24], m0
+    addps    m6, m5
+    mova     [tmpq+4*4], m6
+    addps    m6, m4, m2
+    mulps    m6, [costabs + 16*1]
+    subps    m4, SPILLED(12)
+    mulps    m4, [costabs + 16*8]
+    addps    m2, SPILLED(12)
+    mulps    m2, [costabs + 16*3]
+    subps    m5, m7, m6
+    subps    m5, m2
+    addps    m6, m7
+    addps    m6, m4
+    addps    m7, m2
+    subps    m7, m4
+    mova     [tmpq+4*20], m7
+    mova     m2, [tmpq+4*28]
+    mova     [tmpq+4*28], m5
+    UNSPILL  7, 13
+    subps    m5, m7, m2
+    mulps    m5, [costabs + 16*7]
+    UNSPILL  1, 10
+    mulps    m1, [costabs + 16*2]
+    addps    m4, m3, m2
+    mulps    m4, [costabs + 16*4]
+    addps    m2, m7
+    addps    m7, m3
+    mulps    m7, [costabs]
+    subps    m3, m2
+    mulps    m3, [costabs + 16*2]
+    addps    m2, m7, m5
+    addps    m2, m1
+    SPILL    2, 10
+    addps    m7, m4
+    subps    m7, m1
+    SPILL    7, 12
+    subps    m5, m4
+    subps    m5, m1
+    UNSPILL  0, 14
+    SPILL    5, 13
+    addps    m1, m0, SPILLED(15)
+    subps    m1, SPILLED(8)
+    mova     m4, [costabs + 16*5]
+    mulps    m4, [tmpq]
+    UNSPILL  2, 9
+    addps    m4, m2
+    subps    m2, [tmpq]
+    mulps    m5, m1, [costabs + 16*6]
+    addps    m5, m2
+    SPILL    5, 9
+    addps    m2, m1
+    SPILL    2, 14
+    UNSPILL  5, 15
+    subps    m7, m5, m0
+    addps    m5, SPILLED(8)
+    mulps    m5, [costabs + 16*1]
+    mulps    m7, [costabs + 16*8]
+    addps    m0, SPILLED(8)
+    mulps    m0, [costabs + 16*3]
+    subps    m2, m4, m5
+    subps    m2, m0
+    SPILL    2, 15
+    addps    m5, m4
+    addps    m5, m7
+    addps    m4, m0
+    subps    m4, m7
+    SPILL    4, 8
+    mova     m7, [tmpq+4*16]
+    mova     m2, [tmpq+4*12]
+    addps    m0, m7, m2
+    subps    m0, SPILLED(11)
+    mulps    m0, [costabs + 16*2]
+    addps    m4, m7, SPILLED(11)
+    mulps    m4, [costabs]
+    subps    m7, m2
+    mulps    m7, [costabs + 16*7]
+    addps    m2, SPILLED(11)
+    mulps    m2, [costabs + 16*4]
+    addps    m1, m7, [tmpq+4*8]
+    addps    m1, m4
+    addps    m4, m2
+    subps    m4, [tmpq+4*8]
+    SPILL    4, 11
+    subps    m7, m2
+    subps    m7, [tmpq+4*8]
+    addps    m4, m6, SPILLED(10)
+    subps    m6, SPILLED(10)
+    addps    m2, m5, m1
+    mulps    m2, [costabs + 16*9]
+    subps    m5, m1
+    mulps    m5, [costabs + 16*17]
+    subps    m1, m4, m2
+    addps    m4, m2
+    mulps    m2, m1, [winq+4*36]
+    addps    m2, [bufq+4*36]
+    mova     [outq+1152], m2
+    mulps    m1, [winq+4*32]
+    addps    m1, [bufq+4*32]
+    mova     [outq+1024], m1
+    mulps    m1, m4, [winq+4*116]
+    mova     [bufq+4*36], m1
+    mulps    m4, [winq+4*112]
+    mova     [bufq+4*32], m4
+    addps    m2, m6, m5
+    subps    m6, m5
+    mulps    m1, m6, [winq+4*68]
+    addps    m1, [bufq+4*68]
+    mova     [outq+2176], m1
+    mulps    m6, [winq]
+    addps    m6, [bufq]
+    mova     [outq], m6
+    mulps    m1, m2, [winq+4*148]
+    mova     [bufq+4*68], m1
+    mulps    m2, [winq+4*80]
+    mova     [bufq], m2
+    addps    m5, m3, [tmpq+4*24]
+    mova     m2, [tmpq+4*24]
+    subps    m2, m3
+    mova     m1, SPILLED(9)
+    subps    m1, m0
+    mulps    m1, [costabs + 16*10]
+    addps    m0, SPILLED(9)
+    mulps    m0, [costabs + 16*16]
+    addps    m6, m5, m1
+    subps    m5, m1
+    mulps    m3, m5, [winq+4*40]
+    addps    m3, [bufq+4*40]
+    mova     [outq+1280], m3
+    mulps    m5, [winq+4*28]
+    addps    m5, [bufq+4*28]
+    mova     [outq+896], m5
+    mulps    m1, m6, [winq+4*120]
+    mova     [bufq+4*40], m1
+    mulps    m6, [winq+4*108]
+    mova     [bufq+4*28], m6
+    addps    m1, m2, m0
+    subps    m2, m0
+    mulps    m5, m2, [winq+4*64]
+    addps    m5, [bufq+4*64]
+    mova     [outq+2048], m5
+    mulps    m2, [winq+4*4]
+    addps    m2, [bufq+4*4]
+    mova     [outq+128], m2
+    mulps    m0, m1, [winq+4*144]
+    mova     [bufq+4*64], m0
+    mulps    m1, [winq+4*84]
+    mova     [bufq+4*4], m1
+    mova     m1, [tmpq+4*28]
+    mova     m5, m1
+    addps    m1, SPILLED(13)
+    subps    m5, SPILLED(13)
+    UNSPILL  3, 15
+    addps    m2, m7, m3
+    mulps    m2, [costabs + 16*11]
+    subps    m3, m7
+    mulps    m3, [costabs + 16*15]
+    addps    m0, m2, m1
+    subps    m1, m2
+    SWAP     m0, m2
+    mulps    m6, m1, [winq+4*44]
+    addps    m6, [bufq+4*44]
+    mova     [outq+1408], m6
+    mulps    m1, [winq+4*24]
+    addps    m1, [bufq+4*24]
+    mova     [outq+768], m1
+    mulps    m0, m2, [winq+4*124]
+    mova     [bufq+4*44], m0
+    mulps    m2, [winq+4*104]
+    mova     [bufq+4*24], m2
+    addps    m0, m5, m3
+    subps    m5, m3
+    mulps    m1, m5, [winq+4*60]
+    addps    m1, [bufq+4*60]
+    mova     [outq+1920], m1
+    mulps    m5, [winq+4*8]
+    addps    m5, [bufq+4*8]
+    mova     [outq+256], m5
+    mulps    m1, m0, [winq+4*140]
+    mova     [bufq+4*60], m1
+    mulps    m0, [winq+4*88]
+    mova     [bufq+4*8], m0
+    mova     m1, [tmpq+4*20]
+    addps    m1, SPILLED(12)
+    mova     m2, [tmpq+4*20]
+    subps    m2, SPILLED(12)
+    UNSPILL  7, 8
+    subps    m0, m7, SPILLED(11)
+    addps    m7, SPILLED(11)
+    mulps    m4, m7, [costabs + 16*12]
+    mulps    m0, [costabs + 16*14]
+    addps    m5, m1, m4
+    subps    m1, m4
+    mulps    m7, m1, [winq+4*48]
+    addps    m7, [bufq+4*48]
+    mova     [outq+1536], m7
+    mulps    m1, [winq+4*20]
+    addps    m1, [bufq+4*20]
+    mova     [outq+640], m1
+    mulps    m1, m5, [winq+4*128]
+    mova     [bufq+4*48], m1
+    mulps    m5, [winq+4*100]
+    mova     [bufq+4*20], m5
+    addps    m6, m2, m0
+    subps    m2, m0
+    mulps    m1, m2, [winq+4*56]
+    addps    m1, [bufq+4*56]
+    mova     [outq+1792], m1
+    mulps    m2, [winq+4*12]
+    addps    m2, [bufq+4*12]
+    mova     [outq+384], m2
+    mulps    m0, m6, [winq+4*136]
+    mova    [bufq+4*56], m0
+    mulps    m6, [winq+4*92]
+    mova     [bufq+4*12], m6
+    UNSPILL  0, 14
+    mulps    m0, [costabs + 16*13]
+    mova     m3, [tmpq+4*4]
+    addps    m2, m0, m3
+    subps    m3, m0
+    mulps    m0, m3, [winq+4*52]
+    addps    m0, [bufq+4*52]
+    mova     [outq+1664], m0
+    mulps    m3, [winq+4*16]
+    addps    m3, [bufq+4*16]
+    mova     [outq+512], m3
+    mulps    m0, m2, [winq+4*132]
+    mova     [bufq+4*52], m0
+    mulps    m2, [winq+4*96]
+    mova     [bufq+4*16], m2
+    RET
+%endmacro
+
+INIT_XMM sse
+DEFINE_FOUR_IMDCT
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+DEFINE_FOUR_IMDCT
+%endif
--- a/externals/ffmpeg/libavcodec/x86/inline_asm.h
+++ b/externals/ffmpeg/libavcodec/x86/inline_asm.h
@@ -0,0 +1,100 @@
+/*
+ * inline assembly helper macros
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_INLINE_ASM_H
+#define AVCODEC_X86_INLINE_ASM_H
+
+#include "constants.h"
+
+#define MOVQ_WONE(regd) \
+    __asm__ volatile ( \
+    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+    "psrlw $15, %%" #regd ::)
+
+#define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
+#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
+
+#define MOVQ_BFE(regd)                                  \
+    __asm__ volatile (                                  \
+        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
+        "paddb   %%"#regd", %%"#regd"   \n\t" ::)
+
+#ifndef PIC
+#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2))
+#else
+// for shared library it's better to use this way for accessing constants
+// pcmpeqd -> -1
+#define MOVQ_WTWO(regd)                                 \
+    __asm__ volatile (                                  \
+        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
+        "psrlw         $15, %%"#regd"   \n\t"           \
+        "psllw          $1, %%"#regd"   \n\t"::)
+
+#endif
+
+// using regr as temporary and for the output result
+// first argument is unmodified and second is trashed
+// regfe is supposed to contain 0xfefefefefefefefe
+#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
+    "movq   "#rega", "#regr"            \n\t"                    \
+    "pand   "#regb", "#regr"            \n\t"                    \
+    "pxor   "#rega", "#regb"            \n\t"                    \
+    "pand  "#regfe", "#regb"            \n\t"                    \
+    "psrlq       $1, "#regb"            \n\t"                    \
+    "paddb  "#regb", "#regr"            \n\t"
+
+#define PAVGB_MMX(rega, regb, regr, regfe)                       \
+    "movq   "#rega", "#regr"            \n\t"                    \
+    "por    "#regb", "#regr"            \n\t"                    \
+    "pxor   "#rega", "#regb"            \n\t"                    \
+    "pand  "#regfe", "#regb"            \n\t"                    \
+    "psrlq       $1, "#regb"            \n\t"                    \
+    "psubb  "#regb", "#regr"            \n\t"
+
+// mm6 is supposed to contain 0xfefefefefefefefe
+#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
+    "movq  "#rega", "#regr"             \n\t"                    \
+    "movq  "#regc", "#regp"             \n\t"                    \
+    "pand  "#regb", "#regr"             \n\t"                    \
+    "pand  "#regd", "#regp"             \n\t"                    \
+    "pxor  "#rega", "#regb"             \n\t"                    \
+    "pxor  "#regc", "#regd"             \n\t"                    \
+    "pand    %%mm6, "#regb"             \n\t"                    \
+    "pand    %%mm6, "#regd"             \n\t"                    \
+    "psrlq      $1, "#regb"             \n\t"                    \
+    "psrlq      $1, "#regd"             \n\t"                    \
+    "paddb "#regb", "#regr"             \n\t"                    \
+    "paddb "#regd", "#regp"             \n\t"
+
+#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
+    "movq  "#rega", "#regr"             \n\t"                    \
+    "movq  "#regc", "#regp"             \n\t"                    \
+    "por   "#regb", "#regr"             \n\t"                    \
+    "por   "#regd", "#regp"             \n\t"                    \
+    "pxor  "#rega", "#regb"             \n\t"                    \
+    "pxor  "#regc", "#regd"             \n\t"                    \
+    "pand    %%mm6, "#regb"             \n\t"                    \
+    "pand    %%mm6, "#regd"             \n\t"                    \
+    "psrlq      $1, "#regd"             \n\t"                    \
+    "psrlq      $1, "#regb"             \n\t"                    \
+    "psubb "#regb", "#regr"             \n\t"                    \
+    "psubb "#regd", "#regp"             \n\t"
+
+#endif /* AVCODEC_X86_INLINE_ASM_H */
--- a/externals/ffmpeg/libavcodec/x86/jpeg2000dsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/jpeg2000dsp.asm
@@ -0,0 +1,164 @@
+;******************************************************************************
+;* SIMD-optimized JPEG2000 DSP functions
+;* Copyright (c) 2014 Nicolas Bertrand
+;* Copyright (c) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pf_ict0: times 8 dd 1.402
+pf_ict1: times 8 dd 0.34413
+pf_ict2: times 8 dd 0.71414
+pf_ict3: times 8 dd 1.772
+
+SECTION .text
+
+;***********************************************************************
+; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize)
+;***********************************************************************
+%macro ICT_FLOAT 1
+cglobal ict_float, 4, 4, %1, src0, src1, src2, csize
+    shl  csized, 2
+    add   src0q, csizeq
+    add   src1q, csizeq
+    add   src2q, csizeq
+    neg  csizeq
+    movaps   m6, [pf_ict0]
+    movaps   m7, [pf_ict1]
+    %define ICT0 m6
+    %define ICT1 m7
+
+%if ARCH_X86_64
+    movaps   m8, [pf_ict2]
+    %define ICT2 m8
+%if cpuflag(avx)
+    movaps   m3, [pf_ict3]
+    %define ICT3 m3
+%else
+    movaps   m9, [pf_ict3]
+    %define ICT3 m9
+%endif
+
+%else ; ARCH_X86_32
+    %define ICT2 [pf_ict2]
+%if cpuflag(avx)
+    movaps   m3, [pf_ict3]
+    %define ICT3 m3
+%else
+    %define ICT3 [pf_ict3]
+%endif
+
+%endif ; ARCH
+
+align 16
+.loop:
+    movaps   m0, [src0q+csizeq]
+    movaps   m1, [src1q+csizeq]
+    movaps   m2, [src2q+csizeq]
+
+%if cpuflag(fma4) || cpuflag(fma3)
+%if cpuflag(fma4)
+    fnmaddps  m5, m1, ICT1, m0
+    fmaddps   m4, m2, ICT0, m0
+%else ; fma3
+    movaps    m5, m1
+    movaps    m4, m2
+    fnmaddps  m5, m5, ICT1, m0
+    fmaddps   m4, m4, ICT0, m0
+%endif
+    fmaddps   m0, m1, ICT3, m0
+    fnmaddps  m5, m2, ICT2, m5
+%else ; non FMA
+%if cpuflag(avx)
+    mulps    m5, m1, ICT1
+    mulps    m4, m2, ICT0
+    mulps    m1, m1, ICT3
+    mulps    m2, m2, ICT2
+    subps    m5, m0, m5
+%else ; sse
+    movaps   m3, m1
+    movaps   m4, m2
+    movaps   m5, m0
+    mulps    m3, ICT1
+    mulps    m4, ICT0
+    mulps    m1, ICT3
+    mulps    m2, ICT2
+    subps    m5, m3
+%endif
+    addps    m4, m4, m0
+    addps    m0, m0, m1
+    subps    m5, m5, m2
+%endif
+
+    movaps   [src0q+csizeq], m4
+    movaps   [src2q+csizeq], m0
+    movaps   [src1q+csizeq], m5
+    add  csizeq, mmsize
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+ICT_FLOAT 10
+INIT_YMM avx
+ICT_FLOAT 9
+%if HAVE_FMA4_EXTERNAL
+INIT_XMM fma4
+ICT_FLOAT 9
+%endif
+INIT_YMM fma3
+ICT_FLOAT 9
+
+;***************************************************************************
+; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize)
+;***************************************************************************
+%macro RCT_INT 0
+cglobal rct_int, 4, 4, 4, src0, src1, src2, csize
+    shl  csized, 2
+    add   src0q, csizeq
+    add   src1q, csizeq
+    add   src2q, csizeq
+    neg  csizeq
+
+align 16
+.loop:
+    mova   m1, [src1q+csizeq]
+    mova   m2, [src2q+csizeq]
+    mova   m0, [src0q+csizeq]
+    paddd  m3, m1, m2
+    psrad  m3, 2
+    psubd  m0, m3
+    paddd  m1, m0
+    paddd  m2, m0
+    mova   [src1q+csizeq], m0
+    mova   [src2q+csizeq], m1
+    mova   [src0q+csizeq], m2
+    add  csizeq, mmsize
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+RCT_INT
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+RCT_INT
+%endif
--- a/externals/ffmpeg/libavcodec/x86/jpeg2000dsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/jpeg2000dsp_init.c
@@ -0,0 +1,60 @@
+/*
+ * SIMD optimized JPEG 2000 DSP functions
+ * Copyright (c) 2015 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/jpeg2000dsp.h"
+
+void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
+void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
+void ff_ict_float_fma3(void *src0, void *src1, void *src2, int csize);
+void ff_ict_float_fma4(void *src0, void *src1, void *src2, int csize);
+void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
+void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
+
+av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_sse;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->mct_decode[FF_DWT53] = ff_rct_int_sse2;
+    }
+
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_avx;
+    }
+
+    if (EXTERNAL_FMA4(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_fma4;
+    }
+
+    if (EXTERNAL_FMA3_FAST(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_fma3;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/lossless_audiodsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/lossless_audiodsp.asm
@@ -0,0 +1,190 @@
+;******************************************************************************
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SCALARPRODUCT 0
+; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
+;                                     int order, int mul)
+cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
+    shl orderq, 1
+    movd    m7, mulm
+%if mmsize == 16
+    pshuflw m7, m7, 0
+    punpcklqdq m7, m7
+%else
+    pshufw  m7, m7, 0
+%endif
+    pxor    m6, m6
+    add v1q, orderq
+    add v2q, orderq
+    add v3q, orderq
+    neg orderq
+.loop:
+    movu    m0, [v2q + orderq]
+    movu    m1, [v2q + orderq + mmsize]
+    mova    m4, [v1q + orderq]
+    mova    m5, [v1q + orderq + mmsize]
+    movu    m2, [v3q + orderq]
+    movu    m3, [v3q + orderq + mmsize]
+    pmaddwd m0, m4
+    pmaddwd m1, m5
+    pmullw  m2, m7
+    pmullw  m3, m7
+    paddd   m6, m0
+    paddd   m6, m1
+    paddw   m2, m4
+    paddw   m3, m5
+    mova    [v1q + orderq], m2
+    mova    [v1q + orderq + mmsize], m3
+    add     orderq, mmsize*2
+    jl .loop
+    HADDD   m6, m0
+    movd   eax, m6
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SCALARPRODUCT
+INIT_XMM sse2
+SCALARPRODUCT
+
+INIT_XMM sse4
+; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3,
+;                                     int order, int mul)
+cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul
+    shl orderq, 1
+    movd    m7, mulm
+    SPLATW  m7, m7
+    pxor    m6, m6
+    add v1q, orderq
+    lea v2q, [v2q + 2*orderq]
+    add v3q, orderq
+    neg orderq
+.loop:
+    mova    m3, [v1q + orderq]
+    movu    m0, [v2q + 2*orderq]
+    pmovsxwd m4, m3
+    movu    m1, [v2q + 2*orderq + mmsize]
+    movhlps m5, m3
+    movu    m2, [v3q + orderq]
+    pmovsxwd m5, m5
+    pmullw  m2, m7
+    pmulld  m0, m4
+    pmulld  m1, m5
+    paddw   m2, m3
+    paddd   m6, m0
+    paddd   m6, m1
+    mova    [v1q + orderq], m2
+    add     orderq, 16
+    jl .loop
+    HADDD   m6, m0
+    movd   eax, m6
+    RET
+
+%macro SCALARPRODUCT_LOOP 1
+align 16
+.loop%1:
+    sub     orderq, mmsize*2
+%if %1
+    mova    m1, m4
+    mova    m4, [v2q + orderq]
+    mova    m0, [v2q + orderq + mmsize]
+    palignr m1, m0, %1
+    palignr m0, m4, %1
+    mova    m3, m5
+    mova    m5, [v3q + orderq]
+    mova    m2, [v3q + orderq + mmsize]
+    palignr m3, m2, %1
+    palignr m2, m5, %1
+%else
+    mova    m0, [v2q + orderq]
+    mova    m1, [v2q + orderq + mmsize]
+    mova    m2, [v3q + orderq]
+    mova    m3, [v3q + orderq + mmsize]
+%endif
+    %define t0  [v1q + orderq]
+    %define t1  [v1q + orderq + mmsize]
+%if ARCH_X86_64
+    mova    m8, t0
+    mova    m9, t1
+    %define t0  m8
+    %define t1  m9
+%endif
+    pmaddwd m0, t0
+    pmaddwd m1, t1
+    pmullw  m2, m7
+    pmullw  m3, m7
+    paddw   m2, t0
+    paddw   m3, t1
+    paddd   m6, m0
+    paddd   m6, m1
+    mova    [v1q + orderq], m2
+    mova    [v1q + orderq + mmsize], m3
+    jg .loop%1
+%if %1
+    jmp .end
+%endif
+%endmacro
+
+; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
+;                                     int order, int mul)
+INIT_XMM ssse3
+cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
+    shl orderq, 1
+    movd    m7, mulm
+    pshuflw m7, m7, 0
+    punpcklqdq m7, m7
+    pxor    m6, m6
+    mov    r4d, v2d
+    and    r4d, 15
+    and    v2q, ~15
+    and    v3q, ~15
+    mova    m4, [v2q + orderq]
+    mova    m5, [v3q + orderq]
+    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
+    cmp    r4d, 0
+    je .loop0
+    cmp    r4d, 2
+    je .loop2
+    cmp    r4d, 4
+    je .loop4
+    cmp    r4d, 6
+    je .loop6
+    cmp    r4d, 8
+    je .loop8
+    cmp    r4d, 10
+    je .loop10
+    cmp    r4d, 12
+    je .loop12
+SCALARPRODUCT_LOOP 14
+SCALARPRODUCT_LOOP 12
+SCALARPRODUCT_LOOP 10
+SCALARPRODUCT_LOOP 8
+SCALARPRODUCT_LOOP 6
+SCALARPRODUCT_LOOP 4
+SCALARPRODUCT_LOOP 2
+SCALARPRODUCT_LOOP 0
+.end:
+    HADDD   m6, m0
+    movd   eax, m6
+    RET
--- a/externals/ffmpeg/libavcodec/x86/lossless_audiodsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/lossless_audiodsp_init.c
@@ -0,0 +1,56 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/lossless_audiodsp.h"
+
+int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
+                                               const int16_t *v3,
+                                               int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
+                                             const int16_t *v3,
+                                             int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
+                                              const int16_t *v3,
+                                              int order, int mul);
+
+int32_t ff_scalarproduct_and_madd_int32_sse4(int16_t *v1, const int32_t *v2,
+                                             const int16_t *v3,
+                                             int order, int mul);
+
+av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMXEXT(cpu_flags))
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+
+    if (EXTERNAL_SSSE3(cpu_flags) &&
+        !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4;
+#endif
+}
--- a/externals/ffmpeg/libavcodec/x86/lossless_videodsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/lossless_videodsp.asm
@@ -0,0 +1,406 @@
+;******************************************************************************
+;* SIMD lossless video DSP utils
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Michael Niedermayer
+;* Copyright (c) 2017 Jokyo Images
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_15
+pb_zzzzzzzz77777777: times 8 db -1
+pb_7: times 8 db 7
+pb_ef: times 8 db 14,15
+pb_67: times 8 db  6, 7
+pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
+pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
+pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
+pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
+;                                const uint8_t *diff, int w,
+;                                int *left, int *left_top)
+;------------------------------------------------------------------------------
+%macro MEDIAN_PRED 0
+cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
+    movu    m0, [topq]
+    mova    m2, m0
+    movd    m4, [left_topq]
+    LSHIFT  m2, 1
+    mova    m1, m0
+    por     m4, m2
+    movd    m3, [leftq]
+    psubb   m0, m4 ; t-tl
+    add    dstq, wq
+    add    topq, wq
+    add   diffq, wq
+    neg      wq
+    jmp .skip
+.loop:
+    movu    m4, [topq+wq]
+    mova    m0, m4
+    LSHIFT  m4, 1
+    por     m4, m1
+    mova    m1, m0 ; t
+    psubb   m0, m4 ; t-tl
+.skip:
+    movu    m2, [diffq+wq]
+%assign i 0
+%rep mmsize
+    mova    m4, m0
+    paddb   m4, m3 ; t-tl+l
+    mova    m5, m3
+    pmaxub  m3, m1
+    pminub  m5, m1
+    pminub  m3, m4
+    pmaxub  m3, m5 ; median
+    paddb   m3, m2 ; +residual
+%if i==0
+    mova    m7, m3
+    LSHIFT  m7, mmsize-1
+%else
+    mova    m6, m3
+    RSHIFT  m7, 1
+    LSHIFT  m6, mmsize-1
+    por     m7, m6
+%endif
+%if i<mmsize-1
+    RSHIFT  m0, 1
+    RSHIFT  m1, 1
+    RSHIFT  m2, 1
+%endif
+%assign i i+1
+%endrep
+    movu [dstq+wq], m7
+    add      wq, mmsize
+    jl .loop
+    movzx   r2d, byte [dstq-1]
+    mov [leftq], r2d
+    movzx   r2d, byte [topq-1]
+    mov [left_topq], r2d
+    RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmxext
+MEDIAN_PRED
+%endif
+INIT_XMM sse2
+MEDIAN_PRED
+
+
+%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
+    add     srcq, wq
+    add     dstq, wq
+    neg     wq
+%%.loop:
+    pshufb  xm0, xm5
+%if %2
+    mova    m1, [srcq+wq]
+%else
+    movu    m1, [srcq+wq]
+%endif
+    psllw   m2, m1, 8
+    paddb   m1, m2
+    pshufb  m2, m1, m3
+    paddb   m1, m2
+    pshufb  m2, m1, m4
+    paddb   m1, m2
+%if mmsize >= 16
+    pshufb  m2, m1, m6
+    paddb   m1, m2
+%endif
+    paddb   xm0, xm1
+%if %1
+    mova    [dstq+wq], xm0
+%else
+    movq    [dstq+wq], xm0
+    movhps  [dstq+wq+8], xm0
+%endif
+
+%if mmsize == 32
+    vextracti128    xm2, m1, 1 ; get second lane of the ymm
+    pshufb          xm0, xm5   ; set alls val to last val of the first lane
+    paddb           xm0, xm2
+;store val
+%if %1
+    mova    [dstq+wq+16], xm0
+%else;
+    movq    [dstq+wq+16], xm0
+    movhps  [dstq+wq+16+8], xm0
+%endif
+%endif
+    add     wq, mmsize
+    jl %%.loop
+%if mmsize == 32
+    movzx   eax, byte [dstq - 1]
+%else;
+    mov     eax, mmsize-1
+    sub     eax, wd
+    movd    m1, eax
+    pshufb  m0, m1
+    movd    eax, m0
+%endif
+    RET
+%endmacro
+
+;------------------------------------------------------------------------------
+; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
+;------------------------------------------------------------------------------
+INIT_MMX ssse3
+cglobal add_left_pred, 3,3,7, dst, src, w, left
+.skip_prologue:
+    mova    m5, [pb_7]
+    mova    m4, [pb_zzzz3333zzzzbbbb]
+    mova    m3, [pb_zz11zz55zz99zzdd]
+    movd    m0, leftm
+    psllq   m0, 56
+    ADD_LEFT_LOOP 1, 1
+
+%macro ADD_LEFT_PRED_UNALIGNED 0
+cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
+    mova    xm5, [pb_15]
+    VBROADCASTI128    m6, [pb_zzzzzzzz77777777]
+    VBROADCASTI128    m4, [pb_zzzz3333zzzzbbbb]
+    VBROADCASTI128    m3, [pb_zz11zz55zz99zzdd]
+    movd    xm0, leftm
+    pslldq  xm0, 15
+    test    srcq, mmsize - 1
+    jnz .src_unaligned
+    test    dstq, mmsize - 1
+    jnz .dst_unaligned
+    ADD_LEFT_LOOP 1, 1
+.dst_unaligned:
+    ADD_LEFT_LOOP 0, 1
+.src_unaligned:
+    ADD_LEFT_LOOP 0, 0
+%endmacro
+
+INIT_XMM ssse3
+ADD_LEFT_PRED_UNALIGNED
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+ADD_LEFT_PRED_UNALIGNED
+%endif
+
+;------------------------------------------------------------------------------
+; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+;------------------------------------------------------------------------------
+%macro ADD_BYTES 0
+cglobal add_bytes, 3,4,2, dst, src, w, size
+    mov  sizeq, wq
+    and  sizeq, -2*mmsize
+    jz  .2
+    add   dstq, sizeq
+    add   srcq, sizeq
+    neg  sizeq
+.1:
+    mova    m0, [srcq + sizeq]
+    mova    m1, [srcq + sizeq + mmsize]
+    paddb   m0, [dstq + sizeq]
+    paddb   m1, [dstq + sizeq + mmsize]
+    mova   [dstq + sizeq], m0
+    mova   [dstq + sizeq + mmsize], m1
+    add  sizeq, 2*mmsize
+    jl .1
+.2:
+    and     wq, 2*mmsize-1
+    jz    .end
+    add   dstq, wq
+    add   srcq, wq
+    neg     wq
+.3:
+    mov  sizeb, [srcq + wq]
+    add [dstq + wq], sizeb
+    inc     wq
+    jl .3
+.end:
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+ADD_BYTES
+%endif
+INIT_XMM sse2
+ADD_BYTES
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+ADD_BYTES
+%endif
+
+%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
+    add     wd, wd
+    add     srcq, wq
+    add     dstq, wq
+    neg     wq
+%%.loop:
+    mov%2   m1, [srcq+wq]
+    mova    m2, m1
+    pslld   m1, 16
+    paddw   m1, m2
+    mova    m2, m1
+
+    pshufb  m1, m3
+    paddw   m1, m2
+    pshufb  m0, m5
+%if mmsize == 16
+    mova    m2, m1
+    pshufb  m1, m4
+    paddw   m1, m2
+%endif
+    paddw   m0, m1
+    pand    m0, m7
+%ifidn %1, a
+    mova    [dstq+wq], m0
+%else
+    movq    [dstq+wq], m0
+    movhps  [dstq+wq+8], m0
+%endif
+    add     wq, mmsize
+    jl %%.loop
+    mov     eax, mmsize-1
+    sub     eax, wd
+    mov     wd, eax
+    shl     wd, 8
+    lea     eax, [wd+eax-1]
+    movd    m1, eax
+    pshufb  m0, m1
+    movd    eax, m0
+    RET
+%endmacro
+
+;---------------------------------------------------------------------------------------------
+; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
+;---------------------------------------------------------------------------------------------
+INIT_MMX ssse3
+cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
+.skip_prologue:
+    mova    m5, [pb_67]
+    mova    m3, [pb_zzzz2323zzzzabab]
+    movd    m0, leftm
+    psllq   m0, 48
+    movd    m7, maskm
+    SPLATW  m7 ,m7
+    ADD_HFYU_LEFT_LOOP_INT16 a, a
+
+INIT_XMM ssse3
+cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left
+    mova    m5, [pb_ef]
+    mova    m4, [pb_zzzzzzzz67676767]
+    mova    m3, [pb_zzzz2323zzzzabab]
+    movd    m0, leftm
+    pslldq  m0, 14
+    movd    m7, maskm
+    SPLATW  m7 ,m7
+    test    srcq, 15
+    jnz .src_unaligned
+    test    dstq, 15
+    jnz .dst_unaligned
+    ADD_HFYU_LEFT_LOOP_INT16 a, a
+.dst_unaligned:
+    ADD_HFYU_LEFT_LOOP_INT16 u, a
+.src_unaligned:
+    ADD_HFYU_LEFT_LOOP_INT16 u, u
+
+
+;---------------------------------------------------------------------------------------------
+; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width)
+;---------------------------------------------------------------------------------------------
+%macro ADD_GRADIENT_PRED 0
+cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp
+    mova         xm0, [pb_15]
+
+;load src - 1 in xm1
+    movd         xm1, [srcq-1]
+%if cpuflag(avx2)
+    vpbroadcastb xm1, xm1
+%else
+    pxor         xm2, xm2
+    pshufb       xm1, xm2
+%endif
+
+    add    srcq, widthq
+    neg  widthq
+    neg strideq
+
+.loop:
+    lea    tmpq, [srcq + strideq]
+    mova     m2, [tmpq + widthq] ; A = src[x-stride]
+    movu     m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)]
+    mova     m4, [srcq + widthq] ; current val (src[x])
+
+    psubb    m2, m3; A - B
+
+; prefix sum A-B
+    pslldq   m3, m2, 1
+    paddb    m2, m3
+    pslldq   m3, m2, 2
+    paddb    m2, m3
+    pslldq   m3, m2, 4
+    paddb    m2, m3
+    pslldq   m3, m2, 8
+    paddb    m2, m3
+
+; prefix sum current val
+    pslldq   m3, m4, 1
+    paddb    m4, m3
+    pslldq   m3, m4, 2
+    paddb    m4, m3
+    pslldq   m3, m4, 4
+    paddb    m4, m3
+    pslldq   m3, m4, 8
+    paddb    m4, m3
+
+; last sum
+    paddb                    m2, m4 ; current + (A - B)
+
+    paddb                   xm1, xm2 ; += C
+    mova        [srcq + widthq], xm1 ; store
+
+    pshufb                  xm1, xm0 ; put last val in all val of xm1
+
+%if mmsize == 32
+    vextracti128            xm2, m2, 1 ; get second lane of the ymm
+    paddb                   xm1, xm2; += C
+
+    mova   [srcq + widthq + 16], xm1 ; store
+    pshufb                  xm1, xm0 ; put last val in all val of m1
+%endif
+
+    add         widthq, mmsize
+    jl .loop
+    RET
+
+%endmacro
+
+INIT_XMM ssse3
+ADD_GRADIENT_PRED
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+ADD_GRADIENT_PRED
+%endif
--- a/externals/ffmpeg/libavcodec/x86/lossless_videodsp_init.c
+++ b/externals/ffmpeg/libavcodec/x86/lossless_videodsp_init.c
@@ -0,0 +1,128 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/x86/asm.h"
+#include "../lossless_videodsp.h"
+#include "libavutil/x86/cpu.h"
+
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+void ff_add_bytes_avx2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+
+void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
+                               const uint8_t *diff, ptrdiff_t w,
+                               int *left, int *left_top);
+void ff_add_median_pred_sse2(uint8_t *dst, const uint8_t *top,
+                             const uint8_t *diff, ptrdiff_t w,
+                             int *left, int *left_top);
+
+int  ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
+                            ptrdiff_t w, int left);
+int  ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t w, int left);
+int  ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
+                                     ptrdiff_t w, int left);
+
+int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
+int ff_add_left_pred_int16_unaligned_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
+
+void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
+void ff_add_gradient_pred_avx2(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
+
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
+static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top,
+                                 const uint8_t *diff, ptrdiff_t w,
+                                 int *left, int *left_top)
+{
+    x86_reg w2 = -w;
+    x86_reg x;
+    int l  = *left     & 0xff;
+    int tl = *left_top & 0xff;
+    int t;
+    __asm__ volatile (
+        "mov          %7, %3            \n"
+        "1:                             \n"
+        "movzbl (%3, %4), %2            \n"
+        "mov          %2, %k3           \n"
+        "sub         %b1, %b3           \n"
+        "add         %b0, %b3           \n"
+        "mov          %2, %1            \n"
+        "cmp          %0, %2            \n"
+        "cmovg        %0, %2            \n"
+        "cmovg        %1, %0            \n"
+        "cmp         %k3, %0            \n"
+        "cmovg       %k3, %0            \n"
+        "mov          %7, %3            \n"
+        "cmp          %2, %0            \n"
+        "cmovl        %2, %0            \n"
+        "add    (%6, %4), %b0           \n"
+        "mov         %b0, (%5, %4)      \n"
+        "inc          %4                \n"
+        "jl           1b                \n"
+        : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
+        : "r"(dst + w), "r"(diff + w), "rm"(top + w)
+    );
+    *left     = l;
+    *left_top = tl;
+}
+#endif
+
+void ff_llviddsp_init_x86(LLVidDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
+    if (cpu_flags & AV_CPU_FLAG_CMOV)
+        c->add_median_pred = add_median_pred_cmov;
+#endif
+
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->add_bytes = ff_add_bytes_mmx;
+    }
+
+    if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) {
+        /* slower than cmov version on AMD */
+        if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
+            c->add_median_pred = ff_add_median_pred_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_bytes       = ff_add_bytes_sse2;
+        c->add_median_pred = ff_add_median_pred_sse2;
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->add_left_pred = ff_add_left_pred_ssse3;
+        c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
+        c->add_gradient_pred   = ff_add_gradient_pred_ssse3;
+    }
+
+    if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
+        c->add_left_pred = ff_add_left_pred_unaligned_ssse3;
+        c->add_left_pred_int16 = ff_add_left_pred_int16_unaligned_ssse3;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->add_bytes       = ff_add_bytes_avx2;
+        c->add_left_pred   = ff_add_left_pred_unaligned_avx2;
+        c->add_gradient_pred = ff_add_gradient_pred_avx2;
+    }
+}
--- a/externals/ffmpeg/libavcodec/x86/lossless_videoencdsp.asm
+++ b/externals/ffmpeg/libavcodec/x86/lossless_videoencdsp.asm
@@ -0,0 +1,194 @@
+;************************************************************************
+;* SIMD-optimized lossless video encoding functions
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+;*
+;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pb_80
+
+SECTION .text
+
+; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                    intptr_t w);
+%macro DIFF_BYTES_PROLOGUE 0
+%if ARCH_X86_32
+cglobal diff_bytes, 3,5,2, dst, src1, src2
+%define wq r4q
+    DECLARE_REG_TMP 3
+    mov               wq, r3mp
+%else
+cglobal diff_bytes, 4,5,2, dst, src1, src2, w
+    DECLARE_REG_TMP 4
+%endif ; ARCH_X86_32
+%define i t0q
+%endmacro
+
+; labels to jump to if w < regsize and w < 0
+%macro DIFF_BYTES_LOOP_PREP 2
+    mov                i, wq
+    and                i, -2 * regsize
+        js            %2
+        jz            %1
+    add             dstq, i
+    add            src1q, i
+    add            src2q, i
+    neg                i
+%endmacro
+
+; mov type used for src1q, dstq, first reg, second reg
+%macro DIFF_BYTES_LOOP_CORE 4
+%if mmsize != 16
+    mov%1             %3, [src1q + i]
+    mov%1             %4, [src1q + i + regsize]
+    psubb             %3, [src2q + i]
+    psubb             %4, [src2q + i + regsize]
+    mov%2           [dstq + i], %3
+    mov%2 [regsize + dstq + i], %4
+%else
+    ; SSE enforces alignment of psubb operand
+    mov%1             %3, [src1q + i]
+    movu              %4, [src2q + i]
+    psubb             %3, %4
+    mov%2     [dstq + i], %3
+    mov%1             %3, [src1q + i + regsize]
+    movu              %4, [src2q + i + regsize]
+    psubb             %3, %4
+    mov%2 [regsize + dstq + i], %3
+%endif
+%endmacro
+
+%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
+    %define regsize mmsize
+.loop_%1%2:
+    DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
+    add                i, 2 * regsize
+        jl    .loop_%1%2
+.skip_main_%1%2:
+    and               wq, 2 * regsize - 1
+        jz     .end_%1%2
+%if mmsize > 16
+    ; fall back to narrower xmm
+    %define regsize (mmsize / 2)
+    DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa, .end_aa
+.loop2_%1%2:
+    DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
+    add                i, 2 * regsize
+        jl   .loop2_%1%2
+.setup_loop_gpr_%1%2:
+    and               wq, 2 * regsize - 1
+        jz     .end_%1%2
+%endif
+    add             dstq, wq
+    add            src1q, wq
+    add            src2q, wq
+    neg               wq
+.loop_gpr_%1%2:
+    mov              t0b, [src1q + wq]
+    sub              t0b, [src2q + wq]
+    mov      [dstq + wq], t0b
+    inc               wq
+        jl .loop_gpr_%1%2
+.end_%1%2:
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa
+    DIFF_BYTES_BODY    a, a
+%undef i
+%endif
+
+INIT_XMM sse2
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa
+    test            dstq, regsize - 1
+        jnz     .loop_uu
+    test           src1q, regsize - 1
+        jnz     .loop_ua
+    DIFF_BYTES_BODY    a, a
+    DIFF_BYTES_BODY    u, a
+    DIFF_BYTES_BODY    u, u
+%undef i
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    ; Directly using unaligned SSE2 version is marginally faster than
+    ; branching based on arguments.
+    DIFF_BYTES_LOOP_PREP .skip_main_uu, .end_uu
+    test            dstq, regsize - 1
+        jnz     .loop_uu
+    test           src1q, regsize - 1
+        jnz     .loop_ua
+    DIFF_BYTES_BODY    a, a
+    DIFF_BYTES_BODY    u, a
+    DIFF_BYTES_BODY    u, u
+%undef i
+%endif
+
+
+;--------------------------------------------------------------------------------------------------
+;void sub_left_predict(uint8_t *dst, uint8_t *src, ptrdiff_t stride, ptrdiff_t width, int height)
+;--------------------------------------------------------------------------------------------------
+
+INIT_XMM avx
+cglobal sub_left_predict, 5,6,5, dst, src, stride, width, height, x
+    mova             m1, [pb_80] ; prev initial
+    add            dstq, widthq
+    add            srcq, widthq
+    lea              xd, [widthq-1]
+    neg          widthq
+    and              xd, 15
+    pinsrb           m4, m1, xd, 15
+    mov              xq, widthq
+
+    .loop:
+        movu                     m0, [srcq + widthq]
+        palignr                  m2, m0, m1, 15
+        movu                     m1, [srcq + widthq + 16]
+        palignr                  m3, m1, m0, 15
+        psubb                    m2, m0, m2
+        psubb                    m3, m1, m3
+        movu        [dstq + widthq], m2
+        movu   [dstq + widthq + 16], m3
+        add                  widthq, 2 * 16
+        jl .loop
+
+    add   srcq, strideq
+    sub   dstq, xq ; dst + width
+    test    xd, 16
+    jz .mod32
+    mova    m1, m0
+
+.mod32:
+    pshufb    m1, m4
+    mov   widthq, xq
+    dec  heightd
+    jg .loop
+    RET
--- a/Show More
+++ b/Show More