early-access version 1432

2021-02-09 04:25:58 +01:00
parent de64eab4b4
commit 3d5a9d908a
7336 changed files with 1773492 additions and 111 deletions
--- a/externals/ffmpeg/libswscale/x86/Makefile
+++ b/externals/ffmpeg/libswscale/x86/Makefile
@@ -0,0 +1,15 @@
+$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
+
+OBJS                            += x86/rgb2rgb.o                        \
+                                   x86/swscale.o                        \
+                                   x86/yuv2rgb.o                        \
+
+MMX-OBJS                        += x86/hscale_fast_bilinear_simd.o      \
+
+OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
+
+X86ASM-OBJS                     += x86/input.o                          \
+                                   x86/output.o                         \
+                                   x86/scale.o                          \
+                                   x86/rgb_2_rgb.o                      \
+                                   x86/yuv_2_rgb.o                      \
--- a/externals/ffmpeg/libswscale/x86/hscale_fast_bilinear_simd.c
+++ b/externals/ffmpeg/libswscale/x86/hscale_fast_bilinear_simd.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "../swscale_internal.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+
+#define RET 0xC3 // near return opcode for x86
+#define PREFETCH "prefetchnta"
+
+#if HAVE_INLINE_ASM
+av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
+                                       int16_t *filter, int32_t *filterPos,
+                                       int numSplits)
+{
+    uint8_t *fragmentA;
+    x86_reg imm8OfPShufW1A;
+    x86_reg imm8OfPShufW2A;
+    x86_reg fragmentLengthA;
+    uint8_t *fragmentB;
+    x86_reg imm8OfPShufW1B;
+    x86_reg imm8OfPShufW2B;
+    x86_reg fragmentLengthB;
+    int fragmentPos;
+
+    int xpos, i;
+
+    // create an optimized horizontal scaling routine
+    /* This scaler is made of runtime-generated MMXEXT code using specially tuned
+     * pshufw instructions. For every four output pixels, if four input pixels
+     * are enough for the fast bilinear scaling, then a chunk of fragmentB is
+     * used. If five input pixels are needed, then a chunk of fragmentA is used.
+     */
+
+    // code fragment
+
+    __asm__ volatile (
+        "jmp                         9f                 \n\t"
+        // Begin
+        "0:                                             \n\t"
+        "movq    (%%"FF_REG_d", %%"FF_REG_a"), %%mm3    \n\t"
+        "movd    (%%"FF_REG_c", %%"FF_REG_S"), %%mm0    \n\t"
+        "movd   1(%%"FF_REG_c", %%"FF_REG_S"), %%mm1    \n\t"
+        "punpcklbw                %%mm7, %%mm1          \n\t"
+        "punpcklbw                %%mm7, %%mm0          \n\t"
+        "pshufw                   $0xFF, %%mm1, %%mm1   \n\t"
+        "1:                                             \n\t"
+        "pshufw                   $0xFF, %%mm0, %%mm0   \n\t"
+        "2:                                             \n\t"
+        "psubw                    %%mm1, %%mm0          \n\t"
+        "movl   8(%%"FF_REG_b", %%"FF_REG_a"), %%esi    \n\t"
+        "pmullw                   %%mm3, %%mm0          \n\t"
+        "psllw                       $7, %%mm1          \n\t"
+        "paddw                    %%mm1, %%mm0          \n\t"
+
+        "movq                     %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
+
+        "add                         $8, %%"FF_REG_a"   \n\t"
+        // End
+        "9:                                             \n\t"
+        "lea       " LOCAL_MANGLE(0b) ", %0             \n\t"
+        "lea       " LOCAL_MANGLE(1b) ", %1             \n\t"
+        "lea       " LOCAL_MANGLE(2b) ", %2             \n\t"
+        "dec                         %1                 \n\t"
+        "dec                         %2                 \n\t"
+        "sub                         %0, %1             \n\t"
+        "sub                         %0, %2             \n\t"
+        "lea       " LOCAL_MANGLE(9b) ", %3             \n\t"
+        "sub                         %0, %3             \n\t"
+
+
+        : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
+          "=r" (fragmentLengthA)
+        );
+
+    __asm__ volatile (
+        "jmp                         9f                 \n\t"
+        // Begin
+        "0:                                             \n\t"
+        "movq    (%%"FF_REG_d", %%"FF_REG_a"), %%mm3    \n\t"
+        "movd    (%%"FF_REG_c", %%"FF_REG_S"), %%mm0    \n\t"
+        "punpcklbw                %%mm7, %%mm0          \n\t"
+        "pshufw                   $0xFF, %%mm0, %%mm1   \n\t"
+        "1:                                             \n\t"
+        "pshufw                   $0xFF, %%mm0, %%mm0   \n\t"
+        "2:                                             \n\t"
+        "psubw                    %%mm1, %%mm0          \n\t"
+        "movl   8(%%"FF_REG_b", %%"FF_REG_a"), %%esi    \n\t"
+        "pmullw                   %%mm3, %%mm0          \n\t"
+        "psllw                       $7, %%mm1          \n\t"
+        "paddw                    %%mm1, %%mm0          \n\t"
+
+        "movq                     %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
+
+        "add                         $8, %%"FF_REG_a"   \n\t"
+        // End
+        "9:                                             \n\t"
+        "lea       " LOCAL_MANGLE(0b) ", %0             \n\t"
+        "lea       " LOCAL_MANGLE(1b) ", %1             \n\t"
+        "lea       " LOCAL_MANGLE(2b) ", %2             \n\t"
+        "dec                         %1                 \n\t"
+        "dec                         %2                 \n\t"
+        "sub                         %0, %1             \n\t"
+        "sub                         %0, %2             \n\t"
+        "lea       " LOCAL_MANGLE(9b) ", %3             \n\t"
+        "sub                         %0, %3             \n\t"
+
+
+        : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
+          "=r" (fragmentLengthB)
+        );
+
+    xpos        = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
+    fragmentPos = 0;
+
+    for (i = 0; i < dstW / numSplits; i++) {
+        int xx = xpos >> 16;
+
+        if ((i & 3) == 0) {
+            int a                  = 0;
+            int b                  = ((xpos + xInc) >> 16) - xx;
+            int c                  = ((xpos + xInc * 2) >> 16) - xx;
+            int d                  = ((xpos + xInc * 3) >> 16) - xx;
+            int inc                = (d + 1 < 4);
+            uint8_t *fragment      = inc ? fragmentB : fragmentA;
+            x86_reg imm8OfPShufW1  = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
+            x86_reg imm8OfPShufW2  = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
+            x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
+            int maxShift           = 3 - (d + inc);
+            int shift              = 0;
+
+            if (filterCode) {
+                filter[i]        = ((xpos              & 0xFFFF) ^ 0xFFFF) >> 9;
+                filter[i + 1]    = (((xpos + xInc)     & 0xFFFF) ^ 0xFFFF) >> 9;
+                filter[i + 2]    = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
+                filter[i + 3]    = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
+                filterPos[i / 2] = xx;
+
+                memcpy(filterCode + fragmentPos, fragment, fragmentLength);
+
+                filterCode[fragmentPos + imm8OfPShufW1] =  (a + inc)       |
+                                                          ((b + inc) << 2) |
+                                                          ((c + inc) << 4) |
+                                                          ((d + inc) << 6);
+                filterCode[fragmentPos + imm8OfPShufW2] =  a | (b << 2) |
+                                                               (c << 4) |
+                                                               (d << 6);
+
+                if (i + 4 - inc >= dstW)
+                    shift = maxShift;               // avoid overread
+                else if ((filterPos[i / 2] & 3) <= maxShift)
+                    shift = filterPos[i / 2] & 3;   // align
+
+                if (shift && i >= shift) {
+                    filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
+                    filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
+                    filterPos[i / 2]                        -= shift;
+                }
+            }
+
+            fragmentPos += fragmentLength;
+
+            if (filterCode)
+                filterCode[fragmentPos] = RET;
+        }
+        xpos += xInc;
+    }
+    if (filterCode)
+        filterPos[((i / 2) + 1) & (~1)] = xpos >> 16;  // needed to jump to the next part
+
+    return fragmentPos + 1;
+}
+
+void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
+                                 int dstWidth, const uint8_t *src,
+                                 int srcW, int xInc)
+{
+    int32_t *filterPos = c->hLumFilterPos;
+    int16_t *filter    = c->hLumFilter;
+    void    *mmxextFilterCode = c->lumMmxextFilterCode;
+    int i;
+#if ARCH_X86_64
+    uint64_t retsave;
+#else
+#if !HAVE_EBX_AVAILABLE
+    uint64_t ebxsave;
+#endif
+#endif
+
+    __asm__ volatile(
+#if ARCH_X86_64
+        "mov               -8(%%rsp), %%"FF_REG_a"    \n\t"
+        "mov            %%"FF_REG_a", %5              \n\t"  // retsave
+#else
+#if !HAVE_EBX_AVAILABLE
+        "mov            %%"FF_REG_b", %5              \n\t"  // ebxsave
+#endif
+#endif
+        "pxor                  %%mm7, %%mm7           \n\t"
+        "mov                      %0, %%"FF_REG_c"    \n\t"
+        "mov                      %1, %%"FF_REG_D"    \n\t"
+        "mov                      %2, %%"FF_REG_d"    \n\t"
+        "mov                      %3, %%"FF_REG_b"    \n\t"
+        "xor            %%"FF_REG_a", %%"FF_REG_a"    \n\t" // i
+        PREFETCH"      (%%"FF_REG_c")                 \n\t"
+        PREFETCH"    32(%%"FF_REG_c")                 \n\t"
+        PREFETCH"    64(%%"FF_REG_c")                 \n\t"
+
+#if ARCH_X86_64
+#define CALL_MMXEXT_FILTER_CODE \
+        "movl               (%%"FF_REG_b"), %%esi        \n\t"\
+        "call                          *%4               \n\t"\
+        "movl (%%"FF_REG_b", %%"FF_REG_a"), %%esi        \n\t"\
+        "add                  %%"FF_REG_S", %%"FF_REG_c" \n\t"\
+        "add                  %%"FF_REG_a", %%"FF_REG_D" \n\t"\
+        "xor                  %%"FF_REG_a", %%"FF_REG_a" \n\t"\
+
+#else
+#define CALL_MMXEXT_FILTER_CODE \
+        "movl               (%%"FF_REG_b"), %%esi        \n\t"\
+        "call                          *%4               \n\t"\
+        "addl (%%"FF_REG_b", %%"FF_REG_a"), %%"FF_REG_c" \n\t"\
+        "add                  %%"FF_REG_a", %%"FF_REG_D" \n\t"\
+        "xor                  %%"FF_REG_a", %%"FF_REG_a" \n\t"\
+
+#endif /* ARCH_X86_64 */
+
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+
+#if ARCH_X86_64
+        "mov                      %5, %%"FF_REG_a" \n\t"
+        "mov            %%"FF_REG_a", -8(%%rsp)    \n\t"
+#else
+#if !HAVE_EBX_AVAILABLE
+        "mov                      %5, %%"FF_REG_b" \n\t"
+#endif
+#endif
+        :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
+           "m" (mmxextFilterCode)
+#if ARCH_X86_64
+          ,"m"(retsave)
+#else
+#if !HAVE_EBX_AVAILABLE
+          ,"m" (ebxsave)
+#endif
+#endif
+        : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
+#if ARCH_X86_64 || HAVE_EBX_AVAILABLE
+         ,"%"FF_REG_b
+#endif
+    );
+
+    for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
+        dst[i] = src[srcW-1]*128;
+}
+
+void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
+                                 int dstWidth, const uint8_t *src1,
+                                 const uint8_t *src2, int srcW, int xInc)
+{
+    int32_t *filterPos = c->hChrFilterPos;
+    int16_t *filter    = c->hChrFilter;
+    void    *mmxextFilterCode = c->chrMmxextFilterCode;
+    int i;
+#if ARCH_X86_64
+    DECLARE_ALIGNED(8, uint64_t, retsave);
+#else
+#if !HAVE_EBX_AVAILABLE
+    DECLARE_ALIGNED(8, uint64_t, ebxsave);
+#endif
+#endif
+    __asm__ volatile(
+#if ARCH_X86_64
+        "mov          -8(%%rsp), %%"FF_REG_a"    \n\t"
+        "mov       %%"FF_REG_a", %7              \n\t"  // retsave
+#else
+#if !HAVE_EBX_AVAILABLE
+        "mov       %%"FF_REG_b", %7              \n\t"  // ebxsave
+#endif
+#endif
+        "pxor             %%mm7, %%mm7           \n\t"
+        "mov                 %0, %%"FF_REG_c"    \n\t"
+        "mov                 %1, %%"FF_REG_D"    \n\t"
+        "mov                 %2, %%"FF_REG_d"    \n\t"
+        "mov                 %3, %%"FF_REG_b"    \n\t"
+        "xor          %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
+        PREFETCH"   (%%"FF_REG_c")               \n\t"
+        PREFETCH" 32(%%"FF_REG_c")               \n\t"
+        PREFETCH" 64(%%"FF_REG_c")               \n\t"
+
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        "xor          %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
+        "mov                    %5, %%"FF_REG_c" \n\t" // src2
+        "mov                    %6, %%"FF_REG_D" \n\t" // dst2
+        PREFETCH"   (%%"FF_REG_c")               \n\t"
+        PREFETCH" 32(%%"FF_REG_c")               \n\t"
+        PREFETCH" 64(%%"FF_REG_c")               \n\t"
+
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+
+#if ARCH_X86_64
+        "mov                    %7, %%"FF_REG_a" \n\t"
+        "mov          %%"FF_REG_a", -8(%%rsp)    \n\t"
+#else
+#if !HAVE_EBX_AVAILABLE
+        "mov %7, %%"FF_REG_b"    \n\t"
+#endif
+#endif
+        :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
+           "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
+#if ARCH_X86_64
+          ,"m"(retsave)
+#else
+#if !HAVE_EBX_AVAILABLE
+          ,"m" (ebxsave)
+#endif
+#endif
+        : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
+#if ARCH_X86_64 || HAVE_EBX_AVAILABLE
+         ,"%"FF_REG_b
+#endif
+    );
+
+    for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
+        dst1[i] = src1[srcW-1]*128;
+        dst2[i] = src2[srcW-1]*128;
+    }
+}
+#endif //HAVE_INLINE_ASM
--- a/externals/ffmpeg/libswscale/x86/input.asm
+++ b/externals/ffmpeg/libswscale/x86/input.asm
@@ -0,0 +1,740 @@
+;******************************************************************************
+;* x86-optimized input routines; does shuffling of packed
+;* YUV formats into individual planes, and converts RGB
+;* into YUV planes also.
+;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+%define RY 0x20DE
+%define GY 0x4087
+%define BY 0x0C88
+%define RU 0xECFF
+%define GU 0xDAC8
+%define BU 0x3838
+%define RV 0x3838
+%define GV 0xD0E3
+%define BV 0xF6E4
+
+rgb_Yrnd:        times 4 dd 0x80100        ;  16.5 << 15
+rgb_UVrnd:       times 4 dd 0x400100       ; 128.5 << 15
+%define bgr_Ycoeff_12x4 16*4 + 16* 0 + tableq
+%define bgr_Ycoeff_3x56 16*4 + 16* 1 + tableq
+%define rgb_Ycoeff_12x4 16*4 + 16* 2 + tableq
+%define rgb_Ycoeff_3x56 16*4 + 16* 3 + tableq
+%define bgr_Ucoeff_12x4 16*4 + 16* 4 + tableq
+%define bgr_Ucoeff_3x56 16*4 + 16* 5 + tableq
+%define rgb_Ucoeff_12x4 16*4 + 16* 6 + tableq
+%define rgb_Ucoeff_3x56 16*4 + 16* 7 + tableq
+%define bgr_Vcoeff_12x4 16*4 + 16* 8 + tableq
+%define bgr_Vcoeff_3x56 16*4 + 16* 9 + tableq
+%define rgb_Vcoeff_12x4 16*4 + 16*10 + tableq
+%define rgb_Vcoeff_3x56 16*4 + 16*11 + tableq
+
+%define rgba_Ycoeff_rb 16*4 + 16*12 + tableq
+%define rgba_Ycoeff_br 16*4 + 16*13 + tableq
+%define rgba_Ycoeff_ga 16*4 + 16*14 + tableq
+%define rgba_Ycoeff_ag 16*4 + 16*15 + tableq
+%define rgba_Ucoeff_rb 16*4 + 16*16 + tableq
+%define rgba_Ucoeff_br 16*4 + 16*17 + tableq
+%define rgba_Ucoeff_ga 16*4 + 16*18 + tableq
+%define rgba_Ucoeff_ag 16*4 + 16*19 + tableq
+%define rgba_Vcoeff_rb 16*4 + 16*20 + tableq
+%define rgba_Vcoeff_br 16*4 + 16*21 + tableq
+%define rgba_Vcoeff_ga 16*4 + 16*22 + tableq
+%define rgba_Vcoeff_ag 16*4 + 16*23 + tableq
+
+; bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
+; bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
+; rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
+; rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY
+; bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU
+; bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU
+; rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU
+; rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU
+; bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV
+; bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
+; rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
+; rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
+
+; rgba_Ycoeff_rb:  times 4 dw RY, BY
+; rgba_Ycoeff_br:  times 4 dw BY, RY
+; rgba_Ycoeff_ga:  times 4 dw GY, 0
+; rgba_Ycoeff_ag:  times 4 dw 0,  GY
+; rgba_Ucoeff_rb:  times 4 dw RU, BU
+; rgba_Ucoeff_br:  times 4 dw BU, RU
+; rgba_Ucoeff_ga:  times 4 dw GU, 0
+; rgba_Ucoeff_ag:  times 4 dw 0,  GU
+; rgba_Vcoeff_rb:  times 4 dw RV, BV
+; rgba_Vcoeff_br:  times 4 dw BV, RV
+; rgba_Vcoeff_ga:  times 4 dw GV, 0
+; rgba_Vcoeff_ag:  times 4 dw 0,  GV
+
+shuf_rgb_12x4:   db 0, 0x80, 1, 0x80,  2, 0x80,  3, 0x80, \
+                    6, 0x80, 7, 0x80,  8, 0x80,  9, 0x80
+shuf_rgb_3x56:   db 2, 0x80, 3, 0x80,  4, 0x80,  5, 0x80, \
+                    8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; RGB to Y/UV.
+;
+; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
+; and
+; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
+;                      const uint8_t *unused, int w);
+;-----------------------------------------------------------------------------
+
+; %1 = nr. of XMM registers
+; %2 = rgb or bgr
+%macro RGB24_TO_Y_FN 2-3
+cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
+%if mmsize == 8
+    mova           m5, [%2_Ycoeff_12x4]
+    mova           m6, [%2_Ycoeff_3x56]
+%define coeff1 m5
+%define coeff2 m6
+%elif ARCH_X86_64
+    mova           m8, [%2_Ycoeff_12x4]
+    mova           m9, [%2_Ycoeff_3x56]
+%define coeff1 m8
+%define coeff2 m9
+%else ; x86-32 && mmsize == 16
+%define coeff1 [%2_Ycoeff_12x4]
+%define coeff2 [%2_Ycoeff_3x56]
+%endif ; x86-32/64 && mmsize == 8/16
+%if (ARCH_X86_64 || mmsize == 8) && %0 == 3
+    jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body
+%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
+.body:
+%if cpuflag(ssse3)
+    mova           m7, [shuf_rgb_12x4]
+%define shuf_rgb1 m7
+%if ARCH_X86_64
+    mova          m10, [shuf_rgb_3x56]
+%define shuf_rgb2 m10
+%else ; x86-32
+%define shuf_rgb2 [shuf_rgb_3x56]
+%endif ; x86-32/64
+%endif ; cpuflag(ssse3)
+%if ARCH_X86_64
+    movsxd         wq, wd
+%endif
+    add            wq, wq
+    add          dstq, wq
+    neg            wq
+%if notcpuflag(ssse3)
+    pxor           m7, m7
+%endif ; !cpuflag(ssse3)
+    mova           m4, [rgb_Yrnd]
+.loop:
+%if cpuflag(ssse3)
+    movu           m0, [srcq+0]           ; (byte) { Bx, Gx, Rx }[0-3]
+    movu           m2, [srcq+12]          ; (byte) { Bx, Gx, Rx }[4-7]
+    pshufb         m1, m0, shuf_rgb2      ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    pshufb         m0, shuf_rgb1          ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    pshufb         m3, m2, shuf_rgb2      ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+    pshufb         m2, shuf_rgb1          ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+%else ; !cpuflag(ssse3)
+    movd           m0, [srcq+0]           ; (byte) { B0, G0, R0, B1 }
+    movd           m1, [srcq+2]           ; (byte) { R0, B1, G1, R1 }
+    movd           m2, [srcq+6]           ; (byte) { B2, G2, R2, B3 }
+    movd           m3, [srcq+8]           ; (byte) { R2, B3, G3, R3 }
+%if mmsize == 16 ; i.e. sse2
+    punpckldq      m0, m2                 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    punpckldq      m1, m3                 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    movd           m2, [srcq+12]          ; (byte) { B4, G4, R4, B5 }
+    movd           m3, [srcq+14]          ; (byte) { R4, B5, G5, R5 }
+    movd           m5, [srcq+18]          ; (byte) { B6, G6, R6, B7 }
+    movd           m6, [srcq+20]          ; (byte) { R6, B7, G7, R7 }
+    punpckldq      m2, m5                 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
+    punpckldq      m3, m6                 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; mmsize == 16
+    punpcklbw      m0, m7                 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    punpcklbw      m1, m7                 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    punpcklbw      m2, m7                 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+    punpcklbw      m3, m7                 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; cpuflag(ssse3)
+    add          srcq, 3 * mmsize / 2
+    pmaddwd        m0, coeff1             ; (dword) { B0*BY + G0*GY, B1*BY, B2*BY + G2*GY, B3*BY }
+    pmaddwd        m1, coeff2             ; (dword) { R0*RY, G1+GY + R1*RY, R2*RY, G3+GY + R3*RY }
+    pmaddwd        m2, coeff1             ; (dword) { B4*BY + G4*GY, B5*BY, B6*BY + G6*GY, B7*BY }
+    pmaddwd        m3, coeff2             ; (dword) { R4*RY, G5+GY + R5*RY, R6*RY, G7+GY + R7*RY }
+    paddd          m0, m1                 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[0-3]
+    paddd          m2, m3                 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
+    paddd          m0, m4                 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
+    paddd          m2, m4                 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
+    psrad          m0, 9
+    psrad          m2, 9
+    packssdw       m0, m2                 ; (word) { Y[0-7] }
+    mova    [dstq+wq], m0
+    add            wq, mmsize
+    jl .loop
+    REP_RET
+%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = rgb or bgr
+%macro RGB24_TO_UV_FN 2-3
+cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
+%if ARCH_X86_64
+    mova           m8, [%2_Ucoeff_12x4]
+    mova           m9, [%2_Ucoeff_3x56]
+    mova          m10, [%2_Vcoeff_12x4]
+    mova          m11, [%2_Vcoeff_3x56]
+%define coeffU1 m8
+%define coeffU2 m9
+%define coeffV1 m10
+%define coeffV2 m11
+%else ; x86-32
+%define coeffU1 [%2_Ucoeff_12x4]
+%define coeffU2 [%2_Ucoeff_3x56]
+%define coeffV1 [%2_Vcoeff_12x4]
+%define coeffV2 [%2_Vcoeff_3x56]
+%endif ; x86-32/64
+%if ARCH_X86_64 && %0 == 3
+    jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToUV %+ SUFFIX).body
+%else ; ARCH_X86_64 && %0 == 3
+.body:
+%if cpuflag(ssse3)
+    mova           m7, [shuf_rgb_12x4]
+%define shuf_rgb1 m7
+%if ARCH_X86_64
+    mova          m12, [shuf_rgb_3x56]
+%define shuf_rgb2 m12
+%else ; x86-32
+%define shuf_rgb2 [shuf_rgb_3x56]
+%endif ; x86-32/64
+%endif ; cpuflag(ssse3)
+%if ARCH_X86_64
+    movsxd         wq, dword r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add            wq, wq
+    add         dstUq, wq
+    add         dstVq, wq
+    neg            wq
+    mova           m6, [rgb_UVrnd]
+%if notcpuflag(ssse3)
+    pxor           m7, m7
+%endif
+.loop:
+%if cpuflag(ssse3)
+    movu           m0, [srcq+0]           ; (byte) { Bx, Gx, Rx }[0-3]
+    movu           m4, [srcq+12]          ; (byte) { Bx, Gx, Rx }[4-7]
+    pshufb         m1, m0, shuf_rgb2      ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    pshufb         m0, shuf_rgb1          ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+%else ; !cpuflag(ssse3)
+    movd           m0, [srcq+0]           ; (byte) { B0, G0, R0, B1 }
+    movd           m1, [srcq+2]           ; (byte) { R0, B1, G1, R1 }
+    movd           m4, [srcq+6]           ; (byte) { B2, G2, R2, B3 }
+    movd           m5, [srcq+8]           ; (byte) { R2, B3, G3, R3 }
+%if mmsize == 16
+    punpckldq      m0, m4                 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    punpckldq      m1, m5                 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    movd           m4, [srcq+12]          ; (byte) { B4, G4, R4, B5 }
+    movd           m5, [srcq+14]          ; (byte) { R4, B5, G5, R5 }
+%endif ; mmsize == 16
+    punpcklbw      m0, m7                 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    punpcklbw      m1, m7                 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+%endif ; cpuflag(ssse3)
+    pmaddwd        m2, m0, coeffV1        ; (dword) { B0*BV + G0*GV, B1*BV, B2*BV + G2*GV, B3*BV }
+    pmaddwd        m3, m1, coeffV2        ; (dword) { R0*BV, G1*GV + R1*BV, R2*BV, G3*GV + R3*BV }
+    pmaddwd        m0, coeffU1            ; (dword) { B0*BU + G0*GU, B1*BU, B2*BU + G2*GU, B3*BU }
+    pmaddwd        m1, coeffU2            ; (dword) { R0*BU, G1*GU + R1*BU, R2*BU, G3*GU + R3*BU }
+    paddd          m0, m1                 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[0-3]
+    paddd          m2, m3                 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[0-3]
+%if cpuflag(ssse3)
+    pshufb         m5, m4, shuf_rgb2      ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+    pshufb         m4, shuf_rgb1          ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+%else ; !cpuflag(ssse3)
+%if mmsize == 16
+    movd           m1, [srcq+18]          ; (byte) { B6, G6, R6, B7 }
+    movd           m3, [srcq+20]          ; (byte) { R6, B7, G7, R7 }
+    punpckldq      m4, m1                 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
+    punpckldq      m5, m3                 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; mmsize == 16 && !cpuflag(ssse3)
+    punpcklbw      m4, m7                 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+    punpcklbw      m5, m7                 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; cpuflag(ssse3)
+    add          srcq, 3 * mmsize / 2
+    pmaddwd        m1, m4, coeffU1        ; (dword) { B4*BU + G4*GU, B5*BU, B6*BU + G6*GU, B7*BU }
+    pmaddwd        m3, m5, coeffU2        ; (dword) { R4*BU, G5*GU + R5*BU, R6*BU, G7*GU + R7*BU }
+    pmaddwd        m4, coeffV1            ; (dword) { B4*BV + G4*GV, B5*BV, B6*BV + G6*GV, B7*BV }
+    pmaddwd        m5, coeffV2            ; (dword) { R4*BV, G5*GV + R5*BV, R6*BV, G7*GV + R7*BV }
+    paddd          m1, m3                 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[4-7]
+    paddd          m4, m5                 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[4-7]
+    paddd          m0, m6                 ; += rgb_UVrnd, i.e. (dword) { U[0-3] }
+    paddd          m2, m6                 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
+    paddd          m1, m6                 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
+    paddd          m4, m6                 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
+    psrad          m0, 9
+    psrad          m2, 9
+    psrad          m1, 9
+    psrad          m4, 9
+    packssdw       m0, m1                 ; (word) { U[0-7] }
+    packssdw       m2, m4                 ; (word) { V[0-7] }
+%if mmsize == 8
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
+%else ; mmsize == 16
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
+%endif ; mmsize == 8/16
+    add            wq, mmsize
+    jl .loop
+    REP_RET
+%endif ; ARCH_X86_64 && %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers for rgb-to-Y func
+; %2 = nr. of XMM registers for rgb-to-UV func
+%macro RGB24_FUNCS 2
+RGB24_TO_Y_FN %1, rgb
+RGB24_TO_Y_FN %1, bgr, rgb
+RGB24_TO_UV_FN %2, rgb
+RGB24_TO_UV_FN %2, bgr, rgb
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+RGB24_FUNCS 0, 0
+%endif
+
+INIT_XMM sse2
+RGB24_FUNCS 10, 12
+
+INIT_XMM ssse3
+RGB24_FUNCS 11, 13
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+RGB24_FUNCS 11, 13
+%endif
+
+; %1 = nr. of XMM registers
+; %2-5 = rgba, bgra, argb or abgr (in individual characters)
+%macro RGB32_TO_Y_FN 5-6
+cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
+    mova           m5, [rgba_Ycoeff_%2%4]
+    mova           m6, [rgba_Ycoeff_%3%5]
+%if %0 == 6
+    jmp mangle(private_prefix %+ _ %+ %6 %+ ToY %+ SUFFIX).body
+%else ; %0 == 6
+.body:
+%if ARCH_X86_64
+    movsxd         wq, wd
+%endif
+    add            wq, wq
+    sub            wq, mmsize - 1
+    lea          srcq, [srcq+wq*2]
+    add          dstq, wq
+    neg            wq
+    mova           m4, [rgb_Yrnd]
+    pcmpeqb        m7, m7
+    psrlw          m7, 8                  ; (word) { 0x00ff } x4
+.loop:
+    ; FIXME check alignment and use mova
+    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu           m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    DEINTB          1,  0,  3,  2,  7     ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
+    pmaddwd        m1, m5                 ; (dword) { Bx*BY + Rx*RY }[0-3]
+    pmaddwd        m0, m6                 ; (dword) { Gx*GY }[0-3]
+    pmaddwd        m3, m5                 ; (dword) { Bx*BY + Rx*RY }[4-7]
+    pmaddwd        m2, m6                 ; (dword) { Gx*GY }[4-7]
+    paddd          m0, m4                 ; += rgb_Yrnd
+    paddd          m2, m4                 ; += rgb_Yrnd
+    paddd          m0, m1                 ; (dword) { Y[0-3] }
+    paddd          m2, m3                 ; (dword) { Y[4-7] }
+    psrad          m0, 9
+    psrad          m2, 9
+    packssdw       m0, m2                 ; (word) { Y[0-7] }
+    mova    [dstq+wq], m0
+    add            wq, mmsize
+    jl .loop
+    sub            wq, mmsize - 1
+    jz .end
+    add            srcq, 2*mmsize - 2
+    add            dstq, mmsize - 1
+.loop2:
+    movd           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    DEINTB          1,  0,  3,  2,  7     ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
+    pmaddwd        m1, m5                 ; (dword) { Bx*BY + Rx*RY }[0-3]
+    pmaddwd        m0, m6                 ; (dword) { Gx*GY }[0-3]
+    paddd          m0, m4                 ; += rgb_Yrnd
+    paddd          m0, m1                 ; (dword) { Y[0-3] }
+    psrad          m0, 9
+    packssdw       m0, m0                 ; (word) { Y[0-7] }
+    movd    [dstq+wq], m0
+    add            wq, 2
+    jl .loop2
+.end:
+    REP_RET
+%endif ; %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2-5 = rgba, bgra, argb or abgr (in individual characters)
+%macro RGB32_TO_UV_FN 5-6
+cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
+%if ARCH_X86_64
+    mova           m8, [rgba_Ucoeff_%2%4]
+    mova           m9, [rgba_Ucoeff_%3%5]
+    mova          m10, [rgba_Vcoeff_%2%4]
+    mova          m11, [rgba_Vcoeff_%3%5]
+%define coeffU1 m8
+%define coeffU2 m9
+%define coeffV1 m10
+%define coeffV2 m11
+%else ; x86-32
+%define coeffU1 [rgba_Ucoeff_%2%4]
+%define coeffU2 [rgba_Ucoeff_%3%5]
+%define coeffV1 [rgba_Vcoeff_%2%4]
+%define coeffV2 [rgba_Vcoeff_%3%5]
+%endif ; x86-64/32
+%if ARCH_X86_64 && %0 == 6
+    jmp mangle(private_prefix %+ _ %+ %6 %+ ToUV %+ SUFFIX).body
+%else ; ARCH_X86_64 && %0 == 6
+.body:
+%if ARCH_X86_64
+    movsxd         wq, dword r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add            wq, wq
+    sub            wq, mmsize - 1
+    add         dstUq, wq
+    add         dstVq, wq
+    lea          srcq, [srcq+wq*2]
+    neg            wq
+    pcmpeqb        m7, m7
+    psrlw          m7, 8                  ; (word) { 0x00ff } x4
+    mova           m6, [rgb_UVrnd]
+.loop:
+    ; FIXME check alignment and use mova
+    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu           m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    DEINTB          1,  0,  5,  4,  7     ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
+    pmaddwd        m3, m1, coeffV1        ; (dword) { Bx*BV + Rx*RV }[0-3]
+    pmaddwd        m2, m0, coeffV2        ; (dword) { Gx*GV }[0-3]
+    pmaddwd        m1, coeffU1            ; (dword) { Bx*BU + Rx*RU }[0-3]
+    pmaddwd        m0, coeffU2            ; (dword) { Gx*GU }[0-3]
+    paddd          m3, m6                 ; += rgb_UVrnd
+    paddd          m1, m6                 ; += rgb_UVrnd
+    paddd          m2, m3                 ; (dword) { V[0-3] }
+    paddd          m0, m1                 ; (dword) { U[0-3] }
+    pmaddwd        m3, m5, coeffV1        ; (dword) { Bx*BV + Rx*RV }[4-7]
+    pmaddwd        m1, m4, coeffV2        ; (dword) { Gx*GV }[4-7]
+    pmaddwd        m5, coeffU1            ; (dword) { Bx*BU + Rx*RU }[4-7]
+    pmaddwd        m4, coeffU2            ; (dword) { Gx*GU }[4-7]
+    paddd          m3, m6                 ; += rgb_UVrnd
+    paddd          m5, m6                 ; += rgb_UVrnd
+    psrad          m0, 9
+    paddd          m1, m3                 ; (dword) { V[4-7] }
+    paddd          m4, m5                 ; (dword) { U[4-7] }
+    psrad          m2, 9
+    psrad          m4, 9
+    psrad          m1, 9
+    packssdw       m0, m4                 ; (word) { U[0-7] }
+    packssdw       m2, m1                 ; (word) { V[0-7] }
+%if mmsize == 8
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
+%else ; mmsize == 16
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
+%endif ; mmsize == 8/16
+    add            wq, mmsize
+    jl .loop
+    sub            wq, mmsize - 1
+    jz .end
+    add            srcq , 2*mmsize - 2
+    add            dstUq, mmsize - 1
+    add            dstVq, mmsize - 1
+.loop2:
+    movd           m0, [srcq+wq*2]        ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    DEINTB          1,  0,  5,  4,  7     ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
+    pmaddwd        m3, m1, coeffV1        ; (dword) { Bx*BV + Rx*RV }[0-3]
+    pmaddwd        m2, m0, coeffV2        ; (dword) { Gx*GV }[0-3]
+    pmaddwd        m1, coeffU1            ; (dword) { Bx*BU + Rx*RU }[0-3]
+    pmaddwd        m0, coeffU2            ; (dword) { Gx*GU }[0-3]
+    paddd          m3, m6                 ; += rgb_UVrnd
+    paddd          m1, m6                 ; += rgb_UVrnd
+    paddd          m2, m3                 ; (dword) { V[0-3] }
+    paddd          m0, m1                 ; (dword) { U[0-3] }
+    psrad          m0, 9
+    psrad          m2, 9
+    packssdw       m0, m0                 ; (word) { U[0-7] }
+    packssdw       m2, m2                 ; (word) { V[0-7] }
+    movd   [dstUq+wq], m0
+    movd   [dstVq+wq], m2
+    add            wq, 2
+    jl .loop2
+.end:
+    REP_RET
+%endif ; ARCH_X86_64 && %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers for rgb-to-Y func
+; %2 = nr. of XMM registers for rgb-to-UV func
+%macro RGB32_FUNCS 2
+RGB32_TO_Y_FN %1, r, g, b, a
+RGB32_TO_Y_FN %1, b, g, r, a, rgba
+RGB32_TO_Y_FN %1, a, r, g, b, rgba
+RGB32_TO_Y_FN %1, a, b, g, r, rgba
+
+RGB32_TO_UV_FN %2, r, g, b, a
+RGB32_TO_UV_FN %2, b, g, r, a, rgba
+RGB32_TO_UV_FN %2, a, r, g, b, rgba
+RGB32_TO_UV_FN %2, a, b, g, r, rgba
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+RGB32_FUNCS 0, 0
+%endif
+
+INIT_XMM sse2
+RGB32_FUNCS 8, 12
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+RGB32_FUNCS 8, 12
+%endif
+
+;-----------------------------------------------------------------------------
+; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
+;
+; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
+; and
+; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
+;                      const uint8_t *unused, int w);
+;-----------------------------------------------------------------------------
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = yuyv or uyvy
+%macro LOOP_YUYV_TO_Y 2
+.loop_%1:
+    mov%1          m0, [srcq+wq*2]        ; (byte) { Y0, U0, Y1, V0, ... }
+    mov%1          m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+%ifidn %2, yuyv
+    pand           m0, m2                 ; (word) { Y0, Y1, ..., Y7 }
+    pand           m1, m2                 ; (word) { Y8, Y9, ..., Y15 }
+%else ; uyvy
+    psrlw          m0, 8                  ; (word) { Y0, Y1, ..., Y7 }
+    psrlw          m1, 8                  ; (word) { Y8, Y9, ..., Y15 }
+%endif ; yuyv/uyvy
+    packuswb       m0, m1                 ; (byte) { Y0, ..., Y15 }
+    mova    [dstq+wq], m0
+    add            wq, mmsize
+    jl .loop_%1
+    REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = yuyv or uyvy
+; %3 = if specified, it means that unaligned and aligned code in loop
+;      will be the same (i.e. YUYV+AVX), and thus we don't need to
+;      split the loop in an aligned and unaligned case
+%macro YUYV_TO_Y_FN 2-3
+cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
+%if ARCH_X86_64
+    movsxd         wq, wd
+%endif
+    add          dstq, wq
+%if mmsize == 16
+    test         srcq, 15
+%endif
+    lea          srcq, [srcq+wq*2]
+%ifidn %2, yuyv
+    pcmpeqb        m2, m2                 ; (byte) { 0xff } x 16
+    psrlw          m2, 8                  ; (word) { 0x00ff } x 8
+%endif ; yuyv
+%if mmsize == 16
+    jnz .loop_u_start
+    neg            wq
+    LOOP_YUYV_TO_Y  a, %2
+.loop_u_start:
+    neg            wq
+    LOOP_YUYV_TO_Y  u, %2
+%else ; mmsize == 8
+    neg            wq
+    LOOP_YUYV_TO_Y  a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = yuyv or uyvy
+%macro LOOP_YUYV_TO_UV 2
+.loop_%1:
+%ifidn %2, yuyv
+    mov%1          m0, [srcq+wq*4]        ; (byte) { Y0, U0, Y1, V0, ... }
+    mov%1          m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+    psrlw          m0, 8                  ; (word) { U0, V0, ..., U3, V3 }
+    psrlw          m1, 8                  ; (word) { U4, V4, ..., U7, V7 }
+%else ; uyvy
+%if cpuflag(avx)
+    vpand          m0, m2, [srcq+wq*4]        ; (word) { U0, V0, ..., U3, V3 }
+    vpand          m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 }
+%else
+    mov%1          m0, [srcq+wq*4]        ; (byte) { Y0, U0, Y1, V0, ... }
+    mov%1          m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+    pand           m0, m2                 ; (word) { U0, V0, ..., U3, V3 }
+    pand           m1, m2                 ; (word) { U4, V4, ..., U7, V7 }
+%endif
+%endif ; yuyv/uyvy
+    packuswb       m0, m1                 ; (byte) { U0, V0, ..., U7, V7 }
+    pand           m1, m0, m2             ; (word) { U0, U1, ..., U7 }
+    psrlw          m0, 8                  ; (word) { V0, V1, ..., V7 }
+%if mmsize == 16
+    packuswb       m1, m0                 ; (byte) { U0, ... U7, V1, ... V7 }
+    movh   [dstUq+wq], m1
+    movhps [dstVq+wq], m1
+%else ; mmsize == 8
+    packuswb       m1, m1                 ; (byte) { U0, ... U3 }
+    packuswb       m0, m0                 ; (byte) { V0, ... V3 }
+    movh   [dstUq+wq], m1
+    movh   [dstVq+wq], m0
+%endif ; mmsize == 8/16
+    add            wq, mmsize / 2
+    jl .loop_%1
+    REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = yuyv or uyvy
+; %3 = if specified, it means that unaligned and aligned code in loop
+;      will be the same (i.e. UYVY+AVX), and thus we don't need to
+;      split the loop in an aligned and unaligned case
+%macro YUYV_TO_UV_FN 2-3
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
+%if ARCH_X86_64
+    movsxd         wq, dword r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add         dstUq, wq
+    add         dstVq, wq
+%if mmsize == 16 && %0 == 2
+    test         srcq, 15
+%endif
+    lea          srcq, [srcq+wq*4]
+    pcmpeqb        m2, m2                 ; (byte) { 0xff } x 16
+    psrlw          m2, 8                  ; (word) { 0x00ff } x 8
+    ; NOTE: if uyvy+avx, u/a are identical
+%if mmsize == 16 && %0 == 2
+    jnz .loop_u_start
+    neg            wq
+    LOOP_YUYV_TO_UV a, %2
+.loop_u_start:
+    neg            wq
+    LOOP_YUYV_TO_UV u, %2
+%else ; mmsize == 8
+    neg            wq
+    LOOP_YUYV_TO_UV a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = nv12 or nv21
+%macro LOOP_NVXX_TO_UV 2
+.loop_%1:
+    mov%1          m0, [srcq+wq*2]        ; (byte) { U0, V0, U1, V1, ... }
+    mov%1          m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
+    pand           m2, m0, m5             ; (word) { U0, U1, ..., U7 }
+    pand           m3, m1, m5             ; (word) { U8, U9, ..., U15 }
+    psrlw          m0, 8                  ; (word) { V0, V1, ..., V7 }
+    psrlw          m1, 8                  ; (word) { V8, V9, ..., V15 }
+    packuswb       m2, m3                 ; (byte) { U0, ..., U15 }
+    packuswb       m0, m1                 ; (byte) { V0, ..., V15 }
+%ifidn %2, nv12
+    mova   [dstUq+wq], m2
+    mova   [dstVq+wq], m0
+%else ; nv21
+    mova   [dstVq+wq], m2
+    mova   [dstUq+wq], m0
+%endif ; nv12/21
+    add            wq, mmsize
+    jl .loop_%1
+    REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = nv12 or nv21
+%macro NVXX_TO_UV_FN 2
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
+%if ARCH_X86_64
+    movsxd         wq, dword r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add         dstUq, wq
+    add         dstVq, wq
+%if mmsize == 16
+    test         srcq, 15
+%endif
+    lea          srcq, [srcq+wq*2]
+    pcmpeqb        m5, m5                 ; (byte) { 0xff } x 16
+    psrlw          m5, 8                  ; (word) { 0x00ff } x 8
+%if mmsize == 16
+    jnz .loop_u_start
+    neg            wq
+    LOOP_NVXX_TO_UV a, %2
+.loop_u_start:
+    neg            wq
+    LOOP_NVXX_TO_UV u, %2
+%else ; mmsize == 8
+    neg            wq
+    LOOP_NVXX_TO_UV a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+YUYV_TO_Y_FN  0, yuyv
+YUYV_TO_Y_FN  0, uyvy
+YUYV_TO_UV_FN 0, yuyv
+YUYV_TO_UV_FN 0, uyvy
+NVXX_TO_UV_FN 0, nv12
+NVXX_TO_UV_FN 0, nv21
+%endif
+
+INIT_XMM sse2
+YUYV_TO_Y_FN  3, yuyv
+YUYV_TO_Y_FN  2, uyvy
+YUYV_TO_UV_FN 3, yuyv
+YUYV_TO_UV_FN 3, uyvy
+NVXX_TO_UV_FN 5, nv12
+NVXX_TO_UV_FN 5, nv21
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
+; that's not faster in practice
+YUYV_TO_UV_FN 3, yuyv
+YUYV_TO_UV_FN 3, uyvy, 1
+NVXX_TO_UV_FN 5, nv12
+NVXX_TO_UV_FN 5, nv21
+%endif
--- a/externals/ffmpeg/libswscale/x86/output.asm
+++ b/externals/ffmpeg/libswscale/x86/output.asm
@@ -0,0 +1,425 @@
+;******************************************************************************
+;* x86-optimized vertical line scaling functions
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*                    Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+minshort:      times 8 dw 0x8000
+yuv2yuvX_16_start:  times 4 dd 0x4000 - 0x40000000
+yuv2yuvX_10_start:  times 4 dd 0x10000
+yuv2yuvX_9_start:   times 4 dd 0x20000
+yuv2yuvX_10_upper:  times 8 dw 0x3ff
+yuv2yuvX_9_upper:   times 8 dw 0x1ff
+pd_4:          times 4 dd 4
+pd_4min0x40000:times 4 dd 4 - (0x40000)
+pw_16:         times 8 dw 16
+pw_32:         times 8 dw 32
+pw_512:        times 8 dw 512
+pw_1024:       times 8 dw 1024
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; vertical line scaling
+;
+; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
+;                                     const uint8_t *dither, int offset)
+; and
+; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
+;                                     const int16_t **src, uint8_t *dst, int dstW,
+;                                     const uint8_t *dither, int offset)
+;
+; Scale one or $filterSize lines of source data to generate one line of output
+; data. The input is 15 bits in int16_t if $output_size is [8,10] and 19 bits in
+; int32_t if $output_size is 16. $filter is 12 bits. $filterSize is a multiple
+; of 2. $offset is either 0 or 3. $dither holds 8 values.
+;-----------------------------------------------------------------------------
+%macro yuv2planeX_mainloop 2
+.pixelloop_%2:
+%assign %%i 0
+    ; the rep here is for the 8-bit output MMX case, where dither covers
+    ; 8 pixels but we can only handle 2 pixels per register, and thus 4
+    ; pixels per iteration. In order to not have to keep track of where
+    ; we are w.r.t. dithering, we unroll the MMX/8-bit loop x2.
+%if %1 == 8
+%assign %%repcnt 16/mmsize
+%else
+%assign %%repcnt 1
+%endif
+
+%rep %%repcnt
+
+%if %1 == 8
+%if ARCH_X86_32
+    mova            m2, [rsp+mmsize*(0+%%i)]
+    mova            m1, [rsp+mmsize*(1+%%i)]
+%else ; x86-64
+    mova            m2,  m8
+    mova            m1,  m_dith
+%endif ; x86-32/64
+%else ; %1 == 9/10/16
+    mova            m1, [yuv2yuvX_%1_start]
+    mova            m2,  m1
+%endif ; %1 == 8/9/10/16
+    movsx     cntr_reg,  fltsizem
+.filterloop_%2_ %+ %%i:
+    ; input pixels
+    mov             r6, [srcq+gprsize*cntr_reg-2*gprsize]
+%if %1 == 16
+    mova            m3, [r6+r5*4]
+    mova            m5, [r6+r5*4+mmsize]
+%else ; %1 == 8/9/10
+    mova            m3, [r6+r5*2]
+%endif ; %1 == 8/9/10/16
+    mov             r6, [srcq+gprsize*cntr_reg-gprsize]
+%if %1 == 16
+    mova            m4, [r6+r5*4]
+    mova            m6, [r6+r5*4+mmsize]
+%else ; %1 == 8/9/10
+    mova            m4, [r6+r5*2]
+%endif ; %1 == 8/9/10/16
+
+    ; coefficients
+    movd            m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
+%if %1 == 16
+    pshuflw         m7,  m0,  0          ; coeff[0]
+    pshuflw         m0,  m0,  0x55       ; coeff[1]
+    pmovsxwd        m7,  m7              ; word -> dword
+    pmovsxwd        m0,  m0              ; word -> dword
+
+    pmulld          m3,  m7
+    pmulld          m5,  m7
+    pmulld          m4,  m0
+    pmulld          m6,  m0
+
+    paddd           m2,  m3
+    paddd           m1,  m5
+    paddd           m2,  m4
+    paddd           m1,  m6
+%else ; %1 == 10/9/8
+    punpcklwd       m5,  m3,  m4
+    punpckhwd       m3,  m4
+    SPLATD          m0
+
+    pmaddwd         m5,  m0
+    pmaddwd         m3,  m0
+
+    paddd           m2,  m5
+    paddd           m1,  m3
+%endif ; %1 == 8/9/10/16
+
+    sub       cntr_reg,  2
+    jg .filterloop_%2_ %+ %%i
+
+%if %1 == 16
+    psrad           m2,  31 - %1
+    psrad           m1,  31 - %1
+%else ; %1 == 10/9/8
+    psrad           m2,  27 - %1
+    psrad           m1,  27 - %1
+%endif ; %1 == 8/9/10/16
+
+%if %1 == 8
+    packssdw        m2,  m1
+    packuswb        m2,  m2
+    movh   [dstq+r5*1],  m2
+%else ; %1 == 9/10/16
+%if %1 == 16
+    packssdw        m2,  m1
+    paddw           m2, [minshort]
+%else ; %1 == 9/10
+%if cpuflag(sse4)
+    packusdw        m2,  m1
+%else ; mmxext/sse2
+    packssdw        m2,  m1
+    pmaxsw          m2,  m6
+%endif ; mmxext/sse2/sse4/avx
+    pminsw          m2, [yuv2yuvX_%1_upper]
+%endif ; %1 == 9/10/16
+    mov%2   [dstq+r5*2],  m2
+%endif ; %1 == 8/9/10/16
+
+    add             r5,  mmsize/2
+    sub             wd,  mmsize/2
+
+%assign %%i %%i+2
+%endrep
+    jg .pixelloop_%2
+%endmacro
+
+%macro yuv2planeX_fn 3
+
+%if ARCH_X86_32
+%define cntr_reg fltsizeq
+%define movsx mov
+%else
+%define cntr_reg r7
+%define movsx movsxd
+%endif
+
+cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
+%if %1 == 8 || %1 == 9 || %1 == 10
+    pxor            m6,  m6
+%endif ; %1 == 8/9/10
+
+%if %1 == 8
+%if ARCH_X86_32
+%assign pad 0x2c - (stack_offset & 15)
+    SUB             rsp, pad
+%define m_dith m7
+%else ; x86-64
+%define m_dith m9
+%endif ; x86-32
+
+    ; create registers holding dither
+    movq        m_dith, [ditherq]        ; dither
+    test        offsetd, offsetd
+    jz              .no_rot
+%if mmsize == 16
+    punpcklqdq  m_dith,  m_dith
+%endif ; mmsize == 16
+    PALIGNR     m_dith,  m_dith,  3,  m0
+.no_rot:
+%if mmsize == 16
+    punpcklbw   m_dith,  m6
+%if ARCH_X86_64
+    punpcklwd       m8,  m_dith,  m6
+    pslld           m8,  12
+%else ; x86-32
+    punpcklwd       m5,  m_dith,  m6
+    pslld           m5,  12
+%endif ; x86-32/64
+    punpckhwd   m_dith,  m6
+    pslld       m_dith,  12
+%if ARCH_X86_32
+    mova      [rsp+ 0],  m5
+    mova      [rsp+16],  m_dith
+%endif
+%else ; mmsize == 8
+    punpcklbw       m5,  m_dith,  m6
+    punpckhbw   m_dith,  m6
+    punpcklwd       m4,  m5,  m6
+    punpckhwd       m5,  m6
+    punpcklwd       m3,  m_dith,  m6
+    punpckhwd   m_dith,  m6
+    pslld           m4,  12
+    pslld           m5,  12
+    pslld           m3,  12
+    pslld       m_dith,  12
+    mova      [rsp+ 0],  m4
+    mova      [rsp+ 8],  m5
+    mova      [rsp+16],  m3
+    mova      [rsp+24],  m_dith
+%endif ; mmsize == 8/16
+%endif ; %1 == 8
+
+    xor             r5,  r5
+
+%if mmsize == 8 || %1 == 8
+    yuv2planeX_mainloop %1, a
+%else ; mmsize == 16
+    test          dstq, 15
+    jnz .unaligned
+    yuv2planeX_mainloop %1, a
+    REP_RET
+.unaligned:
+    yuv2planeX_mainloop %1, u
+%endif ; mmsize == 8/16
+
+%if %1 == 8
+%if ARCH_X86_32
+    ADD             rsp, pad
+    RET
+%else ; x86-64
+    REP_RET
+%endif ; x86-32/64
+%else ; %1 == 9/10/16
+    REP_RET
+%endif ; %1 == 8/9/10/16
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmxext
+yuv2planeX_fn  8,  0, 7
+yuv2planeX_fn  9,  0, 5
+yuv2planeX_fn 10,  0, 5
+%endif
+
+INIT_XMM sse2
+yuv2planeX_fn  8, 10, 7
+yuv2planeX_fn  9,  7, 5
+yuv2planeX_fn 10,  7, 5
+
+INIT_XMM sse4
+yuv2planeX_fn  8, 10, 7
+yuv2planeX_fn  9,  7, 5
+yuv2planeX_fn 10,  7, 5
+yuv2planeX_fn 16,  8, 5
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+yuv2planeX_fn  8, 10, 7
+yuv2planeX_fn  9,  7, 5
+yuv2planeX_fn 10,  7, 5
+%endif
+
+; %1=outout-bpc, %2=alignment (u/a)
+%macro yuv2plane1_mainloop 2
+.loop_%2:
+%if %1 == 8
+    paddsw          m0, m2, [srcq+wq*2+mmsize*0]
+    paddsw          m1, m3, [srcq+wq*2+mmsize*1]
+    psraw           m0, 7
+    psraw           m1, 7
+    packuswb        m0, m1
+    mov%2    [dstq+wq], m0
+%elif %1 == 16
+    paddd           m0, m4, [srcq+wq*4+mmsize*0]
+    paddd           m1, m4, [srcq+wq*4+mmsize*1]
+    paddd           m2, m4, [srcq+wq*4+mmsize*2]
+    paddd           m3, m4, [srcq+wq*4+mmsize*3]
+    psrad           m0, 3
+    psrad           m1, 3
+    psrad           m2, 3
+    psrad           m3, 3
+%if cpuflag(sse4) ; avx/sse4
+    packusdw        m0, m1
+    packusdw        m2, m3
+%else ; mmx/sse2
+    packssdw        m0, m1
+    packssdw        m2, m3
+    paddw           m0, m5
+    paddw           m2, m5
+%endif ; mmx/sse2/sse4/avx
+    mov%2    [dstq+wq*2+mmsize*0], m0
+    mov%2    [dstq+wq*2+mmsize*1], m2
+%else ; %1 == 9/10
+    paddsw          m0, m2, [srcq+wq*2+mmsize*0]
+    paddsw          m1, m2, [srcq+wq*2+mmsize*1]
+    psraw           m0, 15 - %1
+    psraw           m1, 15 - %1
+    pmaxsw          m0, m4
+    pmaxsw          m1, m4
+    pminsw          m0, m3
+    pminsw          m1, m3
+    mov%2    [dstq+wq*2+mmsize*0], m0
+    mov%2    [dstq+wq*2+mmsize*1], m1
+%endif
+    add             wq, mmsize
+    jl .loop_%2
+%endmacro
+
+%macro yuv2plane1_fn 3
+cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
+    movsxdifnidn    wq, wd
+    add             wq, mmsize - 1
+    and             wq, ~(mmsize - 1)
+%if %1 == 8
+    add           dstq, wq
+%else ; %1 != 8
+    lea           dstq, [dstq+wq*2]
+%endif ; %1 == 8
+%if %1 == 16
+    lea           srcq, [srcq+wq*4]
+%else ; %1 != 16
+    lea           srcq, [srcq+wq*2]
+%endif ; %1 == 16
+    neg             wq
+
+%if %1 == 8
+    pxor            m4, m4               ; zero
+
+    ; create registers holding dither
+    movq            m3, [ditherq]        ; dither
+    test       offsetd, offsetd
+    jz              .no_rot
+%if mmsize == 16
+    punpcklqdq      m3, m3
+%endif ; mmsize == 16
+    PALIGNR         m3, m3, 3, m2
+.no_rot:
+%if mmsize == 8
+    mova            m2, m3
+    punpckhbw       m3, m4               ; byte->word
+    punpcklbw       m2, m4               ; byte->word
+%else
+    punpcklbw       m3, m4
+    mova            m2, m3
+%endif
+%elif %1 == 9
+    pxor            m4, m4
+    mova            m3, [pw_512]
+    mova            m2, [pw_32]
+%elif %1 == 10
+    pxor            m4, m4
+    mova            m3, [pw_1024]
+    mova            m2, [pw_16]
+%else ; %1 == 16
+%if cpuflag(sse4) ; sse4/avx
+    mova            m4, [pd_4]
+%else ; mmx/sse2
+    mova            m4, [pd_4min0x40000]
+    mova            m5, [minshort]
+%endif ; mmx/sse2/sse4/avx
+%endif ; %1 == ..
+
+    ; actual pixel scaling
+%if mmsize == 8
+    yuv2plane1_mainloop %1, a
+%else ; mmsize == 16
+    test          dstq, 15
+    jnz .unaligned
+    yuv2plane1_mainloop %1, a
+    REP_RET
+.unaligned:
+    yuv2plane1_mainloop %1, u
+%endif ; mmsize == 8/16
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+yuv2plane1_fn  8, 0, 5
+yuv2plane1_fn 16, 0, 3
+
+INIT_MMX mmxext
+yuv2plane1_fn  9, 0, 3
+yuv2plane1_fn 10, 0, 3
+%endif
+
+INIT_XMM sse2
+yuv2plane1_fn  8, 5, 5
+yuv2plane1_fn  9, 5, 3
+yuv2plane1_fn 10, 5, 3
+yuv2plane1_fn 16, 6, 3
+
+INIT_XMM sse4
+yuv2plane1_fn 16, 5, 3
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+yuv2plane1_fn  8, 5, 5
+yuv2plane1_fn  9, 5, 3
+yuv2plane1_fn 10, 5, 3
+yuv2plane1_fn 16, 5, 3
+%endif
--- a/externals/ffmpeg/libswscale/x86/rgb2rgb.c
+++ b/externals/ffmpeg/libswscale/x86/rgb2rgb.c
@@ -0,0 +1,200 @@
+/*
+ * software RGB to RGB converter
+ * pluralize by software PAL8 to RGB converter
+ *              software YUV to YUV converter
+ *              software YUV to RGB converter
+ * Written by Nick Kurshev.
+ * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/cpu.h"
+#include "libavutil/bswap.h"
+#include "libswscale/rgb2rgb.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#if HAVE_INLINE_ASM
+
+DECLARE_ASM_CONST(8, uint64_t, mmx_ff)       = 0x00000000000000FFULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_null)     = 0x0000000000000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_one)      = 0xFFFFFFFFFFFFFFFFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32b)      = 0x000000FF000000FFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32g)      = 0x0000FF000000FF00ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32r)      = 0x00FF000000FF0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32a)      = 0xFF000000FF000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32)       = 0x00FFFFFF00FFFFFFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask3216br)   = 0x00F800F800F800F8ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask3216g)    = 0x0000FC000000FC00ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask3215g)    = 0x0000F8000000F800ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul3216)      = 0x2000000420000004ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul3215)      = 0x2000000820000008ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24b)      = 0x00FF0000FF0000FFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24g)      = 0xFF0000FF0000FF00ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24r)      = 0x0000FF0000FF0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24l)      = 0x0000000000FFFFFFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24h)      = 0x0000FFFFFF000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24hh)     = 0xffff000000000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24hhh)    = 0xffffffff00000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24hhhh)   = 0xffffffffffff0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask15b)      = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
+DECLARE_ASM_CONST(8, uint64_t, mask15rg)     = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
+DECLARE_ASM_CONST(8, uint64_t, mask15s)      = 0xFFE0FFE0FFE0FFE0ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask15g)      = 0x03E003E003E003E0ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask15r)      = 0x7C007C007C007C00ULL;
+#define mask16b mask15b
+DECLARE_ASM_CONST(8, uint64_t, mask16g)      = 0x07E007E007E007E0ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask16r)      = 0xF800F800F800F800ULL;
+DECLARE_ASM_CONST(8, uint64_t, red_16mask)   = 0x0000f8000000f800ULL;
+DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
+DECLARE_ASM_CONST(8, uint64_t, blue_16mask)  = 0x0000001f0000001fULL;
+DECLARE_ASM_CONST(8, uint64_t, red_15mask)   = 0x00007c0000007c00ULL;
+DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
+DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_mid)    = 0x4200420042004200ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_hi)     = 0x0210021002100210ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul16_mid)    = 0x2080208020802080ULL;
+
+DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2YOffset);
+DECLARE_ALIGNED(8, extern const uint64_t, ff_w1111);
+DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
+
+#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
+#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
+#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
+#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
+#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
+#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
+#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
+#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
+#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
+
+// Note: We have C, MMX, MMXEXT, 3DNOW versions, there is no 3DNOW + MMXEXT one.
+
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_AMD3DNOW 0
+#define COMPILE_TEMPLATE_SSE2 0
+#define COMPILE_TEMPLATE_AVX 0
+
+//MMX versions
+#undef RENAME
+#define RENAME(a) a ## _mmx
+#include "rgb2rgb_template.c"
+
+// MMXEXT versions
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
+#define RENAME(a) a ## _mmxext
+#include "rgb2rgb_template.c"
+
+//SSE2 versions
+#undef RENAME
+#undef COMPILE_TEMPLATE_SSE2
+#define COMPILE_TEMPLATE_SSE2 1
+#define RENAME(a) a ## _sse2
+#include "rgb2rgb_template.c"
+
+//AVX versions
+#undef RENAME
+#undef COMPILE_TEMPLATE_AVX
+#define COMPILE_TEMPLATE_AVX 1
+#define RENAME(a) a ## _avx
+#include "rgb2rgb_template.c"
+
+//3DNOW versions
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#undef COMPILE_TEMPLATE_SSE2
+#undef COMPILE_TEMPLATE_AVX
+#undef COMPILE_TEMPLATE_AMD3DNOW
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_SSE2 0
+#define COMPILE_TEMPLATE_AVX 0
+#define COMPILE_TEMPLATE_AMD3DNOW 1
+#define RENAME(a) a ## _3dnow
+#include "rgb2rgb_template.c"
+
+/*
+ RGB15->RGB16 original by Strepto/Astral
+ ported to gcc & bugfixed : A'rpi
+ MMXEXT, 3DNOW optimization by Nick Kurshev
+ 32-bit C version, and and&add trick by Michael Niedermayer
+*/
+
+#endif /* HAVE_INLINE_ASM */
+
+void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
+
+#if ARCH_X86_64
+void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          const uint8_t *src, int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                         const uint8_t *src, int width, int height,
+                         int lumStride, int chromStride, int srcStride);
+#endif
+
+av_cold void rgb2rgb_init_x86(void)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_INLINE_ASM
+    if (INLINE_MMX(cpu_flags))
+        rgb2rgb_init_mmx();
+    if (INLINE_AMD3DNOW(cpu_flags))
+        rgb2rgb_init_3dnow();
+    if (INLINE_MMXEXT(cpu_flags))
+        rgb2rgb_init_mmxext();
+    if (INLINE_SSE2(cpu_flags))
+        rgb2rgb_init_sse2();
+    if (INLINE_AVX(cpu_flags))
+        rgb2rgb_init_avx();
+#endif /* HAVE_INLINE_ASM */
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+#if ARCH_X86_64
+        uyvytoyuv422 = ff_uyvytoyuv422_sse2;
+#endif
+    }
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        shuffle_bytes_0321 = ff_shuffle_bytes_0321_ssse3;
+        shuffle_bytes_2103 = ff_shuffle_bytes_2103_ssse3;
+        shuffle_bytes_1230 = ff_shuffle_bytes_1230_ssse3;
+        shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3;
+        shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3;
+    }
+    if (EXTERNAL_AVX(cpu_flags)) {
+#if ARCH_X86_64
+        uyvytoyuv422 = ff_uyvytoyuv422_avx;
+#endif
+    }
+}
--- a/externals/ffmpeg/libswscale/x86/rgb2rgb_template.c
+++ b/externals/ffmpeg/libswscale/x86/rgb2rgb_template.c
--- a/externals/ffmpeg/libswscale/x86/rgb_2_rgb.asm
+++ b/externals/ffmpeg/libswscale/x86/rgb_2_rgb.asm
@@ -0,0 +1,300 @@
+;******************************************************************************
+;* Copyright Nick Kurshev
+;* Copyright Michael (michaelni@gmx.at)
+;* Copyright 2018 Jokyo Images
+;* Copyright Ivo van Poorten
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_mask_shuffle2103_mmx times 8 dw 255
+pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
+pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
+pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
+pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+SECTION .text
+
+%macro RSHIFT_COPY 3
+; %1 dst ; %2 src ; %3 shift
+%if cpuflag(avx)
+    psrldq  %1, %2, %3
+%else
+    mova           %1, %2
+    RSHIFT         %1, %3
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
+; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
+;------------------------------------------------------------------------------
+INIT_MMX mmxext
+cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
+    mova   m6, [pb_mask_shuffle2103_mmx]
+    mova   m7, m6
+    psllq  m7, 8
+
+    movsxdifnidn wq, wd
+    mov xq, wq
+
+    add        srcq, wq
+    add        dstq, wq
+    neg          wq
+
+;calc scalar loop
+    and xq, mmsize*2 -4
+    je .loop_simd
+
+.loop_scalar:
+   mov          tmpb, [srcq + wq + 2]
+   mov [dstq+wq + 0], tmpb
+   mov          tmpb, [srcq + wq + 1]
+   mov [dstq+wq + 1], tmpb
+   mov          tmpb, [srcq + wq + 0]
+   mov [dstq+wq + 2], tmpb
+   mov          tmpb, [srcq + wq + 3]
+   mov [dstq+wq + 3], tmpb
+   add            wq, 4
+   sub            xq, 4
+   jg .loop_scalar
+
+;check if src_size < mmsize * 2
+cmp wq, 0
+jge .end
+
+.loop_simd:
+    movu     m0, [srcq+wq]
+    movu     m1, [srcq+wq+8]
+
+    pshufw   m3, m0, 177
+    pshufw   m5, m1, 177
+
+    pand     m0, m7
+    pand     m3, m6
+
+    pand     m1, m7
+    pand     m5, m6
+
+    por      m0, m3
+    por      m1, m5
+
+    movu      [dstq+wq], m0
+    movu  [dstq+wq + 8], m1
+
+    add              wq, mmsize*2
+    jl .loop_simd
+
+.end:
+    RET
+
+;------------------------------------------------------------------------------
+; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
+;------------------------------------------------------------------------------
+; %1-4 index shuffle
+%macro SHUFFLE_BYTES 4
+cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
+    VBROADCASTI128    m0, [pb_shuffle%1%2%3%4]
+    movsxdifnidn wq, wd
+    mov xq, wq
+
+    add        srcq, wq
+    add        dstq, wq
+    neg          wq
+
+;calc scalar loop
+    and xq, mmsize-4
+    je .loop_simd
+
+.loop_scalar:
+   mov          tmpb, [srcq + wq + %1]
+   mov [dstq+wq + 0], tmpb
+   mov          tmpb, [srcq + wq + %2]
+   mov [dstq+wq + 1], tmpb
+   mov          tmpb, [srcq + wq + %3]
+   mov [dstq+wq + 2], tmpb
+   mov          tmpb, [srcq + wq + %4]
+   mov [dstq+wq + 3], tmpb
+   add            wq, 4
+   sub            xq, 4
+   jg .loop_scalar
+
+;check if src_size < mmsize
+cmp wq, 0
+jge .end
+
+.loop_simd:
+    movu           m1, [srcq+wq]
+    pshufb         m1, m0
+    movu    [dstq+wq], m1
+    add            wq, mmsize
+    jl .loop_simd
+
+.end:
+    RET
+%endmacro
+
+INIT_XMM ssse3
+SHUFFLE_BYTES 2, 1, 0, 3
+SHUFFLE_BYTES 0, 3, 2, 1
+SHUFFLE_BYTES 1, 2, 3, 0
+SHUFFLE_BYTES 3, 0, 1, 2
+SHUFFLE_BYTES 3, 2, 1, 0
+
+;-----------------------------------------------------------------------------------------------
+; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+;              const uint8_t *src, int width, int height,
+;              int lumStride, int chromStride, int srcStride)
+;-----------------------------------------------------------------------------------------------
+%macro UYVY_TO_YUV422 0
+cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
+    pxor         m0, m0
+    pcmpeqw      m1, m1
+    psrlw        m1, 8
+
+    movsxdifnidn            wq, wd
+    movsxdifnidn   lum_strideq, lum_strided
+    movsxdifnidn chrom_strideq, chrom_strided
+    movsxdifnidn   src_strideq, src_strided
+
+    mov     back_wq, wq
+    mov      whalfq, wq
+    shr      whalfq, 1     ; whalf = width / 2
+
+    lea srcq, [srcq + wq * 2]
+    add    ydstq, wq
+    add    udstq, whalfq
+    add    vdstq, whalfq
+
+.loop_line:
+    mov          xq, wq
+    mov       wtwoq, wq
+    add       wtwoq, wtwoq ; wtwo = width * 2
+
+    neg       wq
+    neg    wtwoq
+    neg   whalfq
+
+    ;calc scalar loop count
+    and       xq, mmsize * 2 - 1
+    je .loop_simd
+
+    .loop_scalar:
+        mov             tmpb, [srcq + wtwoq + 0]
+        mov [udstq + whalfq], tmpb
+
+        mov             tmpb, [srcq + wtwoq + 1]
+        mov     [ydstq + wq], tmpb
+
+        mov             tmpb, [srcq + wtwoq + 2]
+        mov [vdstq + whalfq], tmpb
+
+        mov             tmpb, [srcq + wtwoq + 3]
+        mov [ydstq + wq + 1], tmpb
+
+        add      wq, 2
+        add   wtwoq, 4
+        add  whalfq, 1
+        sub      xq, 2
+        jg .loop_scalar
+
+    ; check if simd loop is need
+    cmp      wq, 0
+    jge .end_line
+
+    .loop_simd:
+        movu    m2, [srcq + wtwoq             ]
+        movu    m3, [srcq + wtwoq + mmsize    ]
+        movu    m4, [srcq + wtwoq + mmsize * 2]
+        movu    m5, [srcq + wtwoq + mmsize * 3]
+
+        ; extract y part 1
+        RSHIFT_COPY    m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
+        pand           m6, m1; YxYx YxYx...
+
+        RSHIFT_COPY    m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
+        pand           m7, m1 ; YxYx YxYx...
+
+        packuswb       m6, m7 ; YYYY YYYY...
+        movu [ydstq + wq], m6
+
+        ; extract y part 2
+        RSHIFT_COPY    m6, m4, 1 ; UYVY UYVY -> YVYU YVY...
+        pand           m6, m1; YxYx YxYx...
+
+        RSHIFT_COPY    m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
+        pand           m7, m1 ; YxYx YxYx...
+
+        packuswb                m6, m7 ; YYYY YYYY...
+        movu [ydstq + wq + mmsize], m6
+
+        ; extract uv
+        pand       m2, m1   ; UxVx...
+        pand       m3, m1   ; UxVx...
+        pand       m4, m1   ; UxVx...
+        pand       m5, m1   ; UxVx...
+
+        packuswb   m2, m3   ; UVUV...
+        packuswb   m4, m5   ; UVUV...
+
+        ; U
+        pand       m6, m2, m1 ; UxUx...
+        pand       m7, m4, m1 ; UxUx...
+
+        packuswb m6, m7 ; UUUU
+        movu   [udstq + whalfq], m6
+
+
+        ; V
+        psrlw      m2, 8  ; VxVx...
+        psrlw      m4, 8  ; VxVx...
+        packuswb   m2, m4 ; VVVV
+        movu   [vdstq + whalfq], m2
+
+        add   whalfq, mmsize
+        add    wtwoq, mmsize * 4
+        add       wq, mmsize * 2
+        jl .loop_simd
+
+    .end_line:
+        add        srcq, src_strideq
+        add        ydstq, lum_strideq
+        add        udstq, chrom_strideq
+        add        vdstq, chrom_strideq
+
+        ;restore initial state of line variable
+        mov           wq, back_wq
+        mov          xq, wq
+        mov      whalfq, wq
+        shr      whalfq, 1     ; whalf = width / 2
+        sub          hd, 1
+        jg .loop_line
+
+    RET
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse2
+UYVY_TO_YUV422
+
+INIT_XMM avx
+UYVY_TO_YUV422
+%endif
--- a/externals/ffmpeg/libswscale/x86/scale.asm
+++ b/externals/ffmpeg/libswscale/x86/scale.asm
@@ -0,0 +1,423 @@
+;******************************************************************************
+;* x86-optimized horizontal line scaling functions
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+max_19bit_int: times 4 dd 0x7ffff
+max_19bit_flt: times 4 dd 524287.0
+minshort:      times 8 dw 0x8000
+unicoeff:      times 4 dd 0x20000000
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; horizontal line scaling
+;
+; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
+;                               (SwsContext *c, int{16,32}_t *dst,
+;                                int dstW, const uint{8,16}_t *src,
+;                                const int16_t *filter,
+;                                const int32_t *filterPos, int filterSize);
+;
+; Scale one horizontal line. Input is either 8-bit width or 16-bit width
+; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
+; downscale before multiplying). Filter is 14 bits. Output is either 15 bits
+; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each
+; output pixel is generated from $filterSize input pixels, the position of
+; the first pixel is given in filterPos[nOutputPixel].
+;-----------------------------------------------------------------------------
+
+; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
+%macro SCALE_FUNC 6
+%ifnidn %3, X
+cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
+%else
+cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
+%endif
+%if ARCH_X86_64
+    movsxd        wq, wd
+%define mov32 movsxd
+%else ; x86-32
+%define mov32 mov
+%endif ; x86-64
+%if %2 == 19
+%if mmsize == 8 ; mmx
+    mova          m2, [max_19bit_int]
+%elif cpuflag(sse4)
+    mova          m2, [max_19bit_int]
+%else ; ssse3/sse2
+    mova          m2, [max_19bit_flt]
+%endif ; mmx/sse2/ssse3/sse4
+%endif ; %2 == 19
+%if %1 == 16
+    mova          m6, [minshort]
+    mova          m7, [unicoeff]
+%elif %1 == 8
+    pxor          m3, m3
+%endif ; %1 == 8/16
+
+%if %1 == 8
+%define movlh movd
+%define movbh movh
+%define srcmul 1
+%else ; %1 == 9-16
+%define movlh movq
+%define movbh movu
+%define srcmul 2
+%endif ; %1 == 8/9-16
+
+%ifnidn %3, X
+
+    ; setup loop
+%if %3 == 8
+    shl           wq, 1                         ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
+%define wshr 1
+%else ; %3 == 4
+%define wshr 0
+%endif ; %3 == 8
+    lea      filterq, [filterq+wq*8]
+%if %2 == 15
+    lea         dstq, [dstq+wq*(2>>wshr)]
+%else ; %2 == 19
+    lea         dstq, [dstq+wq*(4>>wshr)]
+%endif ; %2 == 15/19
+    lea      fltposq, [fltposq+wq*(4>>wshr)]
+    neg           wq
+
+.loop:
+%if %3 == 4 ; filterSize == 4 scaling
+    ; load 2x4 or 4x4 source pixels into m0/m1
+    mov32      pos0q, dword [fltposq+wq*4+ 0]   ; filterPos[0]
+    mov32      pos1q, dword [fltposq+wq*4+ 4]   ; filterPos[1]
+    movlh         m0, [srcq+pos0q*srcmul]       ; src[filterPos[0] + {0,1,2,3}]
+%if mmsize == 8
+    movlh         m1, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
+%else ; mmsize == 16
+%if %1 > 8
+    movhps        m0, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
+%else ; %1 == 8
+    movd          m4, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
+%endif
+    mov32      pos0q, dword [fltposq+wq*4+ 8]   ; filterPos[2]
+    mov32      pos1q, dword [fltposq+wq*4+12]   ; filterPos[3]
+    movlh         m1, [srcq+pos0q*srcmul]       ; src[filterPos[2] + {0,1,2,3}]
+%if %1 > 8
+    movhps        m1, [srcq+pos1q*srcmul]       ; src[filterPos[3] + {0,1,2,3}]
+%else ; %1 == 8
+    movd          m5, [srcq+pos1q*srcmul]       ; src[filterPos[3] + {0,1,2,3}]
+    punpckldq     m0, m4
+    punpckldq     m1, m5
+%endif ; %1 == 8
+%endif ; mmsize == 8/16
+%if %1 == 8
+    punpcklbw     m0, m3                        ; byte -> word
+    punpcklbw     m1, m3                        ; byte -> word
+%endif ; %1 == 8
+
+    ; multiply with filter coefficients
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+             ; add back 0x8000 * sum(coeffs) after the horizontal add
+    psubw         m0, m6
+    psubw         m1, m6
+%endif ; %1 == 16
+    pmaddwd       m0, [filterq+wq*8+mmsize*0]   ; *= filter[{0,1,..,6,7}]
+    pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
+
+    ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
+%if mmsize == 8 ; mmx
+    movq          m4, m0
+    punpckldq     m0, m1
+    punpckhdq     m4, m1
+    paddd         m0, m4
+%elif notcpuflag(ssse3) ; sse2
+    mova          m4, m0
+    shufps        m0, m1, 10001000b
+    shufps        m4, m1, 11011101b
+    paddd         m0, m4
+%else ; ssse3/sse4
+    phaddd        m0, m1                        ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
+                                                ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
+                                                ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
+                                                ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
+%endif ; mmx/sse2/ssse3/sse4
+%else ; %3 == 8, i.e. filterSize == 8 scaling
+    ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
+    mov32      pos0q, dword [fltposq+wq*2+0]    ; filterPos[0]
+    mov32      pos1q, dword [fltposq+wq*2+4]    ; filterPos[1]
+    movbh         m0, [srcq+ pos0q   *srcmul]   ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
+%if mmsize == 8
+    movbh         m1, [srcq+(pos0q+4)*srcmul]   ; src[filterPos[0] + {4,5,6,7}]
+    movbh         m4, [srcq+ pos1q   *srcmul]   ; src[filterPos[1] + {0,1,2,3}]
+    movbh         m5, [srcq+(pos1q+4)*srcmul]   ; src[filterPos[1] + {4,5,6,7}]
+%else ; mmsize == 16
+    movbh         m1, [srcq+ pos1q   *srcmul]   ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
+    mov32      pos0q, dword [fltposq+wq*2+8]    ; filterPos[2]
+    mov32      pos1q, dword [fltposq+wq*2+12]   ; filterPos[3]
+    movbh         m4, [srcq+ pos0q   *srcmul]   ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
+    movbh         m5, [srcq+ pos1q   *srcmul]   ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
+%endif ; mmsize == 8/16
+%if %1 == 8
+    punpcklbw     m0, m3                        ; byte -> word
+    punpcklbw     m1, m3                        ; byte -> word
+    punpcklbw     m4, m3                        ; byte -> word
+    punpcklbw     m5, m3                        ; byte -> word
+%endif ; %1 == 8
+
+    ; multiply
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+             ; add back 0x8000 * sum(coeffs) after the horizontal add
+    psubw         m0, m6
+    psubw         m1, m6
+    psubw         m4, m6
+    psubw         m5, m6
+%endif ; %1 == 16
+    pmaddwd       m0, [filterq+wq*8+mmsize*0]   ; *= filter[{0,1,..,6,7}]
+    pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
+    pmaddwd       m4, [filterq+wq*8+mmsize*2]   ; *= filter[{16,17,..,22,23}]
+    pmaddwd       m5, [filterq+wq*8+mmsize*3]   ; *= filter[{24,25,..,30,31}]
+
+    ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
+%if mmsize == 8
+    paddd         m0, m1
+    paddd         m4, m5
+    movq          m1, m0
+    punpckldq     m0, m4
+    punpckhdq     m1, m4
+    paddd         m0, m1
+%elif notcpuflag(ssse3) ; sse2
+%if %1 == 8
+%define mex m6
+%else
+%define mex m3
+%endif
+    ; emulate horizontal add as transpose + vertical add
+    mova         mex, m0
+    punpckldq     m0, m1
+    punpckhdq    mex, m1
+    paddd         m0, mex
+    mova          m1, m4
+    punpckldq     m4, m5
+    punpckhdq     m1, m5
+    paddd         m4, m1
+    mova          m1, m0
+    punpcklqdq    m0, m4
+    punpckhqdq    m1, m4
+    paddd         m0, m1
+%else ; ssse3/sse4
+    ; FIXME if we rearrange the filter in pairs of 4, we can
+    ; load pixels likewise and use 2 x paddd + phaddd instead
+    ; of 3 x phaddd here, faster on older cpus
+    phaddd        m0, m1
+    phaddd        m4, m5
+    phaddd        m0, m4                        ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
+                                                ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
+                                                ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
+                                                ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
+%endif ; mmx/sse2/ssse3/sse4
+%endif ; %3 == 4/8
+
+%else ; %3 == X, i.e. any filterSize scaling
+
+%ifidn %4, X4
+%define dlt 4
+%else ; %4 == X || %4 == X8
+%define dlt 0
+%endif ; %4 ==/!= X4
+%if ARCH_X86_64
+%define srcq    r8
+%define pos1q   r7
+%define srcendq r9
+    movsxd  fltsizeq, fltsized                  ; filterSize
+    lea      srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
+%else ; x86-32
+%define srcq    srcmemq
+%define pos1q   dstq
+%define srcendq r6m
+    lea        pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
+    mov      srcendq, pos0q
+%endif ; x86-32/64
+    lea      fltposq, [fltposq+wq*4]
+%if %2 == 15
+    lea         dstq, [dstq+wq*2]
+%else ; %2 == 19
+    lea         dstq, [dstq+wq*4]
+%endif ; %2 == 15/19
+    movifnidn  dstmp, dstq
+    neg           wq
+
+.loop:
+    mov32      pos0q, dword [fltposq+wq*4+0]    ; filterPos[0]
+    mov32      pos1q, dword [fltposq+wq*4+4]    ; filterPos[1]
+    ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
+    pxor          m4, m4
+    pxor          m5, m5
+    mov         srcq, srcmemmp
+
+.innerloop:
+    ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
+    movbh         m0, [srcq+ pos0q     *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
+    movbh         m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
+%if %1 == 8
+    punpcklbw     m0, m3
+    punpcklbw     m1, m3
+%endif ; %1 == 8
+
+    ; multiply
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+             ; add back 0x8000 * sum(coeffs) after the horizontal add
+    psubw         m0, m6
+    psubw         m1, m6
+%endif ; %1 == 16
+    pmaddwd       m0, [filterq]                 ; filter[{0,1,2,3(,4,5,6,7)}]
+    pmaddwd       m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
+    paddd         m4, m0
+    paddd         m5, m1
+    add      filterq, mmsize
+    add         srcq, srcmul*mmsize/2
+    cmp         srcq, srcendq                   ; while (src += 4) < &src[filterSize]
+    jl .innerloop
+
+%ifidn %4, X4
+    mov32      pos1q, dword [fltposq+wq*4+4]    ; filterPos[1]
+    movlh         m0, [srcq+ pos0q     *srcmul] ; split last 4 srcpx of dstpx[0]
+    sub        pos1q, fltsizeq                  ; and first 4 srcpx of dstpx[1]
+%if %1 > 8
+    movhps        m0, [srcq+(pos1q+dlt)*srcmul]
+%else ; %1 == 8
+    movd          m1, [srcq+(pos1q+dlt)*srcmul]
+    punpckldq     m0, m1
+%endif ; %1 == 8
+%if %1 == 8
+    punpcklbw     m0, m3
+%endif ; %1 == 8
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+             ; add back 0x8000 * sum(coeffs) after the horizontal add
+    psubw         m0, m6
+%endif ; %1 == 16
+    pmaddwd       m0, [filterq]
+%endif ; %4 == X4
+
+    lea      filterq, [filterq+(fltsizeq+dlt)*2]
+
+%if mmsize == 8 ; mmx
+    movq          m0, m4
+    punpckldq     m4, m5
+    punpckhdq     m0, m5
+    paddd         m0, m4
+%else ; mmsize == 16
+%if notcpuflag(ssse3) ; sse2
+    mova          m1, m4
+    punpcklqdq    m4, m5
+    punpckhqdq    m1, m5
+    paddd         m4, m1
+%else ; ssse3/sse4
+    phaddd        m4, m5
+%endif ; sse2/ssse3/sse4
+%ifidn %4, X4
+    paddd         m4, m0
+%endif ; %3 == X4
+%if notcpuflag(ssse3) ; sse2
+    pshufd        m4, m4, 11011000b
+    movhlps       m0, m4
+    paddd         m0, m4
+%else ; ssse3/sse4
+    phaddd        m4, m4
+    SWAP           0, 4
+%endif ; sse2/ssse3/sse4
+%endif ; mmsize == 8/16
+%endif ; %3 ==/!= X
+
+%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
+    paddd         m0, m7
+%endif ; %1 == 16
+
+    ; clip, store
+    psrad         m0, 14 + %1 - %2
+%ifidn %3, X
+    movifnidn   dstq, dstmp
+%endif ; %3 == X
+%if %2 == 15
+    packssdw      m0, m0
+%ifnidn %3, X
+    movh [dstq+wq*(2>>wshr)], m0
+%else ; %3 == X
+    movd [dstq+wq*2], m0
+%endif ; %3 ==/!= X
+%else ; %2 == 19
+    PMINSD        m0, m2, m4
+%ifnidn %3, X
+    mova [dstq+wq*(4>>wshr)], m0
+%else ; %3 == X
+    movq [dstq+wq*4], m0
+%endif ; %3 ==/!= X
+%endif ; %2 == 15/19
+%ifnidn %3, X
+    add           wq, (mmsize<<wshr)/4          ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
+                                                ; per iteration. see "shl wq,1" above as for why we do this
+%else ; %3 == X
+    add           wq, 2
+%endif ; %3 ==/!= X
+    jl .loop
+    REP_RET
+%endmacro
+
+; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
+%macro SCALE_FUNCS 3
+SCALE_FUNC %1, %2, 4, 4,  6, %3
+SCALE_FUNC %1, %2, 8, 8,  6, %3
+%if mmsize == 8
+SCALE_FUNC %1, %2, X, X,  7, %3
+%else
+SCALE_FUNC %1, %2, X, X4, 7, %3
+SCALE_FUNC %1, %2, X, X8, 7, %3
+%endif
+%endmacro
+
+; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
+%macro SCALE_FUNCS2 3
+%if notcpuflag(sse4)
+SCALE_FUNCS  8, 15, %1
+SCALE_FUNCS  9, 15, %2
+SCALE_FUNCS 10, 15, %2
+SCALE_FUNCS 12, 15, %2
+SCALE_FUNCS 14, 15, %2
+SCALE_FUNCS 16, 15, %3
+%endif ; !sse4
+SCALE_FUNCS  8, 19, %1
+SCALE_FUNCS  9, 19, %2
+SCALE_FUNCS 10, 19, %2
+SCALE_FUNCS 12, 19, %2
+SCALE_FUNCS 14, 19, %2
+SCALE_FUNCS 16, 19, %3
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+SCALE_FUNCS2 0, 0, 0
+%endif
+INIT_XMM sse2
+SCALE_FUNCS2 7, 6, 8
+INIT_XMM ssse3
+SCALE_FUNCS2 6, 6, 8
+INIT_XMM sse4
+SCALE_FUNCS2 6, 6, 8
--- a/externals/ffmpeg/libswscale/x86/swscale.c
+++ b/externals/ffmpeg/libswscale/x86/swscale.c
@@ -0,0 +1,583 @@
+/*
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+
+const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
+    0x0103010301030103LL,
+    0x0200020002000200LL,};
+
+const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
+    0x0602060206020602LL,
+    0x0004000400040004LL,};
+
+#if HAVE_INLINE_ASM
+
+#define DITHER1XBPP
+
+DECLARE_ASM_CONST(8, uint64_t, bF8)=       0xF8F8F8F8F8F8F8F8LL;
+DECLARE_ASM_CONST(8, uint64_t, bFC)=       0xFCFCFCFCFCFCFCFCLL;
+DECLARE_ASM_CONST(8, uint64_t, w10)=       0x0010001000100010LL;
+DECLARE_ASM_CONST(8, uint64_t, w02)=       0x0002000200020002LL;
+
+DECLARE_ASM_CONST(8, uint64_t, b16Mask)=   0x001F001F001F001FLL;
+DECLARE_ASM_CONST(8, uint64_t, g16Mask)=   0x07E007E007E007E0LL;
+DECLARE_ASM_CONST(8, uint64_t, r16Mask)=   0xF800F800F800F800LL;
+DECLARE_ASM_CONST(8, uint64_t, b15Mask)=   0x001F001F001F001FLL;
+DECLARE_ASM_CONST(8, uint64_t, g15Mask)=   0x03E003E003E003E0LL;
+DECLARE_ASM_CONST(8, uint64_t, r15Mask)=   0x7C007C007C007C00LL;
+
+DECLARE_ASM_ALIGNED(8, const uint64_t, ff_M24A)         = 0x00FF0000FF0000FFLL;
+DECLARE_ASM_ALIGNED(8, const uint64_t, ff_M24B)         = 0xFF0000FF0000FF00LL;
+DECLARE_ASM_ALIGNED(8, const uint64_t, ff_M24C)         = 0x0000FF0000FF0000LL;
+
+DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2YOffset)  = 0x1010101010101010ULL;
+DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
+DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w1111)        = 0x0001000100010001ULL;
+
+
+//MMX versions
+#if HAVE_MMX_INLINE
+#undef RENAME
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define RENAME(a) a ## _mmx
+#include "swscale_template.c"
+#endif
+
+// MMXEXT versions
+#if HAVE_MMXEXT_INLINE
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
+#define RENAME(a) a ## _mmxext
+#include "swscale_template.c"
+#endif
+
+void ff_updateMMXDitherTables(SwsContext *c, int dstY)
+{
+    const int dstH= c->dstH;
+    const int flags= c->flags;
+
+    SwsPlane *lumPlane = &c->slice[c->numSlice-2].plane[0];
+    SwsPlane *chrUPlane = &c->slice[c->numSlice-2].plane[1];
+    SwsPlane *alpPlane = &c->slice[c->numSlice-2].plane[3];
+
+    int hasAlpha = c->needAlpha;
+    int32_t *vLumFilterPos= c->vLumFilterPos;
+    int32_t *vChrFilterPos= c->vChrFilterPos;
+    int16_t *vLumFilter= c->vLumFilter;
+    int16_t *vChrFilter= c->vChrFilter;
+    int32_t *lumMmxFilter= c->lumMmxFilter;
+    int32_t *chrMmxFilter= c->chrMmxFilter;
+    int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
+    const int vLumFilterSize= c->vLumFilterSize;
+    const int vChrFilterSize= c->vChrFilterSize;
+    const int chrDstY= dstY>>c->chrDstVSubSample;
+    const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
+    const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
+
+    c->blueDither= ff_dither8[dstY&1];
+    if (c->dstFormat == AV_PIX_FMT_RGB555 || c->dstFormat == AV_PIX_FMT_BGR555)
+        c->greenDither= ff_dither8[dstY&1];
+    else
+        c->greenDither= ff_dither4[dstY&1];
+    c->redDither= ff_dither8[(dstY+1)&1];
+    if (dstY < dstH - 2) {
+        const int16_t **lumSrcPtr  = (const int16_t **)(void*) lumPlane->line + firstLumSrcY - lumPlane->sliceY;
+        const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPlane->line + firstChrSrcY - chrUPlane->sliceY;
+        const int16_t **alpSrcPtr  = (CONFIG_SWSCALE_ALPHA && hasAlpha) ? (const int16_t **)(void*) alpPlane->line + firstLumSrcY - alpPlane->sliceY : NULL;
+
+        int i;
+        if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
+            const int16_t **tmpY = (const int16_t **) lumPlane->tmp;
+
+            int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
+            for (i = 0; i < neg;            i++)
+                tmpY[i] = lumSrcPtr[neg];
+            for (     ; i < end;            i++)
+                tmpY[i] = lumSrcPtr[i];
+            for (     ; i < vLumFilterSize; i++)
+                tmpY[i] = tmpY[i-1];
+            lumSrcPtr = tmpY;
+
+            if (alpSrcPtr) {
+                const int16_t **tmpA = (const int16_t **) alpPlane->tmp;
+                for (i = 0; i < neg;            i++)
+                    tmpA[i] = alpSrcPtr[neg];
+                for (     ; i < end;            i++)
+                    tmpA[i] = alpSrcPtr[i];
+                for (     ; i < vLumFilterSize; i++)
+                    tmpA[i] = tmpA[i - 1];
+                alpSrcPtr = tmpA;
+            }
+        }
+        if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
+            const int16_t **tmpU = (const int16_t **) chrUPlane->tmp;
+            int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
+            for (i = 0; i < neg;            i++) {
+                tmpU[i] = chrUSrcPtr[neg];
+            }
+            for (     ; i < end;            i++) {
+                tmpU[i] = chrUSrcPtr[i];
+            }
+            for (     ; i < vChrFilterSize; i++) {
+                tmpU[i] = tmpU[i - 1];
+            }
+            chrUSrcPtr = tmpU;
+        }
+
+        if (flags & SWS_ACCURATE_RND) {
+            int s= APCK_SIZE / 8;
+            for (i=0; i<vLumFilterSize; i+=2) {
+                *(const void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
+                *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
+                lumMmxFilter[s*i+APCK_COEF/4  ]=
+                lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
+                    + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1] * (1 << 16) : 0);
+                if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
+                    *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
+                    *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
+                    alpMmxFilter[s*i+APCK_COEF/4  ]=
+                    alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
+                }
+            }
+            for (i=0; i<vChrFilterSize; i+=2) {
+                *(const void**)&chrMmxFilter[s*i              ]= chrUSrcPtr[i  ];
+                *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrUSrcPtr[i+(vChrFilterSize>1)];
+                chrMmxFilter[s*i+APCK_COEF/4  ]=
+                chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
+                    + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1] * (1 << 16) : 0);
+            }
+        } else {
+            for (i=0; i<vLumFilterSize; i++) {
+                *(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
+                lumMmxFilter[4*i+2]=
+                lumMmxFilter[4*i+3]=
+                ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
+                if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
+                    *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
+                    alpMmxFilter[4*i+2]=
+                    alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
+                }
+            }
+            for (i=0; i<vChrFilterSize; i++) {
+                *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
+                chrMmxFilter[4*i+2]=
+                chrMmxFilter[4*i+3]=
+                ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U;
+            }
+        }
+    }
+}
+
+#if HAVE_MMXEXT
+static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+                           const int16_t **src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset)
+{
+    if(((uintptr_t)dest) & 15){
+        yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
+        return;
+    }
+    filterSize--;
+#define MAIN_FUNCTION \
+        "pxor       %%xmm0, %%xmm0 \n\t" \
+        "punpcklbw  %%xmm0, %%xmm3 \n\t" \
+        "movd           %4, %%xmm1 \n\t" \
+        "punpcklwd  %%xmm1, %%xmm1 \n\t" \
+        "punpckldq  %%xmm1, %%xmm1 \n\t" \
+        "punpcklqdq %%xmm1, %%xmm1 \n\t" \
+        "psllw          $3, %%xmm1 \n\t" \
+        "paddw      %%xmm1, %%xmm3 \n\t" \
+        "psraw          $4, %%xmm3 \n\t" \
+        "movdqa     %%xmm3, %%xmm4 \n\t" \
+        "movdqa     %%xmm3, %%xmm7 \n\t" \
+        "movl           %3, %%ecx  \n\t" \
+        "mov                                 %0, %%"FF_REG_d"        \n\t"\
+        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
+        ".p2align                             4             \n\t" /* FIXME Unroll? */\
+        "1:                                                 \n\t"\
+        "movddup                  8(%%"FF_REG_d"), %%xmm0   \n\t" /* filterCoeff */\
+        "movdqa              (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\
+        "movdqa            16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\
+        "add                                $16, %%"FF_REG_d"        \n\t"\
+        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
+        "test                         %%"FF_REG_S", %%"FF_REG_S"     \n\t"\
+        "pmulhw                           %%xmm0, %%xmm2      \n\t"\
+        "pmulhw                           %%xmm0, %%xmm5      \n\t"\
+        "paddw                            %%xmm2, %%xmm3      \n\t"\
+        "paddw                            %%xmm5, %%xmm4      \n\t"\
+        " jnz                                1b             \n\t"\
+        "psraw                               $3, %%xmm3      \n\t"\
+        "psraw                               $3, %%xmm4      \n\t"\
+        "packuswb                         %%xmm4, %%xmm3      \n\t"\
+        "movntdq                          %%xmm3, (%1, %%"FF_REG_c") \n\t"\
+        "add                         $16, %%"FF_REG_c"        \n\t"\
+        "cmp                          %2, %%"FF_REG_c"        \n\t"\
+        "movdqa                   %%xmm7, %%xmm3            \n\t" \
+        "movdqa                   %%xmm7, %%xmm4            \n\t" \
+        "mov                                 %0, %%"FF_REG_d"        \n\t"\
+        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
+        "jb                                  1b             \n\t"
+
+    if (offset) {
+        __asm__ volatile(
+            "movq          %5, %%xmm3  \n\t"
+            "movdqa    %%xmm3, %%xmm4  \n\t"
+            "psrlq        $24, %%xmm3  \n\t"
+            "psllq        $40, %%xmm4  \n\t"
+            "por       %%xmm4, %%xmm3  \n\t"
+            MAIN_FUNCTION
+              :: "g" (filter),
+              "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
+              "m"(filterSize), "m"(((uint64_t *) dither)[0])
+              : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
+                "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
+              );
+    } else {
+        __asm__ volatile(
+            "movq          %5, %%xmm3   \n\t"
+            MAIN_FUNCTION
+              :: "g" (filter),
+              "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
+              "m"(filterSize), "m"(((uint64_t *) dither)[0])
+              : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
+                "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
+              );
+    }
+}
+#endif
+
+#endif /* HAVE_INLINE_ASM */
+
+#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
+void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
+                                                SwsContext *c, int16_t *data, \
+                                                int dstW, const uint8_t *src, \
+                                                const int16_t *filter, \
+                                                const int32_t *filterPos, int filterSize)
+
+#define SCALE_FUNCS(filter_n, opt) \
+    SCALE_FUNC(filter_n,  8, 15, opt); \
+    SCALE_FUNC(filter_n,  9, 15, opt); \
+    SCALE_FUNC(filter_n, 10, 15, opt); \
+    SCALE_FUNC(filter_n, 12, 15, opt); \
+    SCALE_FUNC(filter_n, 14, 15, opt); \
+    SCALE_FUNC(filter_n, 16, 15, opt); \
+    SCALE_FUNC(filter_n,  8, 19, opt); \
+    SCALE_FUNC(filter_n,  9, 19, opt); \
+    SCALE_FUNC(filter_n, 10, 19, opt); \
+    SCALE_FUNC(filter_n, 12, 19, opt); \
+    SCALE_FUNC(filter_n, 14, 19, opt); \
+    SCALE_FUNC(filter_n, 16, 19, opt)
+
+#define SCALE_FUNCS_MMX(opt) \
+    SCALE_FUNCS(4, opt); \
+    SCALE_FUNCS(8, opt); \
+    SCALE_FUNCS(X, opt)
+
+#define SCALE_FUNCS_SSE(opt) \
+    SCALE_FUNCS(4, opt); \
+    SCALE_FUNCS(8, opt); \
+    SCALE_FUNCS(X4, opt); \
+    SCALE_FUNCS(X8, opt)
+
+#if ARCH_X86_32
+SCALE_FUNCS_MMX(mmx);
+#endif
+SCALE_FUNCS_SSE(sse2);
+SCALE_FUNCS_SSE(ssse3);
+SCALE_FUNCS_SSE(sse4);
+
+#define VSCALEX_FUNC(size, opt) \
+void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
+                                        const int16_t **src, uint8_t *dest, int dstW, \
+                                        const uint8_t *dither, int offset)
+#define VSCALEX_FUNCS(opt) \
+    VSCALEX_FUNC(8,  opt); \
+    VSCALEX_FUNC(9,  opt); \
+    VSCALEX_FUNC(10, opt)
+
+#if ARCH_X86_32
+VSCALEX_FUNCS(mmxext);
+#endif
+VSCALEX_FUNCS(sse2);
+VSCALEX_FUNCS(sse4);
+VSCALEX_FUNC(16, sse4);
+VSCALEX_FUNCS(avx);
+
+#define VSCALE_FUNC(size, opt) \
+void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \
+                                        const uint8_t *dither, int offset)
+#define VSCALE_FUNCS(opt1, opt2) \
+    VSCALE_FUNC(8,  opt1); \
+    VSCALE_FUNC(9,  opt2); \
+    VSCALE_FUNC(10, opt2); \
+    VSCALE_FUNC(16, opt1)
+
+#if ARCH_X86_32
+VSCALE_FUNCS(mmx, mmxext);
+#endif
+VSCALE_FUNCS(sse2, sse2);
+VSCALE_FUNC(16, sse4);
+VSCALE_FUNCS(avx, avx);
+
+#define INPUT_Y_FUNC(fmt, opt) \
+void ff_ ## fmt ## ToY_  ## opt(uint8_t *dst, const uint8_t *src, \
+                                const uint8_t *unused1, const uint8_t *unused2, \
+                                int w, uint32_t *unused)
+#define INPUT_UV_FUNC(fmt, opt) \
+void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
+                                const uint8_t *unused0, \
+                                const uint8_t *src1, \
+                                const uint8_t *src2, \
+                                int w, uint32_t *unused)
+#define INPUT_FUNC(fmt, opt) \
+    INPUT_Y_FUNC(fmt, opt); \
+    INPUT_UV_FUNC(fmt, opt)
+#define INPUT_FUNCS(opt) \
+    INPUT_FUNC(uyvy, opt); \
+    INPUT_FUNC(yuyv, opt); \
+    INPUT_UV_FUNC(nv12, opt); \
+    INPUT_UV_FUNC(nv21, opt); \
+    INPUT_FUNC(rgba, opt); \
+    INPUT_FUNC(bgra, opt); \
+    INPUT_FUNC(argb, opt); \
+    INPUT_FUNC(abgr, opt); \
+    INPUT_FUNC(rgb24, opt); \
+    INPUT_FUNC(bgr24, opt)
+
+#if ARCH_X86_32
+INPUT_FUNCS(mmx);
+#endif
+INPUT_FUNCS(sse2);
+INPUT_FUNCS(ssse3);
+INPUT_FUNCS(avx);
+
+av_cold void ff_sws_init_swscale_x86(SwsContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_MMX_INLINE
+    if (INLINE_MMX(cpu_flags))
+        sws_init_swscale_mmx(c);
+#endif
+#if HAVE_MMXEXT_INLINE
+    if (INLINE_MMXEXT(cpu_flags))
+        sws_init_swscale_mmxext(c);
+    if (cpu_flags & AV_CPU_FLAG_SSE3){
+        if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
+            c->yuv2planeX = yuv2yuvX_sse3;
+    }
+#endif
+
+#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
+    if (c->srcBpc == 8) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale8to19_ ## filtersize ## _ ## opt1; \
+    } else if (c->srcBpc == 9) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale9to19_ ## filtersize ## _ ## opt1; \
+    } else if (c->srcBpc == 10) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale10to19_ ## filtersize ## _ ## opt1; \
+    } else if (c->srcBpc == 12) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale12to19_ ## filtersize ## _ ## opt1; \
+    } else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth<16)) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale14to19_ ## filtersize ## _ ## opt1; \
+    } else { /* c->srcBpc == 16 */ \
+        av_assert0(c->srcBpc == 16);\
+        hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale16to19_ ## filtersize ## _ ## opt1; \
+    } \
+} while (0)
+#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
+    switch (filtersize) { \
+    case 4:  ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
+    case 8:  ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
+    default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
+    }
+#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
+switch(c->dstBpc){ \
+    case 16:                          do_16_case;                          break; \
+    case 10: if (!isBE(c->dstFormat) && c->dstFormat != AV_PIX_FMT_P010LE) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
+    case 9:  if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_  ## opt; break; \
+    case 8: if ((condition_8bit) && !c->use_mmx_vfilter) vscalefn = ff_yuv2planeX_8_  ## opt; break; \
+    }
+#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
+    switch(c->dstBpc){ \
+    case 16: if (!isBE(c->dstFormat))            vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
+    case 10: if (!isBE(c->dstFormat) && c->dstFormat != AV_PIX_FMT_P010LE && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
+    case 9:  if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_  ## opt2;  break; \
+    case 8:                                      vscalefn = ff_yuv2plane1_8_  ## opt1;  break; \
+    default: av_assert0(c->dstBpc>8); \
+    }
+#define case_rgb(x, X, opt) \
+        case AV_PIX_FMT_ ## X: \
+            c->lumToYV12 = ff_ ## x ## ToY_ ## opt; \
+            if (!c->chrSrcHSubSample) \
+                c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
+            break
+#if ARCH_X86_32
+    if (EXTERNAL_MMX(cpu_flags)) {
+        ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
+        ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
+        ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);
+
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_YA8:
+            c->lumToYV12 = ff_yuyvToY_mmx;
+            if (c->needAlpha)
+                c->alpToYV12 = ff_uyvyToY_mmx;
+            break;
+        case AV_PIX_FMT_YUYV422:
+            c->lumToYV12 = ff_yuyvToY_mmx;
+            c->chrToYV12 = ff_yuyvToUV_mmx;
+            break;
+        case AV_PIX_FMT_UYVY422:
+            c->lumToYV12 = ff_uyvyToY_mmx;
+            c->chrToYV12 = ff_uyvyToUV_mmx;
+            break;
+        case AV_PIX_FMT_NV12:
+            c->chrToYV12 = ff_nv12ToUV_mmx;
+            break;
+        case AV_PIX_FMT_NV21:
+            c->chrToYV12 = ff_nv21ToUV_mmx;
+            break;
+        case_rgb(rgb24, RGB24, mmx);
+        case_rgb(bgr24, BGR24, mmx);
+        case_rgb(bgra,  BGRA,  mmx);
+        case_rgb(rgba,  RGBA,  mmx);
+        case_rgb(abgr,  ABGR,  mmx);
+        case_rgb(argb,  ARGB,  mmx);
+        default:
+            break;
+        }
+    }
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1);
+    }
+#endif /* ARCH_X86_32 */
+#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
+    switch (filtersize) { \
+    case 4:  ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
+    case 8:  ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
+    default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \
+             else                ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \
+             break; \
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
+        ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
+        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, ,
+                            HAVE_ALIGNED_STACK || ARCH_X86_64);
+        ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
+
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_YA8:
+            c->lumToYV12 = ff_yuyvToY_sse2;
+            if (c->needAlpha)
+                c->alpToYV12 = ff_uyvyToY_sse2;
+            break;
+        case AV_PIX_FMT_YUYV422:
+            c->lumToYV12 = ff_yuyvToY_sse2;
+            c->chrToYV12 = ff_yuyvToUV_sse2;
+            break;
+        case AV_PIX_FMT_UYVY422:
+            c->lumToYV12 = ff_uyvyToY_sse2;
+            c->chrToYV12 = ff_uyvyToUV_sse2;
+            break;
+        case AV_PIX_FMT_NV12:
+            c->chrToYV12 = ff_nv12ToUV_sse2;
+            break;
+        case AV_PIX_FMT_NV21:
+            c->chrToYV12 = ff_nv21ToUV_sse2;
+            break;
+        case_rgb(rgb24, RGB24, sse2);
+        case_rgb(bgr24, BGR24, sse2);
+        case_rgb(bgra,  BGRA,  sse2);
+        case_rgb(rgba,  RGBA,  sse2);
+        case_rgb(abgr,  ABGR,  sse2);
+        case_rgb(argb,  ARGB,  sse2);
+        default:
+            break;
+        }
+    }
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
+        ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3);
+        switch (c->srcFormat) {
+        case_rgb(rgb24, RGB24, ssse3);
+        case_rgb(bgr24, BGR24, ssse3);
+        default:
+            break;
+        }
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        /* Xto15 don't need special sse4 functions */
+        ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
+        ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
+        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4,
+                            if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4,
+                            HAVE_ALIGNED_STACK || ARCH_X86_64);
+        if (c->dstBpc == 16 && !isBE(c->dstFormat))
+            c->yuv2plane1 = ff_yuv2plane1_16_sse4;
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, ,
+                            HAVE_ALIGNED_STACK || ARCH_X86_64);
+        ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
+
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_YUYV422:
+            c->chrToYV12 = ff_yuyvToUV_avx;
+            break;
+        case AV_PIX_FMT_UYVY422:
+            c->chrToYV12 = ff_uyvyToUV_avx;
+            break;
+        case AV_PIX_FMT_NV12:
+            c->chrToYV12 = ff_nv12ToUV_avx;
+            break;
+        case AV_PIX_FMT_NV21:
+            c->chrToYV12 = ff_nv21ToUV_avx;
+            break;
+        case_rgb(rgb24, RGB24, avx);
+        case_rgb(bgr24, BGR24, avx);
+        case_rgb(bgra,  BGRA,  avx);
+        case_rgb(rgba,  RGBA,  avx);
+        case_rgb(abgr,  ABGR,  avx);
+        case_rgb(argb,  ARGB,  avx);
+        default:
+            break;
+        }
+    }
+}
--- a/externals/ffmpeg/libswscale/x86/swscale_template.c
+++ b/externals/ffmpeg/libswscale/x86/swscale_template.c
--- a/externals/ffmpeg/libswscale/x86/w64xmmtest.c
+++ b/externals/ffmpeg/libswscale/x86/w64xmmtest.c
@@ -0,0 +1,31 @@
+/*
+ * check XMM registers for clobbers on Win64
+ * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/w64xmmtest.h"
+#include "libswscale/swscale.h"
+
+wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
+               const int srcStride[], int srcSliceY, int srcSliceH,
+               uint8_t *const dst[], const int dstStride[]))
+{
+    testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY,
+                    srcSliceH, dst, dstStride);
+}
--- a/externals/ffmpeg/libswscale/x86/yuv2rgb.c
+++ b/externals/ffmpeg/libswscale/x86/yuv2rgb.c
@@ -0,0 +1,155 @@
+/*
+ * software YUV to RGB converter
+ *
+ * Copyright (C) 2009 Konstantin Shishkov
+ *
+ * MMX/MMXEXT template stuff (needed for fast movntq support),
+ * 1,4,8bpp support and context / deglobalize stuff
+ * by Michael Niedermayer (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "config.h"
+#include "libswscale/rgb2rgb.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/cpu.h"
+
+#if HAVE_X86ASM
+
+#define DITHER1XBPP // only for MMX
+
+/* hope these constant values are cache line aligned */
+DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw)   = 0x00ff00ff00ff00ffULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
+DECLARE_ASM_CONST(8, uint64_t, pb_e0) = 0xe0e0e0e0e0e0e0e0ULL;
+DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
+DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
+
+//MMX versions
+#if HAVE_MMX
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define RENAME(a) a ## _mmx
+#include "yuv2rgb_template.c"
+#endif /* HAVE_MMX */
+
+// MMXEXT versions
+#if HAVE_MMXEXT
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
+#define RENAME(a) a ## _mmxext
+#include "yuv2rgb_template.c"
+#endif /* HAVE_MMXEXT */
+
+//SSSE3 versions
+#if HAVE_SSSE3
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define RENAME(a) a ## _ssse3
+#include "yuv2rgb_template.c"
+#endif
+
+#endif /* HAVE_X86ASM */
+
+av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        switch (c->dstFormat) {
+        case AV_PIX_FMT_RGB32:
+            if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
+#if CONFIG_SWSCALE_ALPHA
+                return yuva420_rgb32_ssse3;
+#endif
+                break;
+            } else
+                return yuv420_rgb32_ssse3;
+        case AV_PIX_FMT_BGR32:
+            if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
+#if CONFIG_SWSCALE_ALPHA
+                return yuva420_bgr32_ssse3;
+#endif
+                break;
+            } else
+                return yuv420_bgr32_ssse3;
+        case AV_PIX_FMT_RGB24:
+            return yuv420_rgb24_ssse3;
+        case AV_PIX_FMT_BGR24:
+            return yuv420_bgr24_ssse3;
+        case AV_PIX_FMT_RGB565:
+            return yuv420_rgb16_ssse3;
+        case AV_PIX_FMT_RGB555:
+            return yuv420_rgb15_ssse3;
+        }
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        switch (c->dstFormat) {
+        case AV_PIX_FMT_RGB24:
+            return yuv420_rgb24_mmxext;
+        case AV_PIX_FMT_BGR24:
+            return yuv420_bgr24_mmxext;
+        }
+    }
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        switch (c->dstFormat) {
+            case AV_PIX_FMT_RGB32:
+                if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
+#if CONFIG_SWSCALE_ALPHA
+                    return yuva420_rgb32_mmx;
+#endif
+                    break;
+                } else
+                    return yuv420_rgb32_mmx;
+            case AV_PIX_FMT_BGR32:
+                if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
+#if CONFIG_SWSCALE_ALPHA
+                    return yuva420_bgr32_mmx;
+#endif
+                    break;
+                } else
+                    return yuv420_bgr32_mmx;
+            case AV_PIX_FMT_RGB24:
+                return yuv420_rgb24_mmx;
+            case AV_PIX_FMT_BGR24:
+                return yuv420_bgr24_mmx;
+            case AV_PIX_FMT_RGB565:
+                return yuv420_rgb16_mmx;
+            case AV_PIX_FMT_RGB555:
+                return yuv420_rgb15_mmx;
+        }
+    }
+
+#endif /* HAVE_X86ASM */
+    return NULL;
+}
--- a/externals/ffmpeg/libswscale/x86/yuv2rgb_template.c
+++ b/externals/ffmpeg/libswscale/x86/yuv2rgb_template.c
@@ -0,0 +1,195 @@
+/*
+ * software YUV to RGB converter
+ *
+ * Copyright (C) 2001-2007 Michael Niedermayer
+ *           (c) 2010 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/x86/asm.h"
+#include "libswscale/swscale_internal.h"
+
+#define YUV2RGB_LOOP(depth)                                          \
+    h_size = (c->dstW + 7) & ~7;                                     \
+    if (h_size * depth > FFABS(dstStride[0]))                        \
+        h_size -= 8;                                                 \
+                                                                     \
+    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                        \
+                                                                     \
+    for (y = 0; y < srcSliceH; y++) {                                \
+        uint8_t *image    = dst[0] + (y + srcSliceY) * dstStride[0]; \
+        const uint8_t *py = src[0] +               y * srcStride[0]; \
+        const uint8_t *pu = src[1] +   (y >> vshift) * srcStride[1]; \
+        const uint8_t *pv = src[2] +   (y >> vshift) * srcStride[2]; \
+        x86_reg index = -h_size / 2;                                 \
+
+extern void RENAME(ff_yuv_420_rgb24)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                     const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                     const uint8_t *py_2index);
+extern void RENAME(ff_yuv_420_bgr24)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                     const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                     const uint8_t *py_2index);
+
+#if !COMPILE_TEMPLATE_MMXEXT
+extern void RENAME(ff_yuv_420_rgb15)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                     const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                     const uint8_t *py_2index);
+extern void RENAME(ff_yuv_420_rgb16)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                     const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                     const uint8_t *py_2index);
+extern void RENAME(ff_yuv_420_rgb32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                     const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                     const uint8_t *py_2index);
+extern void RENAME(ff_yuv_420_bgr32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                     const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                     const uint8_t *py_2index);
+extern void RENAME(ff_yuva_420_rgb32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                      const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                      const uint8_t *py_2index, const uint8_t *pa_2index);
+extern void RENAME(ff_yuva_420_bgr32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
+                                      const uint8_t *pv_index, const uint64_t *pointer_c_dither,
+                                      const uint8_t *py_2index, const uint8_t *pa_2index);
+
+static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(2)
+
+#ifdef DITHER1XBPP
+        c->blueDither  = ff_dither8[y       & 1];
+        c->greenDither = ff_dither8[y       & 1];
+        c->redDither   = ff_dither8[(y + 1) & 1];
+#endif
+
+        RENAME(ff_yuv_420_rgb15)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+    }
+    return srcSliceH;
+}
+
+static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(2)
+
+#ifdef DITHER1XBPP
+        c->blueDither  = ff_dither8[y       & 1];
+        c->greenDither = ff_dither4[y       & 1];
+        c->redDither   = ff_dither8[(y + 1) & 1];
+#endif
+
+        RENAME(ff_yuv_420_rgb16)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+    }
+    return srcSliceH;
+}
+
+static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(4)
+
+        RENAME(ff_yuv_420_rgb32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+    }
+    return srcSliceH;
+}
+
+static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(4)
+
+        RENAME(ff_yuv_420_bgr32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+    }
+    return srcSliceH;
+}
+
+static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+    YUV2RGB_LOOP(4)
+
+        const uint8_t *pa = src[3] + y * srcStride[3];
+        RENAME(ff_yuva_420_rgb32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
+    }
+    return srcSliceH;
+}
+
+static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
+                                        int srcStride[],
+                                        int srcSliceY, int srcSliceH,
+                                        uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(4)
+
+        const uint8_t *pa = src[3] + y * srcStride[3];
+        RENAME(ff_yuva_420_bgr32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
+    }
+    return srcSliceH;
+}
+#endif
+
+static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(3)
+
+        RENAME(ff_yuv_420_rgb24)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+    }
+    return srcSliceH;
+}
+
+static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
+                                        int srcStride[],
+                                        int srcSliceY, int srcSliceH,
+                                        uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(3)
+
+        RENAME(ff_yuv_420_bgr24)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
+    }
+    return srcSliceH;
+}
+
--- a/externals/ffmpeg/libswscale/x86/yuv_2_rgb.asm
+++ b/externals/ffmpeg/libswscale/x86/yuv_2_rgb.asm
@@ -0,0 +1,383 @@
+;******************************************************************************
+;* software YUV to RGB converter
+;*
+;* Copyright (C) 2001-2007 Michael Niedermayer
+;*           (c) 2010 Konstantin Shishkov
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+; below variables are named like mask_dwXY, which means to preserve dword No.X & No.Y
+mask_dw036 : db -1, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,  0
+mask_dw147 : db  0,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0, -1, -1
+mask_dw25  : db  0,  0,  0,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0
+rgb24_shuf1: db  0,  1,  6,  7, 12, 13,  2,  3,  8,  9, 14, 15,  4,  5, 10, 11
+rgb24_shuf2: db 10, 11,  0,  1,  6,  7, 12, 13,  2,  3,  8,  9, 14, 15,  4,  5
+rgb24_shuf3: db  4,  5, 10, 11,  0,  1,  6,  7, 12, 13,  2,  3,  8,  9, 14, 15
+pw_00ff: times 8 dw 255
+pb_f8:   times 16 db 248
+pb_e0:   times 16 db 224
+pb_03:   times 16 db 3
+pb_07:   times 16 db 7
+
+mask_1101: dw -1, -1,  0, -1
+mask_0010: dw  0,  0, -1,  0
+mask_0110: dw  0, -1, -1,  0
+mask_1001: dw -1,  0,  0, -1
+mask_0100: dw  0, -1,  0,  0
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+;
+; YUV420/YUVA420 to RGB/BGR 15/16/24/32
+; R = Y + ((vrCoff * (v - 128)) >> 8)
+; G = Y - ((ugCoff * (u - 128) + vgCoff * (v - 128)) >> 8)
+; B = Y + ((ubCoff * (u - 128)) >> 8)
+;
+;-----------------------------------------------------------------------------
+
+%macro MOV_H2L 1
+%if mmsize == 8
+    psrlq %1, 32
+%else ; mmsize == 16
+    psrldq %1, 8
+%endif
+%endmacro
+
+%macro yuv2rgb_fn 3
+
+%if %3 == 32
+    %ifidn %1, yuva
+    %define parameters index, image, pu_index, pv_index, pointer_c_dither, py_2index, pa_2index
+    %define GPR_num 7
+    %endif
+%else
+    %define parameters index, image, pu_index, pv_index, pointer_c_dither, py_2index
+    %define GPR_num 6
+%endif
+
+%define m_green m2
+%define m_alpha m3
+%define m_y m6
+%define m_u m0
+%define m_v m1
+%ifidn %2, rgb
+%define m_red m1
+%define m_blue m0
+%else
+%define m_red m0
+%define m_blue m1
+%endif
+
+%if mmsize == 8
+%define time_num 1
+%define reg_num 8
+%define y_offset [pointer_c_ditherq + 8  * 8]
+%define u_offset [pointer_c_ditherq + 9  * 8]
+%define v_offset [pointer_c_ditherq + 10 * 8]
+%define ug_coff  [pointer_c_ditherq + 7  * 8]
+%define vg_coff  [pointer_c_ditherq + 6  * 8]
+%define y_coff   [pointer_c_ditherq + 3  * 8]
+%define ub_coff  [pointer_c_ditherq + 5  * 8]
+%define vr_coff  [pointer_c_ditherq + 4  * 8]
+%elif mmsize == 16
+%define time_num 2
+%if ARCH_X86_32
+%define reg_num 8
+%define my_offset [pointer_c_ditherq + 8  * 8]
+%define mu_offset [pointer_c_ditherq + 9  * 8]
+%define mv_offset [pointer_c_ditherq + 10 * 8]
+%define mug_coff  [pointer_c_ditherq + 7  * 8]
+%define mvg_coff  [pointer_c_ditherq + 6  * 8]
+%define my_coff   [pointer_c_ditherq + 3  * 8]
+%define mub_coff  [pointer_c_ditherq + 5  * 8]
+%define mvr_coff  [pointer_c_ditherq + 4  * 8]
+%else ; ARCH_X86_64
+%define reg_num 16
+%define y_offset m8
+%define u_offset m9
+%define v_offset m10
+%define ug_coff  m11
+%define vg_coff  m12
+%define y_coff   m13
+%define ub_coff  m14
+%define vr_coff  m15
+%endif ; ARCH_X86_32/64
+%endif ; coeff define mmsize == 8/16
+
+cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
+
+%if ARCH_X86_64
+    movsxd indexq, indexd
+%if mmsize == 16
+    VBROADCASTSD y_offset, [pointer_c_ditherq + 8  * 8]
+    VBROADCASTSD u_offset, [pointer_c_ditherq + 9  * 8]
+    VBROADCASTSD v_offset, [pointer_c_ditherq + 10 * 8]
+    VBROADCASTSD ug_coff,  [pointer_c_ditherq + 7  * 8]
+    VBROADCASTSD vg_coff,  [pointer_c_ditherq + 6  * 8]
+    VBROADCASTSD y_coff,   [pointer_c_ditherq + 3  * 8]
+    VBROADCASTSD ub_coff,  [pointer_c_ditherq + 5  * 8]
+    VBROADCASTSD vr_coff,  [pointer_c_ditherq + 4  * 8]
+%endif
+%endif
+    movu m_y, [py_2indexq + 2 * indexq]
+    movh m_u, [pu_indexq  +     indexq]
+    movh m_v, [pv_indexq  +     indexq]
+.loop0:
+    pxor m4, m4
+    mova m7, m6
+    punpcklbw m0, m4
+    punpcklbw m1, m4
+    mova m2, [pw_00ff]
+    pand m6, m2
+    psrlw m7, 8
+    psllw m0, 3
+    psllw m1, 3
+    psllw m6, 3
+    psllw m7, 3
+%if (ARCH_X86_32 && mmsize == 16)
+    VBROADCASTSD m2, mu_offset
+    VBROADCASTSD m3, mv_offset
+    VBROADCASTSD m4, my_offset
+    psubsw m0, m2 ; U = U - 128
+    psubsw m1, m3 ; V = V - 128
+    psubw  m6, m4
+    psubw  m7, m4
+    VBROADCASTSD m2, mug_coff
+    VBROADCASTSD m3, mvg_coff
+    VBROADCASTSD m4, my_coff
+    VBROADCASTSD m5, mub_coff
+    pmulhw m2, m0
+    pmulhw m3, m1
+    pmulhw m6, m4
+    pmulhw m7, m4
+    pmulhw m0, m5
+    VBROADCASTSD m4, mvr_coff
+    pmulhw m1, m4
+%else ; ARCH_X86_64 || mmsize == 8
+    psubsw m0, u_offset ; U = U - 128
+    psubsw m1, v_offset ; V = V - 128
+    psubw  m6, y_offset
+    psubw  m7, y_offset
+    mova m2, m0
+    mova m3, m1
+    pmulhw m2, ug_coff
+    pmulhw m3, vg_coff
+    pmulhw m6, y_coff
+    pmulhw m7, y_coff
+    pmulhw m0, ub_coff
+    pmulhw m1, vr_coff
+%endif
+    paddsw m2, m3
+    mova m3, m7
+    mova m5, m7
+    paddsw m3, m0 ; B1 B3 B5 B7 ...
+    paddsw m5, m1 ; R1 R3 R5 R7 ...
+    paddsw m7, m2 ; G1 G3 G4 G7 ...
+    paddsw m0, m6 ; B0 B2 B4 B6 ...
+    paddsw m1, m6 ; R0 R2 R4 R6 ...
+    paddsw m2, m6 ; G0 G2 G4 G6 ...
+
+%if %3 == 24 ; PACK RGB24
+%define depth 3
+    packuswb m0, m3 ; R0 R2 R4 R6 ... R1 R3 R5 R7 ...
+    packuswb m1, m5 ; B0 B2 B4 B6 ... B1 B3 B5 B7 ...
+    packuswb m2, m7 ; G0 G2 G4 G6 ... G1 G3 G5 G7 ...
+    mova m3, m_red
+    mova m6, m_blue
+    MOV_H2L m_red
+    punpcklbw m3, m2     ; R0 G0 R2 G2 R4 G4 R6 G6 R8 G8 ...
+    punpcklbw m6, m_red  ; B0 R1 B2 R3 B4 R5 B6 R7 B8 R9 ...
+    mova m5, m3
+    punpckhbw m2, m_blue ; G1 B1 G3 B3 G5 B5 G7 B7 G9 B9 ...
+%if  mmsize == 8
+    punpcklwd m3 ,m6     ; R0 G0 B0 R1 R2 G2 B2 R3
+    punpckhwd m5, m6     ; R4 G4 B4 R5 R6 G6 B6 R7
+%if cpuflag(mmxext)
+    pshufw m1, m2, 0xc6
+    pshufw m6, m3, 0x84
+    pshufw m7, m5, 0x38
+    pand m6, [mask_1101] ; R0 G0 B0 R1 -- -- R2 G2
+    movq m0, m1
+    pand m7, [mask_0110] ; -- -- R6 G6 B6 R7 -- --
+    movq m2, m1
+    pand m1, [mask_0100] ; -- -- G3 B3 -- -- -- --
+    psrlq m3, 48         ; B2 R3 -- -- -- -- -- --
+    pand m0, [mask_0010] ; -- -- -- -- G1 B1 -- --
+    psllq m5, 32         ; -- -- -- -- R4 G4 B4 R5
+    pand m2, [mask_1001] ; G5 B5 -- -- -- -- G7 B7
+    por m1, m3
+    por m0, m6
+    por m1, m5
+    por m2, m7
+    movntq [imageq], m0
+    movntq [imageq + 8], m1
+    movntq [imageq + 16], m2
+%else ; cpuflag(mmx)
+    movd [imageq], m3      ; R0 G0 R2 G2
+    movd [imageq + 4], m2  ; G1 B1
+    psrlq m3, 32
+    psrlq m2, 16
+    movd [imageq + 6], m3  ; R2 G2 B2 R3
+    movd [imageq + 10], m2 ; G3 B3
+    psrlq m2, 16
+    movd [imageq + 12], m5 ; R4 G4 B4 R5
+    movd [imageq + 16], m2 ; G5 B5
+    psrlq m5, 32
+    movd [imageq + 20], m2 ; -- -- G7 B7
+    movd [imageq + 18], m5 ; R6 G6 B6 R7
+%endif ; mmsize = 8
+%else ; mmsize == 16
+    pshufb m3, [rgb24_shuf1] ; r0  g0  r6  g6  r12 g12 r2  g2  r8  g8  r14 g14 r4  g4  r10 g10
+    pshufb m6, [rgb24_shuf2] ; b10 r11 b0  r1  b6  r7  b12 r13 b2  r3  b8  r9  b14 r15 b4  r5
+    pshufb m2, [rgb24_shuf3] ; g5  b5  g11 b11 g1  b1  g7  b7  g13 b13 g3  b3  g9  b9  g15 b15
+    mova   m7, [mask_dw036]
+    mova   m4, [mask_dw147]
+    mova   m5, [mask_dw25]
+    pand   m0, m7, m3      ; r0  g0  --- --- --- --- r2  g2  --- --- --- --- r4  g4  --- ---
+    pand   m1, m4, m6      ; --- --- b0  r1  --- --- --- --- b2  r3  --- --- --- --- b4  r5
+    por    m0, m1
+    pand   m1, m5, m2      ; --- --- --- --- g1  b1  --- --- --- --- g3  b3  --- --- --- ---
+    por    m0, m1          ; r0  g0  b0  r1  g1  b1  r2  g2  b2  r3  g3  b3  r4  g4  b4  r5
+    pand   m1, m7, m2      ; g5  b5  --- --- --- --- g7  b7  --- --- --- --- g9  b9  --- ---
+    pand   m7, m6          ; b10 r11 --- --- --- --- b12 r13 --- --- --- --- b14 r15 --- ---
+    pand   m6, m5          ; --- --- --- --- b6  r7  --- --- --- --- b8  r9  --- --- --- ---
+    por    m1, m6
+    pand   m6, m4, m3      ; --- --- r6  g6  --- --- --- --- r8  g8  --- --- --- --- r10 g10
+    pand   m2, m4          ; --- --- g11 b11 --- --- --- --- g13 b13 --- --- --- --- g15 b15
+    pand   m3, m5          ; --- --- --- --- r12 g12 --- --- --- --- r14 g14 --- --- --- ---
+    por    m2, m7
+    por    m1, m6          ; g5  b5  r6  g6  b6  r7  g7  b7  r8  g8  b8  r9  g9  b9  r10 g10
+    por    m2, m3          ; b10 r11 g11 b11 r12 g12 b12 r13 g13 b13 r14 g14 b14 r15 g15 b15
+    mova [imageq], m0
+    mova [imageq + 16], m1
+    mova [imageq + 32], m2
+%endif ; mmsize = 16
+%else ; PACK RGB15/16/32
+    packuswb m0, m1
+    packuswb m3, m5
+    packuswb m2, m2
+    mova m1, m0
+    packuswb m7, m7
+    punpcklbw m0, m3 ; B0 B1 B2 B3 ... B7
+    punpckhbw m1, m3 ; R0 R1 R2 R3 ... R7
+    punpcklbw m2, m7 ; G0 G1 G2 G3 ... G7
+%if %3 == 32 ; PACK RGB32
+%define depth 4
+%ifidn %1, yuv
+    pcmpeqd m3, m3 ; Set alpha empty
+%else
+    mova m3, [pa_2indexq + 2 * indexq] ; Load alpha
+%endif
+    mova m5, m_blue
+    mova m6, m_red
+    punpckhbw m5,     m_green
+    punpcklbw m_blue, m_green
+    punpckhbw m6,     m_alpha
+    punpcklbw m_red,  m_alpha
+    mova m_green, m_blue
+    mova m_alpha, m5
+    punpcklwd m_blue, m_red
+    punpckhwd m_green, m_red
+    punpcklwd m5, m6
+    punpckhwd m_alpha, m6
+    mova [imageq + 0], m_blue
+    mova [imageq + 8  * time_num], m_green
+    mova [imageq + 16 * time_num], m5
+    mova [imageq + 24 * time_num], m_alpha
+%else ; PACK RGB15/16
+%define depth 2
+%if cpuflag(ssse3)
+    %define red_dither m3
+    %define green_dither m4
+    %define blue_dither m5
+    VBROADCASTSD red_dither,   [pointer_c_ditherq + 0 * 8]
+    VBROADCASTSD green_dither, [pointer_c_ditherq + 1 * 8]
+    VBROADCASTSD blue_dither,  [pointer_c_ditherq + 2 * 8]
+%else ; cpuflag(mmx/mmxext)
+%define blue_dither  [pointer_c_ditherq + 2  * 8]
+%define green_dither [pointer_c_ditherq + 1  * 8]
+%define red_dither   [pointer_c_ditherq + 0  * 8]
+%endif
+%if %3 == 15
+%define gmask pb_03
+%define isRGB15 1
+%else
+%define gmask pb_07
+%define isRGB15 0
+%endif
+    paddusb m0, blue_dither
+    paddusb m2, green_dither
+    paddusb m1, red_dither
+    pand m0, [pb_f8]
+    pand m1, [pb_f8]
+    mova m3, m2
+    psllw m2, 3 - isRGB15
+    psrlw m3, 5 + isRGB15
+    psrlw m0, 3
+    psrlw m1, isRGB15
+    pand m2, [pb_e0]
+    pand m3, [gmask]
+    por m0, m2
+    por m1, m3
+    mova m2, m0
+    punpcklbw m0, m1
+    punpckhbw m2, m1
+    mova [imageq], m0
+    mova [imageq + 8 * time_num], m2
+%endif ; PACK RGB15/16
+%endif ; PACK RGB15/16/32
+
+movu m_y, [py_2indexq + 2 * indexq + 8 * time_num]
+movh m_v, [pv_indexq  +     indexq + 4 * time_num]
+movh m_u, [pu_indexq  +     indexq + 4 * time_num]
+add imageq, 8 * depth * time_num
+add indexq, 4 * time_num
+js .loop0
+
+REP_RET
+
+%endmacro
+
+INIT_MMX mmx
+yuv2rgb_fn yuv,  rgb, 24
+yuv2rgb_fn yuv,  bgr, 24
+yuv2rgb_fn yuv,  rgb, 32
+yuv2rgb_fn yuv,  bgr, 32
+yuv2rgb_fn yuva, rgb, 32
+yuv2rgb_fn yuva, bgr, 32
+yuv2rgb_fn yuv,  rgb, 15
+yuv2rgb_fn yuv,  rgb, 16
+
+INIT_MMX mmxext
+yuv2rgb_fn yuv, rgb, 24
+yuv2rgb_fn yuv, bgr, 24
+
+INIT_XMM ssse3
+yuv2rgb_fn yuv,  rgb, 24
+yuv2rgb_fn yuv,  bgr, 24
+yuv2rgb_fn yuv,  rgb, 32
+yuv2rgb_fn yuv,  bgr, 32
+yuv2rgb_fn yuva, rgb, 32
+yuv2rgb_fn yuva, bgr, 32
+yuv2rgb_fn yuv,  rgb, 15
+yuv2rgb_fn yuv,  rgb, 16