Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 
     13 #if defined (_M_X64)
     14 #include <emmintrin.h>
     15 #include <tmmintrin.h>  // For _mm_maddubs_epi16
     16 #endif
     17 
     18 #ifdef __cplusplus
     19 namespace libyuv {
     20 extern "C" {
     21 #endif
     22 
     23 // This module is for Visual C.
     24 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
     25 
     26 #define YG 74  /* (int8)(1.164 * 64 + 0.5) */
     27 
     28 #define UB 127  /* min(127,(int8)(2.018 * 64)) */
     29 #define UG -25  /* (int8)(-0.391 * 64 - 0.5) */
     30 #define UR 0
     31 
     32 #define VB 0
     33 #define VG -52  /* (int8)(-0.813 * 64 - 0.5) */
     34 #define VR 102  /* (int8)(1.596 * 64 + 0.5) */
     35 
     36 // Bias
     37 #define BB UB * 128 + VB * 128
     38 #define BG UG * 128 + VG * 128
     39 #define BR UR * 128 + VR * 128
     40 
     41 static const vec8 kUVToB = {
     42   UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
     43 };
     44 
     45 static const vec8 kUVToR = {
     46   UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
     47 };
     48 
     49 static const vec8 kUVToG = {
     50   UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
     51 };
     52 
     53 static const vec8 kVUToB = {
     54   VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
     55 };
     56 
     57 static const vec8 kVUToR = {
     58   VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
     59 };
     60 
     61 static const vec8 kVUToG = {
     62   VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
     63 };
     64 
     65 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
     66 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
     67 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
     68 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
     69 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
     70 
     71 // 64 bit
     72 #if defined(_M_X64)
     73 
     74 // Aligned destination version.
     75 __declspec(align(16))
     76 void I422ToARGBRow_SSSE3(const uint8* y_buf,
     77                          const uint8* u_buf,
     78                          const uint8* v_buf,
     79                          uint8* dst_argb,
     80                          int width) {
     81 
     82   __m128i xmm0, xmm1, xmm2, xmm3;
     83   const __m128i xmm5 = _mm_set1_epi8(-1);
     84   const __m128i xmm4 = _mm_setzero_si128();
     85   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
     86 
     87   while (width > 0) {
     88     xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
     89     xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
     90     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
     91     xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
     92     xmm1 = _mm_load_si128(&xmm0);
     93     xmm2 = _mm_load_si128(&xmm0);
     94     xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
     95     xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
     96     xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
     97     xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
     98     xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
     99     xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
    100     xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
    101     xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
    102     xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
    103     xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
    104     xmm0 = _mm_adds_epi16(xmm0, xmm3);
    105     xmm1 = _mm_adds_epi16(xmm1, xmm3);
    106     xmm2 = _mm_adds_epi16(xmm2, xmm3);
    107     xmm0 = _mm_srai_epi16(xmm0, 6);
    108     xmm1 = _mm_srai_epi16(xmm1, 6);
    109     xmm2 = _mm_srai_epi16(xmm2, 6);
    110     xmm0 = _mm_packus_epi16(xmm0, xmm0);
    111     xmm1 = _mm_packus_epi16(xmm1, xmm1);
    112     xmm2 = _mm_packus_epi16(xmm2, xmm2);
    113     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
    114     xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
    115     xmm1 = _mm_load_si128(&xmm0);
    116     xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
    117     xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
    118 
    119     _mm_store_si128((__m128i *)dst_argb, xmm0);
    120     _mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
    121 
    122     y_buf += 8;
    123     u_buf += 4;
    124     dst_argb += 32;
    125     width -= 8;
    126   }
    127 }
    128 
    129 // Unaligned destination version.
    130 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
    131                                    const uint8* u_buf,
    132                                    const uint8* v_buf,
    133                                    uint8* dst_argb,
    134                                    int width) {
    135 
    136   __m128i xmm0, xmm1, xmm2, xmm3;
    137   const __m128i xmm5 = _mm_set1_epi8(-1);
    138   const __m128i xmm4 = _mm_setzero_si128();
    139   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
    140 
    141   while (width > 0) {
    142     xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
    143     xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
    144     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
    145     xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
    146     xmm1 = _mm_load_si128(&xmm0);
    147     xmm2 = _mm_load_si128(&xmm0);
    148     xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
    149     xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
    150     xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
    151     xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
    152     xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
    153     xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
    154     xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
    155     xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
    156     xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
    157     xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
    158     xmm0 = _mm_adds_epi16(xmm0, xmm3);
    159     xmm1 = _mm_adds_epi16(xmm1, xmm3);
    160     xmm2 = _mm_adds_epi16(xmm2, xmm3);
    161     xmm0 = _mm_srai_epi16(xmm0, 6);
    162     xmm1 = _mm_srai_epi16(xmm1, 6);
    163     xmm2 = _mm_srai_epi16(xmm2, 6);
    164     xmm0 = _mm_packus_epi16(xmm0, xmm0);
    165     xmm1 = _mm_packus_epi16(xmm1, xmm1);
    166     xmm2 = _mm_packus_epi16(xmm2, xmm2);
    167     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
    168     xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
    169     xmm1 = _mm_load_si128(&xmm0);
    170     xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
    171     xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
    172 
    173     _mm_storeu_si128((__m128i *)dst_argb, xmm0);
    174     _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
    175 
    176     y_buf += 8;
    177     u_buf += 4;
    178     dst_argb += 32;
    179     width -= 8;
    180   }
    181 }
    182 // 32 bit
    183 #else  // defined(_M_X64)
    184 
    185 #ifdef HAS_ARGBTOYROW_SSSE3
    186 
    187 // Constants for ARGB.
    188 static const vec8 kARGBToY = {
    189   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
    190 };
    191 
    192 // JPeg full range.
    193 static const vec8 kARGBToYJ = {
    194   15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
    195 };
    196 
    197 static const vec8 kARGBToU = {
    198   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
    199 };
    200 
    201 static const vec8 kARGBToUJ = {
    202   127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
    203 };
    204 
    205 static const vec8 kARGBToV = {
    206   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
    207 };
    208 
    209 static const vec8 kARGBToVJ = {
    210   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
    211 };
    212 
    213 // vpermd for vphaddw + vpackuswb vpermd.
    214 static const lvec32 kPermdARGBToY_AVX = {
    215   0, 4, 1, 5, 2, 6, 3, 7
    216 };
    217 
    218 // vpshufb for vphaddw + vpackuswb packed to shorts.
    219 static const lvec8 kShufARGBToUV_AVX = {
    220   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
    221   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
    222 };
    223 
    224 // Constants for BGRA.
    225 static const vec8 kBGRAToY = {
    226   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
    227 };
    228 
    229 static const vec8 kBGRAToU = {
    230   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
    231 };
    232 
    233 static const vec8 kBGRAToV = {
    234   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
    235 };
    236 
    237 // Constants for ABGR.
    238 static const vec8 kABGRToY = {
    239   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
    240 };
    241 
    242 static const vec8 kABGRToU = {
    243   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
    244 };
    245 
    246 static const vec8 kABGRToV = {
    247   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
    248 };
    249 
    250 // Constants for RGBA.
    251 static const vec8 kRGBAToY = {
    252   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
    253 };
    254 
    255 static const vec8 kRGBAToU = {
    256   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
    257 };
    258 
    259 static const vec8 kRGBAToV = {
    260   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
    261 };
    262 
    263 static const uvec8 kAddY16 = {
    264   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
    265 };
    266 
    267 static const vec16 kAddYJ64 = {
    268   64, 64, 64, 64, 64, 64, 64, 64
    269 };
    270 
    271 static const uvec8 kAddUV128 = {
    272   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
    273   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
    274 };
    275 
    276 static const uvec16 kAddUVJ128 = {
    277   0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
    278 };
    279 
    280 // Shuffle table for converting RGB24 to ARGB.
    281 static const uvec8 kShuffleMaskRGB24ToARGB = {
    282   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
    283 };
    284 
    285 // Shuffle table for converting RAW to ARGB.
    286 static const uvec8 kShuffleMaskRAWToARGB = {
    287   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
    288 };
    289 
    290 // Shuffle table for converting ARGB to RGB24.
    291 static const uvec8 kShuffleMaskARGBToRGB24 = {
    292   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
    293 };
    294 
    295 // Shuffle table for converting ARGB to RAW.
    296 static const uvec8 kShuffleMaskARGBToRAW = {
    297   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
    298 };
    299 
    300 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
    301 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
    302   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
    303 };
    304 
    305 // Shuffle table for converting ARGB to RAW.
    306 static const uvec8 kShuffleMaskARGBToRAW_0 = {
    307   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
    308 };
    309 
    310 // Duplicates gray value 3 times and fills in alpha opaque.
    311 __declspec(naked) __declspec(align(16))
    312 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
    313   __asm {
    314     mov        eax, [esp + 4]        // src_y
    315     mov        edx, [esp + 8]        // dst_argb
    316     mov        ecx, [esp + 12]       // pix
    317     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
    318     pslld      xmm5, 24
    319 
    320     align      4
    321   convertloop:
    322     movq       xmm0, qword ptr [eax]
    323     lea        eax,  [eax + 8]
    324     punpcklbw  xmm0, xmm0
    325     movdqa     xmm1, xmm0
    326     punpcklwd  xmm0, xmm0
    327     punpckhwd  xmm1, xmm1
    328     por        xmm0, xmm5
    329     por        xmm1, xmm5
    330     movdqa     [edx], xmm0
    331     movdqa     [edx + 16], xmm1
    332     lea        edx, [edx + 32]
    333     sub        ecx, 8
    334     jg         convertloop
    335     ret
    336   }
    337 }
    338 
    339 __declspec(naked) __declspec(align(16))
    340 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
    341                                   int pix) {
    342   __asm {
    343     mov        eax, [esp + 4]        // src_y
    344     mov        edx, [esp + 8]        // dst_argb
    345     mov        ecx, [esp + 12]       // pix
    346     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
    347     pslld      xmm5, 24
    348 
    349     align      4
    350   convertloop:
    351     movq       xmm0, qword ptr [eax]
    352     lea        eax,  [eax + 8]
    353     punpcklbw  xmm0, xmm0
    354     movdqa     xmm1, xmm0
    355     punpcklwd  xmm0, xmm0
    356     punpckhwd  xmm1, xmm1
    357     por        xmm0, xmm5
    358     por        xmm1, xmm5
    359     movdqu     [edx], xmm0
    360     movdqu     [edx + 16], xmm1
    361     lea        edx, [edx + 32]
    362     sub        ecx, 8
    363     jg         convertloop
    364     ret
    365   }
    366 }
    367 
    368 __declspec(naked) __declspec(align(16))
    369 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
    370   __asm {
    371     mov       eax, [esp + 4]   // src_rgb24
    372     mov       edx, [esp + 8]   // dst_argb
    373     mov       ecx, [esp + 12]  // pix
    374     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
    375     pslld     xmm5, 24
    376     movdqa    xmm4, kShuffleMaskRGB24ToARGB
    377 
    378     align      4
    379  convertloop:
    380     movdqu    xmm0, [eax]
    381     movdqu    xmm1, [eax + 16]
    382     movdqu    xmm3, [eax + 32]
    383     lea       eax, [eax + 48]
    384     movdqa    xmm2, xmm3
    385     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
    386     pshufb    xmm2, xmm4
    387     por       xmm2, xmm5
    388     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
    389     pshufb    xmm0, xmm4
    390     movdqa    [edx + 32], xmm2
    391     por       xmm0, xmm5
    392     pshufb    xmm1, xmm4
    393     movdqa    [edx], xmm0
    394     por       xmm1, xmm5
    395     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
    396     pshufb    xmm3, xmm4
    397     movdqa    [edx + 16], xmm1
    398     por       xmm3, xmm5
    399     sub       ecx, 16
    400     movdqa    [edx + 48], xmm3
    401     lea       edx, [edx + 64]
    402     jg        convertloop
    403     ret
    404   }
    405 }
    406 
    407 __declspec(naked) __declspec(align(16))
    408 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
    409                         int pix) {
    410   __asm {
    411     mov       eax, [esp + 4]   // src_raw
    412     mov       edx, [esp + 8]   // dst_argb
    413     mov       ecx, [esp + 12]  // pix
    414     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
    415     pslld     xmm5, 24
    416     movdqa    xmm4, kShuffleMaskRAWToARGB
    417 
    418     align      4
    419  convertloop:
    420     movdqu    xmm0, [eax]
    421     movdqu    xmm1, [eax + 16]
    422     movdqu    xmm3, [eax + 32]
    423     lea       eax, [eax + 48]
    424     movdqa    xmm2, xmm3
    425     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
    426     pshufb    xmm2, xmm4
    427     por       xmm2, xmm5
    428     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
    429     pshufb    xmm0, xmm4
    430     movdqa    [edx + 32], xmm2
    431     por       xmm0, xmm5
    432     pshufb    xmm1, xmm4
    433     movdqa    [edx], xmm0
    434     por       xmm1, xmm5
    435     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
    436     pshufb    xmm3, xmm4
    437     movdqa    [edx + 16], xmm1
    438     por       xmm3, xmm5
    439     sub       ecx, 16
    440     movdqa    [edx + 48], xmm3
    441     lea       edx, [edx + 64]
    442     jg        convertloop
    443     ret
    444   }
    445 }
    446 
    447 // pmul method to replicate bits.
    448 // Math to replicate bits:
    449 // (v << 8) | (v << 3)
    450 // v * 256 + v * 8
    451 // v * (256 + 8)
    452 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
    453 // 20 instructions.
    454 __declspec(naked) __declspec(align(16))
    455 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
    456                           int pix) {
    457   __asm {
    458     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
    459     movd      xmm5, eax
    460     pshufd    xmm5, xmm5, 0
    461     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
    462     movd      xmm6, eax
    463     pshufd    xmm6, xmm6, 0
    464     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
    465     psllw     xmm3, 11
    466     pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
    467     psllw     xmm4, 10
    468     psrlw     xmm4, 5
    469     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
    470     psllw     xmm7, 8
    471 
    472     mov       eax, [esp + 4]   // src_rgb565
    473     mov       edx, [esp + 8]   // dst_argb
    474     mov       ecx, [esp + 12]  // pix
    475     sub       edx, eax
    476     sub       edx, eax
    477 
    478     align      4
    479  convertloop:
    480     movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
    481     movdqa    xmm1, xmm0
    482     movdqa    xmm2, xmm0
    483     pand      xmm1, xmm3    // R in upper 5 bits
    484     psllw     xmm2, 11      // B in upper 5 bits
    485     pmulhuw   xmm1, xmm5    // * (256 + 8)
    486     pmulhuw   xmm2, xmm5    // * (256 + 8)
    487     psllw     xmm1, 8
    488     por       xmm1, xmm2    // RB
    489     pand      xmm0, xmm4    // G in middle 6 bits
    490     pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
    491     por       xmm0, xmm7    // AG
    492     movdqa    xmm2, xmm1
    493     punpcklbw xmm1, xmm0
    494     punpckhbw xmm2, xmm0
    495     movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
    496     movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
    497     lea       eax, [eax + 16]
    498     sub       ecx, 8
    499     jg        convertloop
    500     ret
    501   }
    502 }
    503 
    504 // 24 instructions
    505 __declspec(naked) __declspec(align(16))
    506 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
    507                             int pix) {
    508   __asm {
    509     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
    510     movd      xmm5, eax
    511     pshufd    xmm5, xmm5, 0
    512     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
    513     movd      xmm6, eax
    514     pshufd    xmm6, xmm6, 0
    515     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
    516     psllw     xmm3, 11
    517     movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
    518     psrlw     xmm4, 6
    519     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
    520     psllw     xmm7, 8
    521 
    522     mov       eax, [esp + 4]   // src_argb1555
    523     mov       edx, [esp + 8]   // dst_argb
    524     mov       ecx, [esp + 12]  // pix
    525     sub       edx, eax
    526     sub       edx, eax
    527 
    528     align      4
    529  convertloop:
    530     movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
    531     movdqa    xmm1, xmm0
    532     movdqa    xmm2, xmm0
    533     psllw     xmm1, 1       // R in upper 5 bits
    534     psllw     xmm2, 11      // B in upper 5 bits
    535     pand      xmm1, xmm3
    536     pmulhuw   xmm2, xmm5    // * (256 + 8)
    537     pmulhuw   xmm1, xmm5    // * (256 + 8)
    538     psllw     xmm1, 8
    539     por       xmm1, xmm2    // RB
    540     movdqa    xmm2, xmm0
    541     pand      xmm0, xmm4    // G in middle 5 bits
    542     psraw     xmm2, 8       // A
    543     pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
    544     pand      xmm2, xmm7
    545     por       xmm0, xmm2    // AG
    546     movdqa    xmm2, xmm1
    547     punpcklbw xmm1, xmm0
    548     punpckhbw xmm2, xmm0
    549     movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
    550     movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
    551     lea       eax, [eax + 16]
    552     sub       ecx, 8
    553     jg        convertloop
    554     ret
    555   }
    556 }
    557 
    558 // 18 instructions.
    559 __declspec(naked) __declspec(align(16))
    560 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
    561                             int pix) {
    562   __asm {
    563     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
    564     movd      xmm4, eax
    565     pshufd    xmm4, xmm4, 0
    566     movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
    567     pslld     xmm5, 4
    568     mov       eax, [esp + 4]   // src_argb4444
    569     mov       edx, [esp + 8]   // dst_argb
    570     mov       ecx, [esp + 12]  // pix
    571     sub       edx, eax
    572     sub       edx, eax
    573 
    574     align      4
    575  convertloop:
    576     movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
    577     movdqa    xmm2, xmm0
    578     pand      xmm0, xmm4    // mask low nibbles
    579     pand      xmm2, xmm5    // mask high nibbles
    580     movdqa    xmm1, xmm0
    581     movdqa    xmm3, xmm2
    582     psllw     xmm1, 4
    583     psrlw     xmm3, 4
    584     por       xmm0, xmm1
    585     por       xmm2, xmm3
    586     movdqa    xmm1, xmm0
    587     punpcklbw xmm0, xmm2
    588     punpckhbw xmm1, xmm2
    589     movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
    590     movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
    591     lea       eax, [eax + 16]
    592     sub       ecx, 8
    593     jg        convertloop
    594     ret
    595   }
    596 }
    597 
    598 __declspec(naked) __declspec(align(16))
    599 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
    600   __asm {
    601     mov       eax, [esp + 4]   // src_argb
    602     mov       edx, [esp + 8]   // dst_rgb
    603     mov       ecx, [esp + 12]  // pix
    604     movdqa    xmm6, kShuffleMaskARGBToRGB24
    605 
    606     align      4
    607  convertloop:
    608     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
    609     movdqu    xmm1, [eax + 16]
    610     movdqu    xmm2, [eax + 32]
    611     movdqu    xmm3, [eax + 48]
    612     lea       eax, [eax + 64]
    613     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
    614     pshufb    xmm1, xmm6
    615     pshufb    xmm2, xmm6
    616     pshufb    xmm3, xmm6
    617     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
    618     psrldq    xmm1, 4      // 8 bytes from 1
    619     pslldq    xmm4, 12     // 4 bytes from 1 for 0
    620     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
    621     por       xmm0, xmm4   // 4 bytes from 1 for 0
    622     pslldq    xmm5, 8      // 8 bytes from 2 for 1
    623     movdqu    [edx], xmm0  // store 0
    624     por       xmm1, xmm5   // 8 bytes from 2 for 1
    625     psrldq    xmm2, 8      // 4 bytes from 2
    626     pslldq    xmm3, 4      // 12 bytes from 3 for 2
    627     por       xmm2, xmm3   // 12 bytes from 3 for 2
    628     movdqu    [edx + 16], xmm1   // store 1
    629     movdqu    [edx + 32], xmm2   // store 2
    630     lea       edx, [edx + 48]
    631     sub       ecx, 16
    632     jg        convertloop
    633     ret
    634   }
    635 }
    636 
    637 __declspec(naked) __declspec(align(16))
    638 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
    639   __asm {
    640     mov       eax, [esp + 4]   // src_argb
    641     mov       edx, [esp + 8]   // dst_rgb
    642     mov       ecx, [esp + 12]  // pix
    643     movdqa    xmm6, kShuffleMaskARGBToRAW
    644 
    645     align      4
    646  convertloop:
    647     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
    648     movdqu    xmm1, [eax + 16]
    649     movdqu    xmm2, [eax + 32]
    650     movdqu    xmm3, [eax + 48]
    651     lea       eax, [eax + 64]
    652     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
    653     pshufb    xmm1, xmm6
    654     pshufb    xmm2, xmm6
    655     pshufb    xmm3, xmm6
    656     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
    657     psrldq    xmm1, 4      // 8 bytes from 1
    658     pslldq    xmm4, 12     // 4 bytes from 1 for 0
    659     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
    660     por       xmm0, xmm4   // 4 bytes from 1 for 0
    661     pslldq    xmm5, 8      // 8 bytes from 2 for 1
    662     movdqu    [edx], xmm0  // store 0
    663     por       xmm1, xmm5   // 8 bytes from 2 for 1
    664     psrldq    xmm2, 8      // 4 bytes from 2
    665     pslldq    xmm3, 4      // 12 bytes from 3 for 2
    666     por       xmm2, xmm3   // 12 bytes from 3 for 2
    667     movdqu    [edx + 16], xmm1   // store 1
    668     movdqu    [edx + 32], xmm2   // store 2
    669     lea       edx, [edx + 48]
    670     sub       ecx, 16
    671     jg        convertloop
    672     ret
    673   }
    674 }
    675 
    676 __declspec(naked) __declspec(align(16))
    677 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
    678   __asm {
    679     mov       eax, [esp + 4]   // src_argb
    680     mov       edx, [esp + 8]   // dst_rgb
    681     mov       ecx, [esp + 12]  // pix
    682     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
    683     psrld     xmm3, 27
    684     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
    685     psrld     xmm4, 26
    686     pslld     xmm4, 5
    687     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
    688     pslld     xmm5, 11
    689 
    690     align      4
    691  convertloop:
    692     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
    693     movdqa    xmm1, xmm0    // B
    694     movdqa    xmm2, xmm0    // G
    695     pslld     xmm0, 8       // R
    696     psrld     xmm1, 3       // B
    697     psrld     xmm2, 5       // G
    698     psrad     xmm0, 16      // R
    699     pand      xmm1, xmm3    // B
    700     pand      xmm2, xmm4    // G
    701     pand      xmm0, xmm5    // R
    702     por       xmm1, xmm2    // BG
    703     por       xmm0, xmm1    // BGR
    704     packssdw  xmm0, xmm0
    705     lea       eax, [eax + 16]
    706     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
    707     lea       edx, [edx + 8]
    708     sub       ecx, 4
    709     jg        convertloop
    710     ret
    711   }
    712 }
    713 
    714 // TODO(fbarchard): Improve sign extension/packing.
    715 __declspec(naked) __declspec(align(16))
    716 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
    717   __asm {
    718     mov       eax, [esp + 4]   // src_argb
    719     mov       edx, [esp + 8]   // dst_rgb
    720     mov       ecx, [esp + 12]  // pix
    721     pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
    722     psrld     xmm4, 27
    723     movdqa    xmm5, xmm4       // generate mask 0x000003e0
    724     pslld     xmm5, 5
    725     movdqa    xmm6, xmm4       // generate mask 0x00007c00
    726     pslld     xmm6, 10
    727     pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
    728     pslld     xmm7, 15
    729 
    730     align      4
    731  convertloop:
    732     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
    733     movdqa    xmm1, xmm0    // B
    734     movdqa    xmm2, xmm0    // G
    735     movdqa    xmm3, xmm0    // R
    736     psrad     xmm0, 16      // A
    737     psrld     xmm1, 3       // B
    738     psrld     xmm2, 6       // G
    739     psrld     xmm3, 9       // R
    740     pand      xmm0, xmm7    // A
    741     pand      xmm1, xmm4    // B
    742     pand      xmm2, xmm5    // G
    743     pand      xmm3, xmm6    // R
    744     por       xmm0, xmm1    // BA
    745     por       xmm2, xmm3    // GR
    746     por       xmm0, xmm2    // BGRA
    747     packssdw  xmm0, xmm0
    748     lea       eax, [eax + 16]
    749     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
    750     lea       edx, [edx + 8]
    751     sub       ecx, 4
    752     jg        convertloop
    753     ret
    754   }
    755 }
    756 
    757 __declspec(naked) __declspec(align(16))
    758 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
    759   __asm {
    760     mov       eax, [esp + 4]   // src_argb
    761     mov       edx, [esp + 8]   // dst_rgb
    762     mov       ecx, [esp + 12]  // pix
    763     pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
    764     psllw     xmm4, 12
    765     movdqa    xmm3, xmm4       // generate mask 0x00f000f0
    766     psrlw     xmm3, 8
    767 
    768     align      4
    769  convertloop:
    770     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
    771     movdqa    xmm1, xmm0
    772     pand      xmm0, xmm3    // low nibble
    773     pand      xmm1, xmm4    // high nibble
    774     psrl      xmm0, 4
    775     psrl      xmm1, 8
    776     por       xmm0, xmm1
    777     packuswb  xmm0, xmm0
    778     lea       eax, [eax + 16]
    779     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
    780     lea       edx, [edx + 8]
    781     sub       ecx, 4
    782     jg        convertloop
    783     ret
    784   }
    785 }
    786 
    787 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
    788 __declspec(naked) __declspec(align(16))
    789 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    790   __asm {
    791     mov        eax, [esp + 4]   /* src_argb */
    792     mov        edx, [esp + 8]   /* dst_y */
    793     mov        ecx, [esp + 12]  /* pix */
    794     movdqa     xmm5, kAddY16
    795     movdqa     xmm4, kARGBToY
    796 
    797     align      4
    798  convertloop:
    799     movdqa     xmm0, [eax]
    800     movdqa     xmm1, [eax + 16]
    801     movdqa     xmm2, [eax + 32]
    802     movdqa     xmm3, [eax + 48]
    803     pmaddubsw  xmm0, xmm4
    804     pmaddubsw  xmm1, xmm4
    805     pmaddubsw  xmm2, xmm4
    806     pmaddubsw  xmm3, xmm4
    807     lea        eax, [eax + 64]
    808     phaddw     xmm0, xmm1
    809     phaddw     xmm2, xmm3
    810     psrlw      xmm0, 7
    811     psrlw      xmm2, 7
    812     packuswb   xmm0, xmm2
    813     paddb      xmm0, xmm5
    814     sub        ecx, 16
    815     movdqa     [edx], xmm0
    816     lea        edx, [edx + 16]
    817     jg         convertloop
    818     ret
    819   }
    820 }
    821 
    822 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
    823 __declspec(naked) __declspec(align(16))
    824 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    825   __asm {
    826     mov        eax, [esp + 4]   /* src_argb */
    827     mov        edx, [esp + 8]   /* dst_y */
    828     mov        ecx, [esp + 12]  /* pix */
    829     movdqa     xmm4, kARGBToYJ
    830     movdqa     xmm5, kAddYJ64
    831 
    832     align      4
    833  convertloop:
    834     movdqa     xmm0, [eax]
    835     movdqa     xmm1, [eax + 16]
    836     movdqa     xmm2, [eax + 32]
    837     movdqa     xmm3, [eax + 48]
    838     pmaddubsw  xmm0, xmm4
    839     pmaddubsw  xmm1, xmm4
    840     pmaddubsw  xmm2, xmm4
    841     pmaddubsw  xmm3, xmm4
    842     lea        eax, [eax + 64]
    843     phaddw     xmm0, xmm1
    844     phaddw     xmm2, xmm3
    845     paddw      xmm0, xmm5  // Add .5 for rounding.
    846     paddw      xmm2, xmm5
    847     psrlw      xmm0, 7
    848     psrlw      xmm2, 7
    849     packuswb   xmm0, xmm2
    850     sub        ecx, 16
    851     movdqa     [edx], xmm0
    852     lea        edx, [edx + 16]
    853     jg         convertloop
    854     ret
    855   }
    856 }
    857 
    858 #ifdef HAS_ARGBTOYROW_AVX2
    859 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
    860 __declspec(naked) __declspec(align(32))
    861 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
    862   __asm {
    863     mov        eax, [esp + 4]   /* src_argb */
    864     mov        edx, [esp + 8]   /* dst_y */
    865     mov        ecx, [esp + 12]  /* pix */
    866     vbroadcastf128 ymm4, kARGBToY
    867     vbroadcastf128 ymm5, kAddY16
    868     vmovdqa    ymm6, kPermdARGBToY_AVX
    869 
    870     align      4
    871  convertloop:
    872     vmovdqu    ymm0, [eax]
    873     vmovdqu    ymm1, [eax + 32]
    874     vmovdqu    ymm2, [eax + 64]
    875     vmovdqu    ymm3, [eax + 96]
    876     vpmaddubsw ymm0, ymm0, ymm4
    877     vpmaddubsw ymm1, ymm1, ymm4
    878     vpmaddubsw ymm2, ymm2, ymm4
    879     vpmaddubsw ymm3, ymm3, ymm4
    880     lea        eax, [eax + 128]
    881     vphaddw    ymm0, ymm0, ymm1  // mutates.
    882     vphaddw    ymm2, ymm2, ymm3
    883     vpsrlw     ymm0, ymm0, 7
    884     vpsrlw     ymm2, ymm2, 7
    885     vpackuswb  ymm0, ymm0, ymm2  // mutates.
    886     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
    887     vpaddb     ymm0, ymm0, ymm5
    888     sub        ecx, 32
    889     vmovdqu    [edx], ymm0
    890     lea        edx, [edx + 32]
    891     jg         convertloop
    892     vzeroupper
    893     ret
    894   }
    895 }
    896 #endif  //  HAS_ARGBTOYROW_AVX2
    897 
    898 #ifdef HAS_ARGBTOYROW_AVX2
    899 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
    900 __declspec(naked) __declspec(align(32))
    901 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
    902   __asm {
    903     mov        eax, [esp + 4]   /* src_argb */
    904     mov        edx, [esp + 8]   /* dst_y */
    905     mov        ecx, [esp + 12]  /* pix */
    906     vbroadcastf128 ymm4, kARGBToYJ
    907     vbroadcastf128 ymm5, kAddYJ64
    908     vmovdqa    ymm6, kPermdARGBToY_AVX
    909 
    910     align      4
    911  convertloop:
    912     vmovdqu    ymm0, [eax]
    913     vmovdqu    ymm1, [eax + 32]
    914     vmovdqu    ymm2, [eax + 64]
    915     vmovdqu    ymm3, [eax + 96]
    916     vpmaddubsw ymm0, ymm0, ymm4
    917     vpmaddubsw ymm1, ymm1, ymm4
    918     vpmaddubsw ymm2, ymm2, ymm4
    919     vpmaddubsw ymm3, ymm3, ymm4
    920     lea        eax, [eax + 128]
    921     vphaddw    ymm0, ymm0, ymm1  // mutates.
    922     vphaddw    ymm2, ymm2, ymm3
    923     vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
    924     vpaddw     ymm2, ymm2, ymm5
    925     vpsrlw     ymm0, ymm0, 7
    926     vpsrlw     ymm2, ymm2, 7
    927     vpackuswb  ymm0, ymm0, ymm2  // mutates.
    928     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
    929     sub        ecx, 32
    930     vmovdqu    [edx], ymm0
    931     lea        edx, [edx + 32]
    932     jg         convertloop
    933 
    934     vzeroupper
    935     ret
    936   }
    937 }
    938 #endif  //  HAS_ARGBTOYJROW_AVX2
    939 
    940 __declspec(naked) __declspec(align(16))
    941 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    942   __asm {
    943     mov        eax, [esp + 4]   /* src_argb */
    944     mov        edx, [esp + 8]   /* dst_y */
    945     mov        ecx, [esp + 12]  /* pix */
    946     movdqa     xmm5, kAddY16
    947     movdqa     xmm4, kARGBToY
    948 
    949     align      4
    950  convertloop:
    951     movdqu     xmm0, [eax]
    952     movdqu     xmm1, [eax + 16]
    953     movdqu     xmm2, [eax + 32]
    954     movdqu     xmm3, [eax + 48]
    955     pmaddubsw  xmm0, xmm4
    956     pmaddubsw  xmm1, xmm4
    957     pmaddubsw  xmm2, xmm4
    958     pmaddubsw  xmm3, xmm4
    959     lea        eax, [eax + 64]
    960     phaddw     xmm0, xmm1
    961     phaddw     xmm2, xmm3
    962     psrlw      xmm0, 7
    963     psrlw      xmm2, 7
    964     packuswb   xmm0, xmm2
    965     paddb      xmm0, xmm5
    966     sub        ecx, 16
    967     movdqu     [edx], xmm0
    968     lea        edx, [edx + 16]
    969     jg         convertloop
    970     ret
    971   }
    972 }
    973 
    974 __declspec(naked) __declspec(align(16))
    975 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    976   __asm {
    977     mov        eax, [esp + 4]   /* src_argb */
    978     mov        edx, [esp + 8]   /* dst_y */
    979     mov        ecx, [esp + 12]  /* pix */
    980     movdqa     xmm4, kARGBToYJ
    981     movdqa     xmm5, kAddYJ64
    982 
    983     align      4
    984  convertloop:
    985     movdqu     xmm0, [eax]
    986     movdqu     xmm1, [eax + 16]
    987     movdqu     xmm2, [eax + 32]
    988     movdqu     xmm3, [eax + 48]
    989     pmaddubsw  xmm0, xmm4
    990     pmaddubsw  xmm1, xmm4
    991     pmaddubsw  xmm2, xmm4
    992     pmaddubsw  xmm3, xmm4
    993     lea        eax, [eax + 64]
    994     phaddw     xmm0, xmm1
    995     phaddw     xmm2, xmm3
    996     paddw      xmm0, xmm5
    997     paddw      xmm2, xmm5
    998     psrlw      xmm0, 7
    999     psrlw      xmm2, 7
   1000     packuswb   xmm0, xmm2
   1001     sub        ecx, 16
   1002     movdqu     [edx], xmm0
   1003     lea        edx, [edx + 16]
   1004     jg         convertloop
   1005     ret
   1006   }
   1007 }
   1008 
   1009 __declspec(naked) __declspec(align(16))
   1010 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1011   __asm {
   1012     mov        eax, [esp + 4]   /* src_argb */
   1013     mov        edx, [esp + 8]   /* dst_y */
   1014     mov        ecx, [esp + 12]  /* pix */
   1015     movdqa     xmm5, kAddY16
   1016     movdqa     xmm4, kBGRAToY
   1017 
   1018     align      4
   1019  convertloop:
   1020     movdqa     xmm0, [eax]
   1021     movdqa     xmm1, [eax + 16]
   1022     movdqa     xmm2, [eax + 32]
   1023     movdqa     xmm3, [eax + 48]
   1024     pmaddubsw  xmm0, xmm4
   1025     pmaddubsw  xmm1, xmm4
   1026     pmaddubsw  xmm2, xmm4
   1027     pmaddubsw  xmm3, xmm4
   1028     lea        eax, [eax + 64]
   1029     phaddw     xmm0, xmm1
   1030     phaddw     xmm2, xmm3
   1031     psrlw      xmm0, 7
   1032     psrlw      xmm2, 7
   1033     packuswb   xmm0, xmm2
   1034     paddb      xmm0, xmm5
   1035     sub        ecx, 16
   1036     movdqa     [edx], xmm0
   1037     lea        edx, [edx + 16]
   1038     jg         convertloop
   1039     ret
   1040   }
   1041 }
   1042 
   1043 __declspec(naked) __declspec(align(16))
   1044 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1045   __asm {
   1046     mov        eax, [esp + 4]   /* src_argb */
   1047     mov        edx, [esp + 8]   /* dst_y */
   1048     mov        ecx, [esp + 12]  /* pix */
   1049     movdqa     xmm5, kAddY16
   1050     movdqa     xmm4, kBGRAToY
   1051 
   1052     align      4
   1053  convertloop:
   1054     movdqu     xmm0, [eax]
   1055     movdqu     xmm1, [eax + 16]
   1056     movdqu     xmm2, [eax + 32]
   1057     movdqu     xmm3, [eax + 48]
   1058     pmaddubsw  xmm0, xmm4
   1059     pmaddubsw  xmm1, xmm4
   1060     pmaddubsw  xmm2, xmm4
   1061     pmaddubsw  xmm3, xmm4
   1062     lea        eax, [eax + 64]
   1063     phaddw     xmm0, xmm1
   1064     phaddw     xmm2, xmm3
   1065     psrlw      xmm0, 7
   1066     psrlw      xmm2, 7
   1067     packuswb   xmm0, xmm2
   1068     paddb      xmm0, xmm5
   1069     sub        ecx, 16
   1070     movdqu     [edx], xmm0
   1071     lea        edx, [edx + 16]
   1072     jg         convertloop
   1073     ret
   1074   }
   1075 }
   1076 
   1077 __declspec(naked) __declspec(align(16))
   1078 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1079   __asm {
   1080     mov        eax, [esp + 4]   /* src_argb */
   1081     mov        edx, [esp + 8]   /* dst_y */
   1082     mov        ecx, [esp + 12]  /* pix */
   1083     movdqa     xmm5, kAddY16
   1084     movdqa     xmm4, kABGRToY
   1085 
   1086     align      4
   1087  convertloop:
   1088     movdqa     xmm0, [eax]
   1089     movdqa     xmm1, [eax + 16]
   1090     movdqa     xmm2, [eax + 32]
   1091     movdqa     xmm3, [eax + 48]
   1092     pmaddubsw  xmm0, xmm4
   1093     pmaddubsw  xmm1, xmm4
   1094     pmaddubsw  xmm2, xmm4
   1095     pmaddubsw  xmm3, xmm4
   1096     lea        eax, [eax + 64]
   1097     phaddw     xmm0, xmm1
   1098     phaddw     xmm2, xmm3
   1099     psrlw      xmm0, 7
   1100     psrlw      xmm2, 7
   1101     packuswb   xmm0, xmm2
   1102     paddb      xmm0, xmm5
   1103     sub        ecx, 16
   1104     movdqa     [edx], xmm0
   1105     lea        edx, [edx + 16]
   1106     jg         convertloop
   1107     ret
   1108   }
   1109 }
   1110 
   1111 __declspec(naked) __declspec(align(16))
   1112 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1113   __asm {
   1114     mov        eax, [esp + 4]   /* src_argb */
   1115     mov        edx, [esp + 8]   /* dst_y */
   1116     mov        ecx, [esp + 12]  /* pix */
   1117     movdqa     xmm5, kAddY16
   1118     movdqa     xmm4, kABGRToY
   1119 
   1120     align      4
   1121  convertloop:
   1122     movdqu     xmm0, [eax]
   1123     movdqu     xmm1, [eax + 16]
   1124     movdqu     xmm2, [eax + 32]
   1125     movdqu     xmm3, [eax + 48]
   1126     pmaddubsw  xmm0, xmm4
   1127     pmaddubsw  xmm1, xmm4
   1128     pmaddubsw  xmm2, xmm4
   1129     pmaddubsw  xmm3, xmm4
   1130     lea        eax, [eax + 64]
   1131     phaddw     xmm0, xmm1
   1132     phaddw     xmm2, xmm3
   1133     psrlw      xmm0, 7
   1134     psrlw      xmm2, 7
   1135     packuswb   xmm0, xmm2
   1136     paddb      xmm0, xmm5
   1137     sub        ecx, 16
   1138     movdqu     [edx], xmm0
   1139     lea        edx, [edx + 16]
   1140     jg         convertloop
   1141     ret
   1142   }
   1143 }
   1144 
   1145 __declspec(naked) __declspec(align(16))
   1146 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1147   __asm {
   1148     mov        eax, [esp + 4]   /* src_argb */
   1149     mov        edx, [esp + 8]   /* dst_y */
   1150     mov        ecx, [esp + 12]  /* pix */
   1151     movdqa     xmm5, kAddY16
   1152     movdqa     xmm4, kRGBAToY
   1153 
   1154     align      4
   1155  convertloop:
   1156     movdqa     xmm0, [eax]
   1157     movdqa     xmm1, [eax + 16]
   1158     movdqa     xmm2, [eax + 32]
   1159     movdqa     xmm3, [eax + 48]
   1160     pmaddubsw  xmm0, xmm4
   1161     pmaddubsw  xmm1, xmm4
   1162     pmaddubsw  xmm2, xmm4
   1163     pmaddubsw  xmm3, xmm4
   1164     lea        eax, [eax + 64]
   1165     phaddw     xmm0, xmm1
   1166     phaddw     xmm2, xmm3
   1167     psrlw      xmm0, 7
   1168     psrlw      xmm2, 7
   1169     packuswb   xmm0, xmm2
   1170     paddb      xmm0, xmm5
   1171     sub        ecx, 16
   1172     movdqa     [edx], xmm0
   1173     lea        edx, [edx + 16]
   1174     jg         convertloop
   1175     ret
   1176   }
   1177 }
   1178 
   1179 __declspec(naked) __declspec(align(16))
   1180 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1181   __asm {
   1182     mov        eax, [esp + 4]   /* src_argb */
   1183     mov        edx, [esp + 8]   /* dst_y */
   1184     mov        ecx, [esp + 12]  /* pix */
   1185     movdqa     xmm5, kAddY16
   1186     movdqa     xmm4, kRGBAToY
   1187 
   1188     align      4
   1189  convertloop:
   1190     movdqu     xmm0, [eax]
   1191     movdqu     xmm1, [eax + 16]
   1192     movdqu     xmm2, [eax + 32]
   1193     movdqu     xmm3, [eax + 48]
   1194     pmaddubsw  xmm0, xmm4
   1195     pmaddubsw  xmm1, xmm4
   1196     pmaddubsw  xmm2, xmm4
   1197     pmaddubsw  xmm3, xmm4
   1198     lea        eax, [eax + 64]
   1199     phaddw     xmm0, xmm1
   1200     phaddw     xmm2, xmm3
   1201     psrlw      xmm0, 7
   1202     psrlw      xmm2, 7
   1203     packuswb   xmm0, xmm2
   1204     paddb      xmm0, xmm5
   1205     sub        ecx, 16
   1206     movdqu     [edx], xmm0
   1207     lea        edx, [edx + 16]
   1208     jg         convertloop
   1209     ret
   1210   }
   1211 }
   1212 
   1213 __declspec(naked) __declspec(align(16))
   1214 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1215                        uint8* dst_u, uint8* dst_v, int width) {
   1216   __asm {
   1217     push       esi
   1218     push       edi
   1219     mov        eax, [esp + 8 + 4]   // src_argb
   1220     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1221     mov        edx, [esp + 8 + 12]  // dst_u
   1222     mov        edi, [esp + 8 + 16]  // dst_v
   1223     mov        ecx, [esp + 8 + 20]  // pix
   1224     movdqa     xmm7, kARGBToU
   1225     movdqa     xmm6, kARGBToV
   1226     movdqa     xmm5, kAddUV128
   1227     sub        edi, edx             // stride from u to v
   1228 
   1229     align      4
   1230  convertloop:
   1231     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1232     movdqa     xmm0, [eax]
   1233     movdqa     xmm1, [eax + 16]
   1234     movdqa     xmm2, [eax + 32]
   1235     movdqa     xmm3, [eax + 48]
   1236     pavgb      xmm0, [eax + esi]
   1237     pavgb      xmm1, [eax + esi + 16]
   1238     pavgb      xmm2, [eax + esi + 32]
   1239     pavgb      xmm3, [eax + esi + 48]
   1240     lea        eax,  [eax + 64]
   1241     movdqa     xmm4, xmm0
   1242     shufps     xmm0, xmm1, 0x88
   1243     shufps     xmm4, xmm1, 0xdd
   1244     pavgb      xmm0, xmm4
   1245     movdqa     xmm4, xmm2
   1246     shufps     xmm2, xmm3, 0x88
   1247     shufps     xmm4, xmm3, 0xdd
   1248     pavgb      xmm2, xmm4
   1249 
   1250     // step 2 - convert to U and V
   1251     // from here down is very similar to Y code except
   1252     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1253     movdqa     xmm1, xmm0
   1254     movdqa     xmm3, xmm2
   1255     pmaddubsw  xmm0, xmm7  // U
   1256     pmaddubsw  xmm2, xmm7
   1257     pmaddubsw  xmm1, xmm6  // V
   1258     pmaddubsw  xmm3, xmm6
   1259     phaddw     xmm0, xmm2
   1260     phaddw     xmm1, xmm3
   1261     psraw      xmm0, 8
   1262     psraw      xmm1, 8
   1263     packsswb   xmm0, xmm1
   1264     paddb      xmm0, xmm5            // -> unsigned
   1265 
   1266     // step 3 - store 8 U and 8 V values
   1267     sub        ecx, 16
   1268     movlps     qword ptr [edx], xmm0 // U
   1269     movhps     qword ptr [edx + edi], xmm0 // V
   1270     lea        edx, [edx + 8]
   1271     jg         convertloop
   1272 
   1273     pop        edi
   1274     pop        esi
   1275     ret
   1276   }
   1277 }
   1278 
   1279 __declspec(naked) __declspec(align(16))
   1280 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1281                         uint8* dst_u, uint8* dst_v, int width) {
   1282   __asm {
   1283     push       esi
   1284     push       edi
   1285     mov        eax, [esp + 8 + 4]   // src_argb
   1286     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1287     mov        edx, [esp + 8 + 12]  // dst_u
   1288     mov        edi, [esp + 8 + 16]  // dst_v
   1289     mov        ecx, [esp + 8 + 20]  // pix
   1290     movdqa     xmm7, kARGBToUJ
   1291     movdqa     xmm6, kARGBToVJ
   1292     movdqa     xmm5, kAddUVJ128
   1293     sub        edi, edx             // stride from u to v
   1294 
   1295     align      4
   1296  convertloop:
   1297     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1298     movdqa     xmm0, [eax]
   1299     movdqa     xmm1, [eax + 16]
   1300     movdqa     xmm2, [eax + 32]
   1301     movdqa     xmm3, [eax + 48]
   1302     pavgb      xmm0, [eax + esi]
   1303     pavgb      xmm1, [eax + esi + 16]
   1304     pavgb      xmm2, [eax + esi + 32]
   1305     pavgb      xmm3, [eax + esi + 48]
   1306     lea        eax,  [eax + 64]
   1307     movdqa     xmm4, xmm0
   1308     shufps     xmm0, xmm1, 0x88
   1309     shufps     xmm4, xmm1, 0xdd
   1310     pavgb      xmm0, xmm4
   1311     movdqa     xmm4, xmm2
   1312     shufps     xmm2, xmm3, 0x88
   1313     shufps     xmm4, xmm3, 0xdd
   1314     pavgb      xmm2, xmm4
   1315 
   1316     // step 2 - convert to U and V
   1317     // from here down is very similar to Y code except
   1318     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1319     movdqa     xmm1, xmm0
   1320     movdqa     xmm3, xmm2
   1321     pmaddubsw  xmm0, xmm7  // U
   1322     pmaddubsw  xmm2, xmm7
   1323     pmaddubsw  xmm1, xmm6  // V
   1324     pmaddubsw  xmm3, xmm6
   1325     phaddw     xmm0, xmm2
   1326     phaddw     xmm1, xmm3
   1327     paddw      xmm0, xmm5            // +.5 rounding -> unsigned
   1328     paddw      xmm1, xmm5
   1329     psraw      xmm0, 8
   1330     psraw      xmm1, 8
   1331     packsswb   xmm0, xmm1
   1332 
   1333     // step 3 - store 8 U and 8 V values
   1334     sub        ecx, 16
   1335     movlps     qword ptr [edx], xmm0 // U
   1336     movhps     qword ptr [edx + edi], xmm0 // V
   1337     lea        edx, [edx + 8]
   1338     jg         convertloop
   1339 
   1340     pop        edi
   1341     pop        esi
   1342     ret
   1343   }
   1344 }
   1345 
   1346 #ifdef HAS_ARGBTOUVROW_AVX2
   1347 __declspec(naked) __declspec(align(32))
   1348 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
   1349                       uint8* dst_u, uint8* dst_v, int width) {
   1350   __asm {
   1351     push       esi
   1352     push       edi
   1353     mov        eax, [esp + 8 + 4]   // src_argb
   1354     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1355     mov        edx, [esp + 8 + 12]  // dst_u
   1356     mov        edi, [esp + 8 + 16]  // dst_v
   1357     mov        ecx, [esp + 8 + 20]  // pix
   1358     vbroadcastf128 ymm5, kAddUV128
   1359     vbroadcastf128 ymm6, kARGBToV
   1360     vbroadcastf128 ymm7, kARGBToU
   1361     sub        edi, edx             // stride from u to v
   1362 
   1363     align      4
   1364  convertloop:
   1365     /* step 1 - subsample 32x2 argb pixels to 16x1 */
   1366     vmovdqu    ymm0, [eax]
   1367     vmovdqu    ymm1, [eax + 32]
   1368     vmovdqu    ymm2, [eax + 64]
   1369     vmovdqu    ymm3, [eax + 96]
   1370     vpavgb     ymm0, ymm0, [eax + esi]
   1371     vpavgb     ymm1, ymm1, [eax + esi + 32]
   1372     vpavgb     ymm2, ymm2, [eax + esi + 64]
   1373     vpavgb     ymm3, ymm3, [eax + esi + 96]
   1374     lea        eax,  [eax + 128]
   1375     vshufps    ymm4, ymm0, ymm1, 0x88
   1376     vshufps    ymm0, ymm0, ymm1, 0xdd
   1377     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
   1378     vshufps    ymm4, ymm2, ymm3, 0x88
   1379     vshufps    ymm2, ymm2, ymm3, 0xdd
   1380     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
   1381 
   1382     // step 2 - convert to U and V
   1383     // from here down is very similar to Y code except
   1384     // instead of 32 different pixels, its 16 pixels of U and 16 of V
   1385     vpmaddubsw ymm1, ymm0, ymm7  // U
   1386     vpmaddubsw ymm3, ymm2, ymm7
   1387     vpmaddubsw ymm0, ymm0, ymm6  // V
   1388     vpmaddubsw ymm2, ymm2, ymm6
   1389     vphaddw    ymm1, ymm1, ymm3  // mutates
   1390     vphaddw    ymm0, ymm0, ymm2
   1391     vpsraw     ymm1, ymm1, 8
   1392     vpsraw     ymm0, ymm0, 8
   1393     vpacksswb  ymm0, ymm1, ymm0  // mutates
   1394     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
   1395     vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw
   1396     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
   1397 
   1398     // step 3 - store 16 U and 16 V values
   1399     sub         ecx, 32
   1400     vextractf128 [edx], ymm0, 0 // U
   1401     vextractf128 [edx + edi], ymm0, 1 // V
   1402     lea        edx, [edx + 16]
   1403     jg         convertloop
   1404 
   1405     pop        edi
   1406     pop        esi
   1407     vzeroupper
   1408     ret
   1409   }
   1410 }
   1411 #endif  // HAS_ARGBTOUVROW_AVX2
   1412 
   1413 __declspec(naked) __declspec(align(16))
   1414 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1415                                  uint8* dst_u, uint8* dst_v, int width) {
   1416   __asm {
   1417     push       esi
   1418     push       edi
   1419     mov        eax, [esp + 8 + 4]   // src_argb
   1420     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1421     mov        edx, [esp + 8 + 12]  // dst_u
   1422     mov        edi, [esp + 8 + 16]  // dst_v
   1423     mov        ecx, [esp + 8 + 20]  // pix
   1424     movdqa     xmm7, kARGBToU
   1425     movdqa     xmm6, kARGBToV
   1426     movdqa     xmm5, kAddUV128
   1427     sub        edi, edx             // stride from u to v
   1428 
   1429     align      4
   1430  convertloop:
   1431     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1432     movdqu     xmm0, [eax]
   1433     movdqu     xmm1, [eax + 16]
   1434     movdqu     xmm2, [eax + 32]
   1435     movdqu     xmm3, [eax + 48]
   1436     movdqu     xmm4, [eax + esi]
   1437     pavgb      xmm0, xmm4
   1438     movdqu     xmm4, [eax + esi + 16]
   1439     pavgb      xmm1, xmm4
   1440     movdqu     xmm4, [eax + esi + 32]
   1441     pavgb      xmm2, xmm4
   1442     movdqu     xmm4, [eax + esi + 48]
   1443     pavgb      xmm3, xmm4
   1444     lea        eax,  [eax + 64]
   1445     movdqa     xmm4, xmm0
   1446     shufps     xmm0, xmm1, 0x88
   1447     shufps     xmm4, xmm1, 0xdd
   1448     pavgb      xmm0, xmm4
   1449     movdqa     xmm4, xmm2
   1450     shufps     xmm2, xmm3, 0x88
   1451     shufps     xmm4, xmm3, 0xdd
   1452     pavgb      xmm2, xmm4
   1453 
   1454     // step 2 - convert to U and V
   1455     // from here down is very similar to Y code except
   1456     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1457     movdqa     xmm1, xmm0
   1458     movdqa     xmm3, xmm2
   1459     pmaddubsw  xmm0, xmm7  // U
   1460     pmaddubsw  xmm2, xmm7
   1461     pmaddubsw  xmm1, xmm6  // V
   1462     pmaddubsw  xmm3, xmm6
   1463     phaddw     xmm0, xmm2
   1464     phaddw     xmm1, xmm3
   1465     psraw      xmm0, 8
   1466     psraw      xmm1, 8
   1467     packsswb   xmm0, xmm1
   1468     paddb      xmm0, xmm5            // -> unsigned
   1469 
   1470     // step 3 - store 8 U and 8 V values
   1471     sub        ecx, 16
   1472     movlps     qword ptr [edx], xmm0 // U
   1473     movhps     qword ptr [edx + edi], xmm0 // V
   1474     lea        edx, [edx + 8]
   1475     jg         convertloop
   1476 
   1477     pop        edi
   1478     pop        esi
   1479     ret
   1480   }
   1481 }
   1482 
   1483 __declspec(naked) __declspec(align(16))
   1484 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1485                                  uint8* dst_u, uint8* dst_v, int width) {
   1486   __asm {
   1487     push       esi
   1488     push       edi
   1489     mov        eax, [esp + 8 + 4]   // src_argb
   1490     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1491     mov        edx, [esp + 8 + 12]  // dst_u
   1492     mov        edi, [esp + 8 + 16]  // dst_v
   1493     mov        ecx, [esp + 8 + 20]  // pix
   1494     movdqa     xmm7, kARGBToUJ
   1495     movdqa     xmm6, kARGBToVJ
   1496     movdqa     xmm5, kAddUVJ128
   1497     sub        edi, edx             // stride from u to v
   1498 
   1499     align      4
   1500  convertloop:
   1501     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1502     movdqu     xmm0, [eax]
   1503     movdqu     xmm1, [eax + 16]
   1504     movdqu     xmm2, [eax + 32]
   1505     movdqu     xmm3, [eax + 48]
   1506     movdqu     xmm4, [eax + esi]
   1507     pavgb      xmm0, xmm4
   1508     movdqu     xmm4, [eax + esi + 16]
   1509     pavgb      xmm1, xmm4
   1510     movdqu     xmm4, [eax + esi + 32]
   1511     pavgb      xmm2, xmm4
   1512     movdqu     xmm4, [eax + esi + 48]
   1513     pavgb      xmm3, xmm4
   1514     lea        eax,  [eax + 64]
   1515     movdqa     xmm4, xmm0
   1516     shufps     xmm0, xmm1, 0x88
   1517     shufps     xmm4, xmm1, 0xdd
   1518     pavgb      xmm0, xmm4
   1519     movdqa     xmm4, xmm2
   1520     shufps     xmm2, xmm3, 0x88
   1521     shufps     xmm4, xmm3, 0xdd
   1522     pavgb      xmm2, xmm4
   1523 
   1524     // step 2 - convert to U and V
   1525     // from here down is very similar to Y code except
   1526     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1527     movdqa     xmm1, xmm0
   1528     movdqa     xmm3, xmm2
   1529     pmaddubsw  xmm0, xmm7  // U
   1530     pmaddubsw  xmm2, xmm7
   1531     pmaddubsw  xmm1, xmm6  // V
   1532     pmaddubsw  xmm3, xmm6
   1533     phaddw     xmm0, xmm2
   1534     phaddw     xmm1, xmm3
   1535     paddw      xmm0, xmm5            // +.5 rounding -> unsigned
   1536     paddw      xmm1, xmm5
   1537     psraw      xmm0, 8
   1538     psraw      xmm1, 8
   1539     packsswb   xmm0, xmm1
   1540 
   1541     // step 3 - store 8 U and 8 V values
   1542     sub        ecx, 16
   1543     movlps     qword ptr [edx], xmm0 // U
   1544     movhps     qword ptr [edx + edi], xmm0 // V
   1545     lea        edx, [edx + 8]
   1546     jg         convertloop
   1547 
   1548     pop        edi
   1549     pop        esi
   1550     ret
   1551   }
   1552 }
   1553 
   1554 __declspec(naked) __declspec(align(16))
   1555 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
   1556                           uint8* dst_u, uint8* dst_v, int width) {
   1557   __asm {
   1558     push       edi
   1559     mov        eax, [esp + 4 + 4]   // src_argb
   1560     mov        edx, [esp + 4 + 8]   // dst_u
   1561     mov        edi, [esp + 4 + 12]  // dst_v
   1562     mov        ecx, [esp + 4 + 16]  // pix
   1563     movdqa     xmm7, kARGBToU
   1564     movdqa     xmm6, kARGBToV
   1565     movdqa     xmm5, kAddUV128
   1566     sub        edi, edx             // stride from u to v
   1567 
   1568     align      4
   1569  convertloop:
   1570     /* convert to U and V */
   1571     movdqa     xmm0, [eax]          // U
   1572     movdqa     xmm1, [eax + 16]
   1573     movdqa     xmm2, [eax + 32]
   1574     movdqa     xmm3, [eax + 48]
   1575     pmaddubsw  xmm0, xmm7
   1576     pmaddubsw  xmm1, xmm7
   1577     pmaddubsw  xmm2, xmm7
   1578     pmaddubsw  xmm3, xmm7
   1579     phaddw     xmm0, xmm1
   1580     phaddw     xmm2, xmm3
   1581     psraw      xmm0, 8
   1582     psraw      xmm2, 8
   1583     packsswb   xmm0, xmm2
   1584     paddb      xmm0, xmm5
   1585     sub        ecx,  16
   1586     movdqa     [edx], xmm0
   1587 
   1588     movdqa     xmm0, [eax]          // V
   1589     movdqa     xmm1, [eax + 16]
   1590     movdqa     xmm2, [eax + 32]
   1591     movdqa     xmm3, [eax + 48]
   1592     pmaddubsw  xmm0, xmm6
   1593     pmaddubsw  xmm1, xmm6
   1594     pmaddubsw  xmm2, xmm6
   1595     pmaddubsw  xmm3, xmm6
   1596     phaddw     xmm0, xmm1
   1597     phaddw     xmm2, xmm3
   1598     psraw      xmm0, 8
   1599     psraw      xmm2, 8
   1600     packsswb   xmm0, xmm2
   1601     paddb      xmm0, xmm5
   1602     lea        eax,  [eax + 64]
   1603     movdqa     [edx + edi], xmm0
   1604     lea        edx,  [edx + 16]
   1605     jg         convertloop
   1606 
   1607     pop        edi
   1608     ret
   1609   }
   1610 }
   1611 
   1612 __declspec(naked) __declspec(align(16))
   1613 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
   1614                                     uint8* dst_u, uint8* dst_v, int width) {
   1615   __asm {
   1616     push       edi
   1617     mov        eax, [esp + 4 + 4]   // src_argb
   1618     mov        edx, [esp + 4 + 8]   // dst_u
   1619     mov        edi, [esp + 4 + 12]  // dst_v
   1620     mov        ecx, [esp + 4 + 16]  // pix
   1621     movdqa     xmm7, kARGBToU
   1622     movdqa     xmm6, kARGBToV
   1623     movdqa     xmm5, kAddUV128
   1624     sub        edi, edx             // stride from u to v
   1625 
   1626     align      4
   1627  convertloop:
   1628     /* convert to U and V */
   1629     movdqu     xmm0, [eax]          // U
   1630     movdqu     xmm1, [eax + 16]
   1631     movdqu     xmm2, [eax + 32]
   1632     movdqu     xmm3, [eax + 48]
   1633     pmaddubsw  xmm0, xmm7
   1634     pmaddubsw  xmm1, xmm7
   1635     pmaddubsw  xmm2, xmm7
   1636     pmaddubsw  xmm3, xmm7
   1637     phaddw     xmm0, xmm1
   1638     phaddw     xmm2, xmm3
   1639     psraw      xmm0, 8
   1640     psraw      xmm2, 8
   1641     packsswb   xmm0, xmm2
   1642     paddb      xmm0, xmm5
   1643     sub        ecx,  16
   1644     movdqu     [edx], xmm0
   1645 
   1646     movdqu     xmm0, [eax]          // V
   1647     movdqu     xmm1, [eax + 16]
   1648     movdqu     xmm2, [eax + 32]
   1649     movdqu     xmm3, [eax + 48]
   1650     pmaddubsw  xmm0, xmm6
   1651     pmaddubsw  xmm1, xmm6
   1652     pmaddubsw  xmm2, xmm6
   1653     pmaddubsw  xmm3, xmm6
   1654     phaddw     xmm0, xmm1
   1655     phaddw     xmm2, xmm3
   1656     psraw      xmm0, 8
   1657     psraw      xmm2, 8
   1658     packsswb   xmm0, xmm2
   1659     paddb      xmm0, xmm5
   1660     lea        eax,  [eax + 64]
   1661     movdqu     [edx + edi], xmm0
   1662     lea        edx,  [edx + 16]
   1663     jg         convertloop
   1664 
   1665     pop        edi
   1666     ret
   1667   }
   1668 }
   1669 
   1670 __declspec(naked) __declspec(align(16))
   1671 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
   1672                           uint8* dst_u, uint8* dst_v, int width) {
   1673   __asm {
   1674     push       edi
   1675     mov        eax, [esp + 4 + 4]   // src_argb
   1676     mov        edx, [esp + 4 + 8]   // dst_u
   1677     mov        edi, [esp + 4 + 12]  // dst_v
   1678     mov        ecx, [esp + 4 + 16]  // pix
   1679     movdqa     xmm7, kARGBToU
   1680     movdqa     xmm6, kARGBToV
   1681     movdqa     xmm5, kAddUV128
   1682     sub        edi, edx             // stride from u to v
   1683 
   1684     align      4
   1685  convertloop:
   1686     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1687     movdqa     xmm0, [eax]
   1688     movdqa     xmm1, [eax + 16]
   1689     movdqa     xmm2, [eax + 32]
   1690     movdqa     xmm3, [eax + 48]
   1691     lea        eax,  [eax + 64]
   1692     movdqa     xmm4, xmm0
   1693     shufps     xmm0, xmm1, 0x88
   1694     shufps     xmm4, xmm1, 0xdd
   1695     pavgb      xmm0, xmm4
   1696     movdqa     xmm4, xmm2
   1697     shufps     xmm2, xmm3, 0x88
   1698     shufps     xmm4, xmm3, 0xdd
   1699     pavgb      xmm2, xmm4
   1700 
   1701     // step 2 - convert to U and V
   1702     // from here down is very similar to Y code except
   1703     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1704     movdqa     xmm1, xmm0
   1705     movdqa     xmm3, xmm2
   1706     pmaddubsw  xmm0, xmm7  // U
   1707     pmaddubsw  xmm2, xmm7
   1708     pmaddubsw  xmm1, xmm6  // V
   1709     pmaddubsw  xmm3, xmm6
   1710     phaddw     xmm0, xmm2
   1711     phaddw     xmm1, xmm3
   1712     psraw      xmm0, 8
   1713     psraw      xmm1, 8
   1714     packsswb   xmm0, xmm1
   1715     paddb      xmm0, xmm5            // -> unsigned
   1716 
   1717     // step 3 - store 8 U and 8 V values
   1718     sub        ecx, 16
   1719     movlps     qword ptr [edx], xmm0 // U
   1720     movhps     qword ptr [edx + edi], xmm0 // V
   1721     lea        edx, [edx + 8]
   1722     jg         convertloop
   1723 
   1724     pop        edi
   1725     ret
   1726   }
   1727 }
   1728 
   1729 __declspec(naked) __declspec(align(16))
   1730 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
   1731                                     uint8* dst_u, uint8* dst_v, int width) {
   1732   __asm {
   1733     push       edi
   1734     mov        eax, [esp + 4 + 4]   // src_argb
   1735     mov        edx, [esp + 4 + 8]   // dst_u
   1736     mov        edi, [esp + 4 + 12]  // dst_v
   1737     mov        ecx, [esp + 4 + 16]  // pix
   1738     movdqa     xmm7, kARGBToU
   1739     movdqa     xmm6, kARGBToV
   1740     movdqa     xmm5, kAddUV128
   1741     sub        edi, edx             // stride from u to v
   1742 
   1743     align      4
   1744  convertloop:
   1745     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1746     movdqu     xmm0, [eax]
   1747     movdqu     xmm1, [eax + 16]
   1748     movdqu     xmm2, [eax + 32]
   1749     movdqu     xmm3, [eax + 48]
   1750     lea        eax,  [eax + 64]
   1751     movdqa     xmm4, xmm0
   1752     shufps     xmm0, xmm1, 0x88
   1753     shufps     xmm4, xmm1, 0xdd
   1754     pavgb      xmm0, xmm4
   1755     movdqa     xmm4, xmm2
   1756     shufps     xmm2, xmm3, 0x88
   1757     shufps     xmm4, xmm3, 0xdd
   1758     pavgb      xmm2, xmm4
   1759 
   1760     // step 2 - convert to U and V
   1761     // from here down is very similar to Y code except
   1762     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1763     movdqa     xmm1, xmm0
   1764     movdqa     xmm3, xmm2
   1765     pmaddubsw  xmm0, xmm7  // U
   1766     pmaddubsw  xmm2, xmm7
   1767     pmaddubsw  xmm1, xmm6  // V
   1768     pmaddubsw  xmm3, xmm6
   1769     phaddw     xmm0, xmm2
   1770     phaddw     xmm1, xmm3
   1771     psraw      xmm0, 8
   1772     psraw      xmm1, 8
   1773     packsswb   xmm0, xmm1
   1774     paddb      xmm0, xmm5            // -> unsigned
   1775 
   1776     // step 3 - store 8 U and 8 V values
   1777     sub        ecx, 16
   1778     movlps     qword ptr [edx], xmm0 // U
   1779     movhps     qword ptr [edx + edi], xmm0 // V
   1780     lea        edx, [edx + 8]
   1781     jg         convertloop
   1782 
   1783     pop        edi
   1784     ret
   1785   }
   1786 }
   1787 
   1788 __declspec(naked) __declspec(align(16))
   1789 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1790                        uint8* dst_u, uint8* dst_v, int width) {
   1791   __asm {
   1792     push       esi
   1793     push       edi
   1794     mov        eax, [esp + 8 + 4]   // src_argb
   1795     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1796     mov        edx, [esp + 8 + 12]  // dst_u
   1797     mov        edi, [esp + 8 + 16]  // dst_v
   1798     mov        ecx, [esp + 8 + 20]  // pix
   1799     movdqa     xmm7, kBGRAToU
   1800     movdqa     xmm6, kBGRAToV
   1801     movdqa     xmm5, kAddUV128
   1802     sub        edi, edx             // stride from u to v
   1803 
   1804     align      4
   1805  convertloop:
   1806     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1807     movdqa     xmm0, [eax]
   1808     movdqa     xmm1, [eax + 16]
   1809     movdqa     xmm2, [eax + 32]
   1810     movdqa     xmm3, [eax + 48]
   1811     pavgb      xmm0, [eax + esi]
   1812     pavgb      xmm1, [eax + esi + 16]
   1813     pavgb      xmm2, [eax + esi + 32]
   1814     pavgb      xmm3, [eax + esi + 48]
   1815     lea        eax,  [eax + 64]
   1816     movdqa     xmm4, xmm0
   1817     shufps     xmm0, xmm1, 0x88
   1818     shufps     xmm4, xmm1, 0xdd
   1819     pavgb      xmm0, xmm4
   1820     movdqa     xmm4, xmm2
   1821     shufps     xmm2, xmm3, 0x88
   1822     shufps     xmm4, xmm3, 0xdd
   1823     pavgb      xmm2, xmm4
   1824 
   1825     // step 2 - convert to U and V
   1826     // from here down is very similar to Y code except
   1827     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1828     movdqa     xmm1, xmm0
   1829     movdqa     xmm3, xmm2
   1830     pmaddubsw  xmm0, xmm7  // U
   1831     pmaddubsw  xmm2, xmm7
   1832     pmaddubsw  xmm1, xmm6  // V
   1833     pmaddubsw  xmm3, xmm6
   1834     phaddw     xmm0, xmm2
   1835     phaddw     xmm1, xmm3
   1836     psraw      xmm0, 8
   1837     psraw      xmm1, 8
   1838     packsswb   xmm0, xmm1
   1839     paddb      xmm0, xmm5            // -> unsigned
   1840 
   1841     // step 3 - store 8 U and 8 V values
   1842     sub        ecx, 16
   1843     movlps     qword ptr [edx], xmm0 // U
   1844     movhps     qword ptr [edx + edi], xmm0 // V
   1845     lea        edx, [edx + 8]
   1846     jg         convertloop
   1847 
   1848     pop        edi
   1849     pop        esi
   1850     ret
   1851   }
   1852 }
   1853 
   1854 __declspec(naked) __declspec(align(16))
   1855 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1856                                  uint8* dst_u, uint8* dst_v, int width) {
   1857   __asm {
   1858     push       esi
   1859     push       edi
   1860     mov        eax, [esp + 8 + 4]   // src_argb
   1861     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1862     mov        edx, [esp + 8 + 12]  // dst_u
   1863     mov        edi, [esp + 8 + 16]  // dst_v
   1864     mov        ecx, [esp + 8 + 20]  // pix
   1865     movdqa     xmm7, kBGRAToU
   1866     movdqa     xmm6, kBGRAToV
   1867     movdqa     xmm5, kAddUV128
   1868     sub        edi, edx             // stride from u to v
   1869 
   1870     align      4
   1871  convertloop:
   1872     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1873     movdqu     xmm0, [eax]
   1874     movdqu     xmm1, [eax + 16]
   1875     movdqu     xmm2, [eax + 32]
   1876     movdqu     xmm3, [eax + 48]
   1877     movdqu     xmm4, [eax + esi]
   1878     pavgb      xmm0, xmm4
   1879     movdqu     xmm4, [eax + esi + 16]
   1880     pavgb      xmm1, xmm4
   1881     movdqu     xmm4, [eax + esi + 32]
   1882     pavgb      xmm2, xmm4
   1883     movdqu     xmm4, [eax + esi + 48]
   1884     pavgb      xmm3, xmm4
   1885     lea        eax,  [eax + 64]
   1886     movdqa     xmm4, xmm0
   1887     shufps     xmm0, xmm1, 0x88
   1888     shufps     xmm4, xmm1, 0xdd
   1889     pavgb      xmm0, xmm4
   1890     movdqa     xmm4, xmm2
   1891     shufps     xmm2, xmm3, 0x88
   1892     shufps     xmm4, xmm3, 0xdd
   1893     pavgb      xmm2, xmm4
   1894 
   1895     // step 2 - convert to U and V
   1896     // from here down is very similar to Y code except
   1897     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1898     movdqa     xmm1, xmm0
   1899     movdqa     xmm3, xmm2
   1900     pmaddubsw  xmm0, xmm7  // U
   1901     pmaddubsw  xmm2, xmm7
   1902     pmaddubsw  xmm1, xmm6  // V
   1903     pmaddubsw  xmm3, xmm6
   1904     phaddw     xmm0, xmm2
   1905     phaddw     xmm1, xmm3
   1906     psraw      xmm0, 8
   1907     psraw      xmm1, 8
   1908     packsswb   xmm0, xmm1
   1909     paddb      xmm0, xmm5            // -> unsigned
   1910 
   1911     // step 3 - store 8 U and 8 V values
   1912     sub        ecx, 16
   1913     movlps     qword ptr [edx], xmm0 // U
   1914     movhps     qword ptr [edx + edi], xmm0 // V
   1915     lea        edx, [edx + 8]
   1916     jg         convertloop
   1917 
   1918     pop        edi
   1919     pop        esi
   1920     ret
   1921   }
   1922 }
   1923 
   1924 __declspec(naked) __declspec(align(16))
   1925 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1926                        uint8* dst_u, uint8* dst_v, int width) {
   1927   __asm {
   1928     push       esi
   1929     push       edi
   1930     mov        eax, [esp + 8 + 4]   // src_argb
   1931     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1932     mov        edx, [esp + 8 + 12]  // dst_u
   1933     mov        edi, [esp + 8 + 16]  // dst_v
   1934     mov        ecx, [esp + 8 + 20]  // pix
   1935     movdqa     xmm7, kABGRToU
   1936     movdqa     xmm6, kABGRToV
   1937     movdqa     xmm5, kAddUV128
   1938     sub        edi, edx             // stride from u to v
   1939 
   1940     align      4
   1941  convertloop:
   1942     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1943     movdqa     xmm0, [eax]
   1944     movdqa     xmm1, [eax + 16]
   1945     movdqa     xmm2, [eax + 32]
   1946     movdqa     xmm3, [eax + 48]
   1947     pavgb      xmm0, [eax + esi]
   1948     pavgb      xmm1, [eax + esi + 16]
   1949     pavgb      xmm2, [eax + esi + 32]
   1950     pavgb      xmm3, [eax + esi + 48]
   1951     lea        eax,  [eax + 64]
   1952     movdqa     xmm4, xmm0
   1953     shufps     xmm0, xmm1, 0x88
   1954     shufps     xmm4, xmm1, 0xdd
   1955     pavgb      xmm0, xmm4
   1956     movdqa     xmm4, xmm2
   1957     shufps     xmm2, xmm3, 0x88
   1958     shufps     xmm4, xmm3, 0xdd
   1959     pavgb      xmm2, xmm4
   1960 
   1961     // step 2 - convert to U and V
   1962     // from here down is very similar to Y code except
   1963     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1964     movdqa     xmm1, xmm0
   1965     movdqa     xmm3, xmm2
   1966     pmaddubsw  xmm0, xmm7  // U
   1967     pmaddubsw  xmm2, xmm7
   1968     pmaddubsw  xmm1, xmm6  // V
   1969     pmaddubsw  xmm3, xmm6
   1970     phaddw     xmm0, xmm2
   1971     phaddw     xmm1, xmm3
   1972     psraw      xmm0, 8
   1973     psraw      xmm1, 8
   1974     packsswb   xmm0, xmm1
   1975     paddb      xmm0, xmm5            // -> unsigned
   1976 
   1977     // step 3 - store 8 U and 8 V values
   1978     sub        ecx, 16
   1979     movlps     qword ptr [edx], xmm0 // U
   1980     movhps     qword ptr [edx + edi], xmm0 // V
   1981     lea        edx, [edx + 8]
   1982     jg         convertloop
   1983 
   1984     pop        edi
   1985     pop        esi
   1986     ret
   1987   }
   1988 }
   1989 
   1990 __declspec(naked) __declspec(align(16))
   1991 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1992                                  uint8* dst_u, uint8* dst_v, int width) {
   1993   __asm {
   1994     push       esi
   1995     push       edi
   1996     mov        eax, [esp + 8 + 4]   // src_argb
   1997     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1998     mov        edx, [esp + 8 + 12]  // dst_u
   1999     mov        edi, [esp + 8 + 16]  // dst_v
   2000     mov        ecx, [esp + 8 + 20]  // pix
   2001     movdqa     xmm7, kABGRToU
   2002     movdqa     xmm6, kABGRToV
   2003     movdqa     xmm5, kAddUV128
   2004     sub        edi, edx             // stride from u to v
   2005 
   2006     align      4
   2007  convertloop:
   2008     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   2009     movdqu     xmm0, [eax]
   2010     movdqu     xmm1, [eax + 16]
   2011     movdqu     xmm2, [eax + 32]
   2012     movdqu     xmm3, [eax + 48]
   2013     movdqu     xmm4, [eax + esi]
   2014     pavgb      xmm0, xmm4
   2015     movdqu     xmm4, [eax + esi + 16]
   2016     pavgb      xmm1, xmm4
   2017     movdqu     xmm4, [eax + esi + 32]
   2018     pavgb      xmm2, xmm4
   2019     movdqu     xmm4, [eax + esi + 48]
   2020     pavgb      xmm3, xmm4
   2021     lea        eax,  [eax + 64]
   2022     movdqa     xmm4, xmm0
   2023     shufps     xmm0, xmm1, 0x88
   2024     shufps     xmm4, xmm1, 0xdd
   2025     pavgb      xmm0, xmm4
   2026     movdqa     xmm4, xmm2
   2027     shufps     xmm2, xmm3, 0x88
   2028     shufps     xmm4, xmm3, 0xdd
   2029     pavgb      xmm2, xmm4
   2030 
   2031     // step 2 - convert to U and V
   2032     // from here down is very similar to Y code except
   2033     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   2034     movdqa     xmm1, xmm0
   2035     movdqa     xmm3, xmm2
   2036     pmaddubsw  xmm0, xmm7  // U
   2037     pmaddubsw  xmm2, xmm7
   2038     pmaddubsw  xmm1, xmm6  // V
   2039     pmaddubsw  xmm3, xmm6
   2040     phaddw     xmm0, xmm2
   2041     phaddw     xmm1, xmm3
   2042     psraw      xmm0, 8
   2043     psraw      xmm1, 8
   2044     packsswb   xmm0, xmm1
   2045     paddb      xmm0, xmm5            // -> unsigned
   2046 
   2047     // step 3 - store 8 U and 8 V values
   2048     sub        ecx, 16
   2049     movlps     qword ptr [edx], xmm0 // U
   2050     movhps     qword ptr [edx + edi], xmm0 // V
   2051     lea        edx, [edx + 8]
   2052     jg         convertloop
   2053 
   2054     pop        edi
   2055     pop        esi
   2056     ret
   2057   }
   2058 }
   2059 
   2060 __declspec(naked) __declspec(align(16))
   2061 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   2062                        uint8* dst_u, uint8* dst_v, int width) {
   2063   __asm {
   2064     push       esi
   2065     push       edi
   2066     mov        eax, [esp + 8 + 4]   // src_argb
   2067     mov        esi, [esp + 8 + 8]   // src_stride_argb
   2068     mov        edx, [esp + 8 + 12]  // dst_u
   2069     mov        edi, [esp + 8 + 16]  // dst_v
   2070     mov        ecx, [esp + 8 + 20]  // pix
   2071     movdqa     xmm7, kRGBAToU
   2072     movdqa     xmm6, kRGBAToV
   2073     movdqa     xmm5, kAddUV128
   2074     sub        edi, edx             // stride from u to v
   2075 
   2076     align      4
   2077  convertloop:
   2078     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   2079     movdqa     xmm0, [eax]
   2080     movdqa     xmm1, [eax + 16]
   2081     movdqa     xmm2, [eax + 32]
   2082     movdqa     xmm3, [eax + 48]
   2083     pavgb      xmm0, [eax + esi]
   2084     pavgb      xmm1, [eax + esi + 16]
   2085     pavgb      xmm2, [eax + esi + 32]
   2086     pavgb      xmm3, [eax + esi + 48]
   2087     lea        eax,  [eax + 64]
   2088     movdqa     xmm4, xmm0
   2089     shufps     xmm0, xmm1, 0x88
   2090     shufps     xmm4, xmm1, 0xdd
   2091     pavgb      xmm0, xmm4
   2092     movdqa     xmm4, xmm2
   2093     shufps     xmm2, xmm3, 0x88
   2094     shufps     xmm4, xmm3, 0xdd
   2095     pavgb      xmm2, xmm4
   2096 
   2097     // step 2 - convert to U and V
   2098     // from here down is very similar to Y code except
   2099     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   2100     movdqa     xmm1, xmm0
   2101     movdqa     xmm3, xmm2
   2102     pmaddubsw  xmm0, xmm7  // U
   2103     pmaddubsw  xmm2, xmm7
   2104     pmaddubsw  xmm1, xmm6  // V
   2105     pmaddubsw  xmm3, xmm6
   2106     phaddw     xmm0, xmm2
   2107     phaddw     xmm1, xmm3
   2108     psraw      xmm0, 8
   2109     psraw      xmm1, 8
   2110     packsswb   xmm0, xmm1
   2111     paddb      xmm0, xmm5            // -> unsigned
   2112 
   2113     // step 3 - store 8 U and 8 V values
   2114     sub        ecx, 16
   2115     movlps     qword ptr [edx], xmm0 // U
   2116     movhps     qword ptr [edx + edi], xmm0 // V
   2117     lea        edx, [edx + 8]
   2118     jg         convertloop
   2119 
   2120     pop        edi
   2121     pop        esi
   2122     ret
   2123   }
   2124 }
   2125 
   2126 __declspec(naked) __declspec(align(16))
   2127 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
   2128                                  uint8* dst_u, uint8* dst_v, int width) {
   2129   __asm {
   2130     push       esi
   2131     push       edi
   2132     mov        eax, [esp + 8 + 4]   // src_argb
   2133     mov        esi, [esp + 8 + 8]   // src_stride_argb
   2134     mov        edx, [esp + 8 + 12]  // dst_u
   2135     mov        edi, [esp + 8 + 16]  // dst_v
   2136     mov        ecx, [esp + 8 + 20]  // pix
   2137     movdqa     xmm7, kRGBAToU
   2138     movdqa     xmm6, kRGBAToV
   2139     movdqa     xmm5, kAddUV128
   2140     sub        edi, edx             // stride from u to v
   2141 
   2142     align      4
   2143  convertloop:
   2144     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   2145     movdqu     xmm0, [eax]
   2146     movdqu     xmm1, [eax + 16]
   2147     movdqu     xmm2, [eax + 32]
   2148     movdqu     xmm3, [eax + 48]
   2149     movdqu     xmm4, [eax + esi]
   2150     pavgb      xmm0, xmm4
   2151     movdqu     xmm4, [eax + esi + 16]
   2152     pavgb      xmm1, xmm4
   2153     movdqu     xmm4, [eax + esi + 32]
   2154     pavgb      xmm2, xmm4
   2155     movdqu     xmm4, [eax + esi + 48]
   2156     pavgb      xmm3, xmm4
   2157     lea        eax,  [eax + 64]
   2158     movdqa     xmm4, xmm0
   2159     shufps     xmm0, xmm1, 0x88
   2160     shufps     xmm4, xmm1, 0xdd
   2161     pavgb      xmm0, xmm4
   2162     movdqa     xmm4, xmm2
   2163     shufps     xmm2, xmm3, 0x88
   2164     shufps     xmm4, xmm3, 0xdd
   2165     pavgb      xmm2, xmm4
   2166 
   2167     // step 2 - convert to U and V
   2168     // from here down is very similar to Y code except
   2169     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   2170     movdqa     xmm1, xmm0
   2171     movdqa     xmm3, xmm2
   2172     pmaddubsw  xmm0, xmm7  // U
   2173     pmaddubsw  xmm2, xmm7
   2174     pmaddubsw  xmm1, xmm6  // V
   2175     pmaddubsw  xmm3, xmm6
   2176     phaddw     xmm0, xmm2
   2177     phaddw     xmm1, xmm3
   2178     psraw      xmm0, 8
   2179     psraw      xmm1, 8
   2180     packsswb   xmm0, xmm1
   2181     paddb      xmm0, xmm5            // -> unsigned
   2182 
   2183     // step 3 - store 8 U and 8 V values
   2184     sub        ecx, 16
   2185     movlps     qword ptr [edx], xmm0 // U
   2186     movhps     qword ptr [edx + edi], xmm0 // V
   2187     lea        edx, [edx + 8]
   2188     jg         convertloop
   2189 
   2190     pop        edi
   2191     pop        esi
   2192     ret
   2193   }
   2194 }
   2195 #endif  // HAS_ARGBTOYROW_SSSE3
   2196 
   2197 #ifdef HAS_I422TOARGBROW_AVX2
   2198 
   2199 static const lvec8 kUVToB_AVX = {
   2200   UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
   2201   UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
   2202 };
   2203 static const lvec8 kUVToR_AVX = {
   2204   UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
   2205   UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
   2206 };
   2207 static const lvec8 kUVToG_AVX = {
   2208   UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
   2209   UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
   2210 };
   2211 static const lvec16 kYToRgb_AVX = {
   2212   YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
   2213 };
   2214 static const lvec16 kYSub16_AVX = {
   2215   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
   2216 };
   2217 static const lvec16 kUVBiasB_AVX = {
   2218   BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
   2219 };
   2220 static const lvec16 kUVBiasG_AVX = {
   2221   BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
   2222 };
   2223 static const lvec16 kUVBiasR_AVX = {
   2224   BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
   2225 };
   2226 
   2227 // 16 pixels
   2228 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2229 __declspec(naked) __declspec(align(16))
   2230 void I422ToARGBRow_AVX2(const uint8* y_buf,
   2231                         const uint8* u_buf,
   2232                         const uint8* v_buf,
   2233                         uint8* dst_argb,
   2234                         int width) {
   2235   __asm {
   2236     push       esi
   2237     push       edi
   2238     mov        eax, [esp + 8 + 4]   // Y
   2239     mov        esi, [esp + 8 + 8]   // U
   2240     mov        edi, [esp + 8 + 12]  // V
   2241     mov        edx, [esp + 8 + 16]  // argb
   2242     mov        ecx, [esp + 8 + 20]  // width
   2243     sub        edi, esi
   2244     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2245     vpxor      ymm4, ymm4, ymm4
   2246 
   2247     align      4
   2248  convertloop:
   2249     vmovq      xmm0, qword ptr [esi]          //  U
   2250     vmovq      xmm1, qword ptr [esi + edi]    //  V
   2251     lea        esi,  [esi + 8]
   2252     vpunpcklbw ymm0, ymm0, ymm1               // UV
   2253     vpermq     ymm0, ymm0, 0xd8
   2254     vpunpcklwd ymm0, ymm0, ymm0              // UVUV
   2255     vpmaddubsw ymm2, ymm0, kUVToB_AVX        // scale B UV
   2256     vpmaddubsw ymm1, ymm0, kUVToG_AVX        // scale G UV
   2257     vpmaddubsw ymm0, ymm0, kUVToR_AVX        // scale R UV
   2258     vpsubw     ymm2, ymm2, kUVBiasB_AVX      // unbias back to signed
   2259     vpsubw     ymm1, ymm1, kUVBiasG_AVX
   2260     vpsubw     ymm0, ymm0, kUVBiasR_AVX
   2261 
   2262     // Step 2: Find Y contribution to 16 R,G,B values
   2263     vmovdqu    xmm3, [eax]                  // NOLINT
   2264     lea        eax, [eax + 16]
   2265     vpermq     ymm3, ymm3, 0xd8
   2266     vpunpcklbw ymm3, ymm3, ymm4
   2267     vpsubsw    ymm3, ymm3, kYSub16_AVX
   2268     vpmullw    ymm3, ymm3, kYToRgb_AVX
   2269     vpaddsw    ymm2, ymm2, ymm3           // B += Y
   2270     vpaddsw    ymm1, ymm1, ymm3           // G += Y
   2271     vpaddsw    ymm0, ymm0, ymm3           // R += Y
   2272     vpsraw     ymm2, ymm2, 6
   2273     vpsraw     ymm1, ymm1, 6
   2274     vpsraw     ymm0, ymm0, 6
   2275     vpackuswb  ymm2, ymm2, ymm2           // B
   2276     vpackuswb  ymm1, ymm1, ymm1           // G
   2277     vpackuswb  ymm0, ymm0, ymm0           // R
   2278 
   2279     // Step 3: Weave into ARGB
   2280     vpunpcklbw ymm2, ymm2, ymm1           // BG
   2281     vpermq     ymm2, ymm2, 0xd8
   2282     vpunpcklbw ymm0, ymm0, ymm5           // RA
   2283     vpermq     ymm0, ymm0, 0xd8
   2284     vpunpcklwd ymm1, ymm2, ymm0           // BGRA first 8 pixels
   2285     vpunpckhwd ymm2, ymm2, ymm0           // BGRA next 8 pixels
   2286     vmovdqu    [edx], ymm1
   2287     vmovdqu    [edx + 32], ymm2
   2288     lea        edx,  [edx + 64]
   2289     sub        ecx, 16
   2290     jg         convertloop
   2291     vzeroupper
   2292 
   2293     pop        edi
   2294     pop        esi
   2295     ret
   2296   }
   2297 }
   2298 #endif  // HAS_I422TOARGBROW_AVX2
   2299 
   2300 #ifdef HAS_I422TOARGBROW_SSSE3
   2301 
   2302 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
   2303 
   2304 // Read 8 UV from 444.
   2305 #define READYUV444 __asm {                                                     \
   2306     __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
   2307     __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
   2308     __asm lea        esi,  [esi + 8]                                           \
   2309     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
   2310   }
   2311 
   2312 // Read 4 UV from 422, upsample to 8 UV.
   2313 #define READYUV422 __asm {                                                     \
   2314     __asm movd       xmm0, [esi]          /* U */                              \
   2315     __asm movd       xmm1, [esi + edi]    /* V */                              \
   2316     __asm lea        esi,  [esi + 4]                                           \
   2317     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
   2318     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
   2319   }
   2320 
   2321 // Read 2 UV from 411, upsample to 8 UV.
   2322 #define READYUV411 __asm {                                                     \
   2323     __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \
   2324     __asm movd       xmm0, ebx                                                 \
   2325     __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \
   2326     __asm movd       xmm1, ebx                                                 \
   2327     __asm lea        esi,  [esi + 2]                                           \
   2328     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
   2329     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
   2330     __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
   2331   }
   2332 
   2333 // Read 4 UV from NV12, upsample to 8 UV.
   2334 #define READNV12 __asm {                                                       \
   2335     __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
   2336     __asm lea        esi,  [esi + 8]                                           \
   2337     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
   2338   }
   2339 
   2340 // Convert 8 pixels: 8 UV and 8 Y.
   2341 #define YUVTORGB __asm {                                                       \
   2342     /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
   2343     __asm movdqa     xmm1, xmm0                                                \
   2344     __asm movdqa     xmm2, xmm0                                                \
   2345     __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
   2346     __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
   2347     __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
   2348     __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
   2349     __asm psubw      xmm1, kUVBiasG                                            \
   2350     __asm psubw      xmm2, kUVBiasR                                            \
   2351     /* Step 2: Find Y contribution to 8 R,G,B values */                        \
   2352     __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
   2353     __asm lea        eax, [eax + 8]                                            \
   2354     __asm punpcklbw  xmm3, xmm4                                                \
   2355     __asm psubsw     xmm3, kYSub16                                             \
   2356     __asm pmullw     xmm3, kYToRgb                                             \
   2357     __asm paddsw     xmm0, xmm3           /* B += Y */                         \
   2358     __asm paddsw     xmm1, xmm3           /* G += Y */                         \
   2359     __asm paddsw     xmm2, xmm3           /* R += Y */                         \
   2360     __asm psraw      xmm0, 6                                                   \
   2361     __asm psraw      xmm1, 6                                                   \
   2362     __asm psraw      xmm2, 6                                                   \
   2363     __asm packuswb   xmm0, xmm0           /* B */                              \
   2364     __asm packuswb   xmm1, xmm1           /* G */                              \
   2365     __asm packuswb   xmm2, xmm2           /* R */                              \
   2366   }
   2367 
   2368 // Convert 8 pixels: 8 VU and 8 Y.
   2369 #define YVUTORGB __asm {                                                       \
   2370     /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
   2371     __asm movdqa     xmm1, xmm0                                                \
   2372     __asm movdqa     xmm2, xmm0                                                \
   2373     __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
   2374     __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
   2375     __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
   2376     __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
   2377     __asm psubw      xmm1, kUVBiasG                                            \
   2378     __asm psubw      xmm2, kUVBiasR                                            \
   2379     /* Step 2: Find Y contribution to 8 R,G,B values */                        \
   2380     __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
   2381     __asm lea        eax, [eax + 8]                                            \
   2382     __asm punpcklbw  xmm3, xmm4                                                \
   2383     __asm psubsw     xmm3, kYSub16                                             \
   2384     __asm pmullw     xmm3, kYToRgb                                             \
   2385     __asm paddsw     xmm0, xmm3           /* B += Y */                         \
   2386     __asm paddsw     xmm1, xmm3           /* G += Y */                         \
   2387     __asm paddsw     xmm2, xmm3           /* R += Y */                         \
   2388     __asm psraw      xmm0, 6                                                   \
   2389     __asm psraw      xmm1, 6                                                   \
   2390     __asm psraw      xmm2, 6                                                   \
   2391     __asm packuswb   xmm0, xmm0           /* B */                              \
   2392     __asm packuswb   xmm1, xmm1           /* G */                              \
   2393     __asm packuswb   xmm2, xmm2           /* R */                              \
   2394   }
   2395 
   2396 // 8 pixels, dest aligned 16.
   2397 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
   2398 __declspec(naked) __declspec(align(16))
   2399 void I444ToARGBRow_SSSE3(const uint8* y_buf,
   2400                          const uint8* u_buf,
   2401                          const uint8* v_buf,
   2402                          uint8* dst_argb,
   2403                          int width) {
   2404   __asm {
   2405     push       esi
   2406     push       edi
   2407     mov        eax, [esp + 8 + 4]   // Y
   2408     mov        esi, [esp + 8 + 8]   // U
   2409     mov        edi, [esp + 8 + 12]  // V
   2410     mov        edx, [esp + 8 + 16]  // argb
   2411     mov        ecx, [esp + 8 + 20]  // width
   2412     sub        edi, esi
   2413     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2414     pxor       xmm4, xmm4
   2415 
   2416     align      4
   2417  convertloop:
   2418     READYUV444
   2419     YUVTORGB
   2420 
   2421     // Step 3: Weave into ARGB
   2422     punpcklbw  xmm0, xmm1           // BG
   2423     punpcklbw  xmm2, xmm5           // RA
   2424     movdqa     xmm1, xmm0
   2425     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   2426     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   2427     movdqa     [edx], xmm0
   2428     movdqa     [edx + 16], xmm1
   2429     lea        edx,  [edx + 32]
   2430     sub        ecx, 8
   2431     jg         convertloop
   2432 
   2433     pop        edi
   2434     pop        esi
   2435     ret
   2436   }
   2437 }
   2438 
   2439 // 8 pixels, dest aligned 16.
   2440 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2441 __declspec(naked) __declspec(align(16))
   2442 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
   2443                           const uint8* u_buf,
   2444                           const uint8* v_buf,
   2445                           uint8* dst_rgb24,
   2446                           int width) {
   2447   __asm {
   2448     push       esi
   2449     push       edi
   2450     mov        eax, [esp + 8 + 4]   // Y
   2451     mov        esi, [esp + 8 + 8]   // U
   2452     mov        edi, [esp + 8 + 12]  // V
   2453     mov        edx, [esp + 8 + 16]  // rgb24
   2454     mov        ecx, [esp + 8 + 20]  // width
   2455     sub        edi, esi
   2456     pxor       xmm4, xmm4
   2457     movdqa     xmm5, kShuffleMaskARGBToRGB24_0
   2458     movdqa     xmm6, kShuffleMaskARGBToRGB24
   2459 
   2460     align      4
   2461  convertloop:
   2462     READYUV422
   2463     YUVTORGB
   2464 
   2465     // Step 3: Weave into RRGB
   2466     punpcklbw  xmm0, xmm1           // BG
   2467     punpcklbw  xmm2, xmm2           // RR
   2468     movdqa     xmm1, xmm0
   2469     punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
   2470     punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
   2471     pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
   2472     pshufb     xmm1, xmm6           // Pack into first 12 bytes.
   2473     palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
   2474     movq       qword ptr [edx], xmm0  // First 8 bytes
   2475     movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
   2476     lea        edx,  [edx + 24]
   2477     sub        ecx, 8
   2478     jg         convertloop
   2479 
   2480     pop        edi
   2481     pop        esi
   2482     ret
   2483   }
   2484 }
   2485 
   2486 // 8 pixels, dest aligned 16.
   2487 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2488 __declspec(naked) __declspec(align(16))
   2489 void I422ToRAWRow_SSSE3(const uint8* y_buf,
   2490                         const uint8* u_buf,
   2491                         const uint8* v_buf,
   2492                         uint8* dst_raw,
   2493                         int width) {
   2494   __asm {
   2495     push       esi
   2496     push       edi
   2497     mov        eax, [esp + 8 + 4]   // Y
   2498     mov        esi, [esp + 8 + 8]   // U
   2499     mov        edi, [esp + 8 + 12]  // V
   2500     mov        edx, [esp + 8 + 16]  // raw
   2501     mov        ecx, [esp + 8 + 20]  // width
   2502     sub        edi, esi
   2503     pxor       xmm4, xmm4
   2504     movdqa     xmm5, kShuffleMaskARGBToRAW_0
   2505     movdqa     xmm6, kShuffleMaskARGBToRAW
   2506 
   2507     align      4
   2508  convertloop:
   2509     READYUV422
   2510     YUVTORGB
   2511 
   2512     // Step 3: Weave into RRGB
   2513     punpcklbw  xmm0, xmm1           // BG
   2514     punpcklbw  xmm2, xmm2           // RR
   2515     movdqa     xmm1, xmm0
   2516     punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
   2517     punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
   2518     pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
   2519     pshufb     xmm1, xmm6           // Pack into first 12 bytes.
   2520     palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
   2521     movq       qword ptr [edx], xmm0  // First 8 bytes
   2522     movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
   2523     lea        edx,  [edx + 24]
   2524     sub        ecx, 8
   2525     jg         convertloop
   2526 
   2527     pop        edi
   2528     pop        esi
   2529     ret
   2530   }
   2531 }
   2532 
   2533 // 8 pixels, dest unaligned.
   2534 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2535 __declspec(naked) __declspec(align(16))
   2536 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
   2537                            const uint8* u_buf,
   2538                            const uint8* v_buf,
   2539                            uint8* rgb565_buf,
   2540                            int width) {
   2541   __asm {
   2542     push       esi
   2543     push       edi
   2544     mov        eax, [esp + 8 + 4]   // Y
   2545     mov        esi, [esp + 8 + 8]   // U
   2546     mov        edi, [esp + 8 + 12]  // V
   2547     mov        edx, [esp + 8 + 16]  // rgb565
   2548     mov        ecx, [esp + 8 + 20]  // width
   2549     sub        edi, esi
   2550     pxor       xmm4, xmm4
   2551     pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
   2552     psrld      xmm5, 27
   2553     pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
   2554     psrld      xmm6, 26
   2555     pslld      xmm6, 5
   2556     pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
   2557     pslld      xmm7, 11
   2558 
   2559     align      4
   2560  convertloop:
   2561     READYUV422
   2562     YUVTORGB
   2563 
   2564     // Step 3: Weave into RRGB
   2565     punpcklbw  xmm0, xmm1           // BG
   2566     punpcklbw  xmm2, xmm2           // RR
   2567     movdqa     xmm1, xmm0
   2568     punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
   2569     punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
   2570 
   2571     // Step 3b: RRGB -> RGB565
   2572     movdqa     xmm3, xmm0    // B  first 4 pixels of argb
   2573     movdqa     xmm2, xmm0    // G
   2574     pslld      xmm0, 8       // R
   2575     psrld      xmm3, 3       // B
   2576     psrld      xmm2, 5       // G
   2577     psrad      xmm0, 16      // R
   2578     pand       xmm3, xmm5    // B
   2579     pand       xmm2, xmm6    // G
   2580     pand       xmm0, xmm7    // R
   2581     por        xmm3, xmm2    // BG
   2582     por        xmm0, xmm3    // BGR
   2583     movdqa     xmm3, xmm1    // B  next 4 pixels of argb
   2584     movdqa     xmm2, xmm1    // G
   2585     pslld      xmm1, 8       // R
   2586     psrld      xmm3, 3       // B
   2587     psrld      xmm2, 5       // G
   2588     psrad      xmm1, 16      // R
   2589     pand       xmm3, xmm5    // B
   2590     pand       xmm2, xmm6    // G
   2591     pand       xmm1, xmm7    // R
   2592     por        xmm3, xmm2    // BG
   2593     por        xmm1, xmm3    // BGR
   2594     packssdw   xmm0, xmm1
   2595     sub        ecx, 8
   2596     movdqu     [edx], xmm0   // store 8 pixels of RGB565
   2597     lea        edx, [edx + 16]
   2598     jg         convertloop
   2599 
   2600     pop        edi
   2601     pop        esi
   2602     ret
   2603   }
   2604 }
   2605 
   2606 // 8 pixels, dest aligned 16.
   2607 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2608 __declspec(naked) __declspec(align(16))
   2609 void I422ToARGBRow_SSSE3(const uint8* y_buf,
   2610                          const uint8* u_buf,
   2611                          const uint8* v_buf,
   2612                          uint8* dst_argb,
   2613                          int width) {
   2614   __asm {
   2615     push       esi
   2616     push       edi
   2617     mov        eax, [esp + 8 + 4]   // Y
   2618     mov        esi, [esp + 8 + 8]   // U
   2619     mov        edi, [esp + 8 + 12]  // V
   2620     mov        edx, [esp + 8 + 16]  // argb
   2621     mov        ecx, [esp + 8 + 20]  // width
   2622     sub        edi, esi
   2623     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2624     pxor       xmm4, xmm4
   2625 
   2626     align      4
   2627  convertloop:
   2628     READYUV422
   2629     YUVTORGB
   2630 
   2631     // Step 3: Weave into ARGB
   2632     punpcklbw  xmm0, xmm1           // BG
   2633     punpcklbw  xmm2, xmm5           // RA
   2634     movdqa     xmm1, xmm0
   2635     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   2636     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   2637     movdqa     [edx], xmm0
   2638     movdqa     [edx + 16], xmm1
   2639     lea        edx,  [edx + 32]
   2640     sub        ecx, 8
   2641     jg         convertloop
   2642 
   2643     pop        edi
   2644     pop        esi
   2645     ret
   2646   }
   2647 }
   2648 
   2649 // 8 pixels, dest aligned 16.
   2650 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2651 // Similar to I420 but duplicate UV once more.
   2652 __declspec(naked) __declspec(align(16))
   2653 void I411ToARGBRow_SSSE3(const uint8* y_buf,
   2654                          const uint8* u_buf,
   2655                          const uint8* v_buf,
   2656                          uint8* dst_argb,
   2657                          int width) {
   2658   __asm {
   2659     push       ebx
   2660     push       esi
   2661     push       edi
   2662     mov        eax, [esp + 12 + 4]   // Y
   2663     mov        esi, [esp + 12 + 8]   // U
   2664     mov        edi, [esp + 12 + 12]  // V
   2665     mov        edx, [esp + 12 + 16]  // argb
   2666     mov        ecx, [esp + 12 + 20]  // width
   2667     sub        edi, esi
   2668     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2669     pxor       xmm4, xmm4
   2670 
   2671     align      4
   2672  convertloop:
   2673     READYUV411  // modifies EBX
   2674     YUVTORGB
   2675 
   2676     // Step 3: Weave into ARGB
   2677     punpcklbw  xmm0, xmm1           // BG
   2678     punpcklbw  xmm2, xmm5           // RA
   2679     movdqa     xmm1, xmm0
   2680     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   2681     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   2682     movdqa     [edx], xmm0
   2683     movdqa     [edx + 16], xmm1
   2684     lea        edx,  [edx + 32]
   2685     sub        ecx, 8
   2686     jg         convertloop
   2687 
   2688     pop        edi
   2689     pop        esi
   2690     pop        ebx
   2691     ret
   2692   }
   2693 }
   2694 
   2695 // 8 pixels, dest aligned 16.
   2696 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2697 __declspec(naked) __declspec(align(16))
   2698 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
   2699                          const uint8* uv_buf,
   2700                          uint8* dst_argb,
   2701                          int width) {
   2702   __asm {
   2703     push       esi
   2704     mov        eax, [esp + 4 + 4]   // Y
   2705     mov        esi, [esp + 4 + 8]   // UV
   2706     mov        edx, [esp + 4 + 12]  // argb
   2707     mov        ecx, [esp + 4 + 16]  // width
   2708     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2709     pxor       xmm4, xmm4
   2710 
   2711     align      4
   2712  convertloop:
   2713     READNV12
   2714     YUVTORGB
   2715 
   2716     // Step 3: Weave into ARGB
   2717     punpcklbw  xmm0, xmm1           // BG
   2718     punpcklbw  xmm2, xmm5           // RA
   2719     movdqa     xmm1, xmm0
   2720     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   2721     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   2722     movdqa     [edx], xmm0
   2723     movdqa     [edx + 16], xmm1
   2724     lea        edx,  [edx + 32]
   2725     sub        ecx, 8
   2726     jg         convertloop
   2727 
   2728     pop        esi
   2729     ret
   2730   }
   2731 }
   2732 
   2733 // 8 pixels, dest aligned 16.
   2734 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2735 __declspec(naked) __declspec(align(16))
   2736 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
   2737                          const uint8* uv_buf,
   2738                          uint8* dst_argb,
   2739                          int width) {
   2740   __asm {
   2741     push       esi
   2742     mov        eax, [esp + 4 + 4]   // Y
   2743     mov        esi, [esp + 4 + 8]   // VU
   2744     mov        edx, [esp + 4 + 12]  // argb
   2745     mov        ecx, [esp + 4 + 16]  // width
   2746     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2747     pxor       xmm4, xmm4
   2748 
   2749     align      4
   2750  convertloop:
   2751     READNV12
   2752     YVUTORGB
   2753 
   2754     // Step 3: Weave into ARGB
   2755     punpcklbw  xmm0, xmm1           // BG
   2756     punpcklbw  xmm2, xmm5           // RA
   2757     movdqa     xmm1, xmm0
   2758     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   2759     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   2760     movdqa     [edx], xmm0
   2761     movdqa     [edx + 16], xmm1
   2762     lea        edx,  [edx + 32]
   2763     sub        ecx, 8
   2764     jg         convertloop
   2765 
   2766     pop        esi
   2767     ret
   2768   }
   2769 }
   2770 
   2771 // 8 pixels, unaligned.
   2772 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
   2773 __declspec(naked) __declspec(align(16))
   2774 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   2775                                    const uint8* u_buf,
   2776                                    const uint8* v_buf,
   2777                                    uint8* dst_argb,
   2778                                    int width) {
   2779   __asm {
   2780     push       esi
   2781     push       edi
   2782     mov        eax, [esp + 8 + 4]   // Y
   2783     mov        esi, [esp + 8 + 8]   // U
   2784     mov        edi, [esp + 8 + 12]  // V
   2785     mov        edx, [esp + 8 + 16]  // argb
   2786     mov        ecx, [esp + 8 + 20]  // width
   2787     sub        edi, esi
   2788     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2789     pxor       xmm4, xmm4
   2790 
   2791     align      4
   2792  convertloop:
   2793     READYUV444
   2794     YUVTORGB
   2795 
   2796     // Step 3: Weave into ARGB
   2797     punpcklbw  xmm0, xmm1           // BG
   2798     punpcklbw  xmm2, xmm5           // RA
   2799     movdqa     xmm1, xmm0
   2800     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   2801     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   2802     movdqu     [edx], xmm0
   2803     movdqu     [edx + 16], xmm1
   2804     lea        edx,  [edx + 32]
   2805     sub        ecx, 8
   2806     jg         convertloop
   2807 
   2808     pop        edi
   2809     pop        esi
   2810     ret
   2811   }
   2812 }
   2813 
   2814 // 8 pixels, unaligned.
   2815 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2816 __declspec(naked) __declspec(align(16))
   2817 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   2818                                    const uint8* u_buf,
   2819                                    const uint8* v_buf,
   2820                                    uint8* dst_argb,
   2821                                    int width) {
   2822   __asm {
   2823     push       esi
   2824     push       edi
   2825     mov        eax, [esp + 8 + 4]   // Y
   2826     mov        esi, [esp + 8 + 8]   // U
   2827     mov        edi, [esp + 8 + 12]  // V
   2828     mov        edx, [esp + 8 + 16]  // argb
   2829     mov        ecx, [esp + 8 + 20]  // width
   2830     sub        edi, esi
   2831     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2832     pxor       xmm4, xmm4
   2833 
   2834     align      4
   2835  convertloop:
   2836     READYUV422
   2837     YUVTORGB
   2838 
   2839     // Step 3: Weave into ARGB
   2840     punpcklbw  xmm0, xmm1           // BG
   2841     punpcklbw  xmm2, xmm5           // RA
   2842     movdqa     xmm1, xmm0
   2843     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   2844     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   2845     movdqu     [edx], xmm0
   2846     movdqu     [edx + 16], xmm1
   2847     lea        edx,  [edx + 32]
   2848     sub        ecx, 8
   2849     jg         convertloop
   2850 
   2851     pop        edi
   2852     pop        esi
   2853     ret
   2854   }
   2855 }
   2856 
   2857 // 8 pixels, unaligned.
   2858 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2859 // Similar to I420 but duplicate UV once more.
   2860 __declspec(naked) __declspec(align(16))
   2861 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   2862                                    const uint8* u_buf,
   2863                                    const uint8* v_buf,
   2864                                    uint8* dst_argb,
   2865                                    int width) {
   2866   __asm {
   2867     push       ebx
   2868     push       esi
   2869     push       edi
   2870     mov        eax, [esp + 12 + 4]   // Y
   2871     mov        esi, [esp + 12 + 8]   // U
   2872     mov        edi, [esp + 12 + 12]  // V
   2873     mov        edx, [esp + 12 + 16]  // argb
   2874     mov        ecx, [esp + 12 + 20]  // width
   2875     sub        edi, esi
   2876     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2877     pxor       xmm4, xmm4
   2878 
   2879     align      4
   2880  convertloop:
   2881     READYUV411  // modifies EBX
   2882     YUVTORGB
   2883 
   2884     // Step 3: Weave into ARGB
   2885     punpcklbw  xmm0, xmm1           // BG
   2886     punpcklbw  xmm2, xmm5           // RA
   2887     movdqa     xmm1, xmm0
   2888     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   2889     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   2890     movdqu     [edx], xmm0
   2891     movdqu     [edx + 16], xmm1
   2892     lea        edx,  [edx + 32]
   2893     sub        ecx, 8
   2894     jg         convertloop
   2895 
   2896     pop        edi
   2897     pop        esi
   2898     pop        ebx
   2899     ret
   2900   }
   2901 }
   2902 
   2903 // 8 pixels, dest aligned 16.
   2904 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2905 __declspec(naked) __declspec(align(16))
   2906 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   2907                                    const uint8* uv_buf,
   2908                                    uint8* dst_argb,
   2909                                    int width) {
   2910   __asm {
   2911     push       esi
   2912     mov        eax, [esp + 4 + 4]   // Y
   2913     mov        esi, [esp + 4 + 8]   // UV
   2914     mov        edx, [esp + 4 + 12]  // argb
   2915     mov        ecx, [esp + 4 + 16]  // width
   2916     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2917     pxor       xmm4, xmm4
   2918 
   2919     align      4
   2920  convertloop:
   2921     READNV12
   2922     YUVTORGB
   2923 
   2924     // Step 3: Weave into ARGB
   2925     punpcklbw  xmm0, xmm1           // BG
   2926     punpcklbw  xmm2, xmm5           // RA
   2927     movdqa     xmm1, xmm0
   2928     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   2929     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   2930     movdqu     [edx], xmm0
   2931     movdqu     [edx + 16], xmm1
   2932     lea        edx,  [edx + 32]
   2933     sub        ecx, 8
   2934     jg         convertloop
   2935 
   2936     pop        esi
   2937     ret
   2938   }
   2939 }
   2940 
   2941 // 8 pixels, dest aligned 16.
   2942 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2943 __declspec(naked) __declspec(align(16))
   2944 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   2945                                    const uint8* uv_buf,
   2946                                    uint8* dst_argb,
   2947                                    int width) {
   2948   __asm {
   2949     push       esi
   2950     mov        eax, [esp + 4 + 4]   // Y
   2951     mov        esi, [esp + 4 + 8]   // VU
   2952     mov        edx, [esp + 4 + 12]  // argb
   2953     mov        ecx, [esp + 4 + 16]  // width
   2954     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2955     pxor       xmm4, xmm4
   2956 
   2957     align      4
   2958  convertloop:
   2959     READNV12
   2960     YVUTORGB
   2961 
   2962     // Step 3: Weave into ARGB
   2963     punpcklbw  xmm0, xmm1           // BG
   2964     punpcklbw  xmm2, xmm5           // RA
   2965     movdqa     xmm1, xmm0
   2966     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   2967     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   2968     movdqu     [edx], xmm0
   2969     movdqu     [edx + 16], xmm1
   2970     lea        edx,  [edx + 32]
   2971     sub        ecx, 8
   2972     jg         convertloop
   2973 
   2974     pop        esi
   2975     ret
   2976   }
   2977 }
   2978 
   2979 __declspec(naked) __declspec(align(16))
   2980 void I422ToBGRARow_SSSE3(const uint8* y_buf,
   2981                          const uint8* u_buf,
   2982                          const uint8* v_buf,
   2983                          uint8* dst_bgra,
   2984                          int width) {
   2985   __asm {
   2986     push       esi
   2987     push       edi
   2988     mov        eax, [esp + 8 + 4]   // Y
   2989     mov        esi, [esp + 8 + 8]   // U
   2990     mov        edi, [esp + 8 + 12]  // V
   2991     mov        edx, [esp + 8 + 16]  // bgra
   2992     mov        ecx, [esp + 8 + 20]  // width
   2993     sub        edi, esi
   2994     pxor       xmm4, xmm4
   2995 
   2996     align      4
   2997  convertloop:
   2998     READYUV422
   2999     YUVTORGB
   3000 
   3001     // Step 3: Weave into BGRA
   3002     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   3003     punpcklbw  xmm1, xmm0           // GB
   3004     punpcklbw  xmm5, xmm2           // AR
   3005     movdqa     xmm0, xmm5
   3006     punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
   3007     punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
   3008     movdqa     [edx], xmm5
   3009     movdqa     [edx + 16], xmm0
   3010     lea        edx,  [edx + 32]
   3011     sub        ecx, 8
   3012     jg         convertloop
   3013 
   3014     pop        edi
   3015     pop        esi
   3016     ret
   3017   }
   3018 }
   3019 
   3020 __declspec(naked) __declspec(align(16))
   3021 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
   3022                                    const uint8* u_buf,
   3023                                    const uint8* v_buf,
   3024                                    uint8* dst_bgra,
   3025                                    int width) {
   3026   __asm {
   3027     push       esi
   3028     push       edi
   3029     mov        eax, [esp + 8 + 4]   // Y
   3030     mov        esi, [esp + 8 + 8]   // U
   3031     mov        edi, [esp + 8 + 12]  // V
   3032     mov        edx, [esp + 8 + 16]  // bgra
   3033     mov        ecx, [esp + 8 + 20]  // width
   3034     sub        edi, esi
   3035     pxor       xmm4, xmm4
   3036 
   3037     align      4
   3038  convertloop:
   3039     READYUV422
   3040     YUVTORGB
   3041 
   3042     // Step 3: Weave into BGRA
   3043     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   3044     punpcklbw  xmm1, xmm0           // GB
   3045     punpcklbw  xmm5, xmm2           // AR
   3046     movdqa     xmm0, xmm5
   3047     punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
   3048     punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
   3049     movdqu     [edx], xmm5
   3050     movdqu     [edx + 16], xmm0
   3051     lea        edx,  [edx + 32]
   3052     sub        ecx, 8
   3053     jg         convertloop
   3054 
   3055     pop        edi
   3056     pop        esi
   3057     ret
   3058   }
   3059 }
   3060 
   3061 __declspec(naked) __declspec(align(16))
   3062 void I422ToABGRRow_SSSE3(const uint8* y_buf,
   3063                          const uint8* u_buf,
   3064                          const uint8* v_buf,
   3065                          uint8* dst_abgr,
   3066                          int width) {
   3067   __asm {
   3068     push       esi
   3069     push       edi
   3070     mov        eax, [esp + 8 + 4]   // Y
   3071     mov        esi, [esp + 8 + 8]   // U
   3072     mov        edi, [esp + 8 + 12]  // V
   3073     mov        edx, [esp + 8 + 16]  // abgr
   3074     mov        ecx, [esp + 8 + 20]  // width
   3075     sub        edi, esi
   3076     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   3077     pxor       xmm4, xmm4
   3078 
   3079     align      4
   3080  convertloop:
   3081     READYUV422
   3082     YUVTORGB
   3083 
   3084     // Step 3: Weave into ARGB
   3085     punpcklbw  xmm2, xmm1           // RG
   3086     punpcklbw  xmm0, xmm5           // BA
   3087     movdqa     xmm1, xmm2
   3088     punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
   3089     punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
   3090     movdqa     [edx], xmm2
   3091     movdqa     [edx + 16], xmm1
   3092     lea        edx,  [edx + 32]
   3093     sub        ecx, 8
   3094     jg         convertloop
   3095 
   3096     pop        edi
   3097     pop        esi
   3098     ret
   3099   }
   3100 }
   3101 
   3102 __declspec(naked) __declspec(align(16))
   3103 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
   3104                                    const uint8* u_buf,
   3105                                    const uint8* v_buf,
   3106                                    uint8* dst_abgr,
   3107                                    int width) {
   3108   __asm {
   3109     push       esi
   3110     push       edi
   3111     mov        eax, [esp + 8 + 4]   // Y
   3112     mov        esi, [esp + 8 + 8]   // U
   3113     mov        edi, [esp + 8 + 12]  // V
   3114     mov        edx, [esp + 8 + 16]  // abgr
   3115     mov        ecx, [esp + 8 + 20]  // width
   3116     sub        edi, esi
   3117     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   3118     pxor       xmm4, xmm4
   3119 
   3120     align      4
   3121  convertloop:
   3122     READYUV422
   3123     YUVTORGB
   3124 
   3125     // Step 3: Weave into ARGB
   3126     punpcklbw  xmm2, xmm1           // RG
   3127     punpcklbw  xmm0, xmm5           // BA
   3128     movdqa     xmm1, xmm2
   3129     punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
   3130     punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
   3131     movdqu     [edx], xmm2
   3132     movdqu     [edx + 16], xmm1
   3133     lea        edx,  [edx + 32]
   3134     sub        ecx, 8
   3135     jg         convertloop
   3136 
   3137     pop        edi
   3138     pop        esi
   3139     ret
   3140   }
   3141 }
   3142 
   3143 __declspec(naked) __declspec(align(16))
   3144 void I422ToRGBARow_SSSE3(const uint8* y_buf,
   3145                          const uint8* u_buf,
   3146                          const uint8* v_buf,
   3147                          uint8* dst_rgba,
   3148                          int width) {
   3149   __asm {
   3150     push       esi
   3151     push       edi
   3152     mov        eax, [esp + 8 + 4]   // Y
   3153     mov        esi, [esp + 8 + 8]   // U
   3154     mov        edi, [esp + 8 + 12]  // V
   3155     mov        edx, [esp + 8 + 16]  // rgba
   3156     mov        ecx, [esp + 8 + 20]  // width
   3157     sub        edi, esi
   3158     pxor       xmm4, xmm4
   3159 
   3160     align      4
   3161  convertloop:
   3162     READYUV422
   3163     YUVTORGB
   3164 
   3165     // Step 3: Weave into RGBA
   3166     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   3167     punpcklbw  xmm1, xmm2           // GR
   3168     punpcklbw  xmm5, xmm0           // AB
   3169     movdqa     xmm0, xmm5
   3170     punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
   3171     punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
   3172     movdqa     [edx], xmm5
   3173     movdqa     [edx + 16], xmm0
   3174     lea        edx,  [edx + 32]
   3175     sub        ecx, 8
   3176     jg         convertloop
   3177 
   3178     pop        edi
   3179     pop        esi
   3180     ret
   3181   }
   3182 }
   3183 
   3184 __declspec(naked) __declspec(align(16))
   3185 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
   3186                                    const uint8* u_buf,
   3187                                    const uint8* v_buf,
   3188                                    uint8* dst_rgba,
   3189                                    int width) {
   3190   __asm {
   3191     push       esi
   3192     push       edi
   3193     mov        eax, [esp + 8 + 4]   // Y
   3194     mov        esi, [esp + 8 + 8]   // U
   3195     mov        edi, [esp + 8 + 12]  // V
   3196     mov        edx, [esp + 8 + 16]  // rgba
   3197     mov        ecx, [esp + 8 + 20]  // width
   3198     sub        edi, esi
   3199     pxor       xmm4, xmm4
   3200 
   3201     align      4
   3202  convertloop:
   3203     READYUV422
   3204     YUVTORGB
   3205 
   3206     // Step 3: Weave into RGBA
   3207     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   3208     punpcklbw  xmm1, xmm2           // GR
   3209     punpcklbw  xmm5, xmm0           // AB
   3210     movdqa     xmm0, xmm5
   3211     punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
   3212     punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
   3213     movdqu     [edx], xmm5
   3214     movdqu     [edx + 16], xmm0
   3215     lea        edx,  [edx + 32]
   3216     sub        ecx, 8
   3217     jg         convertloop
   3218 
   3219     pop        edi
   3220     pop        esi
   3221     ret
   3222   }
   3223 }
   3224 
   3225 #endif  // HAS_I422TOARGBROW_SSSE3
   3226 
   3227 #ifdef HAS_YTOARGBROW_SSE2
   3228 __declspec(naked) __declspec(align(16))
   3229 void YToARGBRow_SSE2(const uint8* y_buf,
   3230                      uint8* rgb_buf,
   3231                      int width) {
   3232   __asm {
   3233     pxor       xmm5, xmm5
   3234     pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
   3235     pslld      xmm4, 24
   3236     mov        eax, 0x00100010
   3237     movd       xmm3, eax
   3238     pshufd     xmm3, xmm3, 0
   3239     mov        eax, 0x004a004a       // 74
   3240     movd       xmm2, eax
   3241     pshufd     xmm2, xmm2,0
   3242     mov        eax, [esp + 4]       // Y
   3243     mov        edx, [esp + 8]       // rgb
   3244     mov        ecx, [esp + 12]      // width
   3245 
   3246     align      4
   3247  convertloop:
   3248     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
   3249     movq       xmm0, qword ptr [eax]
   3250     lea        eax, [eax + 8]
   3251     punpcklbw  xmm0, xmm5           // 0.Y
   3252     psubusw    xmm0, xmm3
   3253     pmullw     xmm0, xmm2
   3254     psrlw      xmm0, 6
   3255     packuswb   xmm0, xmm0           // G
   3256 
   3257     // Step 2: Weave into ARGB
   3258     punpcklbw  xmm0, xmm0           // GG
   3259     movdqa     xmm1, xmm0
   3260     punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
   3261     punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
   3262     por        xmm0, xmm4
   3263     por        xmm1, xmm4
   3264     movdqa     [edx], xmm0
   3265     movdqa     [edx + 16], xmm1
   3266     lea        edx,  [edx + 32]
   3267     sub        ecx, 8
   3268     jg         convertloop
   3269 
   3270     ret
   3271   }
   3272 }
   3273 #endif  // HAS_YTOARGBROW_SSE2
   3274 
   3275 #ifdef HAS_MIRRORROW_SSSE3
   3276 // Shuffle table for reversing the bytes.
   3277 static const uvec8 kShuffleMirror = {
   3278   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
   3279 };
   3280 
   3281 __declspec(naked) __declspec(align(16))
   3282 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   3283   __asm {
   3284     mov       eax, [esp + 4]   // src
   3285     mov       edx, [esp + 8]   // dst
   3286     mov       ecx, [esp + 12]  // width
   3287     movdqa    xmm5, kShuffleMirror
   3288     lea       eax, [eax - 16]
   3289 
   3290     align      4
   3291  convertloop:
   3292     movdqa    xmm0, [eax + ecx]
   3293     pshufb    xmm0, xmm5
   3294     sub       ecx, 16
   3295     movdqa    [edx], xmm0
   3296     lea       edx, [edx + 16]
   3297     jg        convertloop
   3298     ret
   3299   }
   3300 }
   3301 #endif  // HAS_MIRRORROW_SSSE3
   3302 
   3303 #ifdef HAS_MIRRORROW_AVX2
   3304 // Shuffle table for reversing the bytes.
   3305 static const ulvec8 kShuffleMirror_AVX2 = {
   3306   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
   3307   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
   3308 };
   3309 
   3310 __declspec(naked) __declspec(align(16))
   3311 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   3312   __asm {
   3313     mov       eax, [esp + 4]   // src
   3314     mov       edx, [esp + 8]   // dst
   3315     mov       ecx, [esp + 12]  // width
   3316     vmovdqa   ymm5, kShuffleMirror_AVX2
   3317     lea       eax, [eax - 32]
   3318 
   3319     align      4
   3320  convertloop:
   3321     vmovdqu   ymm0, [eax + ecx]
   3322     vpshufb   ymm0, ymm0, ymm5
   3323     vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
   3324     sub       ecx, 32
   3325     vmovdqu   [edx], ymm0
   3326     lea       edx, [edx + 32]
   3327     jg        convertloop
   3328     vzeroupper
   3329     ret
   3330   }
   3331 }
   3332 #endif  // HAS_MIRRORROW_AVX2
   3333 
   3334 #ifdef HAS_MIRRORROW_SSE2
   3335 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
   3336 // version can not.
   3337 __declspec(naked) __declspec(align(16))
   3338 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   3339   __asm {
   3340     mov       eax, [esp + 4]   // src
   3341     mov       edx, [esp + 8]   // dst
   3342     mov       ecx, [esp + 12]  // width
   3343     lea       eax, [eax - 16]
   3344 
   3345     align      4
   3346  convertloop:
   3347     movdqu    xmm0, [eax + ecx]
   3348     movdqa    xmm1, xmm0        // swap bytes
   3349     psllw     xmm0, 8
   3350     psrlw     xmm1, 8
   3351     por       xmm0, xmm1
   3352     pshuflw   xmm0, xmm0, 0x1b  // swap words
   3353     pshufhw   xmm0, xmm0, 0x1b
   3354     pshufd    xmm0, xmm0, 0x4e  // swap qwords
   3355     sub       ecx, 16
   3356     movdqu    [edx], xmm0
   3357     lea       edx, [edx + 16]
   3358     jg        convertloop
   3359     ret
   3360   }
   3361 }
   3362 #endif  // HAS_MIRRORROW_SSE2
   3363 
   3364 #ifdef HAS_MIRRORROW_UV_SSSE3
   3365 // Shuffle table for reversing the bytes of UV channels.
   3366 static const uvec8 kShuffleMirrorUV = {
   3367   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
   3368 };
   3369 
   3370 __declspec(naked) __declspec(align(16))
   3371 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
   3372                        int width) {
   3373   __asm {
   3374     push      edi
   3375     mov       eax, [esp + 4 + 4]   // src
   3376     mov       edx, [esp + 4 + 8]   // dst_u
   3377     mov       edi, [esp + 4 + 12]  // dst_v
   3378     mov       ecx, [esp + 4 + 16]  // width
   3379     movdqa    xmm1, kShuffleMirrorUV
   3380     lea       eax, [eax + ecx * 2 - 16]
   3381     sub       edi, edx
   3382 
   3383     align      4
   3384  convertloop:
   3385     movdqa    xmm0, [eax]
   3386     lea       eax, [eax - 16]
   3387     pshufb    xmm0, xmm1
   3388     sub       ecx, 8
   3389     movlpd    qword ptr [edx], xmm0
   3390     movhpd    qword ptr [edx + edi], xmm0
   3391     lea       edx, [edx + 8]
   3392     jg        convertloop
   3393 
   3394     pop       edi
   3395     ret
   3396   }
   3397 }
   3398 #endif  // HAS_MIRRORROW_UV_SSSE3
   3399 
   3400 #ifdef HAS_ARGBMIRRORROW_SSSE3
   3401 // Shuffle table for reversing the bytes.
   3402 static const uvec8 kARGBShuffleMirror = {
   3403   12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
   3404 };
   3405 
   3406 __declspec(naked) __declspec(align(16))
   3407 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   3408   __asm {
   3409     mov       eax, [esp + 4]   // src
   3410     mov       edx, [esp + 8]   // dst
   3411     mov       ecx, [esp + 12]  // width
   3412     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
   3413     movdqa    xmm5, kARGBShuffleMirror
   3414 
   3415     align      4
   3416  convertloop:
   3417     movdqa    xmm0, [eax]
   3418     lea       eax, [eax - 16]
   3419     pshufb    xmm0, xmm5
   3420     sub       ecx, 4
   3421     movdqa    [edx], xmm0
   3422     lea       edx, [edx + 16]
   3423     jg        convertloop
   3424     ret
   3425   }
   3426 }
   3427 #endif  // HAS_ARGBMIRRORROW_SSSE3
   3428 
   3429 #ifdef HAS_ARGBMIRRORROW_AVX2
   3430 // Shuffle table for reversing the bytes.
   3431 static const ulvec32 kARGBShuffleMirror_AVX2 = {
   3432   7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
   3433 };
   3434 
   3435 __declspec(naked) __declspec(align(16))
   3436 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   3437   __asm {
   3438     mov       eax, [esp + 4]   // src
   3439     mov       edx, [esp + 8]   // dst
   3440     mov       ecx, [esp + 12]  // width
   3441     lea       eax, [eax - 32]
   3442     vmovdqa   ymm5, kARGBShuffleMirror_AVX2
   3443 
   3444     align      4
   3445  convertloop:
   3446     vpermd    ymm0, ymm5, [eax + ecx * 4]  // permute dword order
   3447     sub       ecx, 8
   3448     vmovdqu   [edx], ymm0
   3449     lea       edx, [edx + 32]
   3450     jg        convertloop
   3451     vzeroupper
   3452     ret
   3453   }
   3454 }
   3455 #endif  // HAS_ARGBMIRRORROW_AVX2
   3456 
   3457 #ifdef HAS_SPLITUVROW_SSE2
   3458 __declspec(naked) __declspec(align(16))
   3459 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   3460   __asm {
   3461     push       edi
   3462     mov        eax, [esp + 4 + 4]    // src_uv
   3463     mov        edx, [esp + 4 + 8]    // dst_u
   3464     mov        edi, [esp + 4 + 12]   // dst_v
   3465     mov        ecx, [esp + 4 + 16]   // pix
   3466     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   3467     psrlw      xmm5, 8
   3468     sub        edi, edx
   3469 
   3470     align      4
   3471   convertloop:
   3472     movdqa     xmm0, [eax]
   3473     movdqa     xmm1, [eax + 16]
   3474     lea        eax,  [eax + 32]
   3475     movdqa     xmm2, xmm0
   3476     movdqa     xmm3, xmm1
   3477     pand       xmm0, xmm5   // even bytes
   3478     pand       xmm1, xmm5
   3479     packuswb   xmm0, xmm1
   3480     psrlw      xmm2, 8      // odd bytes
   3481     psrlw      xmm3, 8
   3482     packuswb   xmm2, xmm3
   3483     movdqa     [edx], xmm0
   3484     movdqa     [edx + edi], xmm2
   3485     lea        edx, [edx + 16]
   3486     sub        ecx, 16
   3487     jg         convertloop
   3488 
   3489     pop        edi
   3490     ret
   3491   }
   3492 }
   3493 
   3494 __declspec(naked) __declspec(align(16))
   3495 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
   3496                                int pix) {
   3497   __asm {
   3498     push       edi
   3499     mov        eax, [esp + 4 + 4]    // src_uv
   3500     mov        edx, [esp + 4 + 8]    // dst_u
   3501     mov        edi, [esp + 4 + 12]   // dst_v
   3502     mov        ecx, [esp + 4 + 16]   // pix
   3503     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   3504     psrlw      xmm5, 8
   3505     sub        edi, edx
   3506 
   3507     align      4
   3508   convertloop:
   3509     movdqu     xmm0, [eax]
   3510     movdqu     xmm1, [eax + 16]
   3511     lea        eax,  [eax + 32]
   3512     movdqa     xmm2, xmm0
   3513     movdqa     xmm3, xmm1
   3514     pand       xmm0, xmm5   // even bytes
   3515     pand       xmm1, xmm5
   3516     packuswb   xmm0, xmm1
   3517     psrlw      xmm2, 8      // odd bytes
   3518     psrlw      xmm3, 8
   3519     packuswb   xmm2, xmm3
   3520     movdqu     [edx], xmm0
   3521     movdqu     [edx + edi], xmm2
   3522     lea        edx, [edx + 16]
   3523     sub        ecx, 16
   3524     jg         convertloop
   3525 
   3526     pop        edi
   3527     ret
   3528   }
   3529 }
   3530 #endif  // HAS_SPLITUVROW_SSE2
   3531 
   3532 #ifdef HAS_SPLITUVROW_AVX2
   3533 __declspec(naked) __declspec(align(16))
   3534 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   3535   __asm {
   3536     push       edi
   3537     mov        eax, [esp + 4 + 4]    // src_uv
   3538     mov        edx, [esp + 4 + 8]    // dst_u
   3539     mov        edi, [esp + 4 + 12]   // dst_v
   3540     mov        ecx, [esp + 4 + 16]   // pix
   3541     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   3542     vpsrlw     ymm5, ymm5, 8
   3543     sub        edi, edx
   3544 
   3545     align      4
   3546   convertloop:
   3547     vmovdqu    ymm0, [eax]
   3548     vmovdqu    ymm1, [eax + 32]
   3549     lea        eax,  [eax + 64]
   3550     vpsrlw     ymm2, ymm0, 8      // odd bytes
   3551     vpsrlw     ymm3, ymm1, 8
   3552     vpand      ymm0, ymm0, ymm5   // even bytes
   3553     vpand      ymm1, ymm1, ymm5
   3554     vpackuswb  ymm0, ymm0, ymm1
   3555     vpackuswb  ymm2, ymm2, ymm3
   3556     vpermq     ymm0, ymm0, 0xd8
   3557     vpermq     ymm2, ymm2, 0xd8
   3558     vmovdqu    [edx], ymm0
   3559     vmovdqu    [edx + edi], ymm2
   3560     lea        edx, [edx + 32]
   3561     sub        ecx, 32
   3562     jg         convertloop
   3563 
   3564     pop        edi
   3565     vzeroupper
   3566     ret
   3567   }
   3568 }
   3569 #endif  // HAS_SPLITUVROW_AVX2
   3570 
   3571 #ifdef HAS_MERGEUVROW_SSE2
   3572 __declspec(naked) __declspec(align(16))
   3573 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
   3574                      int width) {
   3575   __asm {
   3576     push       edi
   3577     mov        eax, [esp + 4 + 4]    // src_u
   3578     mov        edx, [esp + 4 + 8]    // src_v
   3579     mov        edi, [esp + 4 + 12]   // dst_uv
   3580     mov        ecx, [esp + 4 + 16]   // width
   3581     sub        edx, eax
   3582 
   3583     align      4
   3584   convertloop:
   3585     movdqa     xmm0, [eax]      // read 16 U's
   3586     movdqa     xmm1, [eax + edx]  // and 16 V's
   3587     lea        eax,  [eax + 16]
   3588     movdqa     xmm2, xmm0
   3589     punpcklbw  xmm0, xmm1       // first 8 UV pairs
   3590     punpckhbw  xmm2, xmm1       // next 8 UV pairs
   3591     movdqa     [edi], xmm0
   3592     movdqa     [edi + 16], xmm2
   3593     lea        edi, [edi + 32]
   3594     sub        ecx, 16
   3595     jg         convertloop
   3596 
   3597     pop        edi
   3598     ret
   3599   }
   3600 }
   3601 
   3602 __declspec(naked) __declspec(align(16))
   3603 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
   3604                                uint8* dst_uv, int width) {
   3605   __asm {
   3606     push       edi
   3607     mov        eax, [esp + 4 + 4]    // src_u
   3608     mov        edx, [esp + 4 + 8]    // src_v
   3609     mov        edi, [esp + 4 + 12]   // dst_uv
   3610     mov        ecx, [esp + 4 + 16]   // width
   3611     sub        edx, eax
   3612 
   3613     align      4
   3614   convertloop:
   3615     movdqu     xmm0, [eax]      // read 16 U's
   3616     movdqu     xmm1, [eax + edx]  // and 16 V's
   3617     lea        eax,  [eax + 16]
   3618     movdqa     xmm2, xmm0
   3619     punpcklbw  xmm0, xmm1       // first 8 UV pairs
   3620     punpckhbw  xmm2, xmm1       // next 8 UV pairs
   3621     movdqu     [edi], xmm0
   3622     movdqu     [edi + 16], xmm2
   3623     lea        edi, [edi + 32]
   3624     sub        ecx, 16
   3625     jg         convertloop
   3626 
   3627     pop        edi
   3628     ret
   3629   }
   3630 }
   3631 #endif  //  HAS_MERGEUVROW_SSE2
   3632 
   3633 #ifdef HAS_MERGEUVROW_AVX2
   3634 __declspec(naked) __declspec(align(16))
   3635 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
   3636                      int width) {
   3637   __asm {
   3638     push       edi
   3639     mov        eax, [esp + 4 + 4]    // src_u
   3640     mov        edx, [esp + 4 + 8]    // src_v
   3641     mov        edi, [esp + 4 + 12]   // dst_uv
   3642     mov        ecx, [esp + 4 + 16]   // width
   3643     sub        edx, eax
   3644 
   3645     align      4
   3646   convertloop:
   3647     vmovdqu    ymm0, [eax]           // read 32 U's
   3648     vmovdqu    ymm1, [eax + edx]     // and 32 V's
   3649     lea        eax,  [eax + 32]
   3650     vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
   3651     vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
   3652     vperm2i128 ymm1, ymm2, ymm0, 0x20  // low 128 of ymm2 and low 128 of ymm0
   3653     vperm2i128 ymm2, ymm2, ymm0, 0x31  // high 128 of ymm2 and high 128 of ymm0
   3654     vmovdqu    [edi], ymm1
   3655     vmovdqu    [edi + 32], ymm2
   3656     lea        edi, [edi + 64]
   3657     sub        ecx, 32
   3658     jg         convertloop
   3659 
   3660     pop        edi
   3661     vzeroupper
   3662     ret
   3663   }
   3664 }
   3665 #endif  //  HAS_MERGEUVROW_AVX2
   3666 
   3667 #ifdef HAS_COPYROW_SSE2
   3668 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
   3669 __declspec(naked) __declspec(align(16))
   3670 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   3671   __asm {
   3672     mov        eax, [esp + 4]   // src
   3673     mov        edx, [esp + 8]   // dst
   3674     mov        ecx, [esp + 12]  // count
   3675 
   3676     align      4
   3677   convertloop:
   3678     movdqa     xmm0, [eax]
   3679     movdqa     xmm1, [eax + 16]
   3680     lea        eax, [eax + 32]
   3681     movdqa     [edx], xmm0
   3682     movdqa     [edx + 16], xmm1
   3683     lea        edx, [edx + 32]
   3684     sub        ecx, 32
   3685     jg         convertloop
   3686     ret
   3687   }
   3688 }
   3689 #endif  // HAS_COPYROW_SSE2
   3690 
   3691 // Unaligned Multiple of 1.
   3692 __declspec(naked) __declspec(align(16))
   3693 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
   3694   __asm {
   3695     mov        eax, esi
   3696     mov        edx, edi
   3697     mov        esi, [esp + 4]   // src
   3698     mov        edi, [esp + 8]   // dst
   3699     mov        ecx, [esp + 12]  // count
   3700     rep movsb
   3701     mov        edi, edx
   3702     mov        esi, eax
   3703     ret
   3704   }
   3705 }
   3706 
   3707 #ifdef HAS_COPYROW_X86
   3708 __declspec(naked) __declspec(align(16))
   3709 void CopyRow_X86(const uint8* src, uint8* dst, int count) {
   3710   __asm {
   3711     mov        eax, esi
   3712     mov        edx, edi
   3713     mov        esi, [esp + 4]   // src
   3714     mov        edi, [esp + 8]   // dst
   3715     mov        ecx, [esp + 12]  // count
   3716     shr        ecx, 2
   3717     rep movsd
   3718     mov        edi, edx
   3719     mov        esi, eax
   3720     ret
   3721   }
   3722 }
   3723 #endif  // HAS_COPYROW_X86
   3724 
   3725 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
   3726 // width in pixels
   3727 __declspec(naked) __declspec(align(16))
   3728 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   3729   __asm {
   3730     mov        eax, [esp + 4]   // src
   3731     mov        edx, [esp + 8]   // dst
   3732     mov        ecx, [esp + 12]  // count
   3733     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
   3734     pslld      xmm0, 24
   3735     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
   3736     psrld      xmm1, 8
   3737 
   3738     align      4
   3739   convertloop:
   3740     movdqa     xmm2, [eax]
   3741     movdqa     xmm3, [eax + 16]
   3742     lea        eax, [eax + 32]
   3743     movdqa     xmm4, [edx]
   3744     movdqa     xmm5, [edx + 16]
   3745     pand       xmm2, xmm0
   3746     pand       xmm3, xmm0
   3747     pand       xmm4, xmm1
   3748     pand       xmm5, xmm1
   3749     por        xmm2, xmm4
   3750     por        xmm3, xmm5
   3751     movdqa     [edx], xmm2
   3752     movdqa     [edx + 16], xmm3
   3753     lea        edx, [edx + 32]
   3754     sub        ecx, 8
   3755     jg         convertloop
   3756 
   3757     ret
   3758   }
   3759 }
   3760 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
   3761 
   3762 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
   3763 // width in pixels
   3764 __declspec(naked) __declspec(align(16))
   3765 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   3766   __asm {
   3767     mov        eax, [esp + 4]   // src
   3768     mov        edx, [esp + 8]   // dst
   3769     mov        ecx, [esp + 12]  // count
   3770     vpcmpeqb   ymm0, ymm0, ymm0
   3771     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
   3772 
   3773     align      4
   3774   convertloop:
   3775     vmovdqu    ymm1, [eax]
   3776     vmovdqu    ymm2, [eax + 32]
   3777     lea        eax, [eax + 64]
   3778     vpblendvb  ymm1, ymm1, [edx], ymm0
   3779     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
   3780     vmovdqu    [edx], ymm1
   3781     vmovdqu    [edx + 32], ymm2
   3782     lea        edx, [edx + 64]
   3783     sub        ecx, 16
   3784     jg         convertloop
   3785 
   3786     vzeroupper
   3787     ret
   3788   }
   3789 }
   3790 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
   3791 
   3792 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
   3793 // width in pixels
   3794 __declspec(naked) __declspec(align(16))
   3795 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   3796   __asm {
   3797     mov        eax, [esp + 4]   // src
   3798     mov        edx, [esp + 8]   // dst
   3799     mov        ecx, [esp + 12]  // count
   3800     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
   3801     pslld      xmm0, 24
   3802     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
   3803     psrld      xmm1, 8
   3804 
   3805     align      4
   3806   convertloop:
   3807     movq       xmm2, qword ptr [eax]  // 8 Y's
   3808     lea        eax, [eax + 8]
   3809     punpcklbw  xmm2, xmm2
   3810     punpckhwd  xmm3, xmm2
   3811     punpcklwd  xmm2, xmm2
   3812     movdqa     xmm4, [edx]
   3813     movdqa     xmm5, [edx + 16]
   3814     pand       xmm2, xmm0
   3815     pand       xmm3, xmm0
   3816     pand       xmm4, xmm1
   3817     pand       xmm5, xmm1
   3818     por        xmm2, xmm4
   3819     por        xmm3, xmm5
   3820     movdqa     [edx], xmm2
   3821     movdqa     [edx + 16], xmm3
   3822     lea        edx, [edx + 32]
   3823     sub        ecx, 8
   3824     jg         convertloop
   3825 
   3826     ret
   3827   }
   3828 }
   3829 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
   3830 
   3831 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
   3832 // width in pixels
   3833 __declspec(naked) __declspec(align(16))
   3834 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   3835   __asm {
   3836     mov        eax, [esp + 4]   // src
   3837     mov        edx, [esp + 8]   // dst
   3838     mov        ecx, [esp + 12]  // count
   3839     vpcmpeqb   ymm0, ymm0, ymm0
   3840     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
   3841 
   3842     align      4
   3843   convertloop:
   3844     vpmovzxbd  ymm1, qword ptr [eax]
   3845     vpmovzxbd  ymm2, qword ptr [eax + 8]
   3846     lea        eax, [eax + 16]
   3847     vpslld     ymm1, ymm1, 24
   3848     vpslld     ymm2, ymm2, 24
   3849     vpblendvb  ymm1, ymm1, [edx], ymm0
   3850     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
   3851     vmovdqu    [edx], ymm1
   3852     vmovdqu    [edx + 32], ymm2
   3853     lea        edx, [edx + 64]
   3854     sub        ecx, 16
   3855     jg         convertloop
   3856 
   3857     vzeroupper
   3858     ret
   3859   }
   3860 }
   3861 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
   3862 
   3863 #ifdef HAS_SETROW_X86
   3864 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
   3865 __declspec(naked) __declspec(align(16))
   3866 void SetRow_X86(uint8* dst, uint32 v32, int count) {
   3867   __asm {
   3868     mov        edx, edi
   3869     mov        edi, [esp + 4]   // dst
   3870     mov        eax, [esp + 8]   // v32
   3871     mov        ecx, [esp + 12]  // count
   3872     shr        ecx, 2
   3873     rep stosd
   3874     mov        edi, edx
   3875     ret
   3876   }
   3877 }
   3878 
   3879 // SetRow32 writes 'count' words using a 32 bit value repeated.
   3880 __declspec(naked) __declspec(align(16))
   3881 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
   3882                    int dst_stride, int height) {
   3883   __asm {
   3884     push       esi
   3885     push       edi
   3886     push       ebp
   3887     mov        edi, [esp + 12 + 4]   // dst
   3888     mov        eax, [esp + 12 + 8]   // v32
   3889     mov        ebp, [esp + 12 + 12]  // width
   3890     mov        edx, [esp + 12 + 16]  // dst_stride
   3891     mov        esi, [esp + 12 + 20]  // height
   3892     lea        ecx, [ebp * 4]
   3893     sub        edx, ecx             // stride - width * 4
   3894 
   3895     align      4
   3896   convertloop:
   3897     mov        ecx, ebp
   3898     rep stosd
   3899     add        edi, edx
   3900     sub        esi, 1
   3901     jg         convertloop
   3902 
   3903     pop        ebp
   3904     pop        edi
   3905     pop        esi
   3906     ret
   3907   }
   3908 }
   3909 #endif  // HAS_SETROW_X86
   3910 
   3911 #ifdef HAS_YUY2TOYROW_AVX2
   3912 __declspec(naked) __declspec(align(16))
   3913 void YUY2ToYRow_AVX2(const uint8* src_yuy2,
   3914                      uint8* dst_y, int pix) {
   3915   __asm {
   3916     mov        eax, [esp + 4]    // src_yuy2
   3917     mov        edx, [esp + 8]    // dst_y
   3918     mov        ecx, [esp + 12]   // pix
   3919     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
   3920     vpsrlw     ymm5, ymm5, 8
   3921 
   3922     align      4
   3923   convertloop:
   3924     vmovdqu    ymm0, [eax]
   3925     vmovdqu    ymm1, [eax + 32]
   3926     lea        eax,  [eax + 64]
   3927     vpand      ymm0, ymm0, ymm5   // even bytes are Y
   3928     vpand      ymm1, ymm1, ymm5
   3929     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   3930     vpermq     ymm0, ymm0, 0xd8
   3931     sub        ecx, 32
   3932     vmovdqu    [edx], ymm0
   3933     lea        edx, [edx + 32]
   3934     jg         convertloop
   3935     vzeroupper
   3936     ret
   3937   }
   3938 }
   3939 
   3940 __declspec(naked) __declspec(align(16))
   3941 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
   3942                       uint8* dst_u, uint8* dst_v, int pix) {
   3943   __asm {
   3944     push       esi
   3945     push       edi
   3946     mov        eax, [esp + 8 + 4]    // src_yuy2
   3947     mov        esi, [esp + 8 + 8]    // stride_yuy2
   3948     mov        edx, [esp + 8 + 12]   // dst_u
   3949     mov        edi, [esp + 8 + 16]   // dst_v
   3950     mov        ecx, [esp + 8 + 20]   // pix
   3951     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   3952     vpsrlw     ymm5, ymm5, 8
   3953     sub        edi, edx
   3954 
   3955     align      4
   3956   convertloop:
   3957     vmovdqu    ymm0, [eax]
   3958     vmovdqu    ymm1, [eax + 32]
   3959     vpavgb     ymm0, ymm0, [eax + esi]
   3960     vpavgb     ymm1, ymm1, [eax + esi + 32]
   3961     lea        eax,  [eax + 64]
   3962     vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
   3963     vpsrlw     ymm1, ymm1, 8
   3964     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   3965     vpermq     ymm0, ymm0, 0xd8
   3966     vpand      ymm1, ymm0, ymm5  // U
   3967     vpsrlw     ymm0, ymm0, 8     // V
   3968     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   3969     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   3970     vpermq     ymm1, ymm1, 0xd8
   3971     vpermq     ymm0, ymm0, 0xd8
   3972     vextractf128 [edx], ymm1, 0  // U
   3973     vextractf128 [edx + edi], ymm0, 0 // V
   3974     lea        edx, [edx + 16]
   3975     sub        ecx, 32
   3976     jg         convertloop
   3977 
   3978     pop        edi
   3979     pop        esi
   3980     vzeroupper
   3981     ret
   3982   }
   3983 }
   3984 
   3985 __declspec(naked) __declspec(align(16))
   3986 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
   3987                          uint8* dst_u, uint8* dst_v, int pix) {
   3988   __asm {
   3989     push       edi
   3990     mov        eax, [esp + 4 + 4]    // src_yuy2
   3991     mov        edx, [esp + 4 + 8]    // dst_u
   3992     mov        edi, [esp + 4 + 12]   // dst_v
   3993     mov        ecx, [esp + 4 + 16]   // pix
   3994     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   3995     vpsrlw     ymm5, ymm5, 8
   3996     sub        edi, edx
   3997 
   3998     align      4
   3999   convertloop:
   4000     vmovdqu    ymm0, [eax]
   4001     vmovdqu    ymm1, [eax + 32]
   4002     lea        eax,  [eax + 64]
   4003     vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
   4004     vpsrlw     ymm1, ymm1, 8
   4005     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   4006     vpermq     ymm0, ymm0, 0xd8
   4007     vpand      ymm1, ymm0, ymm5  // U
   4008     vpsrlw     ymm0, ymm0, 8     // V
   4009     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   4010     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   4011     vpermq     ymm1, ymm1, 0xd8
   4012     vpermq     ymm0, ymm0, 0xd8
   4013     vextractf128 [edx], ymm1, 0  // U
   4014     vextractf128 [edx + edi], ymm0, 0 // V
   4015     lea        edx, [edx + 16]
   4016     sub        ecx, 32
   4017     jg         convertloop
   4018 
   4019     pop        edi
   4020     vzeroupper
   4021     ret
   4022   }
   4023 }
   4024 
   4025 __declspec(naked) __declspec(align(16))
   4026 void UYVYToYRow_AVX2(const uint8* src_uyvy,
   4027                      uint8* dst_y, int pix) {
   4028   __asm {
   4029     mov        eax, [esp + 4]    // src_uyvy
   4030     mov        edx, [esp + 8]    // dst_y
   4031     mov        ecx, [esp + 12]   // pix
   4032 
   4033     align      4
   4034   convertloop:
   4035     vmovdqu    ymm0, [eax]
   4036     vmovdqu    ymm1, [eax + 32]
   4037     lea        eax,  [eax + 64]
   4038     vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
   4039     vpsrlw     ymm1, ymm1, 8
   4040     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   4041     vpermq     ymm0, ymm0, 0xd8
   4042     sub        ecx, 32
   4043     vmovdqu    [edx], ymm0
   4044     lea        edx, [edx + 32]
   4045     jg         convertloop
   4046     ret
   4047     vzeroupper
   4048   }
   4049 }
   4050 
   4051 __declspec(naked) __declspec(align(16))
   4052 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
   4053                       uint8* dst_u, uint8* dst_v, int pix) {
   4054   __asm {
   4055     push       esi
   4056     push       edi
   4057     mov        eax, [esp + 8 + 4]    // src_yuy2
   4058     mov        esi, [esp + 8 + 8]    // stride_yuy2
   4059     mov        edx, [esp + 8 + 12]   // dst_u
   4060     mov        edi, [esp + 8 + 16]   // dst_v
   4061     mov        ecx, [esp + 8 + 20]   // pix
   4062     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   4063     vpsrlw     ymm5, ymm5, 8
   4064     sub        edi, edx
   4065 
   4066     align      4
   4067   convertloop:
   4068     vmovdqu    ymm0, [eax]
   4069     vmovdqu    ymm1, [eax + 32]
   4070     vpavgb     ymm0, ymm0, [eax + esi]
   4071     vpavgb     ymm1, ymm1, [eax + esi + 32]
   4072     lea        eax,  [eax + 64]
   4073     vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
   4074     vpand      ymm1, ymm1, ymm5
   4075     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   4076     vpermq     ymm0, ymm0, 0xd8
   4077     vpand      ymm1, ymm0, ymm5  // U
   4078     vpsrlw     ymm0, ymm0, 8     // V
   4079     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   4080     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   4081     vpermq     ymm1, ymm1, 0xd8
   4082     vpermq     ymm0, ymm0, 0xd8
   4083     vextractf128 [edx], ymm1, 0  // U
   4084     vextractf128 [edx + edi], ymm0, 0 // V
   4085     lea        edx, [edx + 16]
   4086     sub        ecx, 32
   4087     jg         convertloop
   4088 
   4089     pop        edi
   4090     pop        esi
   4091     vzeroupper
   4092     ret
   4093   }
   4094 }
   4095 
   4096 __declspec(naked) __declspec(align(16))
   4097 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
   4098                          uint8* dst_u, uint8* dst_v, int pix) {
   4099   __asm {
   4100     push       edi
   4101     mov        eax, [esp + 4 + 4]    // src_yuy2
   4102     mov        edx, [esp + 4 + 8]    // dst_u
   4103     mov        edi, [esp + 4 + 12]   // dst_v
   4104     mov        ecx, [esp + 4 + 16]   // pix
   4105     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   4106     vpsrlw     ymm5, ymm5, 8
   4107     sub        edi, edx
   4108 
   4109     align      4
   4110   convertloop:
   4111     vmovdqu    ymm0, [eax]
   4112     vmovdqu    ymm1, [eax + 32]
   4113     lea        eax,  [eax + 64]
   4114     vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
   4115     vpand      ymm1, ymm1, ymm5
   4116     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   4117     vpermq     ymm0, ymm0, 0xd8
   4118     vpand      ymm1, ymm0, ymm5  // U
   4119     vpsrlw     ymm0, ymm0, 8     // V
   4120     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   4121     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   4122     vpermq     ymm1, ymm1, 0xd8
   4123     vpermq     ymm0, ymm0, 0xd8
   4124     vextractf128 [edx], ymm1, 0  // U
   4125     vextractf128 [edx + edi], ymm0, 0 // V
   4126     lea        edx, [edx + 16]
   4127     sub        ecx, 32
   4128     jg         convertloop
   4129 
   4130     pop        edi
   4131     vzeroupper
   4132     ret
   4133   }
   4134 }
   4135 #endif  // HAS_YUY2TOYROW_AVX2
   4136 
   4137 #ifdef HAS_YUY2TOYROW_SSE2
   4138 __declspec(naked) __declspec(align(16))
   4139 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
   4140                      uint8* dst_y, int pix) {
   4141   __asm {
   4142     mov        eax, [esp + 4]    // src_yuy2
   4143     mov        edx, [esp + 8]    // dst_y
   4144     mov        ecx, [esp + 12]   // pix
   4145     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
   4146     psrlw      xmm5, 8
   4147 
   4148     align      4
   4149   convertloop:
   4150     movdqa     xmm0, [eax]
   4151     movdqa     xmm1, [eax + 16]
   4152     lea        eax,  [eax + 32]
   4153     pand       xmm0, xmm5   // even bytes are Y
   4154     pand       xmm1, xmm5
   4155     packuswb   xmm0, xmm1
   4156     sub        ecx, 16
   4157     movdqa     [edx], xmm0
   4158     lea        edx, [edx + 16]
   4159     jg         convertloop
   4160     ret
   4161   }
   4162 }
   4163 
   4164 __declspec(naked) __declspec(align(16))
   4165 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
   4166                       uint8* dst_u, uint8* dst_v, int pix) {
   4167   __asm {
   4168     push       esi
   4169     push       edi
   4170     mov        eax, [esp + 8 + 4]    // src_yuy2
   4171     mov        esi, [esp + 8 + 8]    // stride_yuy2
   4172     mov        edx, [esp + 8 + 12]   // dst_u
   4173     mov        edi, [esp + 8 + 16]   // dst_v
   4174     mov        ecx, [esp + 8 + 20]   // pix
   4175     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   4176     psrlw      xmm5, 8
   4177     sub        edi, edx
   4178 
   4179     align      4
   4180   convertloop:
   4181     movdqa     xmm0, [eax]
   4182     movdqa     xmm1, [eax + 16]
   4183     movdqa     xmm2, [eax + esi]
   4184     movdqa     xmm3, [eax + esi + 16]
   4185     lea        eax,  [eax + 32]
   4186     pavgb      xmm0, xmm2
   4187     pavgb      xmm1, xmm3
   4188     psrlw      xmm0, 8      // YUYV -> UVUV
   4189     psrlw      xmm1, 8
   4190     packuswb   xmm0, xmm1
   4191     movdqa     xmm1, xmm0
   4192     pand       xmm0, xmm5  // U
   4193     packuswb   xmm0, xmm0
   4194     psrlw      xmm1, 8     // V
   4195     packuswb   xmm1, xmm1
   4196     movq       qword ptr [edx], xmm0
   4197     movq       qword ptr [edx + edi], xmm1
   4198     lea        edx, [edx + 8]
   4199     sub        ecx, 16
   4200     jg         convertloop
   4201 
   4202     pop        edi
   4203     pop        esi
   4204     ret
   4205   }
   4206 }
   4207 
   4208 __declspec(naked) __declspec(align(16))
   4209 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
   4210                          uint8* dst_u, uint8* dst_v, int pix) {
   4211   __asm {
   4212     push       edi
   4213     mov        eax, [esp + 4 + 4]    // src_yuy2
   4214     mov        edx, [esp + 4 + 8]    // dst_u
   4215     mov        edi, [esp + 4 + 12]   // dst_v
   4216     mov        ecx, [esp + 4 + 16]   // pix
   4217     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   4218     psrlw      xmm5, 8
   4219     sub        edi, edx
   4220 
   4221     align      4
   4222   convertloop:
   4223     movdqa     xmm0, [eax]
   4224     movdqa     xmm1, [eax + 16]
   4225     lea        eax,  [eax + 32]
   4226     psrlw      xmm0, 8      // YUYV -> UVUV
   4227     psrlw      xmm1, 8
   4228     packuswb   xmm0, xmm1
   4229     movdqa     xmm1, xmm0
   4230     pand       xmm0, xmm5  // U
   4231     packuswb   xmm0, xmm0
   4232     psrlw      xmm1, 8     // V
   4233     packuswb   xmm1, xmm1
   4234     movq       qword ptr [edx], xmm0
   4235     movq       qword ptr [edx + edi], xmm1
   4236     lea        edx, [edx + 8]
   4237     sub        ecx, 16
   4238     jg         convertloop
   4239 
   4240     pop        edi
   4241     ret
   4242   }
   4243 }
   4244 
   4245 __declspec(naked) __declspec(align(16))
   4246 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
   4247                                uint8* dst_y, int pix) {
   4248   __asm {
   4249     mov        eax, [esp + 4]    // src_yuy2
   4250     mov        edx, [esp + 8]    // dst_y
   4251     mov        ecx, [esp + 12]   // pix
   4252     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
   4253     psrlw      xmm5, 8
   4254 
   4255     align      4
   4256   convertloop:
   4257     movdqu     xmm0, [eax]
   4258     movdqu     xmm1, [eax + 16]
   4259     lea        eax,  [eax + 32]
   4260     pand       xmm0, xmm5   // even bytes are Y
   4261     pand       xmm1, xmm5
   4262     packuswb   xmm0, xmm1
   4263     sub        ecx, 16
   4264     movdqu     [edx], xmm0
   4265     lea        edx, [edx + 16]
   4266     jg         convertloop
   4267     ret
   4268   }
   4269 }
   4270 
   4271 __declspec(naked) __declspec(align(16))
   4272 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
   4273                                 uint8* dst_u, uint8* dst_v, int pix) {
   4274   __asm {
   4275     push       esi
   4276     push       edi
   4277     mov        eax, [esp + 8 + 4]    // src_yuy2
   4278     mov        esi, [esp + 8 + 8]    // stride_yuy2
   4279     mov        edx, [esp + 8 + 12]   // dst_u
   4280     mov        edi, [esp + 8 + 16]   // dst_v
   4281     mov        ecx, [esp + 8 + 20]   // pix
   4282     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   4283     psrlw      xmm5, 8
   4284     sub        edi, edx
   4285 
   4286     align      4
   4287   convertloop:
   4288     movdqu     xmm0, [eax]
   4289     movdqu     xmm1, [eax + 16]
   4290     movdqu     xmm2, [eax + esi]
   4291     movdqu     xmm3, [eax + esi + 16]
   4292     lea        eax,  [eax + 32]
   4293     pavgb      xmm0, xmm2
   4294     pavgb      xmm1, xmm3
   4295     psrlw      xmm0, 8      // YUYV -> UVUV
   4296     psrlw      xmm1, 8
   4297     packuswb   xmm0, xmm1
   4298     movdqa     xmm1, xmm0
   4299     pand       xmm0, xmm5  // U
   4300     packuswb   xmm0, xmm0
   4301     psrlw      xmm1, 8     // V
   4302     packuswb   xmm1, xmm1
   4303     movq       qword ptr [edx], xmm0
   4304     movq       qword ptr [edx + edi], xmm1
   4305     lea        edx, [edx + 8]
   4306     sub        ecx, 16
   4307     jg         convertloop
   4308 
   4309     pop        edi
   4310     pop        esi
   4311     ret
   4312   }
   4313 }
   4314 
   4315 __declspec(naked) __declspec(align(16))
   4316 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
   4317                                    uint8* dst_u, uint8* dst_v, int pix) {
   4318   __asm {
   4319     push       edi
   4320     mov        eax, [esp + 4 + 4]    // src_yuy2
   4321     mov        edx, [esp + 4 + 8]    // dst_u
   4322     mov        edi, [esp + 4 + 12]   // dst_v
   4323     mov        ecx, [esp + 4 + 16]   // pix
   4324     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   4325     psrlw      xmm5, 8
   4326     sub        edi, edx
   4327 
   4328     align      4
   4329   convertloop:
   4330     movdqu     xmm0, [eax]
   4331     movdqu     xmm1, [eax + 16]
   4332     lea        eax,  [eax + 32]
   4333     psrlw      xmm0, 8      // YUYV -> UVUV
   4334     psrlw      xmm1, 8
   4335     packuswb   xmm0, xmm1
   4336     movdqa     xmm1, xmm0
   4337     pand       xmm0, xmm5  // U
   4338     packuswb   xmm0, xmm0
   4339     psrlw      xmm1, 8     // V
   4340     packuswb   xmm1, xmm1
   4341     movq       qword ptr [edx], xmm0
   4342     movq       qword ptr [edx + edi], xmm1
   4343     lea        edx, [edx + 8]
   4344     sub        ecx, 16
   4345     jg         convertloop
   4346 
   4347     pop        edi
   4348     ret
   4349   }
   4350 }
   4351 
   4352 __declspec(naked) __declspec(align(16))
   4353 void UYVYToYRow_SSE2(const uint8* src_uyvy,
   4354                      uint8* dst_y, int pix) {
   4355   __asm {
   4356     mov        eax, [esp + 4]    // src_uyvy
   4357     mov        edx, [esp + 8]    // dst_y
   4358     mov        ecx, [esp + 12]   // pix
   4359 
   4360     align      4
   4361   convertloop:
   4362     movdqa     xmm0, [eax]
   4363     movdqa     xmm1, [eax + 16]
   4364     lea        eax,  [eax + 32]
   4365     psrlw      xmm0, 8    // odd bytes are Y
   4366     psrlw      xmm1, 8
   4367     packuswb   xmm0, xmm1
   4368     sub        ecx, 16
   4369     movdqa     [edx], xmm0
   4370     lea        edx, [edx + 16]
   4371     jg         convertloop
   4372     ret
   4373   }
   4374 }
   4375 
   4376 __declspec(naked) __declspec(align(16))
   4377 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
   4378                       uint8* dst_u, uint8* dst_v, int pix) {
   4379   __asm {
   4380     push       esi
   4381     push       edi
   4382     mov        eax, [esp + 8 + 4]    // src_yuy2
   4383     mov        esi, [esp + 8 + 8]    // stride_yuy2
   4384     mov        edx, [esp + 8 + 12]   // dst_u
   4385     mov        edi, [esp + 8 + 16]   // dst_v
   4386     mov        ecx, [esp + 8 + 20]   // pix
   4387     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   4388     psrlw      xmm5, 8
   4389     sub        edi, edx
   4390 
   4391     align      4
   4392   convertloop:
   4393     movdqa     xmm0, [eax]
   4394     movdqa     xmm1, [eax + 16]
   4395     movdqa     xmm2, [eax + esi]
   4396     movdqa     xmm3, [eax + esi + 16]
   4397     lea        eax,  [eax + 32]
   4398     pavgb      xmm0, xmm2
   4399     pavgb      xmm1, xmm3
   4400     pand       xmm0, xmm5   // UYVY -> UVUV
   4401     pand       xmm1, xmm5
   4402     packuswb   xmm0, xmm1
   4403     movdqa     xmm1, xmm0
   4404     pand       xmm0, xmm5  // U
   4405     packuswb   xmm0, xmm0
   4406     psrlw      xmm1, 8     // V
   4407     packuswb   xmm1, xmm1
   4408     movq       qword ptr [edx], xmm0
   4409     movq       qword ptr [edx + edi], xmm1
   4410     lea        edx, [edx + 8]
   4411     sub        ecx, 16
   4412     jg         convertloop
   4413 
   4414     pop        edi
   4415     pop        esi
   4416     ret
   4417   }
   4418 }
   4419 
   4420 __declspec(naked) __declspec(align(16))
   4421 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
   4422                          uint8* dst_u, uint8* dst_v, int pix) {
   4423   __asm {
   4424     push       edi
   4425     mov        eax, [esp + 4 + 4]    // src_yuy2
   4426     mov        edx, [esp + 4 + 8]    // dst_u
   4427     mov        edi, [esp + 4 + 12]   // dst_v
   4428     mov        ecx, [esp + 4 + 16]   // pix
   4429     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   4430     psrlw      xmm5, 8
   4431     sub        edi, edx
   4432 
   4433     align      4
   4434   convertloop:
   4435     movdqa     xmm0, [eax]
   4436     movdqa     xmm1, [eax + 16]
   4437     lea        eax,  [eax + 32]
   4438     pand       xmm0, xmm5   // UYVY -> UVUV
   4439     pand       xmm1, xmm5
   4440     packuswb   xmm0, xmm1
   4441     movdqa     xmm1, xmm0
   4442     pand       xmm0, xmm5  // U
   4443     packuswb   xmm0, xmm0
   4444     psrlw      xmm1, 8     // V
   4445     packuswb   xmm1, xmm1
   4446     movq       qword ptr [edx], xmm0
   4447     movq       qword ptr [edx + edi], xmm1
   4448     lea        edx, [edx + 8]
   4449     sub        ecx, 16
   4450     jg         convertloop
   4451 
   4452     pop        edi
   4453     ret
   4454   }
   4455 }
   4456 
   4457 __declspec(naked) __declspec(align(16))
   4458 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
   4459                                uint8* dst_y, int pix) {
   4460   __asm {
   4461     mov        eax, [esp + 4]    // src_uyvy
   4462     mov        edx, [esp + 8]    // dst_y
   4463     mov        ecx, [esp + 12]   // pix
   4464 
   4465     align      4
   4466   convertloop:
   4467     movdqu     xmm0, [eax]
   4468     movdqu     xmm1, [eax + 16]
   4469     lea        eax,  [eax + 32]
   4470     psrlw      xmm0, 8    // odd bytes are Y
   4471     psrlw      xmm1, 8
   4472     packuswb   xmm0, xmm1
   4473     sub        ecx, 16
   4474     movdqu     [edx], xmm0
   4475     lea        edx, [edx + 16]
   4476     jg         convertloop
   4477     ret
   4478   }
   4479 }
   4480 
   4481 __declspec(naked) __declspec(align(16))
   4482 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
   4483                                 uint8* dst_u, uint8* dst_v, int pix) {
   4484   __asm {
   4485     push       esi
   4486     push       edi
   4487     mov        eax, [esp + 8 + 4]    // src_yuy2
   4488     mov        esi, [esp + 8 + 8]    // stride_yuy2
   4489     mov        edx, [esp + 8 + 12]   // dst_u
   4490     mov        edi, [esp + 8 + 16]   // dst_v
   4491     mov        ecx, [esp + 8 + 20]   // pix
   4492     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   4493     psrlw      xmm5, 8
   4494     sub        edi, edx
   4495 
   4496     align      4
   4497   convertloop:
   4498     movdqu     xmm0, [eax]
   4499     movdqu     xmm1, [eax + 16]
   4500     movdqu     xmm2, [eax + esi]
   4501     movdqu     xmm3, [eax + esi + 16]
   4502     lea        eax,  [eax + 32]
   4503     pavgb      xmm0, xmm2
   4504     pavgb      xmm1, xmm3
   4505     pand       xmm0, xmm5   // UYVY -> UVUV
   4506     pand       xmm1, xmm5
   4507     packuswb   xmm0, xmm1
   4508     movdqa     xmm1, xmm0
   4509     pand       xmm0, xmm5  // U
   4510     packuswb   xmm0, xmm0
   4511     psrlw      xmm1, 8     // V
   4512     packuswb   xmm1, xmm1
   4513     movq       qword ptr [edx], xmm0
   4514     movq       qword ptr [edx + edi], xmm1
   4515     lea        edx, [edx + 8]
   4516     sub        ecx, 16
   4517     jg         convertloop
   4518 
   4519     pop        edi
   4520     pop        esi
   4521     ret
   4522   }
   4523 }
   4524 
   4525 __declspec(naked) __declspec(align(16))
   4526 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
   4527                                    uint8* dst_u, uint8* dst_v, int pix) {
   4528   __asm {
   4529     push       edi
   4530     mov        eax, [esp + 4 + 4]    // src_yuy2
   4531     mov        edx, [esp + 4 + 8]    // dst_u
   4532     mov        edi, [esp + 4 + 12]   // dst_v
   4533     mov        ecx, [esp + 4 + 16]   // pix
   4534     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   4535     psrlw      xmm5, 8
   4536     sub        edi, edx
   4537 
   4538     align      4
   4539   convertloop:
   4540     movdqu     xmm0, [eax]
   4541     movdqu     xmm1, [eax + 16]
   4542     lea        eax,  [eax + 32]
   4543     pand       xmm0, xmm5   // UYVY -> UVUV
   4544     pand       xmm1, xmm5
   4545     packuswb   xmm0, xmm1
   4546     movdqa     xmm1, xmm0
   4547     pand       xmm0, xmm5  // U
   4548     packuswb   xmm0, xmm0
   4549     psrlw      xmm1, 8     // V
   4550     packuswb   xmm1, xmm1
   4551     movq       qword ptr [edx], xmm0
   4552     movq       qword ptr [edx + edi], xmm1
   4553     lea        edx, [edx + 8]
   4554     sub        ecx, 16
   4555     jg         convertloop
   4556 
   4557     pop        edi
   4558     ret
   4559   }
   4560 }
   4561 #endif  // HAS_YUY2TOYROW_SSE2
   4562 
   4563 #ifdef HAS_ARGBBLENDROW_SSE2
   4564 // Blend 8 pixels at a time.
   4565 __declspec(naked) __declspec(align(16))
   4566 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   4567                        uint8* dst_argb, int width) {
   4568   __asm {
   4569     push       esi
   4570     mov        eax, [esp + 4 + 4]   // src_argb0
   4571     mov        esi, [esp + 4 + 8]   // src_argb1
   4572     mov        edx, [esp + 4 + 12]  // dst_argb
   4573     mov        ecx, [esp + 4 + 16]  // width
   4574     pcmpeqb    xmm7, xmm7       // generate constant 1
   4575     psrlw      xmm7, 15
   4576     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
   4577     psrlw      xmm6, 8
   4578     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
   4579     psllw      xmm5, 8
   4580     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
   4581     pslld      xmm4, 24
   4582 
   4583     sub        ecx, 1
   4584     je         convertloop1     // only 1 pixel?
   4585     jl         convertloop1b
   4586 
   4587     // 1 pixel loop until destination pointer is aligned.
   4588   alignloop1:
   4589     test       edx, 15          // aligned?
   4590     je         alignloop1b
   4591     movd       xmm3, [eax]
   4592     lea        eax, [eax + 4]
   4593     movdqa     xmm0, xmm3       // src argb
   4594     pxor       xmm3, xmm4       // ~alpha
   4595     movd       xmm2, [esi]      // _r_b
   4596     psrlw      xmm3, 8          // alpha
   4597     pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
   4598     pshuflw    xmm3, xmm3, 0F5h
   4599     pand       xmm2, xmm6       // _r_b
   4600     paddw      xmm3, xmm7       // 256 - alpha
   4601     pmullw     xmm2, xmm3       // _r_b * alpha
   4602     movd       xmm1, [esi]      // _a_g
   4603     lea        esi, [esi + 4]
   4604     psrlw      xmm1, 8          // _a_g
   4605     por        xmm0, xmm4       // set alpha to 255
   4606     pmullw     xmm1, xmm3       // _a_g * alpha
   4607     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   4608     paddusb    xmm0, xmm2       // + src argb
   4609     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   4610     paddusb    xmm0, xmm1       // + src argb
   4611     sub        ecx, 1
   4612     movd       [edx], xmm0
   4613     lea        edx, [edx + 4]
   4614     jge        alignloop1
   4615 
   4616   alignloop1b:
   4617     add        ecx, 1 - 4
   4618     jl         convertloop4b
   4619 
   4620     // 4 pixel loop.
   4621   convertloop4:
   4622     movdqu     xmm3, [eax]      // src argb
   4623     lea        eax, [eax + 16]
   4624     movdqa     xmm0, xmm3       // src argb
   4625     pxor       xmm3, xmm4       // ~alpha
   4626     movdqu     xmm2, [esi]      // _r_b
   4627     psrlw      xmm3, 8          // alpha
   4628     pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
   4629     pshuflw    xmm3, xmm3, 0F5h
   4630     pand       xmm2, xmm6       // _r_b
   4631     paddw      xmm3, xmm7       // 256 - alpha
   4632     pmullw     xmm2, xmm3       // _r_b * alpha
   4633     movdqu     xmm1, [esi]      // _a_g
   4634     lea        esi, [esi + 16]
   4635     psrlw      xmm1, 8          // _a_g
   4636     por        xmm0, xmm4       // set alpha to 255
   4637     pmullw     xmm1, xmm3       // _a_g * alpha
   4638     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   4639     paddusb    xmm0, xmm2       // + src argb
   4640     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   4641     paddusb    xmm0, xmm1       // + src argb
   4642     sub        ecx, 4
   4643     movdqa     [edx], xmm0
   4644     lea        edx, [edx + 16]
   4645     jge        convertloop4
   4646 
   4647   convertloop4b:
   4648     add        ecx, 4 - 1
   4649     jl         convertloop1b
   4650 
   4651     // 1 pixel loop.
   4652   convertloop1:
   4653     movd       xmm3, [eax]      // src argb
   4654     lea        eax, [eax + 4]
   4655     movdqa     xmm0, xmm3       // src argb
   4656     pxor       xmm3, xmm4       // ~alpha
   4657     movd       xmm2, [esi]      // _r_b
   4658     psrlw      xmm3, 8          // alpha
   4659     pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
   4660     pshuflw    xmm3, xmm3, 0F5h
   4661     pand       xmm2, xmm6       // _r_b
   4662     paddw      xmm3, xmm7       // 256 - alpha
   4663     pmullw     xmm2, xmm3       // _r_b * alpha
   4664     movd       xmm1, [esi]      // _a_g
   4665     lea        esi, [esi + 4]
   4666     psrlw      xmm1, 8          // _a_g
   4667     por        xmm0, xmm4       // set alpha to 255
   4668     pmullw     xmm1, xmm3       // _a_g * alpha
   4669     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   4670     paddusb    xmm0, xmm2       // + src argb
   4671     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   4672     paddusb    xmm0, xmm1       // + src argb
   4673     sub        ecx, 1
   4674     movd       [edx], xmm0
   4675     lea        edx, [edx + 4]
   4676     jge        convertloop1
   4677 
   4678   convertloop1b:
   4679     pop        esi
   4680     ret
   4681   }
   4682 }
   4683 #endif  // HAS_ARGBBLENDROW_SSE2
   4684 
   4685 #ifdef HAS_ARGBBLENDROW_SSSE3
   4686 // Shuffle table for isolating alpha.
   4687 static const uvec8 kShuffleAlpha = {
   4688   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
   4689   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
   4690 };
   4691 // Same as SSE2, but replaces:
   4692 //    psrlw      xmm3, 8          // alpha
   4693 //    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
   4694 //    pshuflw    xmm3, xmm3, 0F5h
   4695 // with..
   4696 //    pshufb     xmm3, kShuffleAlpha // alpha
   4697 // Blend 8 pixels at a time.
   4698 
   4699 __declspec(naked) __declspec(align(16))
   4700 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
   4701                         uint8* dst_argb, int width) {
   4702   __asm {
   4703     push       esi
   4704     mov        eax, [esp + 4 + 4]   // src_argb0
   4705     mov        esi, [esp + 4 + 8]   // src_argb1
   4706     mov        edx, [esp + 4 + 12]  // dst_argb
   4707     mov        ecx, [esp + 4 + 16]  // width
   4708     pcmpeqb    xmm7, xmm7       // generate constant 0x0001
   4709     psrlw      xmm7, 15
   4710     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
   4711     psrlw      xmm6, 8
   4712     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
   4713     psllw      xmm5, 8
   4714     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
   4715     pslld      xmm4, 24
   4716 
   4717     sub        ecx, 1
   4718     je         convertloop1     // only 1 pixel?
   4719     jl         convertloop1b
   4720 
   4721     // 1 pixel loop until destination pointer is aligned.
   4722   alignloop1:
   4723     test       edx, 15          // aligned?
   4724     je         alignloop1b
   4725     movd       xmm3, [eax]
   4726     lea        eax, [eax + 4]
   4727     movdqa     xmm0, xmm3       // src argb
   4728     pxor       xmm3, xmm4       // ~alpha
   4729     movd       xmm2, [esi]      // _r_b
   4730     pshufb     xmm3, kShuffleAlpha // alpha
   4731     pand       xmm2, xmm6       // _r_b
   4732     paddw      xmm3, xmm7       // 256 - alpha
   4733     pmullw     xmm2, xmm3       // _r_b * alpha
   4734     movd       xmm1, [esi]      // _a_g
   4735     lea        esi, [esi + 4]
   4736     psrlw      xmm1, 8          // _a_g
   4737     por        xmm0, xmm4       // set alpha to 255
   4738     pmullw     xmm1, xmm3       // _a_g * alpha
   4739     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   4740     paddusb    xmm0, xmm2       // + src argb
   4741     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   4742     paddusb    xmm0, xmm1       // + src argb
   4743     sub        ecx, 1
   4744     movd       [edx], xmm0
   4745     lea        edx, [edx + 4]
   4746     jge        alignloop1
   4747 
   4748   alignloop1b:
   4749     add        ecx, 1 - 4
   4750     jl         convertloop4b
   4751 
   4752     test       eax, 15          // unaligned?
   4753     jne        convertuloop4
   4754     test       esi, 15          // unaligned?
   4755     jne        convertuloop4
   4756 
   4757     // 4 pixel loop.
   4758   convertloop4:
   4759     movdqa     xmm3, [eax]      // src argb
   4760     lea        eax, [eax + 16]
   4761     movdqa     xmm0, xmm3       // src argb
   4762     pxor       xmm3, xmm4       // ~alpha
   4763     movdqa     xmm2, [esi]      // _r_b
   4764     pshufb     xmm3, kShuffleAlpha // alpha
   4765     pand       xmm2, xmm6       // _r_b
   4766     paddw      xmm3, xmm7       // 256 - alpha
   4767     pmullw     xmm2, xmm3       // _r_b * alpha
   4768     movdqa     xmm1, [esi]      // _a_g
   4769     lea        esi, [esi + 16]
   4770     psrlw      xmm1, 8          // _a_g
   4771     por        xmm0, xmm4       // set alpha to 255
   4772     pmullw     xmm1, xmm3       // _a_g * alpha
   4773     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   4774     paddusb    xmm0, xmm2       // + src argb
   4775     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   4776     paddusb    xmm0, xmm1       // + src argb
   4777     sub        ecx, 4
   4778     movdqa     [edx], xmm0
   4779     lea        edx, [edx + 16]
   4780     jge        convertloop4
   4781     jmp        convertloop4b
   4782 
   4783     // 4 pixel unaligned loop.
   4784   convertuloop4:
   4785     movdqu     xmm3, [eax]      // src argb
   4786     lea        eax, [eax + 16]
   4787     movdqa     xmm0, xmm3       // src argb
   4788     pxor       xmm3, xmm4       // ~alpha
   4789     movdqu     xmm2, [esi]      // _r_b
   4790     pshufb     xmm3, kShuffleAlpha // alpha
   4791     pand       xmm2, xmm6       // _r_b
   4792     paddw      xmm3, xmm7       // 256 - alpha
   4793     pmullw     xmm2, xmm3       // _r_b * alpha
   4794     movdqu     xmm1, [esi]      // _a_g
   4795     lea        esi, [esi + 16]
   4796     psrlw      xmm1, 8          // _a_g
   4797     por        xmm0, xmm4       // set alpha to 255
   4798     pmullw     xmm1, xmm3       // _a_g * alpha
   4799     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   4800     paddusb    xmm0, xmm2       // + src argb
   4801     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   4802     paddusb    xmm0, xmm1       // + src argb
   4803     sub        ecx, 4
   4804     movdqa     [edx], xmm0
   4805     lea        edx, [edx + 16]
   4806     jge        convertuloop4
   4807 
   4808   convertloop4b:
   4809     add        ecx, 4 - 1
   4810     jl         convertloop1b
   4811 
   4812     // 1 pixel loop.
   4813   convertloop1:
   4814     movd       xmm3, [eax]      // src argb
   4815     lea        eax, [eax + 4]
   4816     movdqa     xmm0, xmm3       // src argb
   4817     pxor       xmm3, xmm4       // ~alpha
   4818     movd       xmm2, [esi]      // _r_b
   4819     pshufb     xmm3, kShuffleAlpha // alpha
   4820     pand       xmm2, xmm6       // _r_b
   4821     paddw      xmm3, xmm7       // 256 - alpha
   4822     pmullw     xmm2, xmm3       // _r_b * alpha
   4823     movd       xmm1, [esi]      // _a_g
   4824     lea        esi, [esi + 4]
   4825     psrlw      xmm1, 8          // _a_g
   4826     por        xmm0, xmm4       // set alpha to 255
   4827     pmullw     xmm1, xmm3       // _a_g * alpha
   4828     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   4829     paddusb    xmm0, xmm2       // + src argb
   4830     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   4831     paddusb    xmm0, xmm1       // + src argb
   4832     sub        ecx, 1
   4833     movd       [edx], xmm0
   4834     lea        edx, [edx + 4]
   4835     jge        convertloop1
   4836 
   4837   convertloop1b:
   4838     pop        esi
   4839     ret
   4840   }
   4841 }
   4842 #endif  // HAS_ARGBBLENDROW_SSSE3
   4843 
   4844 #ifdef HAS_ARGBATTENUATEROW_SSE2
   4845 // Attenuate 4 pixels at a time.
   4846 // Aligned to 16 bytes.
   4847 __declspec(naked) __declspec(align(16))
   4848 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
   4849   __asm {
   4850     mov        eax, [esp + 4]   // src_argb0
   4851     mov        edx, [esp + 8]   // dst_argb
   4852     mov        ecx, [esp + 12]  // width
   4853     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
   4854     pslld      xmm4, 24
   4855     pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
   4856     psrld      xmm5, 8
   4857 
   4858     align      4
   4859  convertloop:
   4860     movdqa     xmm0, [eax]      // read 4 pixels
   4861     punpcklbw  xmm0, xmm0       // first 2
   4862     pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
   4863     pshuflw    xmm2, xmm2, 0FFh
   4864     pmulhuw    xmm0, xmm2       // rgb * a
   4865     movdqa     xmm1, [eax]      // read 4 pixels
   4866     punpckhbw  xmm1, xmm1       // next 2 pixels
   4867     pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
   4868     pshuflw    xmm2, xmm2, 0FFh
   4869     pmulhuw    xmm1, xmm2       // rgb * a
   4870     movdqa     xmm2, [eax]      // alphas
   4871     lea        eax, [eax + 16]
   4872     psrlw      xmm0, 8
   4873     pand       xmm2, xmm4
   4874     psrlw      xmm1, 8
   4875     packuswb   xmm0, xmm1
   4876     pand       xmm0, xmm5       // keep original alphas
   4877     por        xmm0, xmm2
   4878     sub        ecx, 4
   4879     movdqa     [edx], xmm0
   4880     lea        edx, [edx + 16]
   4881     jg         convertloop
   4882 
   4883     ret
   4884   }
   4885 }
   4886 #endif  // HAS_ARGBATTENUATEROW_SSE2
   4887 
   4888 #ifdef HAS_ARGBATTENUATEROW_SSSE3
   4889 // Shuffle table duplicating alpha.
   4890 static const uvec8 kShuffleAlpha0 = {
   4891   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
   4892 };
   4893 static const uvec8 kShuffleAlpha1 = {
   4894   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
   4895   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
   4896 };
   4897 __declspec(naked) __declspec(align(16))
   4898 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   4899   __asm {
   4900     mov        eax, [esp + 4]   // src_argb0
   4901     mov        edx, [esp + 8]   // dst_argb
   4902     mov        ecx, [esp + 12]  // width
   4903     pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
   4904     pslld      xmm3, 24
   4905     movdqa     xmm4, kShuffleAlpha0
   4906     movdqa     xmm5, kShuffleAlpha1
   4907 
   4908     align      4
   4909  convertloop:
   4910     movdqu     xmm0, [eax]      // read 4 pixels
   4911     pshufb     xmm0, xmm4       // isolate first 2 alphas
   4912     movdqu     xmm1, [eax]      // read 4 pixels
   4913     punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
   4914     pmulhuw    xmm0, xmm1       // rgb * a
   4915     movdqu     xmm1, [eax]      // read 4 pixels
   4916     pshufb     xmm1, xmm5       // isolate next 2 alphas
   4917     movdqu     xmm2, [eax]      // read 4 pixels
   4918     punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
   4919     pmulhuw    xmm1, xmm2       // rgb * a
   4920     movdqu     xmm2, [eax]      // mask original alpha
   4921     lea        eax, [eax + 16]
   4922     pand       xmm2, xmm3
   4923     psrlw      xmm0, 8
   4924     psrlw      xmm1, 8
   4925     packuswb   xmm0, xmm1
   4926     por        xmm0, xmm2       // copy original alpha
   4927     sub        ecx, 4
   4928     movdqu     [edx], xmm0
   4929     lea        edx, [edx + 16]
   4930     jg         convertloop
   4931 
   4932     ret
   4933   }
   4934 }
   4935 #endif  // HAS_ARGBATTENUATEROW_SSSE3
   4936 
   4937 #ifdef HAS_ARGBATTENUATEROW_AVX2
   4938 // Shuffle table duplicating alpha.
   4939 static const ulvec8 kShuffleAlpha_AVX2 = {
   4940   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
   4941   14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
   4942   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
   4943   14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
   4944 };
   4945 __declspec(naked) __declspec(align(16))
   4946 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
   4947   __asm {
   4948     mov        eax, [esp + 4]   // src_argb0
   4949     mov        edx, [esp + 8]   // dst_argb
   4950     mov        ecx, [esp + 12]  // width
   4951     sub        edx, eax
   4952     vmovdqa    ymm4, kShuffleAlpha_AVX2
   4953     vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
   4954     vpslld     ymm5, ymm5, 24
   4955 
   4956     align      4
   4957  convertloop:
   4958     vmovdqu    ymm6, [eax]       // read 8 pixels.
   4959     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
   4960     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
   4961     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
   4962     vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
   4963     vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
   4964     vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
   4965     vpand      ymm6, ymm6, ymm5  // isolate alpha
   4966     vpsrlw     ymm0, ymm0, 8
   4967     vpsrlw     ymm1, ymm1, 8
   4968     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
   4969     vpor       ymm0, ymm0, ymm6  // copy original alpha
   4970     sub        ecx, 8
   4971     vmovdqu    [eax + edx], ymm0
   4972     lea        eax, [eax + 32]
   4973     jg         convertloop
   4974 
   4975     vzeroupper
   4976     ret
   4977   }
   4978 }
   4979 #endif  // HAS_ARGBATTENUATEROW_AVX2
   4980 
   4981 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
   4982 // Unattenuate 4 pixels at a time.
   4983 // Aligned to 16 bytes.
   4984 __declspec(naked) __declspec(align(16))
   4985 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
   4986                              int width) {
   4987   __asm {
   4988     push       esi
   4989     push       edi
   4990     mov        eax, [esp + 8 + 4]   // src_argb0
   4991     mov        edx, [esp + 8 + 8]   // dst_argb
   4992     mov        ecx, [esp + 8 + 12]  // width
   4993 
   4994     align      4
   4995  convertloop:
   4996     movdqu     xmm0, [eax]      // read 4 pixels
   4997     movzx      esi, byte ptr [eax + 3]  // first alpha
   4998     movzx      edi, byte ptr [eax + 7]  // second alpha
   4999     punpcklbw  xmm0, xmm0       // first 2
   5000     movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
   5001     movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
   5002     pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
   5003     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
   5004     movlhps    xmm2, xmm3
   5005     pmulhuw    xmm0, xmm2       // rgb * a
   5006 
   5007     movdqu     xmm1, [eax]      // read 4 pixels
   5008     movzx      esi, byte ptr [eax + 11]  // third alpha
   5009     movzx      edi, byte ptr [eax + 15]  // forth alpha
   5010     punpckhbw  xmm1, xmm1       // next 2
   5011     movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
   5012     movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
   5013     pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
   5014     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
   5015     movlhps    xmm2, xmm3
   5016     pmulhuw    xmm1, xmm2       // rgb * a
   5017     lea        eax, [eax + 16]
   5018 
   5019     packuswb   xmm0, xmm1
   5020     sub        ecx, 4
   5021     movdqu     [edx], xmm0
   5022     lea        edx, [edx + 16]
   5023     jg         convertloop
   5024     pop        edi
   5025     pop        esi
   5026     ret
   5027   }
   5028 }
   5029 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
   5030 
   5031 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
   5032 // Shuffle table duplicating alpha.
   5033 static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
   5034   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
   5035   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
   5036 };
   5037 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
   5038 // USE_GATHER is not on by default, due to being a slow instruction.
   5039 #ifdef USE_GATHER
   5040 __declspec(naked) __declspec(align(16))
   5041 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
   5042                              int width) {
   5043   __asm {
   5044     mov        eax, [esp + 4]   // src_argb0
   5045     mov        edx, [esp + 8]   // dst_argb
   5046     mov        ecx, [esp + 12]  // width
   5047     sub        edx, eax
   5048     vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2
   5049 
   5050     align      4
   5051  convertloop:
   5052     vmovdqu    ymm6, [eax]       // read 8 pixels.
   5053     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
   5054     vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
   5055     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
   5056     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
   5057     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
   5058     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
   5059     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
   5060     vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
   5061     vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
   5062     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
   5063     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
   5064     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
   5065     sub        ecx, 8
   5066     vmovdqu    [eax + edx], ymm0
   5067     lea        eax, [eax + 32]
   5068     jg         convertloop
   5069 
   5070     vzeroupper
   5071     ret
   5072   }
   5073 }
   5074 #else  // USE_GATHER
   5075 __declspec(naked) __declspec(align(16))
   5076 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
   5077                              int width) {
   5078   __asm {
   5079 
   5080     mov        eax, [esp + 4]   // src_argb0
   5081     mov        edx, [esp + 8]   // dst_argb
   5082     mov        ecx, [esp + 12]  // width
   5083     sub        edx, eax
   5084     vmovdqa    ymm5, kUnattenShuffleAlpha_AVX2
   5085 
   5086     push       esi
   5087     push       edi
   5088 
   5089     align      4
   5090  convertloop:
   5091     // replace VPGATHER
   5092     movzx      esi, byte ptr [eax + 3]                 // alpha0
   5093     movzx      edi, byte ptr [eax + 7]                 // alpha1
   5094     vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]
   5095     vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]
   5096     movzx      esi, byte ptr [eax + 11]                // alpha2
   5097     movzx      edi, byte ptr [eax + 15]                // alpha3
   5098     vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
   5099     vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]
   5100     vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]
   5101     movzx      esi, byte ptr [eax + 19]                // alpha4
   5102     movzx      edi, byte ptr [eax + 23]                // alpha5
   5103     vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
   5104     vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]
   5105     vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]
   5106     movzx      esi, byte ptr [eax + 27]                // alpha6
   5107     movzx      edi, byte ptr [eax + 31]                // alpha7
   5108     vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
   5109     vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]
   5110     vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]
   5111     vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
   5112     vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
   5113     vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
   5114     vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
   5115     // end of VPGATHER
   5116 
   5117     vmovdqu    ymm6, [eax]       // read 8 pixels.
   5118     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
   5119     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
   5120     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
   5121     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
   5122     vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
   5123     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
   5124     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
   5125     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
   5126     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
   5127     sub        ecx, 8
   5128     vmovdqu    [eax + edx], ymm0
   5129     lea        eax, [eax + 32]
   5130     jg         convertloop
   5131 
   5132     pop        edi
   5133     pop        esi
   5134     vzeroupper
   5135     ret
   5136   }
   5137 }
   5138 #endif  // USE_GATHER
   5139 #endif  // HAS_ARGBATTENUATEROW_AVX2
   5140 
   5141 #ifdef HAS_ARGBGRAYROW_SSSE3
   5142 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
   5143 __declspec(naked) __declspec(align(16))
   5144 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   5145   __asm {
   5146     mov        eax, [esp + 4]   /* src_argb */
   5147     mov        edx, [esp + 8]   /* dst_argb */
   5148     mov        ecx, [esp + 12]  /* width */
   5149     movdqa     xmm4, kARGBToYJ
   5150     movdqa     xmm5, kAddYJ64
   5151 
   5152     align      4
   5153  convertloop:
   5154     movdqa     xmm0, [eax]  // G
   5155     movdqa     xmm1, [eax + 16]
   5156     pmaddubsw  xmm0, xmm4
   5157     pmaddubsw  xmm1, xmm4
   5158     phaddw     xmm0, xmm1
   5159     paddw      xmm0, xmm5  // Add .5 for rounding.
   5160     psrlw      xmm0, 7
   5161     packuswb   xmm0, xmm0   // 8 G bytes
   5162     movdqa     xmm2, [eax]  // A
   5163     movdqa     xmm3, [eax + 16]
   5164     lea        eax, [eax + 32]
   5165     psrld      xmm2, 24
   5166     psrld      xmm3, 24
   5167     packuswb   xmm2, xmm3
   5168     packuswb   xmm2, xmm2   // 8 A bytes
   5169     movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
   5170     punpcklbw  xmm0, xmm0   // 8 GG words
   5171     punpcklbw  xmm3, xmm2   // 8 GA words
   5172     movdqa     xmm1, xmm0
   5173     punpcklwd  xmm0, xmm3   // GGGA first 4
   5174     punpckhwd  xmm1, xmm3   // GGGA next 4
   5175     sub        ecx, 8
   5176     movdqa     [edx], xmm0
   5177     movdqa     [edx + 16], xmm1
   5178     lea        edx, [edx + 32]
   5179     jg         convertloop
   5180     ret
   5181   }
   5182 }
   5183 #endif  // HAS_ARGBGRAYROW_SSSE3
   5184 
   5185 #ifdef HAS_ARGBSEPIAROW_SSSE3
   5186 //    b = (r * 35 + g * 68 + b * 17) >> 7
   5187 //    g = (r * 45 + g * 88 + b * 22) >> 7
   5188 //    r = (r * 50 + g * 98 + b * 24) >> 7
   5189 // Constant for ARGB color to sepia tone.
   5190 static const vec8 kARGBToSepiaB = {
   5191   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
   5192 };
   5193 
   5194 static const vec8 kARGBToSepiaG = {
   5195   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
   5196 };
   5197 
   5198 static const vec8 kARGBToSepiaR = {
   5199   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
   5200 };
   5201 
   5202 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
   5203 __declspec(naked) __declspec(align(16))
   5204 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
   5205   __asm {
   5206     mov        eax, [esp + 4]   /* dst_argb */
   5207     mov        ecx, [esp + 8]   /* width */
   5208     movdqa     xmm2, kARGBToSepiaB
   5209     movdqa     xmm3, kARGBToSepiaG
   5210     movdqa     xmm4, kARGBToSepiaR
   5211 
   5212     align      4
   5213  convertloop:
   5214     movdqa     xmm0, [eax]  // B
   5215     movdqa     xmm6, [eax + 16]
   5216     pmaddubsw  xmm0, xmm2
   5217     pmaddubsw  xmm6, xmm2
   5218     phaddw     xmm0, xmm6
   5219     psrlw      xmm0, 7
   5220     packuswb   xmm0, xmm0   // 8 B values
   5221     movdqa     xmm5, [eax]  // G
   5222     movdqa     xmm1, [eax + 16]
   5223     pmaddubsw  xmm5, xmm3
   5224     pmaddubsw  xmm1, xmm3
   5225     phaddw     xmm5, xmm1
   5226     psrlw      xmm5, 7
   5227     packuswb   xmm5, xmm5   // 8 G values
   5228     punpcklbw  xmm0, xmm5   // 8 BG values
   5229     movdqa     xmm5, [eax]  // R
   5230     movdqa     xmm1, [eax + 16]
   5231     pmaddubsw  xmm5, xmm4
   5232     pmaddubsw  xmm1, xmm4
   5233     phaddw     xmm5, xmm1
   5234     psrlw      xmm5, 7
   5235     packuswb   xmm5, xmm5   // 8 R values
   5236     movdqa     xmm6, [eax]  // A
   5237     movdqa     xmm1, [eax + 16]
   5238     psrld      xmm6, 24
   5239     psrld      xmm1, 24
   5240     packuswb   xmm6, xmm1
   5241     packuswb   xmm6, xmm6   // 8 A values
   5242     punpcklbw  xmm5, xmm6   // 8 RA values
   5243     movdqa     xmm1, xmm0   // Weave BG, RA together
   5244     punpcklwd  xmm0, xmm5   // BGRA first 4
   5245     punpckhwd  xmm1, xmm5   // BGRA next 4
   5246     sub        ecx, 8
   5247     movdqa     [eax], xmm0
   5248     movdqa     [eax + 16], xmm1
   5249     lea        eax, [eax + 32]
   5250     jg         convertloop
   5251     ret
   5252   }
   5253 }
   5254 #endif  // HAS_ARGBSEPIAROW_SSSE3
   5255 
   5256 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
   5257 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
   5258 // Same as Sepia except matrix is provided.
   5259 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
   5260 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
   5261 __declspec(naked) __declspec(align(16))
   5262 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
   5263                               const int8* matrix_argb, int width) {
   5264   __asm {
   5265     mov        eax, [esp + 4]   /* src_argb */
   5266     mov        edx, [esp + 8]   /* dst_argb */
   5267     mov        ecx, [esp + 12]  /* matrix_argb */
   5268     movdqu     xmm5, [ecx]
   5269     pshufd     xmm2, xmm5, 0x00
   5270     pshufd     xmm3, xmm5, 0x55
   5271     pshufd     xmm4, xmm5, 0xaa
   5272     pshufd     xmm5, xmm5, 0xff
   5273     mov        ecx, [esp + 16]  /* width */
   5274 
   5275     align      4
   5276  convertloop:
   5277     movdqa     xmm0, [eax]  // B
   5278     movdqa     xmm7, [eax + 16]
   5279     pmaddubsw  xmm0, xmm2
   5280     pmaddubsw  xmm7, xmm2
   5281     movdqa     xmm6, [eax]  // G
   5282     movdqa     xmm1, [eax + 16]
   5283     pmaddubsw  xmm6, xmm3
   5284     pmaddubsw  xmm1, xmm3
   5285     phaddsw    xmm0, xmm7   // B
   5286     phaddsw    xmm6, xmm1   // G
   5287     psraw      xmm0, 6      // B
   5288     psraw      xmm6, 6      // G
   5289     packuswb   xmm0, xmm0   // 8 B values
   5290     packuswb   xmm6, xmm6   // 8 G values
   5291     punpcklbw  xmm0, xmm6   // 8 BG values
   5292     movdqa     xmm1, [eax]  // R
   5293     movdqa     xmm7, [eax + 16]
   5294     pmaddubsw  xmm1, xmm4
   5295     pmaddubsw  xmm7, xmm4
   5296     phaddsw    xmm1, xmm7   // R
   5297     movdqa     xmm6, [eax]  // A
   5298     movdqa     xmm7, [eax + 16]
   5299     pmaddubsw  xmm6, xmm5
   5300     pmaddubsw  xmm7, xmm5
   5301     phaddsw    xmm6, xmm7   // A
   5302     psraw      xmm1, 6      // R
   5303     psraw      xmm6, 6      // A
   5304     packuswb   xmm1, xmm1   // 8 R values
   5305     packuswb   xmm6, xmm6   // 8 A values
   5306     punpcklbw  xmm1, xmm6   // 8 RA values
   5307     movdqa     xmm6, xmm0   // Weave BG, RA together
   5308     punpcklwd  xmm0, xmm1   // BGRA first 4
   5309     punpckhwd  xmm6, xmm1   // BGRA next 4
   5310     sub        ecx, 8
   5311     movdqa     [edx], xmm0
   5312     movdqa     [edx + 16], xmm6
   5313     lea        eax, [eax + 32]
   5314     lea        edx, [edx + 32]
   5315     jg         convertloop
   5316     ret
   5317   }
   5318 }
   5319 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
   5320 
   5321 #ifdef HAS_ARGBQUANTIZEROW_SSE2
   5322 // Quantize 4 ARGB pixels (16 bytes).
   5323 // Aligned to 16 bytes.
   5324 __declspec(naked) __declspec(align(16))
   5325 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
   5326                           int interval_offset, int width) {
   5327   __asm {
   5328     mov        eax, [esp + 4]    /* dst_argb */
   5329     movd       xmm2, [esp + 8]   /* scale */
   5330     movd       xmm3, [esp + 12]  /* interval_size */
   5331     movd       xmm4, [esp + 16]  /* interval_offset */
   5332     mov        ecx, [esp + 20]   /* width */
   5333     pshuflw    xmm2, xmm2, 040h
   5334     pshufd     xmm2, xmm2, 044h
   5335     pshuflw    xmm3, xmm3, 040h
   5336     pshufd     xmm3, xmm3, 044h
   5337     pshuflw    xmm4, xmm4, 040h
   5338     pshufd     xmm4, xmm4, 044h
   5339     pxor       xmm5, xmm5  // constant 0
   5340     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
   5341     pslld      xmm6, 24
   5342 
   5343     align      4
   5344  convertloop:
   5345     movdqa     xmm0, [eax]  // read 4 pixels
   5346     punpcklbw  xmm0, xmm5   // first 2 pixels
   5347     pmulhuw    xmm0, xmm2   // pixel * scale >> 16
   5348     movdqa     xmm1, [eax]  // read 4 pixels
   5349     punpckhbw  xmm1, xmm5   // next 2 pixels
   5350     pmulhuw    xmm1, xmm2
   5351     pmullw     xmm0, xmm3   // * interval_size
   5352     movdqa     xmm7, [eax]  // read 4 pixels
   5353     pmullw     xmm1, xmm3
   5354     pand       xmm7, xmm6   // mask alpha
   5355     paddw      xmm0, xmm4   // + interval_size / 2
   5356     paddw      xmm1, xmm4
   5357     packuswb   xmm0, xmm1
   5358     por        xmm0, xmm7
   5359     sub        ecx, 4
   5360     movdqa     [eax], xmm0
   5361     lea        eax, [eax + 16]
   5362     jg         convertloop
   5363     ret
   5364   }
   5365 }
   5366 #endif  // HAS_ARGBQUANTIZEROW_SSE2
   5367 
   5368 #ifdef HAS_ARGBSHADEROW_SSE2
   5369 // Shade 4 pixels at a time by specified value.
   5370 // Aligned to 16 bytes.
   5371 __declspec(naked) __declspec(align(16))
   5372 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
   5373                        uint32 value) {
   5374   __asm {
   5375     mov        eax, [esp + 4]   // src_argb
   5376     mov        edx, [esp + 8]   // dst_argb
   5377     mov        ecx, [esp + 12]  // width
   5378     movd       xmm2, [esp + 16]  // value
   5379     punpcklbw  xmm2, xmm2
   5380     punpcklqdq xmm2, xmm2
   5381 
   5382     align      4
   5383  convertloop:
   5384     movdqa     xmm0, [eax]      // read 4 pixels
   5385     lea        eax, [eax + 16]
   5386     movdqa     xmm1, xmm0
   5387     punpcklbw  xmm0, xmm0       // first 2
   5388     punpckhbw  xmm1, xmm1       // next 2
   5389     pmulhuw    xmm0, xmm2       // argb * value
   5390     pmulhuw    xmm1, xmm2       // argb * value
   5391     psrlw      xmm0, 8
   5392     psrlw      xmm1, 8
   5393     packuswb   xmm0, xmm1
   5394     sub        ecx, 4
   5395     movdqa     [edx], xmm0
   5396     lea        edx, [edx + 16]
   5397     jg         convertloop
   5398 
   5399     ret
   5400   }
   5401 }
   5402 #endif  // HAS_ARGBSHADEROW_SSE2
   5403 
   5404 #ifdef HAS_ARGBMULTIPLYROW_SSE2
   5405 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
   5406 __declspec(naked) __declspec(align(16))
   5407 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   5408                           uint8* dst_argb, int width) {
   5409   __asm {
   5410     push       esi
   5411     mov        eax, [esp + 4 + 4]   // src_argb0
   5412     mov        esi, [esp + 4 + 8]   // src_argb1
   5413     mov        edx, [esp + 4 + 12]  // dst_argb
   5414     mov        ecx, [esp + 4 + 16]  // width
   5415     pxor       xmm5, xmm5  // constant 0
   5416 
   5417     align      4
   5418  convertloop:
   5419     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
   5420     movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
   5421     movdqu     xmm1, xmm0
   5422     movdqu     xmm3, xmm2
   5423     punpcklbw  xmm0, xmm0         // first 2
   5424     punpckhbw  xmm1, xmm1         // next 2
   5425     punpcklbw  xmm2, xmm5         // first 2
   5426     punpckhbw  xmm3, xmm5         // next 2
   5427     pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
   5428     pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
   5429     lea        eax, [eax + 16]
   5430     lea        esi, [esi + 16]
   5431     packuswb   xmm0, xmm1
   5432     sub        ecx, 4
   5433     movdqu     [edx], xmm0
   5434     lea        edx, [edx + 16]
   5435     jg         convertloop
   5436 
   5437     pop        esi
   5438     ret
   5439   }
   5440 }
   5441 #endif  // HAS_ARGBMULTIPLYROW_SSE2
   5442 
   5443 #ifdef HAS_ARGBADDROW_SSE2
   5444 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
   5445 // TODO(fbarchard): Port this to posix, neon and other math functions.
   5446 __declspec(naked) __declspec(align(16))
   5447 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   5448                      uint8* dst_argb, int width) {
   5449   __asm {
   5450     push       esi
   5451     mov        eax, [esp + 4 + 4]   // src_argb0
   5452     mov        esi, [esp + 4 + 8]   // src_argb1
   5453     mov        edx, [esp + 4 + 12]  // dst_argb
   5454     mov        ecx, [esp + 4 + 16]  // width
   5455 
   5456     sub        ecx, 4
   5457     jl         convertloop49
   5458 
   5459     align      4
   5460  convertloop4:
   5461     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
   5462     lea        eax, [eax + 16]
   5463     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
   5464     lea        esi, [esi + 16]
   5465     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
   5466     sub        ecx, 4
   5467     movdqu     [edx], xmm0
   5468     lea        edx, [edx + 16]
   5469     jge        convertloop4
   5470 
   5471  convertloop49:
   5472     add        ecx, 4 - 1
   5473     jl         convertloop19
   5474 
   5475  convertloop1:
   5476     movd       xmm0, [eax]        // read 1 pixels from src_argb0
   5477     lea        eax, [eax + 4]
   5478     movd       xmm1, [esi]        // read 1 pixels from src_argb1
   5479     lea        esi, [esi + 4]
   5480     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
   5481     sub        ecx, 1
   5482     movd       [edx], xmm0
   5483     lea        edx, [edx + 4]
   5484     jge        convertloop1
   5485 
   5486  convertloop19:
   5487     pop        esi
   5488     ret
   5489   }
   5490 }
   5491 #endif  // HAS_ARGBADDROW_SSE2
   5492 
   5493 #ifdef HAS_ARGBSUBTRACTROW_SSE2
   5494 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
   5495 __declspec(naked) __declspec(align(16))
   5496 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   5497                           uint8* dst_argb, int width) {
   5498   __asm {
   5499     push       esi
   5500     mov        eax, [esp + 4 + 4]   // src_argb0
   5501     mov        esi, [esp + 4 + 8]   // src_argb1
   5502     mov        edx, [esp + 4 + 12]  // dst_argb
   5503     mov        ecx, [esp + 4 + 16]  // width
   5504 
   5505     align      4
   5506  convertloop:
   5507     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
   5508     lea        eax, [eax + 16]
   5509     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
   5510     lea        esi, [esi + 16]
   5511     psubusb    xmm0, xmm1         // src_argb0 - src_argb1
   5512     sub        ecx, 4
   5513     movdqu     [edx], xmm0
   5514     lea        edx, [edx + 16]
   5515     jg         convertloop
   5516 
   5517     pop        esi
   5518     ret
   5519   }
   5520 }
   5521 #endif  // HAS_ARGBSUBTRACTROW_SSE2
   5522 
   5523 #ifdef HAS_ARGBMULTIPLYROW_AVX2
   5524 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
   5525 __declspec(naked) __declspec(align(16))
   5526 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
   5527                           uint8* dst_argb, int width) {
   5528   __asm {
   5529     push       esi
   5530     mov        eax, [esp + 4 + 4]   // src_argb0
   5531     mov        esi, [esp + 4 + 8]   // src_argb1
   5532     mov        edx, [esp + 4 + 12]  // dst_argb
   5533     mov        ecx, [esp + 4 + 16]  // width
   5534     vpxor      ymm5, ymm5, ymm5     // constant 0
   5535 
   5536     align      4
   5537  convertloop:
   5538     vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
   5539     lea        eax, [eax + 32]
   5540     vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
   5541     lea        esi, [esi + 32]
   5542     vpunpcklbw ymm0, ymm1, ymm1   // low 4
   5543     vpunpckhbw ymm1, ymm1, ymm1   // high 4
   5544     vpunpcklbw ymm2, ymm3, ymm5   // low 4
   5545     vpunpckhbw ymm3, ymm3, ymm5   // high 4
   5546     vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
   5547     vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
   5548     vpackuswb  ymm0, ymm0, ymm1
   5549     vmovdqu    [edx], ymm0
   5550     lea        edx, [edx + 32]
   5551     sub        ecx, 8
   5552     jg         convertloop
   5553 
   5554     pop        esi
   5555     vzeroupper
   5556     ret
   5557   }
   5558 }
   5559 #endif  // HAS_ARGBMULTIPLYROW_AVX2
   5560 
   5561 #ifdef HAS_ARGBADDROW_AVX2
   5562 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
   5563 __declspec(naked) __declspec(align(16))
   5564 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
   5565                      uint8* dst_argb, int width) {
   5566   __asm {
   5567     push       esi
   5568     mov        eax, [esp + 4 + 4]   // src_argb0
   5569     mov        esi, [esp + 4 + 8]   // src_argb1
   5570     mov        edx, [esp + 4 + 12]  // dst_argb
   5571     mov        ecx, [esp + 4 + 16]  // width
   5572 
   5573     align      4
   5574  convertloop:
   5575     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
   5576     lea        eax, [eax + 32]
   5577     vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
   5578     lea        esi, [esi + 32]
   5579     vmovdqu    [edx], ymm0
   5580     lea        edx, [edx + 32]
   5581     sub        ecx, 8
   5582     jg         convertloop
   5583 
   5584     pop        esi
   5585     vzeroupper
   5586     ret
   5587   }
   5588 }
   5589 #endif  // HAS_ARGBADDROW_AVX2
   5590 
   5591 #ifdef HAS_ARGBSUBTRACTROW_AVX2
   5592 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
   5593 __declspec(naked) __declspec(align(16))
   5594 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
   5595                           uint8* dst_argb, int width) {
   5596   __asm {
   5597     push       esi
   5598     mov        eax, [esp + 4 + 4]   // src_argb0
   5599     mov        esi, [esp + 4 + 8]   // src_argb1
   5600     mov        edx, [esp + 4 + 12]  // dst_argb
   5601     mov        ecx, [esp + 4 + 16]  // width
   5602 
   5603     align      4
   5604  convertloop:
   5605     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
   5606     lea        eax, [eax + 32]
   5607     vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
   5608     lea        esi, [esi + 32]
   5609     vmovdqu    [edx], ymm0
   5610     lea        edx, [edx + 32]
   5611     sub        ecx, 8
   5612     jg         convertloop
   5613 
   5614     pop        esi
   5615     vzeroupper
   5616     ret
   5617   }
   5618 }
   5619 #endif  // HAS_ARGBSUBTRACTROW_AVX2
   5620 
   5621 #ifdef HAS_SOBELXROW_SSE2
   5622 // SobelX as a matrix is
   5623 // -1  0  1
   5624 // -2  0  2
   5625 // -1  0  1
   5626 __declspec(naked) __declspec(align(16))
   5627 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
   5628                     const uint8* src_y2, uint8* dst_sobelx, int width) {
   5629   __asm {
   5630     push       esi
   5631     push       edi
   5632     mov        eax, [esp + 8 + 4]   // src_y0
   5633     mov        esi, [esp + 8 + 8]   // src_y1
   5634     mov        edi, [esp + 8 + 12]  // src_y2
   5635     mov        edx, [esp + 8 + 16]  // dst_sobelx
   5636     mov        ecx, [esp + 8 + 20]  // width
   5637     sub        esi, eax
   5638     sub        edi, eax
   5639     sub        edx, eax
   5640     pxor       xmm5, xmm5  // constant 0
   5641 
   5642     align      4
   5643  convertloop:
   5644     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
   5645     movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
   5646     punpcklbw  xmm0, xmm5
   5647     punpcklbw  xmm1, xmm5
   5648     psubw      xmm0, xmm1
   5649     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
   5650     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
   5651     punpcklbw  xmm1, xmm5
   5652     punpcklbw  xmm2, xmm5
   5653     psubw      xmm1, xmm2
   5654     movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
   5655     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
   5656     punpcklbw  xmm2, xmm5
   5657     punpcklbw  xmm3, xmm5
   5658     psubw      xmm2, xmm3
   5659     paddw      xmm0, xmm2
   5660     paddw      xmm0, xmm1
   5661     paddw      xmm0, xmm1
   5662     pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
   5663     psubw      xmm1, xmm0
   5664     pmaxsw     xmm0, xmm1
   5665     packuswb   xmm0, xmm0
   5666     sub        ecx, 8
   5667     movq       qword ptr [eax + edx], xmm0
   5668     lea        eax, [eax + 8]
   5669     jg         convertloop
   5670 
   5671     pop        edi
   5672     pop        esi
   5673     ret
   5674   }
   5675 }
   5676 #endif  // HAS_SOBELXROW_SSE2
   5677 
   5678 #ifdef HAS_SOBELYROW_SSE2
   5679 // SobelY as a matrix is
   5680 // -1 -2 -1
   5681 //  0  0  0
   5682 //  1  2  1
   5683 __declspec(naked) __declspec(align(16))
   5684 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
   5685                     uint8* dst_sobely, int width) {
   5686   __asm {
   5687     push       esi
   5688     mov        eax, [esp + 4 + 4]   // src_y0
   5689     mov        esi, [esp + 4 + 8]   // src_y1
   5690     mov        edx, [esp + 4 + 12]  // dst_sobely
   5691     mov        ecx, [esp + 4 + 16]  // width
   5692     sub        esi, eax
   5693     sub        edx, eax
   5694     pxor       xmm5, xmm5  // constant 0
   5695 
   5696     align      4
   5697  convertloop:
   5698     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
   5699     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
   5700     punpcklbw  xmm0, xmm5
   5701     punpcklbw  xmm1, xmm5
   5702     psubw      xmm0, xmm1
   5703     movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
   5704     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
   5705     punpcklbw  xmm1, xmm5
   5706     punpcklbw  xmm2, xmm5
   5707     psubw      xmm1, xmm2
   5708     movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
   5709     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
   5710     punpcklbw  xmm2, xmm5
   5711     punpcklbw  xmm3, xmm5
   5712     psubw      xmm2, xmm3
   5713     paddw      xmm0, xmm2
   5714     paddw      xmm0, xmm1
   5715     paddw      xmm0, xmm1
   5716     pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
   5717     psubw      xmm1, xmm0
   5718     pmaxsw     xmm0, xmm1
   5719     packuswb   xmm0, xmm0
   5720     sub        ecx, 8
   5721     movq       qword ptr [eax + edx], xmm0
   5722     lea        eax, [eax + 8]
   5723     jg         convertloop
   5724 
   5725     pop        esi
   5726     ret
   5727   }
   5728 }
   5729 #endif  // HAS_SOBELYROW_SSE2
   5730 
   5731 #ifdef HAS_SOBELROW_SSE2
   5732 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
   5733 // A = 255
   5734 // R = Sobel
   5735 // G = Sobel
   5736 // B = Sobel
   5737 __declspec(naked) __declspec(align(16))
   5738 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
   5739                    uint8* dst_argb, int width) {
   5740   __asm {
   5741     push       esi
   5742     mov        eax, [esp + 4 + 4]   // src_sobelx
   5743     mov        esi, [esp + 4 + 8]   // src_sobely
   5744     mov        edx, [esp + 4 + 12]  // dst_argb
   5745     mov        ecx, [esp + 4 + 16]  // width
   5746     sub        esi, eax
   5747     pcmpeqb    xmm5, xmm5           // alpha 255
   5748     pslld      xmm5, 24             // 0xff000000
   5749 
   5750     align      4
   5751  convertloop:
   5752     movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
   5753     movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
   5754     lea        eax, [eax + 16]
   5755     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
   5756     movdqa     xmm2, xmm0             // GG
   5757     punpcklbw  xmm2, xmm0             // First 8
   5758     punpckhbw  xmm0, xmm0             // Next 8
   5759     movdqa     xmm1, xmm2             // GGGG
   5760     punpcklwd  xmm1, xmm2             // First 4
   5761     punpckhwd  xmm2, xmm2             // Next 4
   5762     por        xmm1, xmm5             // GGGA
   5763     por        xmm2, xmm5
   5764     movdqa     xmm3, xmm0             // GGGG
   5765     punpcklwd  xmm3, xmm0             // Next 4
   5766     punpckhwd  xmm0, xmm0             // Last 4
   5767     por        xmm3, xmm5             // GGGA
   5768     por        xmm0, xmm5
   5769     sub        ecx, 16
   5770     movdqa     [edx], xmm1
   5771     movdqa     [edx + 16], xmm2
   5772     movdqa     [edx + 32], xmm3
   5773     movdqa     [edx + 48], xmm0
   5774     lea        edx, [edx + 64]
   5775     jg         convertloop
   5776 
   5777     pop        esi
   5778     ret
   5779   }
   5780 }
   5781 #endif  // HAS_SOBELROW_SSE2
   5782 
   5783 #ifdef HAS_SOBELTOPLANEROW_SSE2
   5784 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
   5785 __declspec(naked) __declspec(align(16))
   5786 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
   5787                           uint8* dst_y, int width) {
   5788   __asm {
   5789     push       esi
   5790     mov        eax, [esp + 4 + 4]   // src_sobelx
   5791     mov        esi, [esp + 4 + 8]   // src_sobely
   5792     mov        edx, [esp + 4 + 12]  // dst_argb
   5793     mov        ecx, [esp + 4 + 16]  // width
   5794     sub        esi, eax
   5795 
   5796     align      4
   5797  convertloop:
   5798     movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
   5799     movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
   5800     lea        eax, [eax + 16]
   5801     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
   5802     sub        ecx, 16
   5803     movdqa     [edx], xmm0
   5804     lea        edx, [edx + 16]
   5805     jg         convertloop
   5806 
   5807     pop        esi
   5808     ret
   5809   }
   5810 }
   5811 #endif  // HAS_SOBELTOPLANEROW_SSE2
   5812 
   5813 #ifdef HAS_SOBELXYROW_SSE2
   5814 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
   5815 // A = 255
   5816 // R = Sobel X
   5817 // G = Sobel
   5818 // B = Sobel Y
   5819 __declspec(naked) __declspec(align(16))
   5820 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
   5821                      uint8* dst_argb, int width) {
   5822   __asm {
   5823     push       esi
   5824     mov        eax, [esp + 4 + 4]   // src_sobelx
   5825     mov        esi, [esp + 4 + 8]   // src_sobely
   5826     mov        edx, [esp + 4 + 12]  // dst_argb
   5827     mov        ecx, [esp + 4 + 16]  // width
   5828     sub        esi, eax
   5829     pcmpeqb    xmm5, xmm5           // alpha 255
   5830 
   5831     align      4
   5832  convertloop:
   5833     movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
   5834     movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
   5835     lea        eax, [eax + 16]
   5836     movdqa     xmm2, xmm0
   5837     paddusb    xmm2, xmm1             // sobel = sobelx + sobely
   5838     movdqa     xmm3, xmm0             // XA
   5839     punpcklbw  xmm3, xmm5
   5840     punpckhbw  xmm0, xmm5
   5841     movdqa     xmm4, xmm1             // YS
   5842     punpcklbw  xmm4, xmm2
   5843     punpckhbw  xmm1, xmm2
   5844     movdqa     xmm6, xmm4             // YSXA
   5845     punpcklwd  xmm6, xmm3             // First 4
   5846     punpckhwd  xmm4, xmm3             // Next 4
   5847     movdqa     xmm7, xmm1             // YSXA
   5848     punpcklwd  xmm7, xmm0             // Next 4
   5849     punpckhwd  xmm1, xmm0             // Last 4
   5850     sub        ecx, 16
   5851     movdqa     [edx], xmm6
   5852     movdqa     [edx + 16], xmm4
   5853     movdqa     [edx + 32], xmm7
   5854     movdqa     [edx + 48], xmm1
   5855     lea        edx, [edx + 64]
   5856     jg         convertloop
   5857 
   5858     pop        esi
   5859     ret
   5860   }
   5861 }
   5862 #endif  // HAS_SOBELXYROW_SSE2
   5863 
   5864 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
   5865 // Consider float CumulativeSum.
   5866 // Consider calling CumulativeSum one row at time as needed.
   5867 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
   5868 // Convert cumulative sum for an area to an average for 1 pixel.
   5869 // topleft is pointer to top left of CumulativeSum buffer for area.
   5870 // botleft is pointer to bottom left of CumulativeSum buffer.
   5871 // width is offset from left to right of area in CumulativeSum buffer measured
   5872 //   in number of ints.
   5873 // area is the number of pixels in the area being averaged.
   5874 // dst points to pixel to store result to.
   5875 // count is number of averaged pixels to produce.
   5876 // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
   5877 // aligned.
   5878 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
   5879                                     int width, int area, uint8* dst,
   5880                                     int count) {
   5881   __asm {
   5882     mov        eax, topleft  // eax topleft
   5883     mov        esi, botleft  // esi botleft
   5884     mov        edx, width
   5885     movd       xmm5, area
   5886     mov        edi, dst
   5887     mov        ecx, count
   5888     cvtdq2ps   xmm5, xmm5
   5889     rcpss      xmm4, xmm5  // 1.0f / area
   5890     pshufd     xmm4, xmm4, 0
   5891     sub        ecx, 4
   5892     jl         l4b
   5893 
   5894     cmp        area, 128  // 128 pixels will not overflow 15 bits.
   5895     ja         l4
   5896 
   5897     pshufd     xmm5, xmm5, 0        // area
   5898     pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
   5899     psrld      xmm6, 16
   5900     cvtdq2ps   xmm6, xmm6
   5901     addps      xmm5, xmm6           // (65536.0 + area - 1)
   5902     mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
   5903     cvtps2dq   xmm5, xmm5           // 0.16 fixed point
   5904     packssdw   xmm5, xmm5           // 16 bit shorts
   5905 
   5906     // 4 pixel loop small blocks.
   5907     align      4
   5908   s4:
   5909     // top left
   5910     movdqa     xmm0, [eax]
   5911     movdqa     xmm1, [eax + 16]
   5912     movdqa     xmm2, [eax + 32]
   5913     movdqa     xmm3, [eax + 48]
   5914 
   5915     // - top right
   5916     psubd      xmm0, [eax + edx * 4]
   5917     psubd      xmm1, [eax + edx * 4 + 16]
   5918     psubd      xmm2, [eax + edx * 4 + 32]
   5919     psubd      xmm3, [eax + edx * 4 + 48]
   5920     lea        eax, [eax + 64]
   5921 
   5922     // - bottom left
   5923     psubd      xmm0, [esi]
   5924     psubd      xmm1, [esi + 16]
   5925     psubd      xmm2, [esi + 32]
   5926     psubd      xmm3, [esi + 48]
   5927 
   5928     // + bottom right
   5929     paddd      xmm0, [esi + edx * 4]
   5930     paddd      xmm1, [esi + edx * 4 + 16]
   5931     paddd      xmm2, [esi + edx * 4 + 32]
   5932     paddd      xmm3, [esi + edx * 4 + 48]
   5933     lea        esi, [esi + 64]
   5934 
   5935     packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
   5936     packssdw   xmm2, xmm3
   5937 
   5938     pmulhuw    xmm0, xmm5
   5939     pmulhuw    xmm2, xmm5
   5940 
   5941     packuswb   xmm0, xmm2
   5942     movdqu     [edi], xmm0
   5943     lea        edi, [edi + 16]
   5944     sub        ecx, 4
   5945     jge        s4
   5946 
   5947     jmp        l4b
   5948 
   5949     // 4 pixel loop
   5950     align      4
   5951   l4:
   5952     // top left
   5953     movdqa     xmm0, [eax]
   5954     movdqa     xmm1, [eax + 16]
   5955     movdqa     xmm2, [eax + 32]
   5956     movdqa     xmm3, [eax + 48]
   5957 
   5958     // - top right
   5959     psubd      xmm0, [eax + edx * 4]
   5960     psubd      xmm1, [eax + edx * 4 + 16]
   5961     psubd      xmm2, [eax + edx * 4 + 32]
   5962     psubd      xmm3, [eax + edx * 4 + 48]
   5963     lea        eax, [eax + 64]
   5964 
   5965     // - bottom left
   5966     psubd      xmm0, [esi]
   5967     psubd      xmm1, [esi + 16]
   5968     psubd      xmm2, [esi + 32]
   5969     psubd      xmm3, [esi + 48]
   5970 
   5971     // + bottom right
   5972     paddd      xmm0, [esi + edx * 4]
   5973     paddd      xmm1, [esi + edx * 4 + 16]
   5974     paddd      xmm2, [esi + edx * 4 + 32]
   5975     paddd      xmm3, [esi + edx * 4 + 48]
   5976     lea        esi, [esi + 64]
   5977 
   5978     cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
   5979     cvtdq2ps   xmm1, xmm1
   5980     mulps      xmm0, xmm4
   5981     mulps      xmm1, xmm4
   5982     cvtdq2ps   xmm2, xmm2
   5983     cvtdq2ps   xmm3, xmm3
   5984     mulps      xmm2, xmm4
   5985     mulps      xmm3, xmm4
   5986     cvtps2dq   xmm0, xmm0
   5987     cvtps2dq   xmm1, xmm1
   5988     cvtps2dq   xmm2, xmm2
   5989     cvtps2dq   xmm3, xmm3
   5990     packssdw   xmm0, xmm1
   5991     packssdw   xmm2, xmm3
   5992     packuswb   xmm0, xmm2
   5993     movdqu     [edi], xmm0
   5994     lea        edi, [edi + 16]
   5995     sub        ecx, 4
   5996     jge        l4
   5997 
   5998   l4b:
   5999     add        ecx, 4 - 1
   6000     jl         l1b
   6001 
   6002     // 1 pixel loop
   6003     align      4
   6004   l1:
   6005     movdqa     xmm0, [eax]
   6006     psubd      xmm0, [eax + edx * 4]
   6007     lea        eax, [eax + 16]
   6008     psubd      xmm0, [esi]
   6009     paddd      xmm0, [esi + edx * 4]
   6010     lea        esi, [esi + 16]
   6011     cvtdq2ps   xmm0, xmm0
   6012     mulps      xmm0, xmm4
   6013     cvtps2dq   xmm0, xmm0
   6014     packssdw   xmm0, xmm0
   6015     packuswb   xmm0, xmm0
   6016     movd       dword ptr [edi], xmm0
   6017     lea        edi, [edi + 4]
   6018     sub        ecx, 1
   6019     jge        l1
   6020   l1b:
   6021   }
   6022 }
   6023 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
   6024 
   6025 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
   6026 // Creates a table of cumulative sums where each value is a sum of all values
   6027 // above and to the left of the value.
   6028 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
   6029                                   const int32* previous_cumsum, int width) {
   6030   __asm {
   6031     mov        eax, row
   6032     mov        edx, cumsum
   6033     mov        esi, previous_cumsum
   6034     mov        ecx, width
   6035     pxor       xmm0, xmm0
   6036     pxor       xmm1, xmm1
   6037 
   6038     sub        ecx, 4
   6039     jl         l4b
   6040     test       edx, 15
   6041     jne        l4b
   6042 
   6043     // 4 pixel loop
   6044     align      4
   6045   l4:
   6046     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
   6047     lea        eax, [eax + 16]
   6048     movdqa     xmm4, xmm2
   6049 
   6050     punpcklbw  xmm2, xmm1
   6051     movdqa     xmm3, xmm2
   6052     punpcklwd  xmm2, xmm1
   6053     punpckhwd  xmm3, xmm1
   6054 
   6055     punpckhbw  xmm4, xmm1
   6056     movdqa     xmm5, xmm4
   6057     punpcklwd  xmm4, xmm1
   6058     punpckhwd  xmm5, xmm1
   6059 
   6060     paddd      xmm0, xmm2
   6061     movdqa     xmm2, [esi]  // previous row above.
   6062     paddd      xmm2, xmm0
   6063 
   6064     paddd      xmm0, xmm3
   6065     movdqa     xmm3, [esi + 16]
   6066     paddd      xmm3, xmm0
   6067 
   6068     paddd      xmm0, xmm4
   6069     movdqa     xmm4, [esi + 32]
   6070     paddd      xmm4, xmm0
   6071 
   6072     paddd      xmm0, xmm5
   6073     movdqa     xmm5, [esi + 48]
   6074     lea        esi, [esi + 64]
   6075     paddd      xmm5, xmm0
   6076 
   6077     movdqa     [edx], xmm2
   6078     movdqa     [edx + 16], xmm3
   6079     movdqa     [edx + 32], xmm4
   6080     movdqa     [edx + 48], xmm5
   6081 
   6082     lea        edx, [edx + 64]
   6083     sub        ecx, 4
   6084     jge        l4
   6085 
   6086   l4b:
   6087     add        ecx, 4 - 1
   6088     jl         l1b
   6089 
   6090     // 1 pixel loop
   6091     align      4
   6092   l1:
   6093     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
   6094     lea        eax, [eax + 4]
   6095     punpcklbw  xmm2, xmm1
   6096     punpcklwd  xmm2, xmm1
   6097     paddd      xmm0, xmm2
   6098     movdqu     xmm2, [esi]
   6099     lea        esi, [esi + 16]
   6100     paddd      xmm2, xmm0
   6101     movdqu     [edx], xmm2
   6102     lea        edx, [edx + 16]
   6103     sub        ecx, 1
   6104     jge        l1
   6105 
   6106  l1b:
   6107   }
   6108 }
   6109 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
   6110 
   6111 #ifdef HAS_ARGBAFFINEROW_SSE2
   6112 // Copy ARGB pixels from source image with slope to a row of destination.
   6113 __declspec(naked) __declspec(align(16))
   6114 LIBYUV_API
   6115 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
   6116                         uint8* dst_argb, const float* uv_dudv, int width) {
   6117   __asm {
   6118     push       esi
   6119     push       edi
   6120     mov        eax, [esp + 12]  // src_argb
   6121     mov        esi, [esp + 16]  // stride
   6122     mov        edx, [esp + 20]  // dst_argb
   6123     mov        ecx, [esp + 24]  // pointer to uv_dudv
   6124     movq       xmm2, qword ptr [ecx]  // uv
   6125     movq       xmm7, qword ptr [ecx + 8]  // dudv
   6126     mov        ecx, [esp + 28]  // width
   6127     shl        esi, 16          // 4, stride
   6128     add        esi, 4
   6129     movd       xmm5, esi
   6130     sub        ecx, 4
   6131     jl         l4b
   6132 
   6133     // setup for 4 pixel loop
   6134     pshufd     xmm7, xmm7, 0x44  // dup dudv
   6135     pshufd     xmm5, xmm5, 0  // dup 4, stride
   6136     movdqa     xmm0, xmm2    // x0, y0, x1, y1
   6137     addps      xmm0, xmm7
   6138     movlhps    xmm2, xmm0
   6139     movdqa     xmm4, xmm7
   6140     addps      xmm4, xmm4    // dudv *= 2
   6141     movdqa     xmm3, xmm2    // x2, y2, x3, y3
   6142     addps      xmm3, xmm4
   6143     addps      xmm4, xmm4    // dudv *= 4
   6144 
   6145     // 4 pixel loop
   6146     align      4
   6147   l4:
   6148     cvttps2dq  xmm0, xmm2    // x, y float to int first 2
   6149     cvttps2dq  xmm1, xmm3    // x, y float to int next 2
   6150     packssdw   xmm0, xmm1    // x, y as 8 shorts
   6151     pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
   6152     movd       esi, xmm0
   6153     pshufd     xmm0, xmm0, 0x39  // shift right
   6154     movd       edi, xmm0
   6155     pshufd     xmm0, xmm0, 0x39  // shift right
   6156     movd       xmm1, [eax + esi]  // read pixel 0
   6157     movd       xmm6, [eax + edi]  // read pixel 1
   6158     punpckldq  xmm1, xmm6     // combine pixel 0 and 1
   6159     addps      xmm2, xmm4    // x, y += dx, dy first 2
   6160     movq       qword ptr [edx], xmm1
   6161     movd       esi, xmm0
   6162     pshufd     xmm0, xmm0, 0x39  // shift right
   6163     movd       edi, xmm0
   6164     movd       xmm6, [eax + esi]  // read pixel 2
   6165     movd       xmm0, [eax + edi]  // read pixel 3
   6166     punpckldq  xmm6, xmm0     // combine pixel 2 and 3
   6167     addps      xmm3, xmm4    // x, y += dx, dy next 2
   6168     sub        ecx, 4
   6169     movq       qword ptr 8[edx], xmm6
   6170     lea        edx, [edx + 16]
   6171     jge        l4
   6172 
   6173   l4b:
   6174     add        ecx, 4 - 1
   6175     jl         l1b
   6176 
   6177     // 1 pixel loop
   6178     align      4
   6179   l1:
   6180     cvttps2dq  xmm0, xmm2    // x, y float to int
   6181     packssdw   xmm0, xmm0    // x, y as shorts
   6182     pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
   6183     addps      xmm2, xmm7    // x, y += dx, dy
   6184     movd       esi, xmm0
   6185     movd       xmm0, [eax + esi]  // copy a pixel
   6186     sub        ecx, 1
   6187     movd       [edx], xmm0
   6188     lea        edx, [edx + 4]
   6189     jge        l1
   6190   l1b:
   6191     pop        edi
   6192     pop        esi
   6193     ret
   6194   }
   6195 }
   6196 #endif  // HAS_ARGBAFFINEROW_SSE2
   6197 
   6198 #ifdef HAS_INTERPOLATEROW_AVX2
   6199 // Bilinear filter 16x2 -> 16x1
   6200 __declspec(naked) __declspec(align(16))
   6201 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
   6202                           ptrdiff_t src_stride, int dst_width,
   6203                           int source_y_fraction) {
   6204   __asm {
   6205     push       esi
   6206     push       edi
   6207     mov        edi, [esp + 8 + 4]   // dst_ptr
   6208     mov        esi, [esp + 8 + 8]   // src_ptr
   6209     mov        edx, [esp + 8 + 12]  // src_stride
   6210     mov        ecx, [esp + 8 + 16]  // dst_width
   6211     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   6212     shr        eax, 1
   6213     // Dispatch to specialized filters if applicable.
   6214     cmp        eax, 0
   6215     je         xloop100  // 0 / 128.  Blend 100 / 0.
   6216     sub        edi, esi
   6217     cmp        eax, 32
   6218     je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
   6219     cmp        eax, 64
   6220     je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
   6221     cmp        eax, 96
   6222     je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
   6223 
   6224     vmovd      xmm0, eax  // high fraction 0..127
   6225     neg        eax
   6226     add        eax, 128
   6227     vmovd      xmm5, eax  // low fraction 128..1
   6228     vpunpcklbw xmm5, xmm5, xmm0
   6229     vpunpcklwd xmm5, xmm5, xmm5
   6230     vpxor      ymm0, ymm0, ymm0
   6231     vpermd     ymm5, ymm0, ymm5
   6232 
   6233     align      4
   6234   xloop:
   6235     vmovdqu    ymm0, [esi]
   6236     vmovdqu    ymm2, [esi + edx]
   6237     vpunpckhbw ymm1, ymm0, ymm2  // mutates
   6238     vpunpcklbw ymm0, ymm0, ymm2  // mutates
   6239     vpmaddubsw ymm0, ymm0, ymm5
   6240     vpmaddubsw ymm1, ymm1, ymm5
   6241     vpsrlw     ymm0, ymm0, 7
   6242     vpsrlw     ymm1, ymm1, 7
   6243     vpackuswb  ymm0, ymm0, ymm1  // unmutates
   6244     sub        ecx, 32
   6245     vmovdqu    [esi + edi], ymm0
   6246     lea        esi, [esi + 32]
   6247     jg         xloop
   6248     jmp        xloop99
   6249 
   6250     // Blend 25 / 75.
   6251     align      4
   6252   xloop25:
   6253     vmovdqu    ymm0, [esi]
   6254     vpavgb     ymm0, ymm0, [esi + edx]
   6255     vpavgb     ymm0, ymm0, [esi + edx]
   6256     sub        ecx, 32
   6257     vmovdqu    [esi + edi], ymm0
   6258     lea        esi, [esi + 32]
   6259     jg         xloop25
   6260     jmp        xloop99
   6261 
   6262     // Blend 50 / 50.
   6263     align      4
   6264   xloop50:
   6265     vmovdqu    ymm0, [esi]
   6266     vpavgb     ymm0, ymm0, [esi + edx]
   6267     sub        ecx, 32
   6268     vmovdqu    [esi + edi], ymm0
   6269     lea        esi, [esi + 32]
   6270     jg         xloop50
   6271     jmp        xloop99
   6272 
   6273     // Blend 75 / 25.
   6274     align      4
   6275   xloop75:
   6276     vmovdqu    ymm0, [esi + edx]
   6277     vpavgb     ymm0, ymm0, [esi]
   6278     vpavgb     ymm0, ymm0, [esi]
   6279     sub        ecx, 32
   6280     vmovdqu     [esi + edi], ymm0
   6281     lea        esi, [esi + 32]
   6282     jg         xloop75
   6283     jmp        xloop99
   6284 
   6285     // Blend 100 / 0 - Copy row unchanged.
   6286     align      4
   6287   xloop100:
   6288     rep movsb
   6289 
   6290   xloop99:
   6291     pop        edi
   6292     pop        esi
   6293     vzeroupper
   6294     ret
   6295   }
   6296 }
   6297 #endif  // HAS_INTERPOLATEROW_AVX2
   6298 
   6299 #ifdef HAS_INTERPOLATEROW_SSSE3
   6300 // Bilinear filter 16x2 -> 16x1
   6301 __declspec(naked) __declspec(align(16))
   6302 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   6303                           ptrdiff_t src_stride, int dst_width,
   6304                           int source_y_fraction) {
   6305   __asm {
   6306     push       esi
   6307     push       edi
   6308     mov        edi, [esp + 8 + 4]   // dst_ptr
   6309     mov        esi, [esp + 8 + 8]   // src_ptr
   6310     mov        edx, [esp + 8 + 12]  // src_stride
   6311     mov        ecx, [esp + 8 + 16]  // dst_width
   6312     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   6313     sub        edi, esi
   6314     shr        eax, 1
   6315     // Dispatch to specialized filters if applicable.
   6316     cmp        eax, 0
   6317     je         xloop100  // 0 / 128.  Blend 100 / 0.
   6318     cmp        eax, 32
   6319     je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
   6320     cmp        eax, 64
   6321     je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
   6322     cmp        eax, 96
   6323     je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
   6324 
   6325     movd       xmm0, eax  // high fraction 0..127
   6326     neg        eax
   6327     add        eax, 128
   6328     movd       xmm5, eax  // low fraction 128..1
   6329     punpcklbw  xmm5, xmm0
   6330     punpcklwd  xmm5, xmm5
   6331     pshufd     xmm5, xmm5, 0
   6332 
   6333     align      4
   6334   xloop:
   6335     movdqa     xmm0, [esi]
   6336     movdqa     xmm2, [esi + edx]
   6337     movdqa     xmm1, xmm0
   6338     punpcklbw  xmm0, xmm2
   6339     punpckhbw  xmm1, xmm2
   6340     pmaddubsw  xmm0, xmm5
   6341     pmaddubsw  xmm1, xmm5
   6342     psrlw      xmm0, 7
   6343     psrlw      xmm1, 7
   6344     packuswb   xmm0, xmm1
   6345     sub        ecx, 16
   6346     movdqa     [esi + edi], xmm0
   6347     lea        esi, [esi + 16]
   6348     jg         xloop
   6349     jmp        xloop99
   6350 
   6351     // Blend 25 / 75.
   6352     align      4
   6353   xloop25:
   6354     movdqa     xmm0, [esi]
   6355     movdqa     xmm1, [esi + edx]
   6356     pavgb      xmm0, xmm1
   6357     pavgb      xmm0, xmm1
   6358     sub        ecx, 16
   6359     movdqa     [esi + edi], xmm0
   6360     lea        esi, [esi + 16]
   6361     jg         xloop25
   6362     jmp        xloop99
   6363 
   6364     // Blend 50 / 50.
   6365     align      4
   6366   xloop50:
   6367     movdqa     xmm0, [esi]
   6368     movdqa     xmm1, [esi + edx]
   6369     pavgb      xmm0, xmm1
   6370     sub        ecx, 16
   6371     movdqa     [esi + edi], xmm0
   6372     lea        esi, [esi + 16]
   6373     jg         xloop50
   6374     jmp        xloop99
   6375 
   6376     // Blend 75 / 25.
   6377     align      4
   6378   xloop75:
   6379     movdqa     xmm1, [esi]
   6380     movdqa     xmm0, [esi + edx]
   6381     pavgb      xmm0, xmm1
   6382     pavgb      xmm0, xmm1
   6383     sub        ecx, 16
   6384     movdqa     [esi + edi], xmm0
   6385     lea        esi, [esi + 16]
   6386     jg         xloop75
   6387     jmp        xloop99
   6388 
   6389     // Blend 100 / 0 - Copy row unchanged.
   6390     align      4
   6391   xloop100:
   6392     movdqa     xmm0, [esi]
   6393     sub        ecx, 16
   6394     movdqa     [esi + edi], xmm0
   6395     lea        esi, [esi + 16]
   6396     jg         xloop100
   6397 
   6398   xloop99:
   6399     pop        edi
   6400     pop        esi
   6401     ret
   6402   }
   6403 }
   6404 #endif  // HAS_INTERPOLATEROW_SSSE3
   6405 
   6406 #ifdef HAS_INTERPOLATEROW_SSE2
   6407 // Bilinear filter 16x2 -> 16x1
   6408 __declspec(naked) __declspec(align(16))
   6409 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
   6410                          ptrdiff_t src_stride, int dst_width,
   6411                          int source_y_fraction) {
   6412   __asm {
   6413     push       esi
   6414     push       edi
   6415     mov        edi, [esp + 8 + 4]   // dst_ptr
   6416     mov        esi, [esp + 8 + 8]   // src_ptr
   6417     mov        edx, [esp + 8 + 12]  // src_stride
   6418     mov        ecx, [esp + 8 + 16]  // dst_width
   6419     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   6420     sub        edi, esi
   6421     // Dispatch to specialized filters if applicable.
   6422     cmp        eax, 0
   6423     je         xloop100  // 0 / 256.  Blend 100 / 0.
   6424     cmp        eax, 64
   6425     je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
   6426     cmp        eax, 128
   6427     je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
   6428     cmp        eax, 192
   6429     je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
   6430 
   6431     movd       xmm5, eax            // xmm5 = y fraction
   6432     punpcklbw  xmm5, xmm5
   6433     psrlw      xmm5, 1
   6434     punpcklwd  xmm5, xmm5
   6435     punpckldq  xmm5, xmm5
   6436     punpcklqdq xmm5, xmm5
   6437     pxor       xmm4, xmm4
   6438 
   6439     align      4
   6440   xloop:
   6441     movdqa     xmm0, [esi]  // row0
   6442     movdqa     xmm2, [esi + edx]  // row1
   6443     movdqa     xmm1, xmm0
   6444     movdqa     xmm3, xmm2
   6445     punpcklbw  xmm2, xmm4
   6446     punpckhbw  xmm3, xmm4
   6447     punpcklbw  xmm0, xmm4
   6448     punpckhbw  xmm1, xmm4
   6449     psubw      xmm2, xmm0  // row1 - row0
   6450     psubw      xmm3, xmm1
   6451     paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
   6452     paddw      xmm3, xmm3
   6453     pmulhw     xmm2, xmm5  // scale diff
   6454     pmulhw     xmm3, xmm5
   6455     paddw      xmm0, xmm2  // sum rows
   6456     paddw      xmm1, xmm3
   6457     packuswb   xmm0, xmm1
   6458     sub        ecx, 16
   6459     movdqa     [esi + edi], xmm0
   6460     lea        esi, [esi + 16]
   6461     jg         xloop
   6462     jmp        xloop99
   6463 
   6464     // Blend 25 / 75.
   6465     align      4
   6466   xloop25:
   6467     movdqa     xmm0, [esi]
   6468     movdqa     xmm1, [esi + edx]
   6469     pavgb      xmm0, xmm1
   6470     pavgb      xmm0, xmm1
   6471     sub        ecx, 16
   6472     movdqa     [esi + edi], xmm0
   6473     lea        esi, [esi + 16]
   6474     jg         xloop25
   6475     jmp        xloop99
   6476 
   6477     // Blend 50 / 50.
   6478     align      4
   6479   xloop50:
   6480     movdqa     xmm0, [esi]
   6481     movdqa     xmm1, [esi + edx]
   6482     pavgb      xmm0, xmm1
   6483     sub        ecx, 16
   6484     movdqa     [esi + edi], xmm0
   6485     lea        esi, [esi + 16]
   6486     jg         xloop50
   6487     jmp        xloop99
   6488 
   6489     // Blend 75 / 25.
   6490     align      4
   6491   xloop75:
   6492     movdqa     xmm1, [esi]
   6493     movdqa     xmm0, [esi + edx]
   6494     pavgb      xmm0, xmm1
   6495     pavgb      xmm0, xmm1
   6496     sub        ecx, 16
   6497     movdqa     [esi + edi], xmm0
   6498     lea        esi, [esi + 16]
   6499     jg         xloop75
   6500     jmp        xloop99
   6501 
   6502     // Blend 100 / 0 - Copy row unchanged.
   6503     align      4
   6504   xloop100:
   6505     movdqa     xmm0, [esi]
   6506     sub        ecx, 16
   6507     movdqa     [esi + edi], xmm0
   6508     lea        esi, [esi + 16]
   6509     jg         xloop100
   6510 
   6511   xloop99:
   6512     pop        edi
   6513     pop        esi
   6514     ret
   6515   }
   6516 }
   6517 #endif  // HAS_INTERPOLATEROW_SSE2
   6518 
   6519 // Bilinear filter 16x2 -> 16x1
   6520 __declspec(naked) __declspec(align(16))
   6521 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   6522                                     ptrdiff_t src_stride, int dst_width,
   6523                                     int source_y_fraction) {
   6524   __asm {
   6525     push       esi
   6526     push       edi
   6527     mov        edi, [esp + 8 + 4]   // dst_ptr
   6528     mov        esi, [esp + 8 + 8]   // src_ptr
   6529     mov        edx, [esp + 8 + 12]  // src_stride
   6530     mov        ecx, [esp + 8 + 16]  // dst_width
   6531     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   6532     sub        edi, esi
   6533     shr        eax, 1
   6534     // Dispatch to specialized filters if applicable.
   6535     cmp        eax, 0
   6536     je         xloop100  // 0 / 128.  Blend 100 / 0.
   6537     cmp        eax, 32
   6538     je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
   6539     cmp        eax, 64
   6540     je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
   6541     cmp        eax, 96
   6542     je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
   6543 
   6544     movd       xmm0, eax  // high fraction 0..127
   6545     neg        eax
   6546     add        eax, 128
   6547     movd       xmm5, eax  // low fraction 128..1
   6548     punpcklbw  xmm5, xmm0
   6549     punpcklwd  xmm5, xmm5
   6550     pshufd     xmm5, xmm5, 0
   6551 
   6552     align      4
   6553   xloop:
   6554     movdqu     xmm0, [esi]
   6555     movdqu     xmm2, [esi + edx]
   6556     movdqu     xmm1, xmm0
   6557     punpcklbw  xmm0, xmm2
   6558     punpckhbw  xmm1, xmm2
   6559     pmaddubsw  xmm0, xmm5
   6560     pmaddubsw  xmm1, xmm5
   6561     psrlw      xmm0, 7
   6562     psrlw      xmm1, 7
   6563     packuswb   xmm0, xmm1
   6564     sub        ecx, 16
   6565     movdqu     [esi + edi], xmm0
   6566     lea        esi, [esi + 16]
   6567     jg         xloop
   6568     jmp        xloop99
   6569 
   6570     // Blend 25 / 75.
   6571     align      4
   6572   xloop25:
   6573     movdqu     xmm0, [esi]
   6574     movdqu     xmm1, [esi + edx]
   6575     pavgb      xmm0, xmm1
   6576     pavgb      xmm0, xmm1
   6577     sub        ecx, 16
   6578     movdqu     [esi + edi], xmm0
   6579     lea        esi, [esi + 16]
   6580     jg         xloop25
   6581     jmp        xloop99
   6582 
   6583     // Blend 50 / 50.
   6584     align      4
   6585   xloop50:
   6586     movdqu     xmm0, [esi]
   6587     movdqu     xmm1, [esi + edx]
   6588     pavgb      xmm0, xmm1
   6589     sub        ecx, 16
   6590     movdqu     [esi + edi], xmm0
   6591     lea        esi, [esi + 16]
   6592     jg         xloop50
   6593     jmp        xloop99
   6594 
   6595     // Blend 75 / 25.
   6596     align      4
   6597   xloop75:
   6598     movdqu     xmm1, [esi]
   6599     movdqu     xmm0, [esi + edx]
   6600     pavgb      xmm0, xmm1
   6601     pavgb      xmm0, xmm1
   6602     sub        ecx, 16
   6603     movdqu     [esi + edi], xmm0
   6604     lea        esi, [esi + 16]
   6605     jg         xloop75
   6606     jmp        xloop99
   6607 
   6608     // Blend 100 / 0 - Copy row unchanged.
   6609     align      4
   6610   xloop100:
   6611     movdqu     xmm0, [esi]
   6612     sub        ecx, 16
   6613     movdqu     [esi + edi], xmm0
   6614     lea        esi, [esi + 16]
   6615     jg         xloop100
   6616 
   6617   xloop99:
   6618     pop        edi
   6619     pop        esi
   6620     ret
   6621   }
   6622 }
   6623 
   6624 #ifdef HAS_INTERPOLATEROW_SSE2
   6625 // Bilinear filter 16x2 -> 16x1
   6626 __declspec(naked) __declspec(align(16))
   6627 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
   6628                                    ptrdiff_t src_stride, int dst_width,
   6629                                    int source_y_fraction) {
   6630   __asm {
   6631     push       esi
   6632     push       edi
   6633     mov        edi, [esp + 8 + 4]   // dst_ptr
   6634     mov        esi, [esp + 8 + 8]   // src_ptr
   6635     mov        edx, [esp + 8 + 12]  // src_stride
   6636     mov        ecx, [esp + 8 + 16]  // dst_width
   6637     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   6638     sub        edi, esi
   6639     // Dispatch to specialized filters if applicable.
   6640     cmp        eax, 0
   6641     je         xloop100  // 0 / 256.  Blend 100 / 0.
   6642     cmp        eax, 64
   6643     je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
   6644     cmp        eax, 128
   6645     je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
   6646     cmp        eax, 192
   6647     je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
   6648 
   6649     movd       xmm5, eax            // xmm5 = y fraction
   6650     punpcklbw  xmm5, xmm5
   6651     psrlw      xmm5, 1
   6652     punpcklwd  xmm5, xmm5
   6653     punpckldq  xmm5, xmm5
   6654     punpcklqdq xmm5, xmm5
   6655     pxor       xmm4, xmm4
   6656 
   6657     align      4
   6658   xloop:
   6659     movdqu     xmm0, [esi]  // row0
   6660     movdqu     xmm2, [esi + edx]  // row1
   6661     movdqu     xmm1, xmm0
   6662     movdqu     xmm3, xmm2
   6663     punpcklbw  xmm2, xmm4
   6664     punpckhbw  xmm3, xmm4
   6665     punpcklbw  xmm0, xmm4
   6666     punpckhbw  xmm1, xmm4
   6667     psubw      xmm2, xmm0  // row1 - row0
   6668     psubw      xmm3, xmm1
   6669     paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
   6670     paddw      xmm3, xmm3
   6671     pmulhw     xmm2, xmm5  // scale diff
   6672     pmulhw     xmm3, xmm5
   6673     paddw      xmm0, xmm2  // sum rows
   6674     paddw      xmm1, xmm3
   6675     packuswb   xmm0, xmm1
   6676     sub        ecx, 16
   6677     movdqu     [esi + edi], xmm0
   6678     lea        esi, [esi + 16]
   6679     jg         xloop
   6680     jmp        xloop99
   6681 
   6682     // Blend 25 / 75.
   6683     align      4
   6684   xloop25:
   6685     movdqu     xmm0, [esi]
   6686     movdqu     xmm1, [esi + edx]
   6687     pavgb      xmm0, xmm1
   6688     pavgb      xmm0, xmm1
   6689     sub        ecx, 16
   6690     movdqu     [esi + edi], xmm0
   6691     lea        esi, [esi + 16]
   6692     jg         xloop25
   6693     jmp        xloop99
   6694 
   6695     // Blend 50 / 50.
   6696     align      4
   6697   xloop50:
   6698     movdqu     xmm0, [esi]
   6699     movdqu     xmm1, [esi + edx]
   6700     pavgb      xmm0, xmm1
   6701     sub        ecx, 16
   6702     movdqu     [esi + edi], xmm0
   6703     lea        esi, [esi + 16]
   6704     jg         xloop50
   6705     jmp        xloop99
   6706 
   6707     // Blend 75 / 25.
   6708     align      4
   6709   xloop75:
   6710     movdqu     xmm1, [esi]
   6711     movdqu     xmm0, [esi + edx]
   6712     pavgb      xmm0, xmm1
   6713     pavgb      xmm0, xmm1
   6714     sub        ecx, 16
   6715     movdqu     [esi + edi], xmm0
   6716     lea        esi, [esi + 16]
   6717     jg         xloop75
   6718     jmp        xloop99
   6719 
   6720     // Blend 100 / 0 - Copy row unchanged.
   6721     align      4
   6722   xloop100:
   6723     movdqu     xmm0, [esi]
   6724     sub        ecx, 16
   6725     movdqu     [esi + edi], xmm0
   6726     lea        esi, [esi + 16]
   6727     jg         xloop100
   6728 
   6729   xloop99:
   6730     pop        edi
   6731     pop        esi
   6732     ret
   6733   }
   6734 }
   6735 #endif  // HAS_INTERPOLATEROW_SSE2
   6736 
   6737 __declspec(naked) __declspec(align(16))
   6738 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
   6739                   uint8* dst_uv, int pix) {
   6740   __asm {
   6741     push       edi
   6742     mov        eax, [esp + 4 + 4]    // src_uv
   6743     mov        edx, [esp + 4 + 8]    // src_uv_stride
   6744     mov        edi, [esp + 4 + 12]   // dst_v
   6745     mov        ecx, [esp + 4 + 16]   // pix
   6746     sub        edi, eax
   6747 
   6748     align      4
   6749   convertloop:
   6750     movdqa     xmm0, [eax]
   6751     pavgb      xmm0, [eax + edx]
   6752     sub        ecx, 16
   6753     movdqa     [eax + edi], xmm0
   6754     lea        eax,  [eax + 16]
   6755     jg         convertloop
   6756     pop        edi
   6757     ret
   6758   }
   6759 }
   6760 
   6761 #ifdef HAS_HALFROW_AVX2
   6762 __declspec(naked) __declspec(align(16))
   6763 void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
   6764                   uint8* dst_uv, int pix) {
   6765   __asm {
   6766     push       edi
   6767     mov        eax, [esp + 4 + 4]    // src_uv
   6768     mov        edx, [esp + 4 + 8]    // src_uv_stride
   6769     mov        edi, [esp + 4 + 12]   // dst_v
   6770     mov        ecx, [esp + 4 + 16]   // pix
   6771     sub        edi, eax
   6772 
   6773     align      4
   6774   convertloop:
   6775     vmovdqu    ymm0, [eax]
   6776     vpavgb     ymm0, ymm0, [eax + edx]
   6777     sub        ecx, 32
   6778     vmovdqu    [eax + edi], ymm0
   6779     lea        eax,  [eax + 32]
   6780     jg         convertloop
   6781 
   6782     pop        edi
   6783     vzeroupper
   6784     ret
   6785   }
   6786 }
   6787 #endif  // HAS_HALFROW_AVX2
   6788 
   6789 __declspec(naked) __declspec(align(16))
   6790 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
   6791                           uint32 selector, int pix) {
   6792   __asm {
   6793     mov        eax, [esp + 4]    // src_argb
   6794     mov        edx, [esp + 8]    // dst_bayer
   6795     movd       xmm5, [esp + 12]  // selector
   6796     mov        ecx, [esp + 16]   // pix
   6797     pshufd     xmm5, xmm5, 0
   6798 
   6799     align      4
   6800   wloop:
   6801     movdqa     xmm0, [eax]
   6802     movdqa     xmm1, [eax + 16]
   6803     lea        eax, [eax + 32]
   6804     pshufb     xmm0, xmm5
   6805     pshufb     xmm1, xmm5
   6806     punpckldq  xmm0, xmm1
   6807     sub        ecx, 8
   6808     movq       qword ptr [edx], xmm0
   6809     lea        edx, [edx + 8]
   6810     jg         wloop
   6811     ret
   6812   }
   6813 }
   6814 
   6815 // Specialized ARGB to Bayer that just isolates G channel.
   6816 __declspec(naked) __declspec(align(16))
   6817 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
   6818                            uint32 selector, int pix) {
   6819   __asm {
   6820     mov        eax, [esp + 4]    // src_argb
   6821     mov        edx, [esp + 8]    // dst_bayer
   6822                                  // selector
   6823     mov        ecx, [esp + 16]   // pix
   6824     pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff
   6825     psrld      xmm5, 24
   6826 
   6827     align      4
   6828   wloop:
   6829     movdqa     xmm0, [eax]
   6830     movdqa     xmm1, [eax + 16]
   6831     lea        eax, [eax + 32]
   6832     psrld      xmm0, 8  // Move green to bottom.
   6833     psrld      xmm1, 8
   6834     pand       xmm0, xmm5
   6835     pand       xmm1, xmm5
   6836     packssdw   xmm0, xmm1
   6837     packuswb   xmm0, xmm1
   6838     sub        ecx, 8
   6839     movq       qword ptr [edx], xmm0
   6840     lea        edx, [edx + 8]
   6841     jg         wloop
   6842     ret
   6843   }
   6844 }
   6845 
   6846 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
   6847 __declspec(naked) __declspec(align(16))
   6848 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
   6849                           const uint8* shuffler, int pix) {
   6850   __asm {
   6851     mov        eax, [esp + 4]    // src_argb
   6852     mov        edx, [esp + 8]    // dst_argb
   6853     mov        ecx, [esp + 12]   // shuffler
   6854     movdqa     xmm5, [ecx]
   6855     mov        ecx, [esp + 16]   // pix
   6856 
   6857     align      4
   6858   wloop:
   6859     movdqa     xmm0, [eax]
   6860     movdqa     xmm1, [eax + 16]
   6861     lea        eax, [eax + 32]
   6862     pshufb     xmm0, xmm5
   6863     pshufb     xmm1, xmm5
   6864     sub        ecx, 8
   6865     movdqa     [edx], xmm0
   6866     movdqa     [edx + 16], xmm1
   6867     lea        edx, [edx + 32]
   6868     jg         wloop
   6869     ret
   6870   }
   6871 }
   6872 
   6873 __declspec(naked) __declspec(align(16))
   6874 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
   6875                                     const uint8* shuffler, int pix) {
   6876   __asm {
   6877     mov        eax, [esp + 4]    // src_argb
   6878     mov        edx, [esp + 8]    // dst_argb
   6879     mov        ecx, [esp + 12]   // shuffler
   6880     movdqa     xmm5, [ecx]
   6881     mov        ecx, [esp + 16]   // pix
   6882 
   6883     align      4
   6884   wloop:
   6885     movdqu     xmm0, [eax]
   6886     movdqu     xmm1, [eax + 16]
   6887     lea        eax, [eax + 32]
   6888     pshufb     xmm0, xmm5
   6889     pshufb     xmm1, xmm5
   6890     sub        ecx, 8
   6891     movdqu     [edx], xmm0
   6892     movdqu     [edx + 16], xmm1
   6893     lea        edx, [edx + 32]
   6894     jg         wloop
   6895     ret
   6896   }
   6897 }
   6898 
   6899 #ifdef HAS_ARGBSHUFFLEROW_AVX2
   6900 __declspec(naked) __declspec(align(16))
   6901 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
   6902                          const uint8* shuffler, int pix) {
   6903   __asm {
   6904     mov        eax, [esp + 4]     // src_argb
   6905     mov        edx, [esp + 8]     // dst_argb
   6906     mov        ecx, [esp + 12]    // shuffler
   6907     vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
   6908     mov        ecx, [esp + 16]    // pix
   6909 
   6910     align      4
   6911   wloop:
   6912     vmovdqu    ymm0, [eax]
   6913     vmovdqu    ymm1, [eax + 32]
   6914     lea        eax, [eax + 64]
   6915     vpshufb    ymm0, ymm0, ymm5
   6916     vpshufb    ymm1, ymm1, ymm5
   6917     sub        ecx, 16
   6918     vmovdqu    [edx], ymm0
   6919     vmovdqu    [edx + 32], ymm1
   6920     lea        edx, [edx + 64]
   6921     jg         wloop
   6922 
   6923     vzeroupper
   6924     ret
   6925   }
   6926 }
   6927 #endif  // HAS_ARGBSHUFFLEROW_AVX2
   6928 
   6929 __declspec(naked) __declspec(align(16))
   6930 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
   6931                          const uint8* shuffler, int pix) {
   6932   __asm {
   6933     push       ebx
   6934     push       esi
   6935     mov        eax, [esp + 8 + 4]    // src_argb
   6936     mov        edx, [esp + 8 + 8]    // dst_argb
   6937     mov        esi, [esp + 8 + 12]   // shuffler
   6938     mov        ecx, [esp + 8 + 16]   // pix
   6939     pxor       xmm5, xmm5
   6940 
   6941     mov        ebx, [esi]   // shuffler
   6942     cmp        ebx, 0x03000102
   6943     je         shuf_3012
   6944     cmp        ebx, 0x00010203
   6945     je         shuf_0123
   6946     cmp        ebx, 0x00030201
   6947     je         shuf_0321
   6948     cmp        ebx, 0x02010003
   6949     je         shuf_2103
   6950 
   6951   // TODO(fbarchard): Use one source pointer and 3 offsets.
   6952   shuf_any1:
   6953     movzx      ebx, byte ptr [esi]
   6954     movzx      ebx, byte ptr [eax + ebx]
   6955     mov        [edx], bl
   6956     movzx      ebx, byte ptr [esi + 1]
   6957     movzx      ebx, byte ptr [eax + ebx]
   6958     mov        [edx + 1], bl
   6959     movzx      ebx, byte ptr [esi + 2]
   6960     movzx      ebx, byte ptr [eax + ebx]
   6961     mov        [edx + 2], bl
   6962     movzx      ebx, byte ptr [esi + 3]
   6963     movzx      ebx, byte ptr [eax + ebx]
   6964     mov        [edx + 3], bl
   6965     lea        eax, [eax + 4]
   6966     lea        edx, [edx + 4]
   6967     sub        ecx, 1
   6968     jg         shuf_any1
   6969     jmp        shuf99
   6970 
   6971     align      4
   6972   shuf_0123:
   6973     movdqu     xmm0, [eax]
   6974     lea        eax, [eax + 16]
   6975     movdqa     xmm1, xmm0
   6976     punpcklbw  xmm0, xmm5
   6977     punpckhbw  xmm1, xmm5
   6978     pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
   6979     pshuflw    xmm0, xmm0, 01Bh
   6980     pshufhw    xmm1, xmm1, 01Bh
   6981     pshuflw    xmm1, xmm1, 01Bh
   6982     packuswb   xmm0, xmm1
   6983     sub        ecx, 4
   6984     movdqu     [edx], xmm0
   6985     lea        edx, [edx + 16]
   6986     jg         shuf_0123
   6987     jmp        shuf99
   6988 
   6989     align      4
   6990   shuf_0321:
   6991     movdqu     xmm0, [eax]
   6992     lea        eax, [eax + 16]
   6993     movdqa     xmm1, xmm0
   6994     punpcklbw  xmm0, xmm5
   6995     punpckhbw  xmm1, xmm5
   6996     pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
   6997     pshuflw    xmm0, xmm0, 039h
   6998     pshufhw    xmm1, xmm1, 039h
   6999     pshuflw    xmm1, xmm1, 039h
   7000     packuswb   xmm0, xmm1
   7001     sub        ecx, 4
   7002     movdqu     [edx], xmm0
   7003     lea        edx, [edx + 16]
   7004     jg         shuf_0321
   7005     jmp        shuf99
   7006 
   7007     align      4
   7008   shuf_2103:
   7009     movdqu     xmm0, [eax]
   7010     lea        eax, [eax + 16]
   7011     movdqa     xmm1, xmm0
   7012     punpcklbw  xmm0, xmm5
   7013     punpckhbw  xmm1, xmm5
   7014     pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
   7015     pshuflw    xmm0, xmm0, 093h
   7016     pshufhw    xmm1, xmm1, 093h
   7017     pshuflw    xmm1, xmm1, 093h
   7018     packuswb   xmm0, xmm1
   7019     sub        ecx, 4
   7020     movdqu     [edx], xmm0
   7021     lea        edx, [edx + 16]
   7022     jg         shuf_2103
   7023     jmp        shuf99
   7024 
   7025     align      4
   7026   shuf_3012:
   7027     movdqu     xmm0, [eax]
   7028     lea        eax, [eax + 16]
   7029     movdqa     xmm1, xmm0
   7030     punpcklbw  xmm0, xmm5
   7031     punpckhbw  xmm1, xmm5
   7032     pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
   7033     pshuflw    xmm0, xmm0, 0C6h
   7034     pshufhw    xmm1, xmm1, 0C6h
   7035     pshuflw    xmm1, xmm1, 0C6h
   7036     packuswb   xmm0, xmm1
   7037     sub        ecx, 4
   7038     movdqu     [edx], xmm0
   7039     lea        edx, [edx + 16]
   7040     jg         shuf_3012
   7041 
   7042   shuf99:
   7043     pop        esi
   7044     pop        ebx
   7045     ret
   7046   }
   7047 }
   7048 
   7049 // YUY2 - Macro-pixel = 2 image pixels
   7050 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
   7051 
   7052 // UYVY - Macro-pixel = 2 image pixels
   7053 // U0Y0V0Y1
   7054 
   7055 __declspec(naked) __declspec(align(16))
   7056 void I422ToYUY2Row_SSE2(const uint8* src_y,
   7057                         const uint8* src_u,
   7058                         const uint8* src_v,
   7059                         uint8* dst_frame, int width) {
   7060   __asm {
   7061     push       esi
   7062     push       edi
   7063     mov        eax, [esp + 8 + 4]    // src_y
   7064     mov        esi, [esp + 8 + 8]    // src_u
   7065     mov        edx, [esp + 8 + 12]   // src_v
   7066     mov        edi, [esp + 8 + 16]   // dst_frame
   7067     mov        ecx, [esp + 8 + 20]   // width
   7068     sub        edx, esi
   7069 
   7070     align      4
   7071   convertloop:
   7072     movq       xmm2, qword ptr [esi] // U
   7073     movq       xmm3, qword ptr [esi + edx] // V
   7074     lea        esi, [esi + 8]
   7075     punpcklbw  xmm2, xmm3 // UV
   7076     movdqu     xmm0, [eax] // Y
   7077     lea        eax, [eax + 16]
   7078     movdqa     xmm1, xmm0
   7079     punpcklbw  xmm0, xmm2 // YUYV
   7080     punpckhbw  xmm1, xmm2
   7081     movdqu     [edi], xmm0
   7082     movdqu     [edi + 16], xmm1
   7083     lea        edi, [edi + 32]
   7084     sub        ecx, 16
   7085     jg         convertloop
   7086 
   7087     pop        edi
   7088     pop        esi
   7089     ret
   7090   }
   7091 }
   7092 
   7093 __declspec(naked) __declspec(align(16))
   7094 void I422ToUYVYRow_SSE2(const uint8* src_y,
   7095                         const uint8* src_u,
   7096                         const uint8* src_v,
   7097                         uint8* dst_frame, int width) {
   7098   __asm {
   7099     push       esi
   7100     push       edi
   7101     mov        eax, [esp + 8 + 4]    // src_y
   7102     mov        esi, [esp + 8 + 8]    // src_u
   7103     mov        edx, [esp + 8 + 12]   // src_v
   7104     mov        edi, [esp + 8 + 16]   // dst_frame
   7105     mov        ecx, [esp + 8 + 20]   // width
   7106     sub        edx, esi
   7107 
   7108     align      4
   7109   convertloop:
   7110     movq       xmm2, qword ptr [esi] // U
   7111     movq       xmm3, qword ptr [esi + edx] // V
   7112     lea        esi, [esi + 8]
   7113     punpcklbw  xmm2, xmm3 // UV
   7114     movdqu     xmm0, [eax] // Y
   7115     movdqa     xmm1, xmm2
   7116     lea        eax, [eax + 16]
   7117     punpcklbw  xmm1, xmm0 // UYVY
   7118     punpckhbw  xmm2, xmm0
   7119     movdqu     [edi], xmm1
   7120     movdqu     [edi + 16], xmm2
   7121     lea        edi, [edi + 32]
   7122     sub        ecx, 16
   7123     jg         convertloop
   7124 
   7125     pop        edi
   7126     pop        esi
   7127     ret
   7128   }
   7129 }
   7130 
   7131 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
   7132 __declspec(naked) __declspec(align(16))
   7133 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
   7134                             uint8* dst_argb, const float* poly,
   7135                             int width) {
   7136   __asm {
   7137     push       esi
   7138     mov        eax, [esp + 4 + 4]   /* src_argb */
   7139     mov        edx, [esp + 4 + 8]   /* dst_argb */
   7140     mov        esi, [esp + 4 + 12]  /* poly */
   7141     mov        ecx, [esp + 4 + 16]  /* width */
   7142     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
   7143 
   7144     // 2 pixel loop.
   7145     align      4
   7146  convertloop:
   7147 //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
   7148 //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
   7149     movq       xmm0, qword ptr [eax]  // BGRABGRA
   7150     lea        eax, [eax + 8]
   7151     punpcklbw  xmm0, xmm3
   7152     movdqa     xmm4, xmm0
   7153     punpcklwd  xmm0, xmm3  // pixel 0
   7154     punpckhwd  xmm4, xmm3  // pixel 1
   7155     cvtdq2ps   xmm0, xmm0  // 4 floats
   7156     cvtdq2ps   xmm4, xmm4
   7157     movdqa     xmm1, xmm0  // X
   7158     movdqa     xmm5, xmm4
   7159     mulps      xmm0, [esi + 16]  // C1 * X
   7160     mulps      xmm4, [esi + 16]
   7161     addps      xmm0, [esi]  // result = C0 + C1 * X
   7162     addps      xmm4, [esi]
   7163     movdqa     xmm2, xmm1
   7164     movdqa     xmm6, xmm5
   7165     mulps      xmm2, xmm1  // X * X
   7166     mulps      xmm6, xmm5
   7167     mulps      xmm1, xmm2  // X * X * X
   7168     mulps      xmm5, xmm6
   7169     mulps      xmm2, [esi + 32]  // C2 * X * X
   7170     mulps      xmm6, [esi + 32]
   7171     mulps      xmm1, [esi + 48]  // C3 * X * X * X
   7172     mulps      xmm5, [esi + 48]
   7173     addps      xmm0, xmm2  // result += C2 * X * X
   7174     addps      xmm4, xmm6
   7175     addps      xmm0, xmm1  // result += C3 * X * X * X
   7176     addps      xmm4, xmm5
   7177     cvttps2dq  xmm0, xmm0
   7178     cvttps2dq  xmm4, xmm4
   7179     packuswb   xmm0, xmm4
   7180     packuswb   xmm0, xmm0
   7181     sub        ecx, 2
   7182     movq       qword ptr [edx], xmm0
   7183     lea        edx, [edx + 8]
   7184     jg         convertloop
   7185     pop        esi
   7186     ret
   7187   }
   7188 }
   7189 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
   7190 
   7191 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
   7192 __declspec(naked) __declspec(align(16))
   7193 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
   7194                             uint8* dst_argb, const float* poly,
   7195                             int width) {
   7196   __asm {
   7197     mov        eax, [esp + 4]   /* src_argb */
   7198     mov        edx, [esp + 8]   /* dst_argb */
   7199     mov        ecx, [esp + 12]   /* poly */
   7200     vbroadcastf128 ymm4, [ecx]       // C0
   7201     vbroadcastf128 ymm5, [ecx + 16]  // C1
   7202     vbroadcastf128 ymm6, [ecx + 32]  // C2
   7203     vbroadcastf128 ymm7, [ecx + 48]  // C3
   7204     mov        ecx, [esp + 16]  /* width */
   7205 
   7206     // 2 pixel loop.
   7207     align      4
   7208  convertloop:
   7209     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
   7210     lea         eax, [eax + 8]
   7211     vcvtdq2ps   ymm0, ymm0        // X 8 floats
   7212     vmulps      ymm2, ymm0, ymm0  // X * X
   7213     vmulps      ymm3, ymm0, ymm7  // C3 * X
   7214     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
   7215     vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
   7216     vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
   7217     vcvttps2dq  ymm0, ymm0
   7218     vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
   7219     vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
   7220     vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
   7221     sub         ecx, 2
   7222     vmovq       qword ptr [edx], xmm0
   7223     lea         edx, [edx + 8]
   7224     jg          convertloop
   7225     vzeroupper
   7226     ret
   7227   }
   7228 }
   7229 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
   7230 
   7231 #ifdef HAS_ARGBCOLORTABLEROW_X86
   7232 // Tranform ARGB pixels with color table.
   7233 __declspec(naked) __declspec(align(16))
   7234 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
   7235                            int width) {
   7236   __asm {
   7237     push       esi
   7238     mov        eax, [esp + 4 + 4]   /* dst_argb */
   7239     mov        esi, [esp + 4 + 8]   /* table_argb */
   7240     mov        ecx, [esp + 4 + 12]  /* width */
   7241 
   7242     // 1 pixel loop.
   7243     align      4
   7244   convertloop:
   7245     movzx      edx, byte ptr [eax]
   7246     lea        eax, [eax + 4]
   7247     movzx      edx, byte ptr [esi + edx * 4]
   7248     mov        byte ptr [eax - 4], dl
   7249     movzx      edx, byte ptr [eax - 4 + 1]
   7250     movzx      edx, byte ptr [esi + edx * 4 + 1]
   7251     mov        byte ptr [eax - 4 + 1], dl
   7252     movzx      edx, byte ptr [eax - 4 + 2]
   7253     movzx      edx, byte ptr [esi + edx * 4 + 2]
   7254     mov        byte ptr [eax - 4 + 2], dl
   7255     movzx      edx, byte ptr [eax - 4 + 3]
   7256     movzx      edx, byte ptr [esi + edx * 4 + 3]
   7257     mov        byte ptr [eax - 4 + 3], dl
   7258     dec        ecx
   7259     jg         convertloop
   7260     pop        esi
   7261     ret
   7262   }
   7263 }
   7264 #endif  // HAS_ARGBCOLORTABLEROW_X86
   7265 
   7266 #ifdef HAS_RGBCOLORTABLEROW_X86
   7267 // Tranform RGB pixels with color table.
   7268 __declspec(naked) __declspec(align(16))
   7269 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
   7270   __asm {
   7271     push       esi
   7272     mov        eax, [esp + 4 + 4]   /* dst_argb */
   7273     mov        esi, [esp + 4 + 8]   /* table_argb */
   7274     mov        ecx, [esp + 4 + 12]  /* width */
   7275 
   7276     // 1 pixel loop.
   7277     align      4
   7278   convertloop:
   7279     movzx      edx, byte ptr [eax]
   7280     lea        eax, [eax + 4]
   7281     movzx      edx, byte ptr [esi + edx * 4]
   7282     mov        byte ptr [eax - 4], dl
   7283     movzx      edx, byte ptr [eax - 4 + 1]
   7284     movzx      edx, byte ptr [esi + edx * 4 + 1]
   7285     mov        byte ptr [eax - 4 + 1], dl
   7286     movzx      edx, byte ptr [eax - 4 + 2]
   7287     movzx      edx, byte ptr [esi + edx * 4 + 2]
   7288     mov        byte ptr [eax - 4 + 2], dl
   7289     dec        ecx
   7290     jg         convertloop
   7291 
   7292     pop        esi
   7293     ret
   7294   }
   7295 }
   7296 #endif  // HAS_RGBCOLORTABLEROW_X86
   7297 
   7298 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
   7299 // Tranform RGB pixels with luma table.
   7300 __declspec(naked) __declspec(align(16))
   7301 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
   7302                                  int width,
   7303                                  const uint8* luma, uint32 lumacoeff) {
   7304   __asm {
   7305     push       esi
   7306     push       edi
   7307     mov        eax, [esp + 8 + 4]   /* src_argb */
   7308     mov        edi, [esp + 8 + 8]   /* dst_argb */
   7309     mov        ecx, [esp + 8 + 12]  /* width */
   7310     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
   7311     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
   7312     pshufd     xmm2, xmm2, 0
   7313     pshufd     xmm3, xmm3, 0
   7314     pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
   7315     psllw      xmm4, 8
   7316     pxor       xmm5, xmm5
   7317 
   7318     // 4 pixel loop.
   7319     align      4
   7320   convertloop:
   7321     movdqu     xmm0, qword ptr [eax]      // generate luma ptr
   7322     pmaddubsw  xmm0, xmm3
   7323     phaddw     xmm0, xmm0
   7324     pand       xmm0, xmm4  // mask out low bits
   7325     punpcklwd  xmm0, xmm5
   7326     paddd      xmm0, xmm2  // add table base
   7327     movd       esi, xmm0
   7328     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
   7329 
   7330     movzx      edx, byte ptr [eax]
   7331     movzx      edx, byte ptr [esi + edx]
   7332     mov        byte ptr [edi], dl
   7333     movzx      edx, byte ptr [eax + 1]
   7334     movzx      edx, byte ptr [esi + edx]
   7335     mov        byte ptr [edi + 1], dl
   7336     movzx      edx, byte ptr [eax + 2]
   7337     movzx      edx, byte ptr [esi + edx]
   7338     mov        byte ptr [edi + 2], dl
   7339     movzx      edx, byte ptr [eax + 3]  // copy alpha.
   7340     mov        byte ptr [edi + 3], dl
   7341 
   7342     movd       esi, xmm0
   7343     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
   7344 
   7345     movzx      edx, byte ptr [eax + 4]
   7346     movzx      edx, byte ptr [esi + edx]
   7347     mov        byte ptr [edi + 4], dl
   7348     movzx      edx, byte ptr [eax + 5]
   7349     movzx      edx, byte ptr [esi + edx]
   7350     mov        byte ptr [edi + 5], dl
   7351     movzx      edx, byte ptr [eax + 6]
   7352     movzx      edx, byte ptr [esi + edx]
   7353     mov        byte ptr [edi + 6], dl
   7354     movzx      edx, byte ptr [eax + 7]  // copy alpha.
   7355     mov        byte ptr [edi + 7], dl
   7356 
   7357     movd       esi, xmm0
   7358     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
   7359 
   7360     movzx      edx, byte ptr [eax + 8]
   7361     movzx      edx, byte ptr [esi + edx]
   7362     mov        byte ptr [edi + 8], dl
   7363     movzx      edx, byte ptr [eax + 9]
   7364     movzx      edx, byte ptr [esi + edx]
   7365     mov        byte ptr [edi + 9], dl
   7366     movzx      edx, byte ptr [eax + 10]
   7367     movzx      edx, byte ptr [esi + edx]
   7368     mov        byte ptr [edi + 10], dl
   7369     movzx      edx, byte ptr [eax + 11]  // copy alpha.
   7370     mov        byte ptr [edi + 11], dl
   7371 
   7372     movd       esi, xmm0
   7373 
   7374     movzx      edx, byte ptr [eax + 12]
   7375     movzx      edx, byte ptr [esi + edx]
   7376     mov        byte ptr [edi + 12], dl
   7377     movzx      edx, byte ptr [eax + 13]
   7378     movzx      edx, byte ptr [esi + edx]
   7379     mov        byte ptr [edi + 13], dl
   7380     movzx      edx, byte ptr [eax + 14]
   7381     movzx      edx, byte ptr [esi + edx]
   7382     mov        byte ptr [edi + 14], dl
   7383     movzx      edx, byte ptr [eax + 15]  // copy alpha.
   7384     mov        byte ptr [edi + 15], dl
   7385 
   7386     sub        ecx, 4
   7387     lea        eax, [eax + 16]
   7388     lea        edi, [edi + 16]
   7389     jg         convertloop
   7390 
   7391     pop        edi
   7392     pop        esi
   7393     ret
   7394   }
   7395 }
   7396 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
   7397 
   7398 #endif  // defined(_M_X64)
   7399 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
   7400 
   7401 #ifdef __cplusplus
   7402 }  // extern "C"
   7403 }  // namespace libyuv
   7404 #endif
   7405