Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 
     13 // This module is for Visual C 32/64 bit and clangcl 32 bit
     14 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
     15     (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
     16 
     17 #if defined(_M_X64)
     18 #include <emmintrin.h>
     19 #include <tmmintrin.h>  // For _mm_maddubs_epi16
     20 #endif
     21 
     22 #ifdef __cplusplus
     23 namespace libyuv {
     24 extern "C" {
     25 #endif
     26 
     27 // 64 bit
     28 #if defined(_M_X64)
     29 
     30 // Read 4 UV from 422, upsample to 8 UV.
     31 #define READYUV422                                      \
     32   xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);            \
     33   xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
     34   xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                 \
     35   xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                \
     36   u_buf += 4;                                           \
     37   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);              \
     38   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                 \
     39   y_buf += 8;
     40 
     41 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
     42 #define READYUVA422                                     \
     43   xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);            \
     44   xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
     45   xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                 \
     46   xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                \
     47   u_buf += 4;                                           \
     48   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);              \
     49   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                 \
     50   y_buf += 8;                                           \
     51   xmm5 = _mm_loadl_epi64((__m128i*)a_buf);              \
     52   a_buf += 8;
     53 
     54 // Convert 8 pixels: 8 UV and 8 Y.
     55 #define YUVTORGB(yuvconstants)                                     \
     56   xmm1 = _mm_loadu_si128(&xmm0);                                   \
     57   xmm2 = _mm_loadu_si128(&xmm0);                                   \
     58   xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
     59   xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
     60   xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
     61   xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);   \
     62   xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);   \
     63   xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);   \
     64   xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);  \
     65   xmm0 = _mm_adds_epi16(xmm0, xmm4);                               \
     66   xmm1 = _mm_adds_epi16(xmm1, xmm4);                               \
     67   xmm2 = _mm_adds_epi16(xmm2, xmm4);                               \
     68   xmm0 = _mm_srai_epi16(xmm0, 6);                                  \
     69   xmm1 = _mm_srai_epi16(xmm1, 6);                                  \
     70   xmm2 = _mm_srai_epi16(xmm2, 6);                                  \
     71   xmm0 = _mm_packus_epi16(xmm0, xmm0);                             \
     72   xmm1 = _mm_packus_epi16(xmm1, xmm1);                             \
     73   xmm2 = _mm_packus_epi16(xmm2, xmm2);
     74 
     75 // Store 8 ARGB values.
     76 #define STOREARGB                                    \
     77   xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);              \
     78   xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);              \
     79   xmm1 = _mm_loadu_si128(&xmm0);                     \
     80   xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);             \
     81   xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);             \
     82   _mm_storeu_si128((__m128i*)dst_argb, xmm0);        \
     83   _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
     84   dst_argb += 32;
     85 
     86 #if defined(HAS_I422TOARGBROW_SSSE3)
     87 void I422ToARGBRow_SSSE3(const uint8* y_buf,
     88                          const uint8* u_buf,
     89                          const uint8* v_buf,
     90                          uint8* dst_argb,
     91                          const struct YuvConstants* yuvconstants,
     92                          int width) {
     93   __m128i xmm0, xmm1, xmm2, xmm4;
     94   const __m128i xmm5 = _mm_set1_epi8(-1);
     95   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
     96   while (width > 0) {
     97     READYUV422
     98     YUVTORGB(yuvconstants)
     99     STOREARGB
    100     width -= 8;
    101   }
    102 }
    103 #endif
    104 
    105 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
    106 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
    107                               const uint8* u_buf,
    108                               const uint8* v_buf,
    109                               const uint8* a_buf,
    110                               uint8* dst_argb,
    111                               const struct YuvConstants* yuvconstants,
    112                               int width) {
    113   __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
    114   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
    115   while (width > 0) {
    116     READYUVA422
    117     YUVTORGB(yuvconstants)
    118     STOREARGB
    119     width -= 8;
    120   }
    121 }
    122 #endif
    123 
    124 // 32 bit
    125 #else  // defined(_M_X64)
    126 #ifdef HAS_ARGBTOYROW_SSSE3
    127 
    128 // Constants for ARGB.
    129 static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
    130                               13, 65, 33, 0, 13, 65, 33, 0};
    131 
    132 // JPeg full range.
    133 static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
    134                                15, 75, 38, 0, 15, 75, 38, 0};
    135 
    136 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
    137                               112, -74, -38, 0, 112, -74, -38, 0};
    138 
    139 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
    140                                127, -84, -43, 0, 127, -84, -43, 0};
    141 
    142 static const vec8 kARGBToV = {
    143     -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
    144 };
    145 
    146 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
    147                                -20, -107, 127, 0, -20, -107, 127, 0};
    148 
    149 // vpshufb for vphaddw + vpackuswb packed to shorts.
    150 static const lvec8 kShufARGBToUV_AVX = {
    151     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
    152     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
    153 
    154 // Constants for BGRA.
    155 static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
    156                               0, 33, 65, 13, 0, 33, 65, 13};
    157 
    158 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
    159                               0, -38, -74, 112, 0, -38, -74, 112};
    160 
    161 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
    162                               0, 112, -94, -18, 0, 112, -94, -18};
    163 
    164 // Constants for ABGR.
    165 static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
    166                               33, 65, 13, 0, 33, 65, 13, 0};
    167 
    168 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
    169                               -38, -74, 112, 0, -38, -74, 112, 0};
    170 
    171 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
    172                               112, -94, -18, 0, 112, -94, -18, 0};
    173 
    174 // Constants for RGBA.
    175 static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
    176                               0, 13, 65, 33, 0, 13, 65, 33};
    177 
    178 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
    179                               0, 112, -74, -38, 0, 112, -74, -38};
    180 
    181 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
    182                               0, -18, -94, 112, 0, -18, -94, 112};
    183 
    184 static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
    185                               16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
    186 
    187 // 7 bit fixed point 0.5.
    188 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
    189 
    190 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
    191                                 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
    192 
    193 static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
    194                                   0x8080u, 0x8080u, 0x8080u, 0x8080u};
    195 
    196 // Shuffle table for converting RGB24 to ARGB.
    197 static const uvec8 kShuffleMaskRGB24ToARGB = {
    198     0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
    199 
    200 // Shuffle table for converting RAW to ARGB.
    201 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
    202                                             8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
    203 
    204 // Shuffle table for converting RAW to RGB24.  First 8.
    205 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
    206     2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
    207     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
    208 
    209 // Shuffle table for converting RAW to RGB24.  Middle 8.
    210 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
    211     2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
    212     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
    213 
    214 // Shuffle table for converting RAW to RGB24.  Last 8.
    215 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
    216     8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
    217     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
    218 
    219 // Shuffle table for converting ARGB to RGB24.
    220 static const uvec8 kShuffleMaskARGBToRGB24 = {
    221     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
    222 
    223 // Shuffle table for converting ARGB to RAW.
    224 static const uvec8 kShuffleMaskARGBToRAW = {
    225     2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
    226 
    227 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
    228 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
    229     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
    230 
    231 // YUY2 shuf 16 Y to 32 Y.
    232 static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
    233                                     10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
    234                                     6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
    235 
    236 // YUY2 shuf 8 UV to 16 UV.
    237 static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
    238                                      11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
    239                                      5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
    240 
    241 // UYVY shuf 16 Y to 32 Y.
    242 static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
    243                                     11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
    244                                     7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
    245 
    246 // UYVY shuf 8 UV to 16 UV.
    247 static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
    248                                      10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
    249                                      4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
    250 
    251 // NV21 shuf 8 VU to 16 UV.
    252 static const lvec8 kShuffleNV21 = {
    253     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
    254     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
    255 };
    256 
    257 // Duplicates gray value 3 times and fills in alpha opaque.
    258 __declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y,
    259                                           uint8* dst_argb,
    260                                           int width) {
    261   __asm {
    262     mov        eax, [esp + 4]  // src_y
    263     mov        edx, [esp + 8]  // dst_argb
    264     mov        ecx, [esp + 12]  // width
    265     pcmpeqb    xmm5, xmm5  // generate mask 0xff000000
    266     pslld      xmm5, 24
    267 
    268   convertloop:
    269     movq       xmm0, qword ptr [eax]
    270     lea        eax,  [eax + 8]
    271     punpcklbw  xmm0, xmm0
    272     movdqa     xmm1, xmm0
    273     punpcklwd  xmm0, xmm0
    274     punpckhwd  xmm1, xmm1
    275     por        xmm0, xmm5
    276     por        xmm1, xmm5
    277     movdqu     [edx], xmm0
    278     movdqu     [edx + 16], xmm1
    279     lea        edx, [edx + 32]
    280     sub        ecx, 8
    281     jg         convertloop
    282     ret
    283   }
    284 }
    285 
    286 #ifdef HAS_J400TOARGBROW_AVX2
    287 // Duplicates gray value 3 times and fills in alpha opaque.
    288 __declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y,
    289                                           uint8* dst_argb,
    290                                           int width) {
    291   __asm {
    292     mov         eax, [esp + 4]  // src_y
    293     mov         edx, [esp + 8]  // dst_argb
    294     mov         ecx, [esp + 12]  // width
    295     vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff000000
    296     vpslld      ymm5, ymm5, 24
    297 
    298   convertloop:
    299     vmovdqu     xmm0, [eax]
    300     lea         eax,  [eax + 16]
    301     vpermq      ymm0, ymm0, 0xd8
    302     vpunpcklbw  ymm0, ymm0, ymm0
    303     vpermq      ymm0, ymm0, 0xd8
    304     vpunpckhwd  ymm1, ymm0, ymm0
    305     vpunpcklwd  ymm0, ymm0, ymm0
    306     vpor        ymm0, ymm0, ymm5
    307     vpor        ymm1, ymm1, ymm5
    308     vmovdqu     [edx], ymm0
    309     vmovdqu     [edx + 32], ymm1
    310     lea         edx, [edx + 64]
    311     sub         ecx, 16
    312     jg          convertloop
    313     vzeroupper
    314     ret
    315   }
    316 }
    317 #endif  // HAS_J400TOARGBROW_AVX2
    318 
    319 __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24,
    320                                             uint8* dst_argb,
    321                                             int width) {
    322   __asm {
    323     mov       eax, [esp + 4]  // src_rgb24
    324     mov       edx, [esp + 8]  // dst_argb
    325     mov       ecx, [esp + 12]  // width
    326     pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
    327     pslld     xmm5, 24
    328     movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
    329 
    330  convertloop:
    331     movdqu    xmm0, [eax]
    332     movdqu    xmm1, [eax + 16]
    333     movdqu    xmm3, [eax + 32]
    334     lea       eax, [eax + 48]
    335     movdqa    xmm2, xmm3
    336     palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
    337     pshufb    xmm2, xmm4
    338     por       xmm2, xmm5
    339     palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
    340     pshufb    xmm0, xmm4
    341     movdqu    [edx + 32], xmm2
    342     por       xmm0, xmm5
    343     pshufb    xmm1, xmm4
    344     movdqu    [edx], xmm0
    345     por       xmm1, xmm5
    346     palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
    347     pshufb    xmm3, xmm4
    348     movdqu    [edx + 16], xmm1
    349     por       xmm3, xmm5
    350     movdqu    [edx + 48], xmm3
    351     lea       edx, [edx + 64]
    352     sub       ecx, 16
    353     jg        convertloop
    354     ret
    355   }
    356 }
    357 
    358 __declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw,
    359                                           uint8* dst_argb,
    360                                           int width) {
    361   __asm {
    362     mov       eax, [esp + 4]  // src_raw
    363     mov       edx, [esp + 8]  // dst_argb
    364     mov       ecx, [esp + 12]  // width
    365     pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
    366     pslld     xmm5, 24
    367     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
    368 
    369  convertloop:
    370     movdqu    xmm0, [eax]
    371     movdqu    xmm1, [eax + 16]
    372     movdqu    xmm3, [eax + 32]
    373     lea       eax, [eax + 48]
    374     movdqa    xmm2, xmm3
    375     palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
    376     pshufb    xmm2, xmm4
    377     por       xmm2, xmm5
    378     palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
    379     pshufb    xmm0, xmm4
    380     movdqu    [edx + 32], xmm2
    381     por       xmm0, xmm5
    382     pshufb    xmm1, xmm4
    383     movdqu    [edx], xmm0
    384     por       xmm1, xmm5
    385     palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
    386     pshufb    xmm3, xmm4
    387     movdqu    [edx + 16], xmm1
    388     por       xmm3, xmm5
    389     movdqu    [edx + 48], xmm3
    390     lea       edx, [edx + 64]
    391     sub       ecx, 16
    392     jg        convertloop
    393     ret
    394   }
    395 }
    396 
    397 __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8* src_raw,
    398                                            uint8* dst_rgb24,
    399                                            int width) {
    400   __asm {
    401     mov       eax, [esp + 4]  // src_raw
    402     mov       edx, [esp + 8]  // dst_rgb24
    403     mov       ecx, [esp + 12]  // width
    404     movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
    405     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
    406     movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
    407 
    408  convertloop:
    409     movdqu    xmm0, [eax]
    410     movdqu    xmm1, [eax + 4]
    411     movdqu    xmm2, [eax + 8]
    412     lea       eax, [eax + 24]
    413     pshufb    xmm0, xmm3
    414     pshufb    xmm1, xmm4
    415     pshufb    xmm2, xmm5
    416     movq      qword ptr [edx], xmm0
    417     movq      qword ptr [edx + 8], xmm1
    418     movq      qword ptr [edx + 16], xmm2
    419     lea       edx, [edx + 24]
    420     sub       ecx, 8
    421     jg        convertloop
    422     ret
    423   }
    424 }
    425 
    426 // pmul method to replicate bits.
    427 // Math to replicate bits:
    428 // (v << 8) | (v << 3)
    429 // v * 256 + v * 8
    430 // v * (256 + 8)
    431 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
    432 // 20 instructions.
    433 __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565,
    434                                             uint8* dst_argb,
    435                                             int width) {
    436   __asm {
    437     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
    438     movd      xmm5, eax
    439     pshufd    xmm5, xmm5, 0
    440     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
    441     movd      xmm6, eax
    442     pshufd    xmm6, xmm6, 0
    443     pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
    444     psllw     xmm3, 11
    445     pcmpeqb   xmm4, xmm4  // generate mask 0x07e007e0 for Green
    446     psllw     xmm4, 10
    447     psrlw     xmm4, 5
    448     pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
    449     psllw     xmm7, 8
    450 
    451     mov       eax, [esp + 4]  // src_rgb565
    452     mov       edx, [esp + 8]  // dst_argb
    453     mov       ecx, [esp + 12]  // width
    454     sub       edx, eax
    455     sub       edx, eax
    456 
    457  convertloop:
    458     movdqu    xmm0, [eax]  // fetch 8 pixels of bgr565
    459     movdqa    xmm1, xmm0
    460     movdqa    xmm2, xmm0
    461     pand      xmm1, xmm3  // R in upper 5 bits
    462     psllw     xmm2, 11  // B in upper 5 bits
    463     pmulhuw   xmm1, xmm5  // * (256 + 8)
    464     pmulhuw   xmm2, xmm5  // * (256 + 8)
    465     psllw     xmm1, 8
    466     por       xmm1, xmm2  // RB
    467     pand      xmm0, xmm4  // G in middle 6 bits
    468     pmulhuw   xmm0, xmm6  // << 5 * (256 + 4)
    469     por       xmm0, xmm7  // AG
    470     movdqa    xmm2, xmm1
    471     punpcklbw xmm1, xmm0
    472     punpckhbw xmm2, xmm0
    473     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
    474     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
    475     lea       eax, [eax + 16]
    476     sub       ecx, 8
    477     jg        convertloop
    478     ret
    479   }
    480 }
    481 
    482 #ifdef HAS_RGB565TOARGBROW_AVX2
    483 // pmul method to replicate bits.
    484 // Math to replicate bits:
    485 // (v << 8) | (v << 3)
    486 // v * 256 + v * 8
    487 // v * (256 + 8)
    488 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
    489 __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565,
    490                                             uint8* dst_argb,
    491                                             int width) {
    492   __asm {
    493     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
    494     vmovd      xmm5, eax
    495     vbroadcastss ymm5, xmm5
    496     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
    497     vmovd      xmm6, eax
    498     vbroadcastss ymm6, xmm6
    499     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
    500     vpsllw     ymm3, ymm3, 11
    501     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x07e007e0 for Green
    502     vpsllw     ymm4, ymm4, 10
    503     vpsrlw     ymm4, ymm4, 5
    504     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
    505     vpsllw     ymm7, ymm7, 8
    506 
    507     mov        eax, [esp + 4]  // src_rgb565
    508     mov        edx, [esp + 8]  // dst_argb
    509     mov        ecx, [esp + 12]  // width
    510     sub        edx, eax
    511     sub        edx, eax
    512 
    513  convertloop:
    514     vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgr565
    515     vpand      ymm1, ymm0, ymm3  // R in upper 5 bits
    516     vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
    517     vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
    518     vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
    519     vpsllw     ymm1, ymm1, 8
    520     vpor       ymm1, ymm1, ymm2  // RB
    521     vpand      ymm0, ymm0, ymm4  // G in middle 6 bits
    522     vpmulhuw   ymm0, ymm0, ymm6  // << 5 * (256 + 4)
    523     vpor       ymm0, ymm0, ymm7  // AG
    524     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
    525     vpermq     ymm1, ymm1, 0xd8
    526     vpunpckhbw ymm2, ymm1, ymm0
    527     vpunpcklbw ymm1, ymm1, ymm0
    528     vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
    529     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
    530     lea       eax, [eax + 32]
    531     sub       ecx, 16
    532     jg        convertloop
    533     vzeroupper
    534     ret
    535   }
    536 }
    537 #endif  // HAS_RGB565TOARGBROW_AVX2
    538 
    539 #ifdef HAS_ARGB1555TOARGBROW_AVX2
    540 __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555,
    541                                               uint8* dst_argb,
    542                                               int width) {
    543   __asm {
    544     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
    545     vmovd      xmm5, eax
    546     vbroadcastss ymm5, xmm5
    547     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
    548     vmovd      xmm6, eax
    549     vbroadcastss ymm6, xmm6
    550     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
    551     vpsllw     ymm3, ymm3, 11
    552     vpsrlw     ymm4, ymm3, 6  // generate mask 0x03e003e0 for Green
    553     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
    554     vpsllw     ymm7, ymm7, 8
    555 
    556     mov        eax,  [esp + 4]  // src_argb1555
    557     mov        edx,  [esp + 8]  // dst_argb
    558     mov        ecx,  [esp + 12]  // width
    559     sub        edx,  eax
    560     sub        edx,  eax
    561 
    562  convertloop:
    563     vmovdqu    ymm0, [eax]  // fetch 16 pixels of 1555
    564     vpsllw     ymm1, ymm0, 1  // R in upper 5 bits
    565     vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
    566     vpand      ymm1, ymm1, ymm3
    567     vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
    568     vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
    569     vpsllw     ymm1, ymm1, 8
    570     vpor       ymm1, ymm1, ymm2  // RB
    571     vpsraw     ymm2, ymm0, 8  // A
    572     vpand      ymm0, ymm0, ymm4  // G in middle 5 bits
    573     vpmulhuw   ymm0, ymm0, ymm6  // << 6 * (256 + 8)
    574     vpand      ymm2, ymm2, ymm7
    575     vpor       ymm0, ymm0, ymm2  // AG
    576     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
    577     vpermq     ymm1, ymm1, 0xd8
    578     vpunpckhbw ymm2, ymm1, ymm0
    579     vpunpcklbw ymm1, ymm1, ymm0
    580     vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
    581     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
    582     lea       eax, [eax + 32]
    583     sub       ecx, 16
    584     jg        convertloop
    585     vzeroupper
    586     ret
    587   }
    588 }
    589 #endif  // HAS_ARGB1555TOARGBROW_AVX2
    590 
    591 #ifdef HAS_ARGB4444TOARGBROW_AVX2
    592 __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444,
    593                                               uint8* dst_argb,
    594                                               int width) {
    595   __asm {
    596     mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
    597     vmovd     xmm4, eax
    598     vbroadcastss ymm4, xmm4
    599     vpslld    ymm5, ymm4, 4  // 0xf0f0f0f0 for high nibbles
    600     mov       eax,  [esp + 4]  // src_argb4444
    601     mov       edx,  [esp + 8]  // dst_argb
    602     mov       ecx,  [esp + 12]  // width
    603     sub       edx,  eax
    604     sub       edx,  eax
    605 
    606  convertloop:
    607     vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgra4444
    608     vpand      ymm2, ymm0, ymm5  // mask high nibbles
    609     vpand      ymm0, ymm0, ymm4  // mask low nibbles
    610     vpsrlw     ymm3, ymm2, 4
    611     vpsllw     ymm1, ymm0, 4
    612     vpor       ymm2, ymm2, ymm3
    613     vpor       ymm0, ymm0, ymm1
    614     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
    615     vpermq     ymm2, ymm2, 0xd8
    616     vpunpckhbw ymm1, ymm0, ymm2
    617     vpunpcklbw ymm0, ymm0, ymm2
    618     vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
    619     vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
    620     lea       eax, [eax + 32]
    621     sub       ecx, 16
    622     jg        convertloop
    623     vzeroupper
    624     ret
    625   }
    626 }
    627 #endif  // HAS_ARGB4444TOARGBROW_AVX2
    628 
    629 // 24 instructions
    630 __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555,
    631                                               uint8* dst_argb,
    632                                               int width) {
    633   __asm {
    634     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
    635     movd      xmm5, eax
    636     pshufd    xmm5, xmm5, 0
    637     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
    638     movd      xmm6, eax
    639     pshufd    xmm6, xmm6, 0
    640     pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
    641     psllw     xmm3, 11
    642     movdqa    xmm4, xmm3  // generate mask 0x03e003e0 for Green
    643     psrlw     xmm4, 6
    644     pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
    645     psllw     xmm7, 8
    646 
    647     mov       eax, [esp + 4]  // src_argb1555
    648     mov       edx, [esp + 8]  // dst_argb
    649     mov       ecx, [esp + 12]  // width
    650     sub       edx, eax
    651     sub       edx, eax
    652 
    653  convertloop:
    654     movdqu    xmm0, [eax]  // fetch 8 pixels of 1555
    655     movdqa    xmm1, xmm0
    656     movdqa    xmm2, xmm0
    657     psllw     xmm1, 1  // R in upper 5 bits
    658     psllw     xmm2, 11  // B in upper 5 bits
    659     pand      xmm1, xmm3
    660     pmulhuw   xmm2, xmm5  // * (256 + 8)
    661     pmulhuw   xmm1, xmm5  // * (256 + 8)
    662     psllw     xmm1, 8
    663     por       xmm1, xmm2  // RB
    664     movdqa    xmm2, xmm0
    665     pand      xmm0, xmm4  // G in middle 5 bits
    666     psraw     xmm2, 8  // A
    667     pmulhuw   xmm0, xmm6  // << 6 * (256 + 8)
    668     pand      xmm2, xmm7
    669     por       xmm0, xmm2  // AG
    670     movdqa    xmm2, xmm1
    671     punpcklbw xmm1, xmm0
    672     punpckhbw xmm2, xmm0
    673     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
    674     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
    675     lea       eax, [eax + 16]
    676     sub       ecx, 8
    677     jg        convertloop
    678     ret
    679   }
    680 }
    681 
    682 // 18 instructions.
    683 __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444,
    684                                               uint8* dst_argb,
    685                                               int width) {
    686   __asm {
    687     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
    688     movd      xmm4, eax
    689     pshufd    xmm4, xmm4, 0
    690     movdqa    xmm5, xmm4  // 0xf0f0f0f0 for high nibbles
    691     pslld     xmm5, 4
    692     mov       eax, [esp + 4]  // src_argb4444
    693     mov       edx, [esp + 8]  // dst_argb
    694     mov       ecx, [esp + 12]  // width
    695     sub       edx, eax
    696     sub       edx, eax
    697 
    698  convertloop:
    699     movdqu    xmm0, [eax]  // fetch 8 pixels of bgra4444
    700     movdqa    xmm2, xmm0
    701     pand      xmm0, xmm4  // mask low nibbles
    702     pand      xmm2, xmm5  // mask high nibbles
    703     movdqa    xmm1, xmm0
    704     movdqa    xmm3, xmm2
    705     psllw     xmm1, 4
    706     psrlw     xmm3, 4
    707     por       xmm0, xmm1
    708     por       xmm2, xmm3
    709     movdqa    xmm1, xmm0
    710     punpcklbw xmm0, xmm2
    711     punpckhbw xmm1, xmm2
    712     movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
    713     movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
    714     lea       eax, [eax + 16]
    715     sub       ecx, 8
    716     jg        convertloop
    717     ret
    718   }
    719 }
    720 
    721 __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb,
    722                                             uint8* dst_rgb,
    723                                             int width) {
    724   __asm {
    725     mov       eax, [esp + 4]  // src_argb
    726     mov       edx, [esp + 8]  // dst_rgb
    727     mov       ecx, [esp + 12]  // width
    728     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
    729 
    730  convertloop:
    731     movdqu    xmm0, [eax]  // fetch 16 pixels of argb
    732     movdqu    xmm1, [eax + 16]
    733     movdqu    xmm2, [eax + 32]
    734     movdqu    xmm3, [eax + 48]
    735     lea       eax, [eax + 64]
    736     pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
    737     pshufb    xmm1, xmm6
    738     pshufb    xmm2, xmm6
    739     pshufb    xmm3, xmm6
    740     movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
    741     psrldq    xmm1, 4  // 8 bytes from 1
    742     pslldq    xmm4, 12  // 4 bytes from 1 for 0
    743     movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
    744     por       xmm0, xmm4  // 4 bytes from 1 for 0
    745     pslldq    xmm5, 8  // 8 bytes from 2 for 1
    746     movdqu    [edx], xmm0  // store 0
    747     por       xmm1, xmm5  // 8 bytes from 2 for 1
    748     psrldq    xmm2, 8  // 4 bytes from 2
    749     pslldq    xmm3, 4  // 12 bytes from 3 for 2
    750     por       xmm2, xmm3  // 12 bytes from 3 for 2
    751     movdqu    [edx + 16], xmm1  // store 1
    752     movdqu    [edx + 32], xmm2  // store 2
    753     lea       edx, [edx + 48]
    754     sub       ecx, 16
    755     jg        convertloop
    756     ret
    757   }
    758 }
    759 
    760 __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb,
    761                                           uint8* dst_rgb,
    762                                           int width) {
    763   __asm {
    764     mov       eax, [esp + 4]  // src_argb
    765     mov       edx, [esp + 8]  // dst_rgb
    766     mov       ecx, [esp + 12]  // width
    767     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
    768 
    769  convertloop:
    770     movdqu    xmm0, [eax]  // fetch 16 pixels of argb
    771     movdqu    xmm1, [eax + 16]
    772     movdqu    xmm2, [eax + 32]
    773     movdqu    xmm3, [eax + 48]
    774     lea       eax, [eax + 64]
    775     pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
    776     pshufb    xmm1, xmm6
    777     pshufb    xmm2, xmm6
    778     pshufb    xmm3, xmm6
    779     movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
    780     psrldq    xmm1, 4  // 8 bytes from 1
    781     pslldq    xmm4, 12  // 4 bytes from 1 for 0
    782     movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
    783     por       xmm0, xmm4  // 4 bytes from 1 for 0
    784     pslldq    xmm5, 8  // 8 bytes from 2 for 1
    785     movdqu    [edx], xmm0  // store 0
    786     por       xmm1, xmm5  // 8 bytes from 2 for 1
    787     psrldq    xmm2, 8  // 4 bytes from 2
    788     pslldq    xmm3, 4  // 12 bytes from 3 for 2
    789     por       xmm2, xmm3  // 12 bytes from 3 for 2
    790     movdqu    [edx + 16], xmm1  // store 1
    791     movdqu    [edx + 32], xmm2  // store 2
    792     lea       edx, [edx + 48]
    793     sub       ecx, 16
    794     jg        convertloop
    795     ret
    796   }
    797 }
    798 
    799 __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb,
    800                                             uint8* dst_rgb,
    801                                             int width) {
    802   __asm {
    803     mov       eax, [esp + 4]  // src_argb
    804     mov       edx, [esp + 8]  // dst_rgb
    805     mov       ecx, [esp + 12]  // width
    806     pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
    807     psrld     xmm3, 27
    808     pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
    809     psrld     xmm4, 26
    810     pslld     xmm4, 5
    811     pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
    812     pslld     xmm5, 11
    813 
    814  convertloop:
    815     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
    816     movdqa    xmm1, xmm0  // B
    817     movdqa    xmm2, xmm0  // G
    818     pslld     xmm0, 8  // R
    819     psrld     xmm1, 3  // B
    820     psrld     xmm2, 5  // G
    821     psrad     xmm0, 16  // R
    822     pand      xmm1, xmm3  // B
    823     pand      xmm2, xmm4  // G
    824     pand      xmm0, xmm5  // R
    825     por       xmm1, xmm2  // BG
    826     por       xmm0, xmm1  // BGR
    827     packssdw  xmm0, xmm0
    828     lea       eax, [eax + 16]
    829     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
    830     lea       edx, [edx + 8]
    831     sub       ecx, 4
    832     jg        convertloop
    833     ret
    834   }
    835 }
    836 
    837 __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb,
    838                                                   uint8* dst_rgb,
    839                                                   const uint32 dither4,
    840                                                   int width) {
    841   __asm {
    842 
    843     mov       eax, [esp + 4]  // src_argb
    844     mov       edx, [esp + 8]  // dst_rgb
    845     movd      xmm6, [esp + 12]  // dither4
    846     mov       ecx, [esp + 16]  // width
    847     punpcklbw xmm6, xmm6  // make dither 16 bytes
    848     movdqa    xmm7, xmm6
    849     punpcklwd xmm6, xmm6
    850     punpckhwd xmm7, xmm7
    851     pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
    852     psrld     xmm3, 27
    853     pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
    854     psrld     xmm4, 26
    855     pslld     xmm4, 5
    856     pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
    857     pslld     xmm5, 11
    858 
    859  convertloop:
    860     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
    861     paddusb   xmm0, xmm6  // add dither
    862     movdqa    xmm1, xmm0  // B
    863     movdqa    xmm2, xmm0  // G
    864     pslld     xmm0, 8  // R
    865     psrld     xmm1, 3  // B
    866     psrld     xmm2, 5  // G
    867     psrad     xmm0, 16  // R
    868     pand      xmm1, xmm3  // B
    869     pand      xmm2, xmm4  // G
    870     pand      xmm0, xmm5  // R
    871     por       xmm1, xmm2  // BG
    872     por       xmm0, xmm1  // BGR
    873     packssdw  xmm0, xmm0
    874     lea       eax, [eax + 16]
    875     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
    876     lea       edx, [edx + 8]
    877     sub       ecx, 4
    878     jg        convertloop
    879     ret
    880   }
    881 }
    882 
    883 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
    884 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb,
    885                                                   uint8* dst_rgb,
    886                                                   const uint32 dither4,
    887                                                   int width) {
    888   __asm {
    889     mov        eax, [esp + 4]  // src_argb
    890     mov        edx, [esp + 8]  // dst_rgb
    891     vbroadcastss xmm6, [esp + 12]  // dither4
    892     mov        ecx, [esp + 16]  // width
    893     vpunpcklbw xmm6, xmm6, xmm6  // make dither 32 bytes
    894     vpermq     ymm6, ymm6, 0xd8
    895     vpunpcklwd ymm6, ymm6, ymm6
    896     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
    897     vpsrld     ymm3, ymm3, 27
    898     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
    899     vpsrld     ymm4, ymm4, 26
    900     vpslld     ymm4, ymm4, 5
    901     vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
    902 
    903  convertloop:
    904     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
    905     vpaddusb   ymm0, ymm0, ymm6  // add dither
    906     vpsrld     ymm2, ymm0, 5  // G
    907     vpsrld     ymm1, ymm0, 3  // B
    908     vpsrld     ymm0, ymm0, 8  // R
    909     vpand      ymm2, ymm2, ymm4  // G
    910     vpand      ymm1, ymm1, ymm3  // B
    911     vpand      ymm0, ymm0, ymm5  // R
    912     vpor       ymm1, ymm1, ymm2  // BG
    913     vpor       ymm0, ymm0, ymm1  // BGR
    914     vpackusdw  ymm0, ymm0, ymm0
    915     vpermq     ymm0, ymm0, 0xd8
    916     lea        eax, [eax + 32]
    917     vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
    918     lea        edx, [edx + 16]
    919     sub        ecx, 8
    920     jg         convertloop
    921     vzeroupper
    922     ret
    923   }
    924 }
    925 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
    926 
    927 // TODO(fbarchard): Improve sign extension/packing.
    928 __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb,
    929                                               uint8* dst_rgb,
    930                                               int width) {
    931   __asm {
    932     mov       eax, [esp + 4]  // src_argb
    933     mov       edx, [esp + 8]  // dst_rgb
    934     mov       ecx, [esp + 12]  // width
    935     pcmpeqb   xmm4, xmm4  // generate mask 0x0000001f
    936     psrld     xmm4, 27
    937     movdqa    xmm5, xmm4  // generate mask 0x000003e0
    938     pslld     xmm5, 5
    939     movdqa    xmm6, xmm4  // generate mask 0x00007c00
    940     pslld     xmm6, 10
    941     pcmpeqb   xmm7, xmm7  // generate mask 0xffff8000
    942     pslld     xmm7, 15
    943 
    944  convertloop:
    945     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
    946     movdqa    xmm1, xmm0  // B
    947     movdqa    xmm2, xmm0  // G
    948     movdqa    xmm3, xmm0  // R
    949     psrad     xmm0, 16  // A
    950     psrld     xmm1, 3  // B
    951     psrld     xmm2, 6  // G
    952     psrld     xmm3, 9  // R
    953     pand      xmm0, xmm7  // A
    954     pand      xmm1, xmm4  // B
    955     pand      xmm2, xmm5  // G
    956     pand      xmm3, xmm6  // R
    957     por       xmm0, xmm1  // BA
    958     por       xmm2, xmm3  // GR
    959     por       xmm0, xmm2  // BGRA
    960     packssdw  xmm0, xmm0
    961     lea       eax, [eax + 16]
    962     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
    963     lea       edx, [edx + 8]
    964     sub       ecx, 4
    965     jg        convertloop
    966     ret
    967   }
    968 }
    969 
    970 __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb,
    971                                               uint8* dst_rgb,
    972                                               int width) {
    973   __asm {
    974     mov       eax, [esp + 4]  // src_argb
    975     mov       edx, [esp + 8]  // dst_rgb
    976     mov       ecx, [esp + 12]  // width
    977     pcmpeqb   xmm4, xmm4  // generate mask 0xf000f000
    978     psllw     xmm4, 12
    979     movdqa    xmm3, xmm4  // generate mask 0x00f000f0
    980     psrlw     xmm3, 8
    981 
    982  convertloop:
    983     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
    984     movdqa    xmm1, xmm0
    985     pand      xmm0, xmm3  // low nibble
    986     pand      xmm1, xmm4  // high nibble
    987     psrld     xmm0, 4
    988     psrld     xmm1, 8
    989     por       xmm0, xmm1
    990     packuswb  xmm0, xmm0
    991     lea       eax, [eax + 16]
    992     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
    993     lea       edx, [edx + 8]
    994     sub       ecx, 4
    995     jg        convertloop
    996     ret
    997   }
    998 }
    999 
   1000 #ifdef HAS_ARGBTORGB565ROW_AVX2
   1001 __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb,
   1002                                             uint8* dst_rgb,
   1003                                             int width) {
   1004   __asm {
   1005     mov        eax, [esp + 4]  // src_argb
   1006     mov        edx, [esp + 8]  // dst_rgb
   1007     mov        ecx, [esp + 12]  // width
   1008     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
   1009     vpsrld     ymm3, ymm3, 27
   1010     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
   1011     vpsrld     ymm4, ymm4, 26
   1012     vpslld     ymm4, ymm4, 5
   1013     vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
   1014 
   1015  convertloop:
   1016     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
   1017     vpsrld     ymm2, ymm0, 5  // G
   1018     vpsrld     ymm1, ymm0, 3  // B
   1019     vpsrld     ymm0, ymm0, 8  // R
   1020     vpand      ymm2, ymm2, ymm4  // G
   1021     vpand      ymm1, ymm1, ymm3  // B
   1022     vpand      ymm0, ymm0, ymm5  // R
   1023     vpor       ymm1, ymm1, ymm2  // BG
   1024     vpor       ymm0, ymm0, ymm1  // BGR
   1025     vpackusdw  ymm0, ymm0, ymm0
   1026     vpermq     ymm0, ymm0, 0xd8
   1027     lea        eax, [eax + 32]
   1028     vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
   1029     lea        edx, [edx + 16]
   1030     sub        ecx, 8
   1031     jg         convertloop
   1032     vzeroupper
   1033     ret
   1034   }
   1035 }
   1036 #endif  // HAS_ARGBTORGB565ROW_AVX2
   1037 
   1038 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
   1039 __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb,
   1040                                               uint8* dst_rgb,
   1041                                               int width) {
   1042   __asm {
   1043     mov        eax, [esp + 4]  // src_argb
   1044     mov        edx, [esp + 8]  // dst_rgb
   1045     mov        ecx, [esp + 12]  // width
   1046     vpcmpeqb   ymm4, ymm4, ymm4
   1047     vpsrld     ymm4, ymm4, 27  // generate mask 0x0000001f
   1048     vpslld     ymm5, ymm4, 5  // generate mask 0x000003e0
   1049     vpslld     ymm6, ymm4, 10  // generate mask 0x00007c00
   1050     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffff8000
   1051     vpslld     ymm7, ymm7, 15
   1052 
   1053  convertloop:
   1054     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
   1055     vpsrld     ymm3, ymm0, 9  // R
   1056     vpsrld     ymm2, ymm0, 6  // G
   1057     vpsrld     ymm1, ymm0, 3  // B
   1058     vpsrad     ymm0, ymm0, 16  // A
   1059     vpand      ymm3, ymm3, ymm6  // R
   1060     vpand      ymm2, ymm2, ymm5  // G
   1061     vpand      ymm1, ymm1, ymm4  // B
   1062     vpand      ymm0, ymm0, ymm7  // A
   1063     vpor       ymm0, ymm0, ymm1  // BA
   1064     vpor       ymm2, ymm2, ymm3  // GR
   1065     vpor       ymm0, ymm0, ymm2  // BGRA
   1066     vpackssdw  ymm0, ymm0, ymm0
   1067     vpermq     ymm0, ymm0, 0xd8
   1068     lea        eax, [eax + 32]
   1069     vmovdqu    [edx], xmm0  // store 8 pixels of ARGB1555
   1070     lea        edx, [edx + 16]
   1071     sub        ecx, 8
   1072     jg         convertloop
   1073     vzeroupper
   1074     ret
   1075   }
   1076 }
   1077 #endif  // HAS_ARGBTOARGB1555ROW_AVX2
   1078 
   1079 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
   1080 __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb,
   1081                                               uint8* dst_rgb,
   1082                                               int width) {
   1083   __asm {
   1084     mov        eax, [esp + 4]  // src_argb
   1085     mov        edx, [esp + 8]  // dst_rgb
   1086     mov        ecx, [esp + 12]  // width
   1087     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xf000f000
   1088     vpsllw     ymm4, ymm4, 12
   1089     vpsrlw     ymm3, ymm4, 8  // generate mask 0x00f000f0
   1090 
   1091  convertloop:
   1092     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
   1093     vpand      ymm1, ymm0, ymm4  // high nibble
   1094     vpand      ymm0, ymm0, ymm3  // low nibble
   1095     vpsrld     ymm1, ymm1, 8
   1096     vpsrld     ymm0, ymm0, 4
   1097     vpor       ymm0, ymm0, ymm1
   1098     vpackuswb  ymm0, ymm0, ymm0
   1099     vpermq     ymm0, ymm0, 0xd8
   1100     lea        eax, [eax + 32]
   1101     vmovdqu    [edx], xmm0  // store 8 pixels of ARGB4444
   1102     lea        edx, [edx + 16]
   1103     sub        ecx, 8
   1104     jg         convertloop
   1105     vzeroupper
   1106     ret
   1107   }
   1108 }
   1109 #endif  // HAS_ARGBTOARGB4444ROW_AVX2
   1110 
   1111 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
   1112 __declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb,
   1113                                         uint8* dst_y,
   1114                                         int width) {
   1115   __asm {
   1116     mov        eax, [esp + 4] /* src_argb */
   1117     mov        edx, [esp + 8] /* dst_y */
   1118     mov        ecx, [esp + 12] /* width */
   1119     movdqa     xmm4, xmmword ptr kARGBToY
   1120     movdqa     xmm5, xmmword ptr kAddY16
   1121 
   1122  convertloop:
   1123     movdqu     xmm0, [eax]
   1124     movdqu     xmm1, [eax + 16]
   1125     movdqu     xmm2, [eax + 32]
   1126     movdqu     xmm3, [eax + 48]
   1127     pmaddubsw  xmm0, xmm4
   1128     pmaddubsw  xmm1, xmm4
   1129     pmaddubsw  xmm2, xmm4
   1130     pmaddubsw  xmm3, xmm4
   1131     lea        eax, [eax + 64]
   1132     phaddw     xmm0, xmm1
   1133     phaddw     xmm2, xmm3
   1134     psrlw      xmm0, 7
   1135     psrlw      xmm2, 7
   1136     packuswb   xmm0, xmm2
   1137     paddb      xmm0, xmm5
   1138     movdqu     [edx], xmm0
   1139     lea        edx, [edx + 16]
   1140     sub        ecx, 16
   1141     jg         convertloop
   1142     ret
   1143   }
   1144 }
   1145 
   1146 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
   1147 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
   1148 __declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb,
   1149                                          uint8* dst_y,
   1150                                          int width) {
   1151   __asm {
   1152     mov        eax, [esp + 4] /* src_argb */
   1153     mov        edx, [esp + 8] /* dst_y */
   1154     mov        ecx, [esp + 12] /* width */
   1155     movdqa     xmm4, xmmword ptr kARGBToYJ
   1156     movdqa     xmm5, xmmword ptr kAddYJ64
   1157 
   1158  convertloop:
   1159     movdqu     xmm0, [eax]
   1160     movdqu     xmm1, [eax + 16]
   1161     movdqu     xmm2, [eax + 32]
   1162     movdqu     xmm3, [eax + 48]
   1163     pmaddubsw  xmm0, xmm4
   1164     pmaddubsw  xmm1, xmm4
   1165     pmaddubsw  xmm2, xmm4
   1166     pmaddubsw  xmm3, xmm4
   1167     lea        eax, [eax + 64]
   1168     phaddw     xmm0, xmm1
   1169     phaddw     xmm2, xmm3
   1170     paddw      xmm0, xmm5  // Add .5 for rounding.
   1171     paddw      xmm2, xmm5
   1172     psrlw      xmm0, 7
   1173     psrlw      xmm2, 7
   1174     packuswb   xmm0, xmm2
   1175     movdqu     [edx], xmm0
   1176     lea        edx, [edx + 16]
   1177     sub        ecx, 16
   1178     jg         convertloop
   1179     ret
   1180   }
   1181 }
   1182 
   1183 #ifdef HAS_ARGBTOYROW_AVX2
   1184 // vpermd for vphaddw + vpackuswb vpermd.
   1185 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
   1186 
   1187 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
   1188 __declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb,
   1189                                        uint8* dst_y,
   1190                                        int width) {
   1191   __asm {
   1192     mov        eax, [esp + 4] /* src_argb */
   1193     mov        edx, [esp + 8] /* dst_y */
   1194     mov        ecx, [esp + 12] /* width */
   1195     vbroadcastf128 ymm4, xmmword ptr kARGBToY
   1196     vbroadcastf128 ymm5, xmmword ptr kAddY16
   1197     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
   1198 
   1199  convertloop:
   1200     vmovdqu    ymm0, [eax]
   1201     vmovdqu    ymm1, [eax + 32]
   1202     vmovdqu    ymm2, [eax + 64]
   1203     vmovdqu    ymm3, [eax + 96]
   1204     vpmaddubsw ymm0, ymm0, ymm4
   1205     vpmaddubsw ymm1, ymm1, ymm4
   1206     vpmaddubsw ymm2, ymm2, ymm4
   1207     vpmaddubsw ymm3, ymm3, ymm4
   1208     lea        eax, [eax + 128]
   1209     vphaddw    ymm0, ymm0, ymm1  // mutates.
   1210     vphaddw    ymm2, ymm2, ymm3
   1211     vpsrlw     ymm0, ymm0, 7
   1212     vpsrlw     ymm2, ymm2, 7
   1213     vpackuswb  ymm0, ymm0, ymm2  // mutates.
   1214     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
   1215     vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
   1216     vmovdqu    [edx], ymm0
   1217     lea        edx, [edx + 32]
   1218     sub        ecx, 32
   1219     jg         convertloop
   1220     vzeroupper
   1221     ret
   1222   }
   1223 }
   1224 #endif  //  HAS_ARGBTOYROW_AVX2
   1225 
   1226 #ifdef HAS_ARGBTOYJROW_AVX2
   1227 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
   1228 __declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb,
   1229                                         uint8* dst_y,
   1230                                         int width) {
   1231   __asm {
   1232     mov        eax, [esp + 4] /* src_argb */
   1233     mov        edx, [esp + 8] /* dst_y */
   1234     mov        ecx, [esp + 12] /* width */
   1235     vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
   1236     vbroadcastf128 ymm5, xmmword ptr kAddYJ64
   1237     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
   1238 
   1239  convertloop:
   1240     vmovdqu    ymm0, [eax]
   1241     vmovdqu    ymm1, [eax + 32]
   1242     vmovdqu    ymm2, [eax + 64]
   1243     vmovdqu    ymm3, [eax + 96]
   1244     vpmaddubsw ymm0, ymm0, ymm4
   1245     vpmaddubsw ymm1, ymm1, ymm4
   1246     vpmaddubsw ymm2, ymm2, ymm4
   1247     vpmaddubsw ymm3, ymm3, ymm4
   1248     lea        eax, [eax + 128]
   1249     vphaddw    ymm0, ymm0, ymm1  // mutates.
   1250     vphaddw    ymm2, ymm2, ymm3
   1251     vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
   1252     vpaddw     ymm2, ymm2, ymm5
   1253     vpsrlw     ymm0, ymm0, 7
   1254     vpsrlw     ymm2, ymm2, 7
   1255     vpackuswb  ymm0, ymm0, ymm2  // mutates.
   1256     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
   1257     vmovdqu    [edx], ymm0
   1258     lea        edx, [edx + 32]
   1259     sub        ecx, 32
   1260     jg         convertloop
   1261 
   1262     vzeroupper
   1263     ret
   1264   }
   1265 }
   1266 #endif  //  HAS_ARGBTOYJROW_AVX2
   1267 
   1268 __declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb,
   1269                                         uint8* dst_y,
   1270                                         int width) {
   1271   __asm {
   1272     mov        eax, [esp + 4] /* src_argb */
   1273     mov        edx, [esp + 8] /* dst_y */
   1274     mov        ecx, [esp + 12] /* width */
   1275     movdqa     xmm4, xmmword ptr kBGRAToY
   1276     movdqa     xmm5, xmmword ptr kAddY16
   1277 
   1278  convertloop:
   1279     movdqu     xmm0, [eax]
   1280     movdqu     xmm1, [eax + 16]
   1281     movdqu     xmm2, [eax + 32]
   1282     movdqu     xmm3, [eax + 48]
   1283     pmaddubsw  xmm0, xmm4
   1284     pmaddubsw  xmm1, xmm4
   1285     pmaddubsw  xmm2, xmm4
   1286     pmaddubsw  xmm3, xmm4
   1287     lea        eax, [eax + 64]
   1288     phaddw     xmm0, xmm1
   1289     phaddw     xmm2, xmm3
   1290     psrlw      xmm0, 7
   1291     psrlw      xmm2, 7
   1292     packuswb   xmm0, xmm2
   1293     paddb      xmm0, xmm5
   1294     movdqu     [edx], xmm0
   1295     lea        edx, [edx + 16]
   1296     sub        ecx, 16
   1297     jg         convertloop
   1298     ret
   1299   }
   1300 }
   1301 
   1302 __declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb,
   1303                                         uint8* dst_y,
   1304                                         int width) {
   1305   __asm {
   1306     mov        eax, [esp + 4] /* src_argb */
   1307     mov        edx, [esp + 8] /* dst_y */
   1308     mov        ecx, [esp + 12] /* width */
   1309     movdqa     xmm4, xmmword ptr kABGRToY
   1310     movdqa     xmm5, xmmword ptr kAddY16
   1311 
   1312  convertloop:
   1313     movdqu     xmm0, [eax]
   1314     movdqu     xmm1, [eax + 16]
   1315     movdqu     xmm2, [eax + 32]
   1316     movdqu     xmm3, [eax + 48]
   1317     pmaddubsw  xmm0, xmm4
   1318     pmaddubsw  xmm1, xmm4
   1319     pmaddubsw  xmm2, xmm4
   1320     pmaddubsw  xmm3, xmm4
   1321     lea        eax, [eax + 64]
   1322     phaddw     xmm0, xmm1
   1323     phaddw     xmm2, xmm3
   1324     psrlw      xmm0, 7
   1325     psrlw      xmm2, 7
   1326     packuswb   xmm0, xmm2
   1327     paddb      xmm0, xmm5
   1328     movdqu     [edx], xmm0
   1329     lea        edx, [edx + 16]
   1330     sub        ecx, 16
   1331     jg         convertloop
   1332     ret
   1333   }
   1334 }
   1335 
   1336 __declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb,
   1337                                         uint8* dst_y,
   1338                                         int width) {
   1339   __asm {
   1340     mov        eax, [esp + 4] /* src_argb */
   1341     mov        edx, [esp + 8] /* dst_y */
   1342     mov        ecx, [esp + 12] /* width */
   1343     movdqa     xmm4, xmmword ptr kRGBAToY
   1344     movdqa     xmm5, xmmword ptr kAddY16
   1345 
   1346  convertloop:
   1347     movdqu     xmm0, [eax]
   1348     movdqu     xmm1, [eax + 16]
   1349     movdqu     xmm2, [eax + 32]
   1350     movdqu     xmm3, [eax + 48]
   1351     pmaddubsw  xmm0, xmm4
   1352     pmaddubsw  xmm1, xmm4
   1353     pmaddubsw  xmm2, xmm4
   1354     pmaddubsw  xmm3, xmm4
   1355     lea        eax, [eax + 64]
   1356     phaddw     xmm0, xmm1
   1357     phaddw     xmm2, xmm3
   1358     psrlw      xmm0, 7
   1359     psrlw      xmm2, 7
   1360     packuswb   xmm0, xmm2
   1361     paddb      xmm0, xmm5
   1362     movdqu     [edx], xmm0
   1363     lea        edx, [edx + 16]
   1364     sub        ecx, 16
   1365     jg         convertloop
   1366     ret
   1367   }
   1368 }
   1369 
   1370 __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
   1371                                          int src_stride_argb,
   1372                                          uint8* dst_u,
   1373                                          uint8* dst_v,
   1374                                          int width) {
   1375   __asm {
   1376     push       esi
   1377     push       edi
   1378     mov        eax, [esp + 8 + 4]  // src_argb
   1379     mov        esi, [esp + 8 + 8]  // src_stride_argb
   1380     mov        edx, [esp + 8 + 12]  // dst_u
   1381     mov        edi, [esp + 8 + 16]  // dst_v
   1382     mov        ecx, [esp + 8 + 20]  // width
   1383     movdqa     xmm5, xmmword ptr kAddUV128
   1384     movdqa     xmm6, xmmword ptr kARGBToV
   1385     movdqa     xmm7, xmmword ptr kARGBToU
   1386     sub        edi, edx  // stride from u to v
   1387 
   1388  convertloop:
   1389          /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1390     movdqu     xmm0, [eax]
   1391     movdqu     xmm4, [eax + esi]
   1392     pavgb      xmm0, xmm4
   1393     movdqu     xmm1, [eax + 16]
   1394     movdqu     xmm4, [eax + esi + 16]
   1395     pavgb      xmm1, xmm4
   1396     movdqu     xmm2, [eax + 32]
   1397     movdqu     xmm4, [eax + esi + 32]
   1398     pavgb      xmm2, xmm4
   1399     movdqu     xmm3, [eax + 48]
   1400     movdqu     xmm4, [eax + esi + 48]
   1401     pavgb      xmm3, xmm4
   1402 
   1403     lea        eax,  [eax + 64]
   1404     movdqa     xmm4, xmm0
   1405     shufps     xmm0, xmm1, 0x88
   1406     shufps     xmm4, xmm1, 0xdd
   1407     pavgb      xmm0, xmm4
   1408     movdqa     xmm4, xmm2
   1409     shufps     xmm2, xmm3, 0x88
   1410     shufps     xmm4, xmm3, 0xdd
   1411     pavgb      xmm2, xmm4
   1412 
   1413     // step 2 - convert to U and V
   1414     // from here down is very similar to Y code except
   1415     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1416     movdqa     xmm1, xmm0
   1417     movdqa     xmm3, xmm2
   1418     pmaddubsw  xmm0, xmm7  // U
   1419     pmaddubsw  xmm2, xmm7
   1420     pmaddubsw  xmm1, xmm6  // V
   1421     pmaddubsw  xmm3, xmm6
   1422     phaddw     xmm0, xmm2
   1423     phaddw     xmm1, xmm3
   1424     psraw      xmm0, 8
   1425     psraw      xmm1, 8
   1426     packsswb   xmm0, xmm1
   1427     paddb      xmm0, xmm5  // -> unsigned
   1428 
   1429     // step 3 - store 8 U and 8 V values
   1430     movlps     qword ptr [edx], xmm0  // U
   1431     movhps     qword ptr [edx + edi], xmm0  // V
   1432     lea        edx, [edx + 8]
   1433     sub        ecx, 16
   1434     jg         convertloop
   1435 
   1436     pop        edi
   1437     pop        esi
   1438     ret
   1439   }
   1440 }
   1441 
   1442 __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
   1443                                           int src_stride_argb,
   1444                                           uint8* dst_u,
   1445                                           uint8* dst_v,
   1446                                           int width) {
   1447   __asm {
   1448     push       esi
   1449     push       edi
   1450     mov        eax, [esp + 8 + 4]  // src_argb
   1451     mov        esi, [esp + 8 + 8]  // src_stride_argb
   1452     mov        edx, [esp + 8 + 12]  // dst_u
   1453     mov        edi, [esp + 8 + 16]  // dst_v
   1454     mov        ecx, [esp + 8 + 20]  // width
   1455     movdqa     xmm5, xmmword ptr kAddUVJ128
   1456     movdqa     xmm6, xmmword ptr kARGBToVJ
   1457     movdqa     xmm7, xmmword ptr kARGBToUJ
   1458     sub        edi, edx  // stride from u to v
   1459 
   1460  convertloop:
   1461          /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1462     movdqu     xmm0, [eax]
   1463     movdqu     xmm4, [eax + esi]
   1464     pavgb      xmm0, xmm4
   1465     movdqu     xmm1, [eax + 16]
   1466     movdqu     xmm4, [eax + esi + 16]
   1467     pavgb      xmm1, xmm4
   1468     movdqu     xmm2, [eax + 32]
   1469     movdqu     xmm4, [eax + esi + 32]
   1470     pavgb      xmm2, xmm4
   1471     movdqu     xmm3, [eax + 48]
   1472     movdqu     xmm4, [eax + esi + 48]
   1473     pavgb      xmm3, xmm4
   1474 
   1475     lea        eax,  [eax + 64]
   1476     movdqa     xmm4, xmm0
   1477     shufps     xmm0, xmm1, 0x88
   1478     shufps     xmm4, xmm1, 0xdd
   1479     pavgb      xmm0, xmm4
   1480     movdqa     xmm4, xmm2
   1481     shufps     xmm2, xmm3, 0x88
   1482     shufps     xmm4, xmm3, 0xdd
   1483     pavgb      xmm2, xmm4
   1484 
   1485     // step 2 - convert to U and V
   1486     // from here down is very similar to Y code except
   1487     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1488     movdqa     xmm1, xmm0
   1489     movdqa     xmm3, xmm2
   1490     pmaddubsw  xmm0, xmm7  // U
   1491     pmaddubsw  xmm2, xmm7
   1492     pmaddubsw  xmm1, xmm6  // V
   1493     pmaddubsw  xmm3, xmm6
   1494     phaddw     xmm0, xmm2
   1495     phaddw     xmm1, xmm3
   1496     paddw      xmm0, xmm5  // +.5 rounding -> unsigned
   1497     paddw      xmm1, xmm5
   1498     psraw      xmm0, 8
   1499     psraw      xmm1, 8
   1500     packsswb   xmm0, xmm1
   1501 
   1502     // step 3 - store 8 U and 8 V values
   1503     movlps     qword ptr [edx], xmm0  // U
   1504     movhps     qword ptr [edx + edi], xmm0  // V
   1505     lea        edx, [edx + 8]
   1506     sub        ecx, 16
   1507     jg         convertloop
   1508 
   1509     pop        edi
   1510     pop        esi
   1511     ret
   1512   }
   1513 }
   1514 
   1515 #ifdef HAS_ARGBTOUVROW_AVX2
   1516 __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
   1517                                         int src_stride_argb,
   1518                                         uint8* dst_u,
   1519                                         uint8* dst_v,
   1520                                         int width) {
   1521   __asm {
   1522     push       esi
   1523     push       edi
   1524     mov        eax, [esp + 8 + 4]  // src_argb
   1525     mov        esi, [esp + 8 + 8]  // src_stride_argb
   1526     mov        edx, [esp + 8 + 12]  // dst_u
   1527     mov        edi, [esp + 8 + 16]  // dst_v
   1528     mov        ecx, [esp + 8 + 20]  // width
   1529     vbroadcastf128 ymm5, xmmword ptr kAddUV128
   1530     vbroadcastf128 ymm6, xmmword ptr kARGBToV
   1531     vbroadcastf128 ymm7, xmmword ptr kARGBToU
   1532     sub        edi, edx   // stride from u to v
   1533 
   1534  convertloop:
   1535         /* step 1 - subsample 32x2 argb pixels to 16x1 */
   1536     vmovdqu    ymm0, [eax]
   1537     vmovdqu    ymm1, [eax + 32]
   1538     vmovdqu    ymm2, [eax + 64]
   1539     vmovdqu    ymm3, [eax + 96]
   1540     vpavgb     ymm0, ymm0, [eax + esi]
   1541     vpavgb     ymm1, ymm1, [eax + esi + 32]
   1542     vpavgb     ymm2, ymm2, [eax + esi + 64]
   1543     vpavgb     ymm3, ymm3, [eax + esi + 96]
   1544     lea        eax,  [eax + 128]
   1545     vshufps    ymm4, ymm0, ymm1, 0x88
   1546     vshufps    ymm0, ymm0, ymm1, 0xdd
   1547     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
   1548     vshufps    ymm4, ymm2, ymm3, 0x88
   1549     vshufps    ymm2, ymm2, ymm3, 0xdd
   1550     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
   1551 
   1552     // step 2 - convert to U and V
   1553     // from here down is very similar to Y code except
   1554     // instead of 32 different pixels, its 16 pixels of U and 16 of V
   1555     vpmaddubsw ymm1, ymm0, ymm7  // U
   1556     vpmaddubsw ymm3, ymm2, ymm7
   1557     vpmaddubsw ymm0, ymm0, ymm6  // V
   1558     vpmaddubsw ymm2, ymm2, ymm6
   1559     vphaddw    ymm1, ymm1, ymm3  // mutates
   1560     vphaddw    ymm0, ymm0, ymm2
   1561     vpsraw     ymm1, ymm1, 8
   1562     vpsraw     ymm0, ymm0, 8
   1563     vpacksswb  ymm0, ymm1, ymm0  // mutates
   1564     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
   1565     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
   1566     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
   1567 
   1568     // step 3 - store 16 U and 16 V values
   1569     vextractf128 [edx], ymm0, 0  // U
   1570     vextractf128 [edx + edi], ymm0, 1  // V
   1571     lea        edx, [edx + 16]
   1572     sub        ecx, 32
   1573     jg         convertloop
   1574 
   1575     pop        edi
   1576     pop        esi
   1577     vzeroupper
   1578     ret
   1579   }
   1580 }
   1581 #endif  // HAS_ARGBTOUVROW_AVX2
   1582 
   1583 #ifdef HAS_ARGBTOUVJROW_AVX2
   1584 __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
   1585                                          int src_stride_argb,
   1586                                          uint8* dst_u,
   1587                                          uint8* dst_v,
   1588                                          int width) {
   1589   __asm {
   1590     push       esi
   1591     push       edi
   1592     mov        eax, [esp + 8 + 4]  // src_argb
   1593     mov        esi, [esp + 8 + 8]  // src_stride_argb
   1594     mov        edx, [esp + 8 + 12]  // dst_u
   1595     mov        edi, [esp + 8 + 16]  // dst_v
   1596     mov        ecx, [esp + 8 + 20]  // width
   1597     vbroadcastf128 ymm5, xmmword ptr kAddUV128
   1598     vbroadcastf128 ymm6, xmmword ptr kARGBToV
   1599     vbroadcastf128 ymm7, xmmword ptr kARGBToU
   1600     sub        edi, edx   // stride from u to v
   1601 
   1602  convertloop:
   1603         /* step 1 - subsample 32x2 argb pixels to 16x1 */
   1604     vmovdqu    ymm0, [eax]
   1605     vmovdqu    ymm1, [eax + 32]
   1606     vmovdqu    ymm2, [eax + 64]
   1607     vmovdqu    ymm3, [eax + 96]
   1608     vpavgb     ymm0, ymm0, [eax + esi]
   1609     vpavgb     ymm1, ymm1, [eax + esi + 32]
   1610     vpavgb     ymm2, ymm2, [eax + esi + 64]
   1611     vpavgb     ymm3, ymm3, [eax + esi + 96]
   1612     lea        eax,  [eax + 128]
   1613     vshufps    ymm4, ymm0, ymm1, 0x88
   1614     vshufps    ymm0, ymm0, ymm1, 0xdd
   1615     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
   1616     vshufps    ymm4, ymm2, ymm3, 0x88
   1617     vshufps    ymm2, ymm2, ymm3, 0xdd
   1618     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
   1619 
   1620     // step 2 - convert to U and V
   1621     // from here down is very similar to Y code except
   1622     // instead of 32 different pixels, its 16 pixels of U and 16 of V
   1623     vpmaddubsw ymm1, ymm0, ymm7  // U
   1624     vpmaddubsw ymm3, ymm2, ymm7
   1625     vpmaddubsw ymm0, ymm0, ymm6  // V
   1626     vpmaddubsw ymm2, ymm2, ymm6
   1627     vphaddw    ymm1, ymm1, ymm3  // mutates
   1628     vphaddw    ymm0, ymm0, ymm2
   1629     vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
   1630     vpaddw     ymm0, ymm0, ymm5
   1631     vpsraw     ymm1, ymm1, 8
   1632     vpsraw     ymm0, ymm0, 8
   1633     vpacksswb  ymm0, ymm1, ymm0  // mutates
   1634     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
   1635     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
   1636 
   1637     // step 3 - store 16 U and 16 V values
   1638     vextractf128 [edx], ymm0, 0  // U
   1639     vextractf128 [edx + edi], ymm0, 1  // V
   1640     lea        edx, [edx + 16]
   1641     sub        ecx, 32
   1642     jg         convertloop
   1643 
   1644     pop        edi
   1645     pop        esi
   1646     vzeroupper
   1647     ret
   1648   }
   1649 }
   1650 #endif  // HAS_ARGBTOUVJROW_AVX2
   1651 
   1652 __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
   1653                                             uint8* dst_u,
   1654                                             uint8* dst_v,
   1655                                             int width) {
   1656   __asm {
   1657     push       edi
   1658     mov        eax, [esp + 4 + 4]  // src_argb
   1659     mov        edx, [esp + 4 + 8]  // dst_u
   1660     mov        edi, [esp + 4 + 12]  // dst_v
   1661     mov        ecx, [esp + 4 + 16]  // width
   1662     movdqa     xmm5, xmmword ptr kAddUV128
   1663     movdqa     xmm6, xmmword ptr kARGBToV
   1664     movdqa     xmm7, xmmword ptr kARGBToU
   1665     sub        edi, edx    // stride from u to v
   1666 
   1667  convertloop:
   1668         /* convert to U and V */
   1669     movdqu     xmm0, [eax]  // U
   1670     movdqu     xmm1, [eax + 16]
   1671     movdqu     xmm2, [eax + 32]
   1672     movdqu     xmm3, [eax + 48]
   1673     pmaddubsw  xmm0, xmm7
   1674     pmaddubsw  xmm1, xmm7
   1675     pmaddubsw  xmm2, xmm7
   1676     pmaddubsw  xmm3, xmm7
   1677     phaddw     xmm0, xmm1
   1678     phaddw     xmm2, xmm3
   1679     psraw      xmm0, 8
   1680     psraw      xmm2, 8
   1681     packsswb   xmm0, xmm2
   1682     paddb      xmm0, xmm5
   1683     movdqu     [edx], xmm0
   1684 
   1685     movdqu     xmm0, [eax]  // V
   1686     movdqu     xmm1, [eax + 16]
   1687     movdqu     xmm2, [eax + 32]
   1688     movdqu     xmm3, [eax + 48]
   1689     pmaddubsw  xmm0, xmm6
   1690     pmaddubsw  xmm1, xmm6
   1691     pmaddubsw  xmm2, xmm6
   1692     pmaddubsw  xmm3, xmm6
   1693     phaddw     xmm0, xmm1
   1694     phaddw     xmm2, xmm3
   1695     psraw      xmm0, 8
   1696     psraw      xmm2, 8
   1697     packsswb   xmm0, xmm2
   1698     paddb      xmm0, xmm5
   1699     lea        eax,  [eax + 64]
   1700     movdqu     [edx + edi], xmm0
   1701     lea        edx,  [edx + 16]
   1702     sub        ecx,  16
   1703     jg         convertloop
   1704 
   1705     pop        edi
   1706     ret
   1707   }
   1708 }
   1709 
   1710 __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
   1711                                          int src_stride_argb,
   1712                                          uint8* dst_u,
   1713                                          uint8* dst_v,
   1714                                          int width) {
   1715   __asm {
   1716     push       esi
   1717     push       edi
   1718     mov        eax, [esp + 8 + 4]  // src_argb
   1719     mov        esi, [esp + 8 + 8]  // src_stride_argb
   1720     mov        edx, [esp + 8 + 12]  // dst_u
   1721     mov        edi, [esp + 8 + 16]  // dst_v
   1722     mov        ecx, [esp + 8 + 20]  // width
   1723     movdqa     xmm5, xmmword ptr kAddUV128
   1724     movdqa     xmm6, xmmword ptr kBGRAToV
   1725     movdqa     xmm7, xmmword ptr kBGRAToU
   1726     sub        edi, edx  // stride from u to v
   1727 
   1728  convertloop:
   1729          /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1730     movdqu     xmm0, [eax]
   1731     movdqu     xmm4, [eax + esi]
   1732     pavgb      xmm0, xmm4
   1733     movdqu     xmm1, [eax + 16]
   1734     movdqu     xmm4, [eax + esi + 16]
   1735     pavgb      xmm1, xmm4
   1736     movdqu     xmm2, [eax + 32]
   1737     movdqu     xmm4, [eax + esi + 32]
   1738     pavgb      xmm2, xmm4
   1739     movdqu     xmm3, [eax + 48]
   1740     movdqu     xmm4, [eax + esi + 48]
   1741     pavgb      xmm3, xmm4
   1742 
   1743     lea        eax,  [eax + 64]
   1744     movdqa     xmm4, xmm0
   1745     shufps     xmm0, xmm1, 0x88
   1746     shufps     xmm4, xmm1, 0xdd
   1747     pavgb      xmm0, xmm4
   1748     movdqa     xmm4, xmm2
   1749     shufps     xmm2, xmm3, 0x88
   1750     shufps     xmm4, xmm3, 0xdd
   1751     pavgb      xmm2, xmm4
   1752 
   1753     // step 2 - convert to U and V
   1754     // from here down is very similar to Y code except
   1755     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1756     movdqa     xmm1, xmm0
   1757     movdqa     xmm3, xmm2
   1758     pmaddubsw  xmm0, xmm7  // U
   1759     pmaddubsw  xmm2, xmm7
   1760     pmaddubsw  xmm1, xmm6  // V
   1761     pmaddubsw  xmm3, xmm6
   1762     phaddw     xmm0, xmm2
   1763     phaddw     xmm1, xmm3
   1764     psraw      xmm0, 8
   1765     psraw      xmm1, 8
   1766     packsswb   xmm0, xmm1
   1767     paddb      xmm0, xmm5  // -> unsigned
   1768 
   1769     // step 3 - store 8 U and 8 V values
   1770     movlps     qword ptr [edx], xmm0  // U
   1771     movhps     qword ptr [edx + edi], xmm0  // V
   1772     lea        edx, [edx + 8]
   1773     sub        ecx, 16
   1774     jg         convertloop
   1775 
   1776     pop        edi
   1777     pop        esi
   1778     ret
   1779   }
   1780 }
   1781 
   1782 __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
   1783                                          int src_stride_argb,
   1784                                          uint8* dst_u,
   1785                                          uint8* dst_v,
   1786                                          int width) {
   1787   __asm {
   1788     push       esi
   1789     push       edi
   1790     mov        eax, [esp + 8 + 4]  // src_argb
   1791     mov        esi, [esp + 8 + 8]  // src_stride_argb
   1792     mov        edx, [esp + 8 + 12]  // dst_u
   1793     mov        edi, [esp + 8 + 16]  // dst_v
   1794     mov        ecx, [esp + 8 + 20]  // width
   1795     movdqa     xmm5, xmmword ptr kAddUV128
   1796     movdqa     xmm6, xmmword ptr kABGRToV
   1797     movdqa     xmm7, xmmword ptr kABGRToU
   1798     sub        edi, edx  // stride from u to v
   1799 
   1800  convertloop:
   1801          /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1802     movdqu     xmm0, [eax]
   1803     movdqu     xmm4, [eax + esi]
   1804     pavgb      xmm0, xmm4
   1805     movdqu     xmm1, [eax + 16]
   1806     movdqu     xmm4, [eax + esi + 16]
   1807     pavgb      xmm1, xmm4
   1808     movdqu     xmm2, [eax + 32]
   1809     movdqu     xmm4, [eax + esi + 32]
   1810     pavgb      xmm2, xmm4
   1811     movdqu     xmm3, [eax + 48]
   1812     movdqu     xmm4, [eax + esi + 48]
   1813     pavgb      xmm3, xmm4
   1814 
   1815     lea        eax,  [eax + 64]
   1816     movdqa     xmm4, xmm0
   1817     shufps     xmm0, xmm1, 0x88
   1818     shufps     xmm4, xmm1, 0xdd
   1819     pavgb      xmm0, xmm4
   1820     movdqa     xmm4, xmm2
   1821     shufps     xmm2, xmm3, 0x88
   1822     shufps     xmm4, xmm3, 0xdd
   1823     pavgb      xmm2, xmm4
   1824 
   1825     // step 2 - convert to U and V
   1826     // from here down is very similar to Y code except
   1827     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1828     movdqa     xmm1, xmm0
   1829     movdqa     xmm3, xmm2
   1830     pmaddubsw  xmm0, xmm7  // U
   1831     pmaddubsw  xmm2, xmm7
   1832     pmaddubsw  xmm1, xmm6  // V
   1833     pmaddubsw  xmm3, xmm6
   1834     phaddw     xmm0, xmm2
   1835     phaddw     xmm1, xmm3
   1836     psraw      xmm0, 8
   1837     psraw      xmm1, 8
   1838     packsswb   xmm0, xmm1
   1839     paddb      xmm0, xmm5  // -> unsigned
   1840 
   1841     // step 3 - store 8 U and 8 V values
   1842     movlps     qword ptr [edx], xmm0  // U
   1843     movhps     qword ptr [edx + edi], xmm0  // V
   1844     lea        edx, [edx + 8]
   1845     sub        ecx, 16
   1846     jg         convertloop
   1847 
   1848     pop        edi
   1849     pop        esi
   1850     ret
   1851   }
   1852 }
   1853 
   1854 __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
   1855                                          int src_stride_argb,
   1856                                          uint8* dst_u,
   1857                                          uint8* dst_v,
   1858                                          int width) {
   1859   __asm {
   1860     push       esi
   1861     push       edi
   1862     mov        eax, [esp + 8 + 4]  // src_argb
   1863     mov        esi, [esp + 8 + 8]  // src_stride_argb
   1864     mov        edx, [esp + 8 + 12]  // dst_u
   1865     mov        edi, [esp + 8 + 16]  // dst_v
   1866     mov        ecx, [esp + 8 + 20]  // width
   1867     movdqa     xmm5, xmmword ptr kAddUV128
   1868     movdqa     xmm6, xmmword ptr kRGBAToV
   1869     movdqa     xmm7, xmmword ptr kRGBAToU
   1870     sub        edi, edx  // stride from u to v
   1871 
   1872  convertloop:
   1873          /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1874     movdqu     xmm0, [eax]
   1875     movdqu     xmm4, [eax + esi]
   1876     pavgb      xmm0, xmm4
   1877     movdqu     xmm1, [eax + 16]
   1878     movdqu     xmm4, [eax + esi + 16]
   1879     pavgb      xmm1, xmm4
   1880     movdqu     xmm2, [eax + 32]
   1881     movdqu     xmm4, [eax + esi + 32]
   1882     pavgb      xmm2, xmm4
   1883     movdqu     xmm3, [eax + 48]
   1884     movdqu     xmm4, [eax + esi + 48]
   1885     pavgb      xmm3, xmm4
   1886 
   1887     lea        eax,  [eax + 64]
   1888     movdqa     xmm4, xmm0
   1889     shufps     xmm0, xmm1, 0x88
   1890     shufps     xmm4, xmm1, 0xdd
   1891     pavgb      xmm0, xmm4
   1892     movdqa     xmm4, xmm2
   1893     shufps     xmm2, xmm3, 0x88
   1894     shufps     xmm4, xmm3, 0xdd
   1895     pavgb      xmm2, xmm4
   1896 
   1897     // step 2 - convert to U and V
   1898     // from here down is very similar to Y code except
   1899     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1900     movdqa     xmm1, xmm0
   1901     movdqa     xmm3, xmm2
   1902     pmaddubsw  xmm0, xmm7  // U
   1903     pmaddubsw  xmm2, xmm7
   1904     pmaddubsw  xmm1, xmm6  // V
   1905     pmaddubsw  xmm3, xmm6
   1906     phaddw     xmm0, xmm2
   1907     phaddw     xmm1, xmm3
   1908     psraw      xmm0, 8
   1909     psraw      xmm1, 8
   1910     packsswb   xmm0, xmm1
   1911     paddb      xmm0, xmm5  // -> unsigned
   1912 
   1913     // step 3 - store 8 U and 8 V values
   1914     movlps     qword ptr [edx], xmm0  // U
   1915     movhps     qword ptr [edx + edi], xmm0  // V
   1916     lea        edx, [edx + 8]
   1917     sub        ecx, 16
   1918     jg         convertloop
   1919 
   1920     pop        edi
   1921     pop        esi
   1922     ret
   1923   }
   1924 }
   1925 #endif  // HAS_ARGBTOYROW_SSSE3
   1926 
   1927 // Read 16 UV from 444
   1928 #define READYUV444_AVX2 \
   1929   __asm {                                                \
   1930     __asm vmovdqu    xmm0, [esi] /* U */                      \
   1931     __asm vmovdqu    xmm1, [esi + edi] /* V */                      \
   1932     __asm lea        esi,  [esi + 16]                                          \
   1933     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1934     __asm vpermq     ymm1, ymm1, 0xd8                                          \
   1935     __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
   1936     __asm vmovdqu    xmm4, [eax] /* Y */                      \
   1937     __asm vpermq     ymm4, ymm4, 0xd8                                          \
   1938     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
   1939     __asm lea        eax, [eax + 16]}
   1940 
   1941 // Read 8 UV from 422, upsample to 16 UV.
   1942 #define READYUV422_AVX2 \
   1943   __asm {                                                \
   1944     __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
   1945     __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
   1946     __asm lea        esi,  [esi + 8]                                           \
   1947     __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
   1948     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1949     __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
   1950     __asm vmovdqu    xmm4, [eax] /* Y */                      \
   1951     __asm vpermq     ymm4, ymm4, 0xd8                                          \
   1952     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
   1953     __asm lea        eax, [eax + 16]}
   1954 
   1955 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
   1956 #define READYUVA422_AVX2 \
   1957   __asm {                                               \
   1958     __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
   1959     __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
   1960     __asm lea        esi,  [esi + 8]                                           \
   1961     __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
   1962     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1963     __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
   1964     __asm vmovdqu    xmm4, [eax] /* Y */                      \
   1965     __asm vpermq     ymm4, ymm4, 0xd8                                          \
   1966     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
   1967     __asm lea        eax, [eax + 16]                                           \
   1968     __asm vmovdqu    xmm5, [ebp] /* A */                      \
   1969     __asm vpermq     ymm5, ymm5, 0xd8                                          \
   1970     __asm lea        ebp, [ebp + 16]}
   1971 
   1972 // Read 8 UV from NV12, upsample to 16 UV.
   1973 #define READNV12_AVX2 \
   1974   __asm {                                                  \
   1975     __asm vmovdqu    xmm0, [esi] /* UV */                     \
   1976     __asm lea        esi,  [esi + 16]                                          \
   1977     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1978     __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
   1979     __asm vmovdqu    xmm4, [eax] /* Y */                      \
   1980     __asm vpermq     ymm4, ymm4, 0xd8                                          \
   1981     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
   1982     __asm lea        eax, [eax + 16]}
   1983 
   1984 // Read 8 UV from NV21, upsample to 16 UV.
   1985 #define READNV21_AVX2 \
   1986   __asm {                                                  \
   1987     __asm vmovdqu    xmm0, [esi] /* UV */                     \
   1988     __asm lea        esi,  [esi + 16]                                          \
   1989     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1990     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
   1991     __asm vmovdqu    xmm4, [eax] /* Y */                      \
   1992     __asm vpermq     ymm4, ymm4, 0xd8                                          \
   1993     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
   1994     __asm lea        eax, [eax + 16]}
   1995 
   1996 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
   1997 #define READYUY2_AVX2 \
   1998   __asm {                                                  \
   1999     __asm vmovdqu    ymm4, [eax] /* YUY2 */                           \
   2000     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
   2001     __asm vmovdqu    ymm0, [eax] /* UV */                             \
   2002     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
   2003     __asm lea        eax, [eax + 32]}
   2004 
   2005 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
   2006 #define READUYVY_AVX2 \
   2007   __asm {                                                  \
   2008     __asm vmovdqu    ymm4, [eax] /* UYVY */                           \
   2009     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
   2010     __asm vmovdqu    ymm0, [eax] /* UV */                             \
   2011     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
   2012     __asm lea        eax, [eax + 32]}
   2013 
   2014 // Convert 16 pixels: 16 UV and 16 Y.
   2015 #define YUVTORGB_AVX2(YuvConstants) \
   2016   __asm {                                    \
   2017     __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
   2018     __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
   2019     __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
   2020     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
   2021     __asm vpsubw     ymm2, ymm3, ymm2                                          \
   2022     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
   2023     __asm vpsubw     ymm1, ymm3, ymm1                                          \
   2024     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
   2025     __asm vpsubw     ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */                       \
   2026     __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
   2027     __asm vpaddsw    ymm0, ymm0, ymm4 /* B += Y */                   \
   2028     __asm vpaddsw    ymm1, ymm1, ymm4 /* G += Y */                   \
   2029     __asm vpaddsw    ymm2, ymm2, ymm4 /* R += Y */                   \
   2030     __asm vpsraw     ymm0, ymm0, 6                                             \
   2031     __asm vpsraw     ymm1, ymm1, 6                                             \
   2032     __asm vpsraw     ymm2, ymm2, 6                                             \
   2033     __asm vpackuswb  ymm0, ymm0, ymm0 /* B */                        \
   2034     __asm vpackuswb  ymm1, ymm1, ymm1 /* G */                        \
   2035     __asm vpackuswb  ymm2, ymm2, ymm2 /* R */                  \
   2036   }
   2037 
   2038 // Store 16 ARGB values.
   2039 #define STOREARGB_AVX2 \
   2040   __asm {                                                 \
   2041     __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                       \
   2042     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   2043     __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                       \
   2044     __asm vpermq     ymm2, ymm2, 0xd8                                          \
   2045     __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */      \
   2046     __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */       \
   2047     __asm vmovdqu    0[edx], ymm1                                              \
   2048     __asm vmovdqu    32[edx], ymm0                                             \
   2049     __asm lea        edx,  [edx + 64]}
   2050 
   2051 // Store 16 RGBA values.
   2052 #define STORERGBA_AVX2 \
   2053   __asm {                                                 \
   2054     __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                       \
   2055     __asm vpermq     ymm1, ymm1, 0xd8                                          \
   2056     __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                       \
   2057     __asm vpermq     ymm2, ymm2, 0xd8                                          \
   2058     __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */      \
   2059     __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */       \
   2060     __asm vmovdqu    [edx], ymm0                                               \
   2061     __asm vmovdqu    [edx + 32], ymm1                                          \
   2062     __asm lea        edx,  [edx + 64]}
   2063 
   2064 #ifdef HAS_I422TOARGBROW_AVX2
   2065 // 16 pixels
   2066 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2067 __declspec(naked) void I422ToARGBRow_AVX2(
   2068     const uint8* y_buf,
   2069     const uint8* u_buf,
   2070     const uint8* v_buf,
   2071     uint8* dst_argb,
   2072     const struct YuvConstants* yuvconstants,
   2073     int width) {
   2074   __asm {
   2075     push       esi
   2076     push       edi
   2077     push       ebx
   2078     mov        eax, [esp + 12 + 4]  // Y
   2079     mov        esi, [esp + 12 + 8]  // U
   2080     mov        edi, [esp + 12 + 12]  // V
   2081     mov        edx, [esp + 12 + 16]  // argb
   2082     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2083     mov        ecx, [esp + 12 + 24]  // width
   2084     sub        edi, esi
   2085     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
   2086 
   2087  convertloop:
   2088     READYUV422_AVX2
   2089     YUVTORGB_AVX2(ebx)
   2090     STOREARGB_AVX2
   2091 
   2092     sub        ecx, 16
   2093     jg         convertloop
   2094 
   2095     pop        ebx
   2096     pop        edi
   2097     pop        esi
   2098     vzeroupper
   2099     ret
   2100   }
   2101 }
   2102 #endif  // HAS_I422TOARGBROW_AVX2
   2103 
   2104 #ifdef HAS_I422ALPHATOARGBROW_AVX2
   2105 // 16 pixels
   2106 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
   2107 __declspec(naked) void I422AlphaToARGBRow_AVX2(
   2108     const uint8* y_buf,
   2109     const uint8* u_buf,
   2110     const uint8* v_buf,
   2111     const uint8* a_buf,
   2112     uint8* dst_argb,
   2113     const struct YuvConstants* yuvconstants,
   2114     int width) {
   2115   __asm {
   2116     push       esi
   2117     push       edi
   2118     push       ebx
   2119     push       ebp
   2120     mov        eax, [esp + 16 + 4]  // Y
   2121     mov        esi, [esp + 16 + 8]  // U
   2122     mov        edi, [esp + 16 + 12]  // V
   2123     mov        ebp, [esp + 16 + 16]  // A
   2124     mov        edx, [esp + 16 + 20]  // argb
   2125     mov        ebx, [esp + 16 + 24]  // yuvconstants
   2126     mov        ecx, [esp + 16 + 28]  // width
   2127     sub        edi, esi
   2128 
   2129  convertloop:
   2130     READYUVA422_AVX2
   2131     YUVTORGB_AVX2(ebx)
   2132     STOREARGB_AVX2
   2133 
   2134     sub        ecx, 16
   2135     jg         convertloop
   2136 
   2137     pop        ebp
   2138     pop        ebx
   2139     pop        edi
   2140     pop        esi
   2141     vzeroupper
   2142     ret
   2143   }
   2144 }
   2145 #endif  // HAS_I422ALPHATOARGBROW_AVX2
   2146 
   2147 #ifdef HAS_I444TOARGBROW_AVX2
   2148 // 16 pixels
   2149 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
   2150 __declspec(naked) void I444ToARGBRow_AVX2(
   2151     const uint8* y_buf,
   2152     const uint8* u_buf,
   2153     const uint8* v_buf,
   2154     uint8* dst_argb,
   2155     const struct YuvConstants* yuvconstants,
   2156     int width) {
   2157   __asm {
   2158     push       esi
   2159     push       edi
   2160     push       ebx
   2161     mov        eax, [esp + 12 + 4]  // Y
   2162     mov        esi, [esp + 12 + 8]  // U
   2163     mov        edi, [esp + 12 + 12]  // V
   2164     mov        edx, [esp + 12 + 16]  // argb
   2165     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2166     mov        ecx, [esp + 12 + 24]  // width
   2167     sub        edi, esi
   2168     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
   2169  convertloop:
   2170     READYUV444_AVX2
   2171     YUVTORGB_AVX2(ebx)
   2172     STOREARGB_AVX2
   2173 
   2174     sub        ecx, 16
   2175     jg         convertloop
   2176 
   2177     pop        ebx
   2178     pop        edi
   2179     pop        esi
   2180     vzeroupper
   2181     ret
   2182   }
   2183 }
   2184 #endif  // HAS_I444TOARGBROW_AVX2
   2185 
   2186 #ifdef HAS_NV12TOARGBROW_AVX2
   2187 // 16 pixels.
   2188 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2189 __declspec(naked) void NV12ToARGBRow_AVX2(
   2190     const uint8* y_buf,
   2191     const uint8* uv_buf,
   2192     uint8* dst_argb,
   2193     const struct YuvConstants* yuvconstants,
   2194     int width) {
   2195   __asm {
   2196     push       esi
   2197     push       ebx
   2198     mov        eax, [esp + 8 + 4]  // Y
   2199     mov        esi, [esp + 8 + 8]  // UV
   2200     mov        edx, [esp + 8 + 12]  // argb
   2201     mov        ebx, [esp + 8 + 16]  // yuvconstants
   2202     mov        ecx, [esp + 8 + 20]  // width
   2203     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
   2204 
   2205  convertloop:
   2206     READNV12_AVX2
   2207     YUVTORGB_AVX2(ebx)
   2208     STOREARGB_AVX2
   2209 
   2210     sub        ecx, 16
   2211     jg         convertloop
   2212 
   2213     pop        ebx
   2214     pop        esi
   2215     vzeroupper
   2216     ret
   2217   }
   2218 }
   2219 #endif  // HAS_NV12TOARGBROW_AVX2
   2220 
   2221 #ifdef HAS_NV21TOARGBROW_AVX2
   2222 // 16 pixels.
   2223 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2224 __declspec(naked) void NV21ToARGBRow_AVX2(
   2225     const uint8* y_buf,
   2226     const uint8* vu_buf,
   2227     uint8* dst_argb,
   2228     const struct YuvConstants* yuvconstants,
   2229     int width) {
   2230   __asm {
   2231     push       esi
   2232     push       ebx
   2233     mov        eax, [esp + 8 + 4]  // Y
   2234     mov        esi, [esp + 8 + 8]  // VU
   2235     mov        edx, [esp + 8 + 12]  // argb
   2236     mov        ebx, [esp + 8 + 16]  // yuvconstants
   2237     mov        ecx, [esp + 8 + 20]  // width
   2238     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
   2239 
   2240  convertloop:
   2241     READNV21_AVX2
   2242     YUVTORGB_AVX2(ebx)
   2243     STOREARGB_AVX2
   2244 
   2245     sub        ecx, 16
   2246     jg         convertloop
   2247 
   2248     pop        ebx
   2249     pop        esi
   2250     vzeroupper
   2251     ret
   2252   }
   2253 }
   2254 #endif  // HAS_NV21TOARGBROW_AVX2
   2255 
   2256 #ifdef HAS_YUY2TOARGBROW_AVX2
   2257 // 16 pixels.
   2258 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
   2259 __declspec(naked) void YUY2ToARGBRow_AVX2(
   2260     const uint8* src_yuy2,
   2261     uint8* dst_argb,
   2262     const struct YuvConstants* yuvconstants,
   2263     int width) {
   2264   __asm {
   2265     push       ebx
   2266     mov        eax, [esp + 4 + 4]  // yuy2
   2267     mov        edx, [esp + 4 + 8]  // argb
   2268     mov        ebx, [esp + 4 + 12]  // yuvconstants
   2269     mov        ecx, [esp + 4 + 16]  // width
   2270     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
   2271 
   2272  convertloop:
   2273     READYUY2_AVX2
   2274     YUVTORGB_AVX2(ebx)
   2275     STOREARGB_AVX2
   2276 
   2277     sub        ecx, 16
   2278     jg         convertloop
   2279 
   2280     pop        ebx
   2281     vzeroupper
   2282     ret
   2283   }
   2284 }
   2285 #endif  // HAS_YUY2TOARGBROW_AVX2
   2286 
   2287 #ifdef HAS_UYVYTOARGBROW_AVX2
   2288 // 16 pixels.
   2289 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
   2290 __declspec(naked) void UYVYToARGBRow_AVX2(
   2291     const uint8* src_uyvy,
   2292     uint8* dst_argb,
   2293     const struct YuvConstants* yuvconstants,
   2294     int width) {
   2295   __asm {
   2296     push       ebx
   2297     mov        eax, [esp + 4 + 4]  // uyvy
   2298     mov        edx, [esp + 4 + 8]  // argb
   2299     mov        ebx, [esp + 4 + 12]  // yuvconstants
   2300     mov        ecx, [esp + 4 + 16]  // width
   2301     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
   2302 
   2303  convertloop:
   2304     READUYVY_AVX2
   2305     YUVTORGB_AVX2(ebx)
   2306     STOREARGB_AVX2
   2307 
   2308     sub        ecx, 16
   2309     jg         convertloop
   2310 
   2311     pop        ebx
   2312     vzeroupper
   2313     ret
   2314   }
   2315 }
   2316 #endif  // HAS_UYVYTOARGBROW_AVX2
   2317 
   2318 #ifdef HAS_I422TORGBAROW_AVX2
   2319 // 16 pixels
   2320 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
   2321 __declspec(naked) void I422ToRGBARow_AVX2(
   2322     const uint8* y_buf,
   2323     const uint8* u_buf,
   2324     const uint8* v_buf,
   2325     uint8* dst_argb,
   2326     const struct YuvConstants* yuvconstants,
   2327     int width) {
   2328   __asm {
   2329     push       esi
   2330     push       edi
   2331     push       ebx
   2332     mov        eax, [esp + 12 + 4]  // Y
   2333     mov        esi, [esp + 12 + 8]  // U
   2334     mov        edi, [esp + 12 + 12]  // V
   2335     mov        edx, [esp + 12 + 16]  // abgr
   2336     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2337     mov        ecx, [esp + 12 + 24]  // width
   2338     sub        edi, esi
   2339     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
   2340 
   2341  convertloop:
   2342     READYUV422_AVX2
   2343     YUVTORGB_AVX2(ebx)
   2344     STORERGBA_AVX2
   2345 
   2346     sub        ecx, 16
   2347     jg         convertloop
   2348 
   2349     pop        ebx
   2350     pop        edi
   2351     pop        esi
   2352     vzeroupper
   2353     ret
   2354   }
   2355 }
   2356 #endif  // HAS_I422TORGBAROW_AVX2
   2357 
   2358 #if defined(HAS_I422TOARGBROW_SSSE3)
   2359 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
   2360 // Allows a conversion with half size scaling.
   2361 
   2362 // Read 8 UV from 444.
   2363 #define READYUV444 \
   2364   __asm {                                                     \
   2365     __asm movq       xmm0, qword ptr [esi] /* U */                             \
   2366     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
   2367     __asm lea        esi,  [esi + 8]                                           \
   2368     __asm punpcklbw  xmm0, xmm1 /* UV */                             \
   2369     __asm movq       xmm4, qword ptr [eax]                                     \
   2370     __asm punpcklbw  xmm4, xmm4                                                \
   2371     __asm lea        eax, [eax + 8]}
   2372 
   2373 // Read 4 UV from 422, upsample to 8 UV.
   2374 #define READYUV422 \
   2375   __asm {                                                     \
   2376     __asm movd       xmm0, [esi] /* U */                              \
   2377     __asm movd       xmm1, [esi + edi] /* V */                              \
   2378     __asm lea        esi,  [esi + 4]                                           \
   2379     __asm punpcklbw  xmm0, xmm1 /* UV */                             \
   2380     __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
   2381     __asm movq       xmm4, qword ptr [eax]                                     \
   2382     __asm punpcklbw  xmm4, xmm4                                                \
   2383     __asm lea        eax, [eax + 8]}
   2384 
   2385 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
   2386 #define READYUVA422 \
   2387   __asm {                                                    \
   2388     __asm movd       xmm0, [esi] /* U */                              \
   2389     __asm movd       xmm1, [esi + edi] /* V */                              \
   2390     __asm lea        esi,  [esi + 4]                                           \
   2391     __asm punpcklbw  xmm0, xmm1 /* UV */                             \
   2392     __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
   2393     __asm movq       xmm4, qword ptr [eax] /* Y */                           \
   2394     __asm punpcklbw  xmm4, xmm4                                                \
   2395     __asm lea        eax, [eax + 8]                                            \
   2396     __asm movq       xmm5, qword ptr [ebp] /* A */                           \
   2397     __asm lea        ebp, [ebp + 8]}
   2398 
   2399 // Read 4 UV from NV12, upsample to 8 UV.
   2400 #define READNV12 \
   2401   __asm {                                                       \
   2402     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
   2403     __asm lea        esi,  [esi + 8]                                           \
   2404     __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
   2405     __asm movq       xmm4, qword ptr [eax]                                     \
   2406     __asm punpcklbw  xmm4, xmm4                                                \
   2407     __asm lea        eax, [eax + 8]}
   2408 
   2409 // Read 4 VU from NV21, upsample to 8 UV.
   2410 #define READNV21 \
   2411   __asm {                                                       \
   2412     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
   2413     __asm lea        esi,  [esi + 8]                                           \
   2414     __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
   2415     __asm movq       xmm4, qword ptr [eax]                                     \
   2416     __asm punpcklbw  xmm4, xmm4                                                \
   2417     __asm lea        eax, [eax + 8]}
   2418 
   2419 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
   2420 #define READYUY2 \
   2421   __asm {                                                       \
   2422     __asm movdqu     xmm4, [eax] /* YUY2 */                           \
   2423     __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
   2424     __asm movdqu     xmm0, [eax] /* UV */                             \
   2425     __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
   2426     __asm lea        eax, [eax + 16]}
   2427 
   2428 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
   2429 #define READUYVY \
   2430   __asm {                                                       \
   2431     __asm movdqu     xmm4, [eax] /* UYVY */                           \
   2432     __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
   2433     __asm movdqu     xmm0, [eax] /* UV */                             \
   2434     __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
   2435     __asm lea        eax, [eax + 16]}
   2436 
   2437 // Convert 8 pixels: 8 UV and 8 Y.
   2438 #define YUVTORGB(YuvConstants) \
   2439   __asm {                                         \
   2440     __asm movdqa     xmm1, xmm0                                                \
   2441     __asm movdqa     xmm2, xmm0                                                \
   2442     __asm movdqa     xmm3, xmm0                                                \
   2443     __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \
   2444     __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \
   2445     __asm psubw      xmm0, xmm1                                                \
   2446     __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \
   2447     __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \
   2448     __asm psubw      xmm1, xmm2                                                \
   2449     __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
   2450     __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
   2451     __asm psubw      xmm2, xmm3                                                \
   2452     __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
   2453     __asm paddsw     xmm0, xmm4 /* B += Y */                         \
   2454     __asm paddsw     xmm1, xmm4 /* G += Y */                         \
   2455     __asm paddsw     xmm2, xmm4 /* R += Y */                         \
   2456     __asm psraw      xmm0, 6                                                   \
   2457     __asm psraw      xmm1, 6                                                   \
   2458     __asm psraw      xmm2, 6                                                   \
   2459     __asm packuswb   xmm0, xmm0 /* B */                              \
   2460     __asm packuswb   xmm1, xmm1 /* G */                              \
   2461     __asm packuswb   xmm2, xmm2 /* R */             \
   2462   }
   2463 
   2464 // Store 8 ARGB values.
   2465 #define STOREARGB \
   2466   __asm {                                                      \
   2467     __asm punpcklbw  xmm0, xmm1 /* BG */                             \
   2468     __asm punpcklbw  xmm2, xmm5 /* RA */                             \
   2469     __asm movdqa     xmm1, xmm0                                                \
   2470     __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */            \
   2471     __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */             \
   2472     __asm movdqu     0[edx], xmm0                                              \
   2473     __asm movdqu     16[edx], xmm1                                             \
   2474     __asm lea        edx,  [edx + 32]}
   2475 
   2476 // Store 8 BGRA values.
   2477 #define STOREBGRA \
   2478   __asm {                                                      \
   2479     __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
   2480     __asm punpcklbw  xmm1, xmm0 /* GB */                             \
   2481     __asm punpcklbw  xmm5, xmm2 /* AR */                             \
   2482     __asm movdqa     xmm0, xmm5                                                \
   2483     __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */            \
   2484     __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */             \
   2485     __asm movdqu     0[edx], xmm5                                              \
   2486     __asm movdqu     16[edx], xmm0                                             \
   2487     __asm lea        edx,  [edx + 32]}
   2488 
   2489 // Store 8 RGBA values.
   2490 #define STORERGBA \
   2491   __asm {                                                      \
   2492     __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
   2493     __asm punpcklbw  xmm1, xmm2 /* GR */                             \
   2494     __asm punpcklbw  xmm5, xmm0 /* AB */                             \
   2495     __asm movdqa     xmm0, xmm5                                                \
   2496     __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */            \
   2497     __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */             \
   2498     __asm movdqu     0[edx], xmm5                                              \
   2499     __asm movdqu     16[edx], xmm0                                             \
   2500     __asm lea        edx,  [edx + 32]}
   2501 
   2502 // Store 8 RGB24 values.
   2503 #define STORERGB24 \
   2504   __asm {/* Weave into RRGB */                                                      \
   2505     __asm punpcklbw  xmm0, xmm1 /* BG */                             \
   2506     __asm punpcklbw  xmm2, xmm2 /* RR */                             \
   2507     __asm movdqa     xmm1, xmm0                                                \
   2508     __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
   2509     __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */                                                        \
   2510     __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
   2511     __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */           \
   2512     __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
   2513     __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */               \
   2514     __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                  \
   2515     __asm lea        edx,  [edx + 24]}
   2516 
   2517 // Store 8 RGB565 values.
   2518 #define STORERGB565 \
   2519   __asm {/* Weave into RRGB */                                                      \
   2520     __asm punpcklbw  xmm0, xmm1 /* BG */                             \
   2521     __asm punpcklbw  xmm2, xmm2 /* RR */                             \
   2522     __asm movdqa     xmm1, xmm0                                                \
   2523     __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
   2524     __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */                                                       \
   2525     __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */             \
   2526     __asm movdqa     xmm2, xmm0 /* G */                                     \
   2527     __asm pslld      xmm0, 8 /* R */                                     \
   2528     __asm psrld      xmm3, 3 /* B */                                     \
   2529     __asm psrld      xmm2, 5 /* G */                                     \
   2530     __asm psrad      xmm0, 16 /* R */                                     \
   2531     __asm pand       xmm3, xmm5 /* B */                                     \
   2532     __asm pand       xmm2, xmm6 /* G */                                     \
   2533     __asm pand       xmm0, xmm7 /* R */                                     \
   2534     __asm por        xmm3, xmm2 /* BG */                                    \
   2535     __asm por        xmm0, xmm3 /* BGR */                                   \
   2536     __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */              \
   2537     __asm movdqa     xmm2, xmm1 /* G */                                     \
   2538     __asm pslld      xmm1, 8 /* R */                                     \
   2539     __asm psrld      xmm3, 3 /* B */                                     \
   2540     __asm psrld      xmm2, 5 /* G */                                     \
   2541     __asm psrad      xmm1, 16 /* R */                                     \
   2542     __asm pand       xmm3, xmm5 /* B */                                     \
   2543     __asm pand       xmm2, xmm6 /* G */                                     \
   2544     __asm pand       xmm1, xmm7 /* R */                                     \
   2545     __asm por        xmm3, xmm2 /* BG */                                    \
   2546     __asm por        xmm1, xmm3 /* BGR */                                   \
   2547     __asm packssdw   xmm0, xmm1                                                \
   2548     __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */              \
   2549     __asm lea        edx, [edx + 16]}
   2550 
   2551 // 8 pixels.
   2552 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
   2553 __declspec(naked) void I444ToARGBRow_SSSE3(
   2554     const uint8* y_buf,
   2555     const uint8* u_buf,
   2556     const uint8* v_buf,
   2557     uint8* dst_argb,
   2558     const struct YuvConstants* yuvconstants,
   2559     int width) {
   2560   __asm {
   2561     push       esi
   2562     push       edi
   2563     push       ebx
   2564     mov        eax, [esp + 12 + 4]  // Y
   2565     mov        esi, [esp + 12 + 8]  // U
   2566     mov        edi, [esp + 12 + 12]  // V
   2567     mov        edx, [esp + 12 + 16]  // argb
   2568     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2569     mov        ecx, [esp + 12 + 24]  // width
   2570     sub        edi, esi
   2571     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
   2572 
   2573  convertloop:
   2574     READYUV444
   2575     YUVTORGB(ebx)
   2576     STOREARGB
   2577 
   2578     sub        ecx, 8
   2579     jg         convertloop
   2580 
   2581     pop        ebx
   2582     pop        edi
   2583     pop        esi
   2584     ret
   2585   }
   2586 }
   2587 
   2588 // 8 pixels.
   2589 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
   2590 __declspec(naked) void I422ToRGB24Row_SSSE3(
   2591     const uint8* y_buf,
   2592     const uint8* u_buf,
   2593     const uint8* v_buf,
   2594     uint8* dst_rgb24,
   2595     const struct YuvConstants* yuvconstants,
   2596     int width) {
   2597   __asm {
   2598     push       esi
   2599     push       edi
   2600     push       ebx
   2601     mov        eax, [esp + 12 + 4]  // Y
   2602     mov        esi, [esp + 12 + 8]  // U
   2603     mov        edi, [esp + 12 + 12]  // V
   2604     mov        edx, [esp + 12 + 16]  // argb
   2605     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2606     mov        ecx, [esp + 12 + 24]  // width
   2607     sub        edi, esi
   2608     movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
   2609     movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
   2610 
   2611  convertloop:
   2612     READYUV422
   2613     YUVTORGB(ebx)
   2614     STORERGB24
   2615 
   2616     sub        ecx, 8
   2617     jg         convertloop
   2618 
   2619     pop        ebx
   2620     pop        edi
   2621     pop        esi
   2622     ret
   2623   }
   2624 }
   2625 
   2626 // 8 pixels
   2627 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
   2628 __declspec(naked) void I422ToRGB565Row_SSSE3(
   2629     const uint8* y_buf,
   2630     const uint8* u_buf,
   2631     const uint8* v_buf,
   2632     uint8* rgb565_buf,
   2633     const struct YuvConstants* yuvconstants,
   2634     int width) {
   2635   __asm {
   2636     push       esi
   2637     push       edi
   2638     push       ebx
   2639     mov        eax, [esp + 12 + 4]  // Y
   2640     mov        esi, [esp + 12 + 8]  // U
   2641     mov        edi, [esp + 12 + 12]  // V
   2642     mov        edx, [esp + 12 + 16]  // argb
   2643     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2644     mov        ecx, [esp + 12 + 24]  // width
   2645     sub        edi, esi
   2646     pcmpeqb    xmm5, xmm5  // generate mask 0x0000001f
   2647     psrld      xmm5, 27
   2648     pcmpeqb    xmm6, xmm6  // generate mask 0x000007e0
   2649     psrld      xmm6, 26
   2650     pslld      xmm6, 5
   2651     pcmpeqb    xmm7, xmm7  // generate mask 0xfffff800
   2652     pslld      xmm7, 11
   2653 
   2654  convertloop:
   2655     READYUV422
   2656     YUVTORGB(ebx)
   2657     STORERGB565
   2658 
   2659     sub        ecx, 8
   2660     jg         convertloop
   2661 
   2662     pop        ebx
   2663     pop        edi
   2664     pop        esi
   2665     ret
   2666   }
   2667 }
   2668 
   2669 // 8 pixels.
   2670 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2671 __declspec(naked) void I422ToARGBRow_SSSE3(
   2672     const uint8* y_buf,
   2673     const uint8* u_buf,
   2674     const uint8* v_buf,
   2675     uint8* dst_argb,
   2676     const struct YuvConstants* yuvconstants,
   2677     int width) {
   2678   __asm {
   2679     push       esi
   2680     push       edi
   2681     push       ebx
   2682     mov        eax, [esp + 12 + 4]  // Y
   2683     mov        esi, [esp + 12 + 8]  // U
   2684     mov        edi, [esp + 12 + 12]  // V
   2685     mov        edx, [esp + 12 + 16]  // argb
   2686     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2687     mov        ecx, [esp + 12 + 24]  // width
   2688     sub        edi, esi
   2689     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
   2690 
   2691  convertloop:
   2692     READYUV422
   2693     YUVTORGB(ebx)
   2694     STOREARGB
   2695 
   2696     sub        ecx, 8
   2697     jg         convertloop
   2698 
   2699     pop        ebx
   2700     pop        edi
   2701     pop        esi
   2702     ret
   2703   }
   2704 }
   2705 
   2706 // 8 pixels.
   2707 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
   2708 __declspec(naked) void I422AlphaToARGBRow_SSSE3(
   2709     const uint8* y_buf,
   2710     const uint8* u_buf,
   2711     const uint8* v_buf,
   2712     const uint8* a_buf,
   2713     uint8* dst_argb,
   2714     const struct YuvConstants* yuvconstants,
   2715     int width) {
   2716   __asm {
   2717     push       esi
   2718     push       edi
   2719     push       ebx
   2720     push       ebp
   2721     mov        eax, [esp + 16 + 4]  // Y
   2722     mov        esi, [esp + 16 + 8]  // U
   2723     mov        edi, [esp + 16 + 12]  // V
   2724     mov        ebp, [esp + 16 + 16]  // A
   2725     mov        edx, [esp + 16 + 20]  // argb
   2726     mov        ebx, [esp + 16 + 24]  // yuvconstants
   2727     mov        ecx, [esp + 16 + 28]  // width
   2728     sub        edi, esi
   2729 
   2730  convertloop:
   2731     READYUVA422
   2732     YUVTORGB(ebx)
   2733     STOREARGB
   2734 
   2735     sub        ecx, 8
   2736     jg         convertloop
   2737 
   2738     pop        ebp
   2739     pop        ebx
   2740     pop        edi
   2741     pop        esi
   2742     ret
   2743   }
   2744 }
   2745 
   2746 // 8 pixels.
   2747 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2748 __declspec(naked) void NV12ToARGBRow_SSSE3(
   2749     const uint8* y_buf,
   2750     const uint8* uv_buf,
   2751     uint8* dst_argb,
   2752     const struct YuvConstants* yuvconstants,
   2753     int width) {
   2754   __asm {
   2755     push       esi
   2756     push       ebx
   2757     mov        eax, [esp + 8 + 4]  // Y
   2758     mov        esi, [esp + 8 + 8]  // UV
   2759     mov        edx, [esp + 8 + 12]  // argb
   2760     mov        ebx, [esp + 8 + 16]  // yuvconstants
   2761     mov        ecx, [esp + 8 + 20]  // width
   2762     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
   2763 
   2764  convertloop:
   2765     READNV12
   2766     YUVTORGB(ebx)
   2767     STOREARGB
   2768 
   2769     sub        ecx, 8
   2770     jg         convertloop
   2771 
   2772     pop        ebx
   2773     pop        esi
   2774     ret
   2775   }
   2776 }
   2777 
   2778 // 8 pixels.
   2779 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2780 __declspec(naked) void NV21ToARGBRow_SSSE3(
   2781     const uint8* y_buf,
   2782     const uint8* vu_buf,
   2783     uint8* dst_argb,
   2784     const struct YuvConstants* yuvconstants,
   2785     int width) {
   2786   __asm {
   2787     push       esi
   2788     push       ebx
   2789     mov        eax, [esp + 8 + 4]  // Y
   2790     mov        esi, [esp + 8 + 8]  // VU
   2791     mov        edx, [esp + 8 + 12]  // argb
   2792     mov        ebx, [esp + 8 + 16]  // yuvconstants
   2793     mov        ecx, [esp + 8 + 20]  // width
   2794     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
   2795 
   2796  convertloop:
   2797     READNV21
   2798     YUVTORGB(ebx)
   2799     STOREARGB
   2800 
   2801     sub        ecx, 8
   2802     jg         convertloop
   2803 
   2804     pop        ebx
   2805     pop        esi
   2806     ret
   2807   }
   2808 }
   2809 
   2810 // 8 pixels.
   2811 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
   2812 __declspec(naked) void YUY2ToARGBRow_SSSE3(
   2813     const uint8* src_yuy2,
   2814     uint8* dst_argb,
   2815     const struct YuvConstants* yuvconstants,
   2816     int width) {
   2817   __asm {
   2818     push       ebx
   2819     mov        eax, [esp + 4 + 4]  // yuy2
   2820     mov        edx, [esp + 4 + 8]  // argb
   2821     mov        ebx, [esp + 4 + 12]  // yuvconstants
   2822     mov        ecx, [esp + 4 + 16]  // width
   2823     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
   2824 
   2825  convertloop:
   2826     READYUY2
   2827     YUVTORGB(ebx)
   2828     STOREARGB
   2829 
   2830     sub        ecx, 8
   2831     jg         convertloop
   2832 
   2833     pop        ebx
   2834     ret
   2835   }
   2836 }
   2837 
   2838 // 8 pixels.
   2839 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
   2840 __declspec(naked) void UYVYToARGBRow_SSSE3(
   2841     const uint8* src_uyvy,
   2842     uint8* dst_argb,
   2843     const struct YuvConstants* yuvconstants,
   2844     int width) {
   2845   __asm {
   2846     push       ebx
   2847     mov        eax, [esp + 4 + 4]  // uyvy
   2848     mov        edx, [esp + 4 + 8]  // argb
   2849     mov        ebx, [esp + 4 + 12]  // yuvconstants
   2850     mov        ecx, [esp + 4 + 16]  // width
   2851     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
   2852 
   2853  convertloop:
   2854     READUYVY
   2855     YUVTORGB(ebx)
   2856     STOREARGB
   2857 
   2858     sub        ecx, 8
   2859     jg         convertloop
   2860 
   2861     pop        ebx
   2862     ret
   2863   }
   2864 }
   2865 
   2866 __declspec(naked) void I422ToRGBARow_SSSE3(
   2867     const uint8* y_buf,
   2868     const uint8* u_buf,
   2869     const uint8* v_buf,
   2870     uint8* dst_rgba,
   2871     const struct YuvConstants* yuvconstants,
   2872     int width) {
   2873   __asm {
   2874     push       esi
   2875     push       edi
   2876     push       ebx
   2877     mov        eax, [esp + 12 + 4]  // Y
   2878     mov        esi, [esp + 12 + 8]  // U
   2879     mov        edi, [esp + 12 + 12]  // V
   2880     mov        edx, [esp + 12 + 16]  // argb
   2881     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2882     mov        ecx, [esp + 12 + 24]  // width
   2883     sub        edi, esi
   2884 
   2885  convertloop:
   2886     READYUV422
   2887     YUVTORGB(ebx)
   2888     STORERGBA
   2889 
   2890     sub        ecx, 8
   2891     jg         convertloop
   2892 
   2893     pop        ebx
   2894     pop        edi
   2895     pop        esi
   2896     ret
   2897   }
   2898 }
   2899 #endif  // HAS_I422TOARGBROW_SSSE3
   2900 
   2901 #ifdef HAS_I400TOARGBROW_SSE2
   2902 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
   2903 __declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
   2904                                           uint8* rgb_buf,
   2905                                           int width) {
   2906   __asm {
   2907     mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
   2908     movd       xmm2, eax
   2909     pshufd     xmm2, xmm2,0
   2910     mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
   2911     movd       xmm3, eax
   2912     pshufd     xmm3, xmm3, 0
   2913     pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
   2914     pslld      xmm4, 24
   2915 
   2916     mov        eax, [esp + 4]  // Y
   2917     mov        edx, [esp + 8]  // rgb
   2918     mov        ecx, [esp + 12]  // width
   2919 
   2920  convertloop:
   2921         // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
   2922     movq       xmm0, qword ptr [eax]
   2923     lea        eax, [eax + 8]
   2924     punpcklbw  xmm0, xmm0  // Y.Y
   2925     pmulhuw    xmm0, xmm2
   2926     psubusw    xmm0, xmm3
   2927     psrlw      xmm0, 6
   2928     packuswb   xmm0, xmm0        // G
   2929 
   2930     // Step 2: Weave into ARGB
   2931     punpcklbw  xmm0, xmm0  // GG
   2932     movdqa     xmm1, xmm0
   2933     punpcklwd  xmm0, xmm0  // BGRA first 4 pixels
   2934     punpckhwd  xmm1, xmm1  // BGRA next 4 pixels
   2935     por        xmm0, xmm4
   2936     por        xmm1, xmm4
   2937     movdqu     [edx], xmm0
   2938     movdqu     [edx + 16], xmm1
   2939     lea        edx,  [edx + 32]
   2940     sub        ecx, 8
   2941     jg         convertloop
   2942     ret
   2943   }
   2944 }
   2945 #endif  // HAS_I400TOARGBROW_SSE2
   2946 
   2947 #ifdef HAS_I400TOARGBROW_AVX2
   2948 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
   2949 // note: vpunpcklbw mutates and vpackuswb unmutates.
   2950 __declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf,
   2951                                           uint8* rgb_buf,
   2952                                           int width) {
   2953   __asm {
   2954     mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
   2955     vmovd      xmm2, eax
   2956     vbroadcastss ymm2, xmm2
   2957     mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
   2958     vmovd      xmm3, eax
   2959     vbroadcastss ymm3, xmm3
   2960     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xff000000
   2961     vpslld     ymm4, ymm4, 24
   2962 
   2963     mov        eax, [esp + 4]  // Y
   2964     mov        edx, [esp + 8]  // rgb
   2965     mov        ecx, [esp + 12]  // width
   2966 
   2967  convertloop:
   2968         // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
   2969     vmovdqu    xmm0, [eax]
   2970     lea        eax, [eax + 16]
   2971     vpermq     ymm0, ymm0, 0xd8  // vpunpcklbw mutates
   2972     vpunpcklbw ymm0, ymm0, ymm0  // Y.Y
   2973     vpmulhuw   ymm0, ymm0, ymm2
   2974     vpsubusw   ymm0, ymm0, ymm3
   2975     vpsrlw     ymm0, ymm0, 6
   2976     vpackuswb  ymm0, ymm0, ymm0        // G.  still mutated: 3120
   2977 
   2978     // TODO(fbarchard): Weave alpha with unpack.
   2979     // Step 2: Weave into ARGB
   2980     vpunpcklbw ymm1, ymm0, ymm0  // GG - mutates
   2981     vpermq     ymm1, ymm1, 0xd8
   2982     vpunpcklwd ymm0, ymm1, ymm1  // GGGG first 8 pixels
   2983     vpunpckhwd ymm1, ymm1, ymm1  // GGGG next 8 pixels
   2984     vpor       ymm0, ymm0, ymm4
   2985     vpor       ymm1, ymm1, ymm4
   2986     vmovdqu    [edx], ymm0
   2987     vmovdqu    [edx + 32], ymm1
   2988     lea        edx,  [edx + 64]
   2989     sub        ecx, 16
   2990     jg         convertloop
   2991     vzeroupper
   2992     ret
   2993   }
   2994 }
   2995 #endif  // HAS_I400TOARGBROW_AVX2
   2996 
   2997 #ifdef HAS_MIRRORROW_SSSE3
   2998 // Shuffle table for reversing the bytes.
   2999 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
   3000                                      7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
   3001 
   3002 // TODO(fbarchard): Replace lea with -16 offset.
   3003 __declspec(naked) void MirrorRow_SSSE3(const uint8* src,
   3004                                        uint8* dst,
   3005                                        int width) {
   3006   __asm {
   3007     mov       eax, [esp + 4]  // src
   3008     mov       edx, [esp + 8]  // dst
   3009     mov       ecx, [esp + 12]  // width
   3010     movdqa    xmm5, xmmword ptr kShuffleMirror
   3011 
   3012  convertloop:
   3013     movdqu    xmm0, [eax - 16 + ecx]
   3014     pshufb    xmm0, xmm5
   3015     movdqu    [edx], xmm0
   3016     lea       edx, [edx + 16]
   3017     sub       ecx, 16
   3018     jg        convertloop
   3019     ret
   3020   }
   3021 }
   3022 #endif  // HAS_MIRRORROW_SSSE3
   3023 
   3024 #ifdef HAS_MIRRORROW_AVX2
   3025 __declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   3026   __asm {
   3027     mov       eax, [esp + 4]  // src
   3028     mov       edx, [esp + 8]  // dst
   3029     mov       ecx, [esp + 12]  // width
   3030     vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
   3031 
   3032  convertloop:
   3033     vmovdqu   ymm0, [eax - 32 + ecx]
   3034     vpshufb   ymm0, ymm0, ymm5
   3035     vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
   3036     vmovdqu   [edx], ymm0
   3037     lea       edx, [edx + 32]
   3038     sub       ecx, 32
   3039     jg        convertloop
   3040     vzeroupper
   3041     ret
   3042   }
   3043 }
   3044 #endif  // HAS_MIRRORROW_AVX2
   3045 
   3046 #ifdef HAS_MIRRORUVROW_SSSE3
   3047 // Shuffle table for reversing the bytes of UV channels.
   3048 static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
   3049                                        15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
   3050 
   3051 __declspec(naked) void MirrorUVRow_SSSE3(const uint8* src,
   3052                                          uint8* dst_u,
   3053                                          uint8* dst_v,
   3054                                          int width) {
   3055   __asm {
   3056     push      edi
   3057     mov       eax, [esp + 4 + 4]  // src
   3058     mov       edx, [esp + 4 + 8]  // dst_u
   3059     mov       edi, [esp + 4 + 12]  // dst_v
   3060     mov       ecx, [esp + 4 + 16]  // width
   3061     movdqa    xmm1, xmmword ptr kShuffleMirrorUV
   3062     lea       eax, [eax + ecx * 2 - 16]
   3063     sub       edi, edx
   3064 
   3065  convertloop:
   3066     movdqu    xmm0, [eax]
   3067     lea       eax, [eax - 16]
   3068     pshufb    xmm0, xmm1
   3069     movlpd    qword ptr [edx], xmm0
   3070     movhpd    qword ptr [edx + edi], xmm0
   3071     lea       edx, [edx + 8]
   3072     sub       ecx, 8
   3073     jg        convertloop
   3074 
   3075     pop       edi
   3076     ret
   3077   }
   3078 }
   3079 #endif  // HAS_MIRRORUVROW_SSSE3
   3080 
   3081 #ifdef HAS_ARGBMIRRORROW_SSE2
   3082 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src,
   3083                                           uint8* dst,
   3084                                           int width) {
   3085   __asm {
   3086     mov       eax, [esp + 4]  // src
   3087     mov       edx, [esp + 8]  // dst
   3088     mov       ecx, [esp + 12]  // width
   3089     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
   3090 
   3091  convertloop:
   3092     movdqu    xmm0, [eax]
   3093     lea       eax, [eax - 16]
   3094     pshufd    xmm0, xmm0, 0x1b
   3095     movdqu    [edx], xmm0
   3096     lea       edx, [edx + 16]
   3097     sub       ecx, 4
   3098     jg        convertloop
   3099     ret
   3100   }
   3101 }
   3102 #endif  // HAS_ARGBMIRRORROW_SSE2
   3103 
   3104 #ifdef HAS_ARGBMIRRORROW_AVX2
   3105 // Shuffle table for reversing the bytes.
   3106 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
   3107 
   3108 __declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src,
   3109                                           uint8* dst,
   3110                                           int width) {
   3111   __asm {
   3112     mov       eax, [esp + 4]  // src
   3113     mov       edx, [esp + 8]  // dst
   3114     mov       ecx, [esp + 12]  // width
   3115     vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
   3116 
   3117  convertloop:
   3118     vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
   3119     vmovdqu   [edx], ymm0
   3120     lea       edx, [edx + 32]
   3121     sub       ecx, 8
   3122     jg        convertloop
   3123     vzeroupper
   3124     ret
   3125   }
   3126 }
   3127 #endif  // HAS_ARGBMIRRORROW_AVX2
   3128 
   3129 #ifdef HAS_SPLITUVROW_SSE2
   3130 __declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv,
   3131                                        uint8* dst_u,
   3132                                        uint8* dst_v,
   3133                                        int width) {
   3134   __asm {
   3135     push       edi
   3136     mov        eax, [esp + 4 + 4]  // src_uv
   3137     mov        edx, [esp + 4 + 8]  // dst_u
   3138     mov        edi, [esp + 4 + 12]  // dst_v
   3139     mov        ecx, [esp + 4 + 16]  // width
   3140     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
   3141     psrlw      xmm5, 8
   3142     sub        edi, edx
   3143 
   3144   convertloop:
   3145     movdqu     xmm0, [eax]
   3146     movdqu     xmm1, [eax + 16]
   3147     lea        eax,  [eax + 32]
   3148     movdqa     xmm2, xmm0
   3149     movdqa     xmm3, xmm1
   3150     pand       xmm0, xmm5  // even bytes
   3151     pand       xmm1, xmm5
   3152     packuswb   xmm0, xmm1
   3153     psrlw      xmm2, 8  // odd bytes
   3154     psrlw      xmm3, 8
   3155     packuswb   xmm2, xmm3
   3156     movdqu     [edx], xmm0
   3157     movdqu     [edx + edi], xmm2
   3158     lea        edx, [edx + 16]
   3159     sub        ecx, 16
   3160     jg         convertloop
   3161 
   3162     pop        edi
   3163     ret
   3164   }
   3165 }
   3166 
   3167 #endif  // HAS_SPLITUVROW_SSE2
   3168 
   3169 #ifdef HAS_SPLITUVROW_AVX2
   3170 __declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv,
   3171                                        uint8* dst_u,
   3172                                        uint8* dst_v,
   3173                                        int width) {
   3174   __asm {
   3175     push       edi
   3176     mov        eax, [esp + 4 + 4]  // src_uv
   3177     mov        edx, [esp + 4 + 8]  // dst_u
   3178     mov        edi, [esp + 4 + 12]  // dst_v
   3179     mov        ecx, [esp + 4 + 16]  // width
   3180     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
   3181     vpsrlw     ymm5, ymm5, 8
   3182     sub        edi, edx
   3183 
   3184   convertloop:
   3185     vmovdqu    ymm0, [eax]
   3186     vmovdqu    ymm1, [eax + 32]
   3187     lea        eax,  [eax + 64]
   3188     vpsrlw     ymm2, ymm0, 8  // odd bytes
   3189     vpsrlw     ymm3, ymm1, 8
   3190     vpand      ymm0, ymm0, ymm5  // even bytes
   3191     vpand      ymm1, ymm1, ymm5
   3192     vpackuswb  ymm0, ymm0, ymm1
   3193     vpackuswb  ymm2, ymm2, ymm3
   3194     vpermq     ymm0, ymm0, 0xd8
   3195     vpermq     ymm2, ymm2, 0xd8
   3196     vmovdqu    [edx], ymm0
   3197     vmovdqu    [edx + edi], ymm2
   3198     lea        edx, [edx + 32]
   3199     sub        ecx, 32
   3200     jg         convertloop
   3201 
   3202     pop        edi
   3203     vzeroupper
   3204     ret
   3205   }
   3206 }
   3207 #endif  // HAS_SPLITUVROW_AVX2
   3208 
   3209 #ifdef HAS_MERGEUVROW_SSE2
   3210 __declspec(naked) void MergeUVRow_SSE2(const uint8* src_u,
   3211                                        const uint8* src_v,
   3212                                        uint8* dst_uv,
   3213                                        int width) {
   3214   __asm {
   3215     push       edi
   3216     mov        eax, [esp + 4 + 4]  // src_u
   3217     mov        edx, [esp + 4 + 8]  // src_v
   3218     mov        edi, [esp + 4 + 12]  // dst_uv
   3219     mov        ecx, [esp + 4 + 16]  // width
   3220     sub        edx, eax
   3221 
   3222   convertloop:
   3223     movdqu     xmm0, [eax]  // read 16 U's
   3224     movdqu     xmm1, [eax + edx]  // and 16 V's
   3225     lea        eax,  [eax + 16]
   3226     movdqa     xmm2, xmm0
   3227     punpcklbw  xmm0, xmm1  // first 8 UV pairs
   3228     punpckhbw  xmm2, xmm1  // next 8 UV pairs
   3229     movdqu     [edi], xmm0
   3230     movdqu     [edi + 16], xmm2
   3231     lea        edi, [edi + 32]
   3232     sub        ecx, 16
   3233     jg         convertloop
   3234 
   3235     pop        edi
   3236     ret
   3237   }
   3238 }
   3239 #endif  //  HAS_MERGEUVROW_SSE2
   3240 
   3241 #ifdef HAS_MERGEUVROW_AVX2
   3242 __declspec(naked) void MergeUVRow_AVX2(const uint8* src_u,
   3243                                        const uint8* src_v,
   3244                                        uint8* dst_uv,
   3245                                        int width) {
   3246   __asm {
   3247     push       edi
   3248     mov        eax, [esp + 4 + 4]  // src_u
   3249     mov        edx, [esp + 4 + 8]  // src_v
   3250     mov        edi, [esp + 4 + 12]  // dst_uv
   3251     mov        ecx, [esp + 4 + 16]  // width
   3252     sub        edx, eax
   3253 
   3254   convertloop:
   3255     vmovdqu    ymm0, [eax]  // read 32 U's
   3256     vmovdqu    ymm1, [eax + edx]  // and 32 V's
   3257     lea        eax,  [eax + 32]
   3258     vpunpcklbw ymm2, ymm0, ymm1  // low 16 UV pairs. mutated qqword 0,2
   3259     vpunpckhbw ymm0, ymm0, ymm1  // high 16 UV pairs. mutated qqword 1,3
   3260     vextractf128 [edi], ymm2, 0  // bytes 0..15
   3261     vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
   3262     vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
   3263     vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
   3264     lea        edi, [edi + 64]
   3265     sub        ecx, 32
   3266     jg         convertloop
   3267 
   3268     pop        edi
   3269     vzeroupper
   3270     ret
   3271   }
   3272 }
   3273 #endif  //  HAS_MERGEUVROW_AVX2
   3274 
   3275 #ifdef HAS_COPYROW_SSE2
   3276 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
   3277 __declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   3278   __asm {
   3279     mov        eax, [esp + 4]  // src
   3280     mov        edx, [esp + 8]  // dst
   3281     mov        ecx, [esp + 12]  // count
   3282     test       eax, 15
   3283     jne        convertloopu
   3284     test       edx, 15
   3285     jne        convertloopu
   3286 
   3287   convertloopa:
   3288     movdqa     xmm0, [eax]
   3289     movdqa     xmm1, [eax + 16]
   3290     lea        eax, [eax + 32]
   3291     movdqa     [edx], xmm0
   3292     movdqa     [edx + 16], xmm1
   3293     lea        edx, [edx + 32]
   3294     sub        ecx, 32
   3295     jg         convertloopa
   3296     ret
   3297 
   3298   convertloopu:
   3299     movdqu     xmm0, [eax]
   3300     movdqu     xmm1, [eax + 16]
   3301     lea        eax, [eax + 32]
   3302     movdqu     [edx], xmm0
   3303     movdqu     [edx + 16], xmm1
   3304     lea        edx, [edx + 32]
   3305     sub        ecx, 32
   3306     jg         convertloopu
   3307     ret
   3308   }
   3309 }
   3310 #endif  // HAS_COPYROW_SSE2
   3311 
   3312 #ifdef HAS_COPYROW_AVX
   3313 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
   3314 __declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
   3315   __asm {
   3316     mov        eax, [esp + 4]  // src
   3317     mov        edx, [esp + 8]  // dst
   3318     mov        ecx, [esp + 12]  // count
   3319 
   3320   convertloop:
   3321     vmovdqu    ymm0, [eax]
   3322     vmovdqu    ymm1, [eax + 32]
   3323     lea        eax, [eax + 64]
   3324     vmovdqu    [edx], ymm0
   3325     vmovdqu    [edx + 32], ymm1
   3326     lea        edx, [edx + 64]
   3327     sub        ecx, 64
   3328     jg         convertloop
   3329 
   3330     vzeroupper
   3331     ret
   3332   }
   3333 }
   3334 #endif  // HAS_COPYROW_AVX
   3335 
   3336 // Multiple of 1.
   3337 __declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
   3338   __asm {
   3339     mov        eax, esi
   3340     mov        edx, edi
   3341     mov        esi, [esp + 4]  // src
   3342     mov        edi, [esp + 8]  // dst
   3343     mov        ecx, [esp + 12]  // count
   3344     rep movsb
   3345     mov        edi, edx
   3346     mov        esi, eax
   3347     ret
   3348   }
   3349 }
   3350 
   3351 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
   3352 // width in pixels
   3353 __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src,
   3354                                              uint8* dst,
   3355                                              int width) {
   3356   __asm {
   3357     mov        eax, [esp + 4]  // src
   3358     mov        edx, [esp + 8]  // dst
   3359     mov        ecx, [esp + 12]  // count
   3360     pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
   3361     pslld      xmm0, 24
   3362     pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
   3363     psrld      xmm1, 8
   3364 
   3365   convertloop:
   3366     movdqu     xmm2, [eax]
   3367     movdqu     xmm3, [eax + 16]
   3368     lea        eax, [eax + 32]
   3369     movdqu     xmm4, [edx]
   3370     movdqu     xmm5, [edx + 16]
   3371     pand       xmm2, xmm0
   3372     pand       xmm3, xmm0
   3373     pand       xmm4, xmm1
   3374     pand       xmm5, xmm1
   3375     por        xmm2, xmm4
   3376     por        xmm3, xmm5
   3377     movdqu     [edx], xmm2
   3378     movdqu     [edx + 16], xmm3
   3379     lea        edx, [edx + 32]
   3380     sub        ecx, 8
   3381     jg         convertloop
   3382 
   3383     ret
   3384   }
   3385 }
   3386 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
   3387 
   3388 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
   3389 // width in pixels
   3390 __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src,
   3391                                              uint8* dst,
   3392                                              int width) {
   3393   __asm {
   3394     mov        eax, [esp + 4]  // src
   3395     mov        edx, [esp + 8]  // dst
   3396     mov        ecx, [esp + 12]  // count
   3397     vpcmpeqb   ymm0, ymm0, ymm0
   3398     vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
   3399 
   3400   convertloop:
   3401     vmovdqu    ymm1, [eax]
   3402     vmovdqu    ymm2, [eax + 32]
   3403     lea        eax, [eax + 64]
   3404     vpblendvb  ymm1, ymm1, [edx], ymm0
   3405     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
   3406     vmovdqu    [edx], ymm1
   3407     vmovdqu    [edx + 32], ymm2
   3408     lea        edx, [edx + 64]
   3409     sub        ecx, 16
   3410     jg         convertloop
   3411 
   3412     vzeroupper
   3413     ret
   3414   }
   3415 }
   3416 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
   3417 
   3418 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
   3419 // width in pixels
   3420 __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8* src_argb,
   3421                                                 uint8* dst_a,
   3422                                                 int width) {
   3423   __asm {
   3424     mov        eax, [esp + 4]  // src_argb
   3425     mov        edx, [esp + 8]  // dst_a
   3426     mov        ecx, [esp + 12]  // width
   3427 
   3428   extractloop:
   3429     movdqu     xmm0, [eax]
   3430     movdqu     xmm1, [eax + 16]
   3431     lea        eax, [eax + 32]
   3432     psrld      xmm0, 24
   3433     psrld      xmm1, 24
   3434     packssdw   xmm0, xmm1
   3435     packuswb   xmm0, xmm0
   3436     movq       qword ptr [edx], xmm0
   3437     lea        edx, [edx + 8]
   3438     sub        ecx, 8
   3439     jg         extractloop
   3440 
   3441     ret
   3442   }
   3443 }
   3444 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
   3445 
   3446 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
   3447 // width in pixels
   3448 __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8* src_argb,
   3449                                                 uint8* dst_a,
   3450                                                 int width) {
   3451   __asm {
   3452     mov        eax, [esp + 4]  // src_argb
   3453     mov        edx, [esp + 8]  // dst_a
   3454     mov        ecx, [esp + 12]  // width
   3455     vmovdqa    ymm4, ymmword ptr kPermdARGBToY_AVX
   3456 
   3457   extractloop:
   3458     vmovdqu    ymm0, [eax]
   3459     vmovdqu    ymm1, [eax + 32]
   3460     vpsrld     ymm0, ymm0, 24
   3461     vpsrld     ymm1, ymm1, 24
   3462     vmovdqu    ymm2, [eax + 64]
   3463     vmovdqu    ymm3, [eax + 96]
   3464     lea        eax, [eax + 128]
   3465     vpackssdw  ymm0, ymm0, ymm1  // mutates
   3466     vpsrld     ymm2, ymm2, 24
   3467     vpsrld     ymm3, ymm3, 24
   3468     vpackssdw  ymm2, ymm2, ymm3  // mutates
   3469     vpackuswb  ymm0, ymm0, ymm2  // mutates
   3470     vpermd     ymm0, ymm4, ymm0  // unmutate
   3471     vmovdqu    [edx], ymm0
   3472     lea        edx, [edx + 32]
   3473     sub        ecx, 32
   3474     jg         extractloop
   3475 
   3476     vzeroupper
   3477     ret
   3478   }
   3479 }
   3480 #endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
   3481 
   3482 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
   3483 // width in pixels
   3484 __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src,
   3485                                                 uint8* dst,
   3486                                                 int width) {
   3487   __asm {
   3488     mov        eax, [esp + 4]  // src
   3489     mov        edx, [esp + 8]  // dst
   3490     mov        ecx, [esp + 12]  // count
   3491     pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
   3492     pslld      xmm0, 24
   3493     pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
   3494     psrld      xmm1, 8
   3495 
   3496   convertloop:
   3497     movq       xmm2, qword ptr [eax]  // 8 Y's
   3498     lea        eax, [eax + 8]
   3499     punpcklbw  xmm2, xmm2
   3500     punpckhwd  xmm3, xmm2
   3501     punpcklwd  xmm2, xmm2
   3502     movdqu     xmm4, [edx]
   3503     movdqu     xmm5, [edx + 16]
   3504     pand       xmm2, xmm0
   3505     pand       xmm3, xmm0
   3506     pand       xmm4, xmm1
   3507     pand       xmm5, xmm1
   3508     por        xmm2, xmm4
   3509     por        xmm3, xmm5
   3510     movdqu     [edx], xmm2
   3511     movdqu     [edx + 16], xmm3
   3512     lea        edx, [edx + 32]
   3513     sub        ecx, 8
   3514     jg         convertloop
   3515 
   3516     ret
   3517   }
   3518 }
   3519 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
   3520 
   3521 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
   3522 // width in pixels
   3523 __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src,
   3524                                                 uint8* dst,
   3525                                                 int width) {
   3526   __asm {
   3527     mov        eax, [esp + 4]  // src
   3528     mov        edx, [esp + 8]  // dst
   3529     mov        ecx, [esp + 12]  // count
   3530     vpcmpeqb   ymm0, ymm0, ymm0
   3531     vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
   3532 
   3533   convertloop:
   3534     vpmovzxbd  ymm1, qword ptr [eax]
   3535     vpmovzxbd  ymm2, qword ptr [eax + 8]
   3536     lea        eax, [eax + 16]
   3537     vpslld     ymm1, ymm1, 24
   3538     vpslld     ymm2, ymm2, 24
   3539     vpblendvb  ymm1, ymm1, [edx], ymm0
   3540     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
   3541     vmovdqu    [edx], ymm1
   3542     vmovdqu    [edx + 32], ymm2
   3543     lea        edx, [edx + 64]
   3544     sub        ecx, 16
   3545     jg         convertloop
   3546 
   3547     vzeroupper
   3548     ret
   3549   }
   3550 }
   3551 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
   3552 
   3553 #ifdef HAS_SETROW_X86
   3554 // Write 'count' bytes using an 8 bit value repeated.
   3555 // Count should be multiple of 4.
   3556 __declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) {
   3557   __asm {
   3558     movzx      eax, byte ptr [esp + 8]  // v8
   3559     mov        edx, 0x01010101  // Duplicate byte to all bytes.
   3560     mul        edx  // overwrites edx with upper part of result.
   3561     mov        edx, edi
   3562     mov        edi, [esp + 4]  // dst
   3563     mov        ecx, [esp + 12]  // count
   3564     shr        ecx, 2
   3565     rep stosd
   3566     mov        edi, edx
   3567     ret
   3568   }
   3569 }
   3570 
   3571 // Write 'count' bytes using an 8 bit value repeated.
   3572 __declspec(naked) void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
   3573   __asm {
   3574     mov        edx, edi
   3575     mov        edi, [esp + 4]  // dst
   3576     mov        eax, [esp + 8]  // v8
   3577     mov        ecx, [esp + 12]  // count
   3578     rep stosb
   3579     mov        edi, edx
   3580     ret
   3581   }
   3582 }
   3583 
   3584 // Write 'count' 32 bit values.
   3585 __declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
   3586   __asm {
   3587     mov        edx, edi
   3588     mov        edi, [esp + 4]  // dst
   3589     mov        eax, [esp + 8]  // v32
   3590     mov        ecx, [esp + 12]  // count
   3591     rep stosd
   3592     mov        edi, edx
   3593     ret
   3594   }
   3595 }
   3596 #endif  // HAS_SETROW_X86
   3597 
   3598 #ifdef HAS_YUY2TOYROW_AVX2
   3599 __declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2,
   3600                                        uint8* dst_y,
   3601                                        int width) {
   3602   __asm {
   3603     mov        eax, [esp + 4]  // src_yuy2
   3604     mov        edx, [esp + 8]  // dst_y
   3605     mov        ecx, [esp + 12]  // width
   3606     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
   3607     vpsrlw     ymm5, ymm5, 8
   3608 
   3609   convertloop:
   3610     vmovdqu    ymm0, [eax]
   3611     vmovdqu    ymm1, [eax + 32]
   3612     lea        eax,  [eax + 64]
   3613     vpand      ymm0, ymm0, ymm5  // even bytes are Y
   3614     vpand      ymm1, ymm1, ymm5
   3615     vpackuswb  ymm0, ymm0, ymm1  // mutates.
   3616     vpermq     ymm0, ymm0, 0xd8
   3617     vmovdqu    [edx], ymm0
   3618     lea        edx, [edx + 32]
   3619     sub        ecx, 32
   3620     jg         convertloop
   3621     vzeroupper
   3622     ret
   3623   }
   3624 }
   3625 
   3626 __declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
   3627                                         int stride_yuy2,
   3628                                         uint8* dst_u,
   3629                                         uint8* dst_v,
   3630                                         int width) {
   3631   __asm {
   3632     push       esi
   3633     push       edi
   3634     mov        eax, [esp + 8 + 4]  // src_yuy2
   3635     mov        esi, [esp + 8 + 8]  // stride_yuy2
   3636     mov        edx, [esp + 8 + 12]  // dst_u
   3637     mov        edi, [esp + 8 + 16]  // dst_v
   3638     mov        ecx, [esp + 8 + 20]  // width
   3639     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
   3640     vpsrlw     ymm5, ymm5, 8
   3641     sub        edi, edx
   3642 
   3643   convertloop:
   3644     vmovdqu    ymm0, [eax]
   3645     vmovdqu    ymm1, [eax + 32]
   3646     vpavgb     ymm0, ymm0, [eax + esi]
   3647     vpavgb     ymm1, ymm1, [eax + esi + 32]
   3648     lea        eax,  [eax + 64]
   3649     vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
   3650     vpsrlw     ymm1, ymm1, 8
   3651     vpackuswb  ymm0, ymm0, ymm1  // mutates.
   3652     vpermq     ymm0, ymm0, 0xd8
   3653     vpand      ymm1, ymm0, ymm5  // U
   3654     vpsrlw     ymm0, ymm0, 8  // V
   3655     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   3656     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   3657     vpermq     ymm1, ymm1, 0xd8
   3658     vpermq     ymm0, ymm0, 0xd8
   3659     vextractf128 [edx], ymm1, 0  // U
   3660     vextractf128 [edx + edi], ymm0, 0  // V
   3661     lea        edx, [edx + 16]
   3662     sub        ecx, 32
   3663     jg         convertloop
   3664 
   3665     pop        edi
   3666     pop        esi
   3667     vzeroupper
   3668     ret
   3669   }
   3670 }
   3671 
   3672 __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
   3673                                            uint8* dst_u,
   3674                                            uint8* dst_v,
   3675                                            int width) {
   3676   __asm {
   3677     push       edi
   3678     mov        eax, [esp + 4 + 4]  // src_yuy2
   3679     mov        edx, [esp + 4 + 8]  // dst_u
   3680     mov        edi, [esp + 4 + 12]  // dst_v
   3681     mov        ecx, [esp + 4 + 16]  // width
   3682     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
   3683     vpsrlw     ymm5, ymm5, 8
   3684     sub        edi, edx
   3685 
   3686   convertloop:
   3687     vmovdqu    ymm0, [eax]
   3688     vmovdqu    ymm1, [eax + 32]
   3689     lea        eax,  [eax + 64]
   3690     vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
   3691     vpsrlw     ymm1, ymm1, 8
   3692     vpackuswb  ymm0, ymm0, ymm1  // mutates.
   3693     vpermq     ymm0, ymm0, 0xd8
   3694     vpand      ymm1, ymm0, ymm5  // U
   3695     vpsrlw     ymm0, ymm0, 8  // V
   3696     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   3697     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   3698     vpermq     ymm1, ymm1, 0xd8
   3699     vpermq     ymm0, ymm0, 0xd8
   3700     vextractf128 [edx], ymm1, 0  // U
   3701     vextractf128 [edx + edi], ymm0, 0  // V
   3702     lea        edx, [edx + 16]
   3703     sub        ecx, 32
   3704     jg         convertloop
   3705 
   3706     pop        edi
   3707     vzeroupper
   3708     ret
   3709   }
   3710 }
   3711 
   3712 __declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy,
   3713                                        uint8* dst_y,
   3714                                        int width) {
   3715   __asm {
   3716     mov        eax, [esp + 4]  // src_uyvy
   3717     mov        edx, [esp + 8]  // dst_y
   3718     mov        ecx, [esp + 12]  // width
   3719 
   3720   convertloop:
   3721     vmovdqu    ymm0, [eax]
   3722     vmovdqu    ymm1, [eax + 32]
   3723     lea        eax,  [eax + 64]
   3724     vpsrlw     ymm0, ymm0, 8  // odd bytes are Y
   3725     vpsrlw     ymm1, ymm1, 8
   3726     vpackuswb  ymm0, ymm0, ymm1  // mutates.
   3727     vpermq     ymm0, ymm0, 0xd8
   3728     vmovdqu    [edx], ymm0
   3729     lea        edx, [edx + 32]
   3730     sub        ecx, 32
   3731     jg         convertloop
   3732     vzeroupper
   3733     ret
   3734   }
   3735 }
   3736 
   3737 __declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy,
   3738                                         int stride_uyvy,
   3739                                         uint8* dst_u,
   3740                                         uint8* dst_v,
   3741                                         int width) {
   3742   __asm {
   3743     push       esi
   3744     push       edi
   3745     mov        eax, [esp + 8 + 4]  // src_yuy2
   3746     mov        esi, [esp + 8 + 8]  // stride_yuy2
   3747     mov        edx, [esp + 8 + 12]  // dst_u
   3748     mov        edi, [esp + 8 + 16]  // dst_v
   3749     mov        ecx, [esp + 8 + 20]  // width
   3750     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
   3751     vpsrlw     ymm5, ymm5, 8
   3752     sub        edi, edx
   3753 
   3754   convertloop:
   3755     vmovdqu    ymm0, [eax]
   3756     vmovdqu    ymm1, [eax + 32]
   3757     vpavgb     ymm0, ymm0, [eax + esi]
   3758     vpavgb     ymm1, ymm1, [eax + esi + 32]
   3759     lea        eax,  [eax + 64]
   3760     vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
   3761     vpand      ymm1, ymm1, ymm5
   3762     vpackuswb  ymm0, ymm0, ymm1  // mutates.
   3763     vpermq     ymm0, ymm0, 0xd8
   3764     vpand      ymm1, ymm0, ymm5  // U
   3765     vpsrlw     ymm0, ymm0, 8  // V
   3766     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   3767     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   3768     vpermq     ymm1, ymm1, 0xd8
   3769     vpermq     ymm0, ymm0, 0xd8
   3770     vextractf128 [edx], ymm1, 0  // U
   3771     vextractf128 [edx + edi], ymm0, 0  // V
   3772     lea        edx, [edx + 16]
   3773     sub        ecx, 32
   3774     jg         convertloop
   3775 
   3776     pop        edi
   3777     pop        esi
   3778     vzeroupper
   3779     ret
   3780   }
   3781 }
   3782 
   3783 __declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
   3784                                            uint8* dst_u,
   3785                                            uint8* dst_v,
   3786                                            int width) {
   3787   __asm {
   3788     push       edi
   3789     mov        eax, [esp + 4 + 4]  // src_yuy2
   3790     mov        edx, [esp + 4 + 8]  // dst_u
   3791     mov        edi, [esp + 4 + 12]  // dst_v
   3792     mov        ecx, [esp + 4 + 16]  // width
   3793     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
   3794     vpsrlw     ymm5, ymm5, 8
   3795     sub        edi, edx
   3796 
   3797   convertloop:
   3798     vmovdqu    ymm0, [eax]
   3799     vmovdqu    ymm1, [eax + 32]
   3800     lea        eax,  [eax + 64]
   3801     vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
   3802     vpand      ymm1, ymm1, ymm5
   3803     vpackuswb  ymm0, ymm0, ymm1  // mutates.
   3804     vpermq     ymm0, ymm0, 0xd8
   3805     vpand      ymm1, ymm0, ymm5  // U
   3806     vpsrlw     ymm0, ymm0, 8  // V
   3807     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   3808     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   3809     vpermq     ymm1, ymm1, 0xd8
   3810     vpermq     ymm0, ymm0, 0xd8
   3811     vextractf128 [edx], ymm1, 0  // U
   3812     vextractf128 [edx + edi], ymm0, 0  // V
   3813     lea        edx, [edx + 16]
   3814     sub        ecx, 32
   3815     jg         convertloop
   3816 
   3817     pop        edi
   3818     vzeroupper
   3819     ret
   3820   }
   3821 }
   3822 #endif  // HAS_YUY2TOYROW_AVX2
   3823 
   3824 #ifdef HAS_YUY2TOYROW_SSE2
   3825 __declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2,
   3826                                        uint8* dst_y,
   3827                                        int width) {
   3828   __asm {
   3829     mov        eax, [esp + 4]  // src_yuy2
   3830     mov        edx, [esp + 8]  // dst_y
   3831     mov        ecx, [esp + 12]  // width
   3832     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
   3833     psrlw      xmm5, 8
   3834 
   3835   convertloop:
   3836     movdqu     xmm0, [eax]
   3837     movdqu     xmm1, [eax + 16]
   3838     lea        eax,  [eax + 32]
   3839     pand       xmm0, xmm5  // even bytes are Y
   3840     pand       xmm1, xmm5
   3841     packuswb   xmm0, xmm1
   3842     movdqu     [edx], xmm0
   3843     lea        edx, [edx + 16]
   3844     sub        ecx, 16
   3845     jg         convertloop
   3846     ret
   3847   }
   3848 }
   3849 
   3850 __declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
   3851                                         int stride_yuy2,
   3852                                         uint8* dst_u,
   3853                                         uint8* dst_v,
   3854                                         int width) {
   3855   __asm {
   3856     push       esi
   3857     push       edi
   3858     mov        eax, [esp + 8 + 4]  // src_yuy2
   3859     mov        esi, [esp + 8 + 8]  // stride_yuy2
   3860     mov        edx, [esp + 8 + 12]  // dst_u
   3861     mov        edi, [esp + 8 + 16]  // dst_v
   3862     mov        ecx, [esp + 8 + 20]  // width
   3863     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
   3864     psrlw      xmm5, 8
   3865     sub        edi, edx
   3866 
   3867   convertloop:
   3868     movdqu     xmm0, [eax]
   3869     movdqu     xmm1, [eax + 16]
   3870     movdqu     xmm2, [eax + esi]
   3871     movdqu     xmm3, [eax + esi + 16]
   3872     lea        eax,  [eax + 32]
   3873     pavgb      xmm0, xmm2
   3874     pavgb      xmm1, xmm3
   3875     psrlw      xmm0, 8  // YUYV -> UVUV
   3876     psrlw      xmm1, 8
   3877     packuswb   xmm0, xmm1
   3878     movdqa     xmm1, xmm0
   3879     pand       xmm0, xmm5  // U
   3880     packuswb   xmm0, xmm0
   3881     psrlw      xmm1, 8  // V
   3882     packuswb   xmm1, xmm1
   3883     movq       qword ptr [edx], xmm0
   3884     movq       qword ptr [edx + edi], xmm1
   3885     lea        edx, [edx + 8]
   3886     sub        ecx, 16
   3887     jg         convertloop
   3888 
   3889     pop        edi
   3890     pop        esi
   3891     ret
   3892   }
   3893 }
   3894 
   3895 __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
   3896                                            uint8* dst_u,
   3897                                            uint8* dst_v,
   3898                                            int width) {
   3899   __asm {
   3900     push       edi
   3901     mov        eax, [esp + 4 + 4]  // src_yuy2
   3902     mov        edx, [esp + 4 + 8]  // dst_u
   3903     mov        edi, [esp + 4 + 12]  // dst_v
   3904     mov        ecx, [esp + 4 + 16]  // width
   3905     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
   3906     psrlw      xmm5, 8
   3907     sub        edi, edx
   3908 
   3909   convertloop:
   3910     movdqu     xmm0, [eax]
   3911     movdqu     xmm1, [eax + 16]
   3912     lea        eax,  [eax + 32]
   3913     psrlw      xmm0, 8  // YUYV -> UVUV
   3914     psrlw      xmm1, 8
   3915     packuswb   xmm0, xmm1
   3916     movdqa     xmm1, xmm0
   3917     pand       xmm0, xmm5  // U
   3918     packuswb   xmm0, xmm0
   3919     psrlw      xmm1, 8  // V
   3920     packuswb   xmm1, xmm1
   3921     movq       qword ptr [edx], xmm0
   3922     movq       qword ptr [edx + edi], xmm1
   3923     lea        edx, [edx + 8]
   3924     sub        ecx, 16
   3925     jg         convertloop
   3926 
   3927     pop        edi
   3928     ret
   3929   }
   3930 }
   3931 
   3932 __declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy,
   3933                                        uint8* dst_y,
   3934                                        int width) {
   3935   __asm {
   3936     mov        eax, [esp + 4]  // src_uyvy
   3937     mov        edx, [esp + 8]  // dst_y
   3938     mov        ecx, [esp + 12]  // width
   3939 
   3940   convertloop:
   3941     movdqu     xmm0, [eax]
   3942     movdqu     xmm1, [eax + 16]
   3943     lea        eax,  [eax + 32]
   3944     psrlw      xmm0, 8  // odd bytes are Y
   3945     psrlw      xmm1, 8
   3946     packuswb   xmm0, xmm1
   3947     movdqu     [edx], xmm0
   3948     lea        edx, [edx + 16]
   3949     sub        ecx, 16
   3950     jg         convertloop
   3951     ret
   3952   }
   3953 }
   3954 
   3955 __declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy,
   3956                                         int stride_uyvy,
   3957                                         uint8* dst_u,
   3958                                         uint8* dst_v,
   3959                                         int width) {
   3960   __asm {
   3961     push       esi
   3962     push       edi
   3963     mov        eax, [esp + 8 + 4]  // src_yuy2
   3964     mov        esi, [esp + 8 + 8]  // stride_yuy2
   3965     mov        edx, [esp + 8 + 12]  // dst_u
   3966     mov        edi, [esp + 8 + 16]  // dst_v
   3967     mov        ecx, [esp + 8 + 20]  // width
   3968     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
   3969     psrlw      xmm5, 8
   3970     sub        edi, edx
   3971 
   3972   convertloop:
   3973     movdqu     xmm0, [eax]
   3974     movdqu     xmm1, [eax + 16]
   3975     movdqu     xmm2, [eax + esi]
   3976     movdqu     xmm3, [eax + esi + 16]
   3977     lea        eax,  [eax + 32]
   3978     pavgb      xmm0, xmm2
   3979     pavgb      xmm1, xmm3
   3980     pand       xmm0, xmm5  // UYVY -> UVUV
   3981     pand       xmm1, xmm5
   3982     packuswb   xmm0, xmm1
   3983     movdqa     xmm1, xmm0
   3984     pand       xmm0, xmm5  // U
   3985     packuswb   xmm0, xmm0
   3986     psrlw      xmm1, 8  // V
   3987     packuswb   xmm1, xmm1
   3988     movq       qword ptr [edx], xmm0
   3989     movq       qword ptr [edx + edi], xmm1
   3990     lea        edx, [edx + 8]
   3991     sub        ecx, 16
   3992     jg         convertloop
   3993 
   3994     pop        edi
   3995     pop        esi
   3996     ret
   3997   }
   3998 }
   3999 
   4000 __declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
   4001                                            uint8* dst_u,
   4002                                            uint8* dst_v,
   4003                                            int width) {
   4004   __asm {
   4005     push       edi
   4006     mov        eax, [esp + 4 + 4]  // src_yuy2
   4007     mov        edx, [esp + 4 + 8]  // dst_u
   4008     mov        edi, [esp + 4 + 12]  // dst_v
   4009     mov        ecx, [esp + 4 + 16]  // width
   4010     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
   4011     psrlw      xmm5, 8
   4012     sub        edi, edx
   4013 
   4014   convertloop:
   4015     movdqu     xmm0, [eax]
   4016     movdqu     xmm1, [eax + 16]
   4017     lea        eax,  [eax + 32]
   4018     pand       xmm0, xmm5  // UYVY -> UVUV
   4019     pand       xmm1, xmm5
   4020     packuswb   xmm0, xmm1
   4021     movdqa     xmm1, xmm0
   4022     pand       xmm0, xmm5  // U
   4023     packuswb   xmm0, xmm0
   4024     psrlw      xmm1, 8  // V
   4025     packuswb   xmm1, xmm1
   4026     movq       qword ptr [edx], xmm0
   4027     movq       qword ptr [edx + edi], xmm1
   4028     lea        edx, [edx + 8]
   4029     sub        ecx, 16
   4030     jg         convertloop
   4031 
   4032     pop        edi
   4033     ret
   4034   }
   4035 }
   4036 #endif  // HAS_YUY2TOYROW_SSE2
   4037 
   4038 #ifdef HAS_BLENDPLANEROW_SSSE3
   4039 // Blend 8 pixels at a time.
   4040 // unsigned version of math
   4041 // =((A2*C2)+(B2*(255-C2))+255)/256
   4042 // signed version of math
   4043 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
   4044 __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
   4045                                            const uint8* src1,
   4046                                            const uint8* alpha,
   4047                                            uint8* dst,
   4048                                            int width) {
   4049   __asm {
   4050     push       esi
   4051     push       edi
   4052     pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
   4053     psllw      xmm5, 8
   4054     mov        eax, 0x80808080  // 128 for biasing image to signed.
   4055     movd       xmm6, eax
   4056     pshufd     xmm6, xmm6, 0x00
   4057 
   4058     mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
   4059     movd       xmm7, eax
   4060     pshufd     xmm7, xmm7, 0x00
   4061     mov        eax, [esp + 8 + 4]  // src0
   4062     mov        edx, [esp + 8 + 8]  // src1
   4063     mov        esi, [esp + 8 + 12]  // alpha
   4064     mov        edi, [esp + 8 + 16]  // dst
   4065     mov        ecx, [esp + 8 + 20]  // width
   4066     sub        eax, esi
   4067     sub        edx, esi
   4068     sub        edi, esi
   4069 
   4070     // 8 pixel loop.
   4071   convertloop8:
   4072     movq       xmm0, qword ptr [esi]  // alpha
   4073     punpcklbw  xmm0, xmm0
   4074     pxor       xmm0, xmm5  // a, 255-a
   4075     movq       xmm1, qword ptr [eax + esi]  // src0
   4076     movq       xmm2, qword ptr [edx + esi]  // src1
   4077     punpcklbw  xmm1, xmm2
   4078     psubb      xmm1, xmm6  // bias src0/1 - 128
   4079     pmaddubsw  xmm0, xmm1
   4080     paddw      xmm0, xmm7  // unbias result - 32768 and round.
   4081     psrlw      xmm0, 8
   4082     packuswb   xmm0, xmm0
   4083     movq       qword ptr [edi + esi], xmm0
   4084     lea        esi, [esi + 8]
   4085     sub        ecx, 8
   4086     jg         convertloop8
   4087 
   4088     pop        edi
   4089     pop        esi
   4090     ret
   4091   }
   4092 }
   4093 #endif  // HAS_BLENDPLANEROW_SSSE3
   4094 
   4095 #ifdef HAS_BLENDPLANEROW_AVX2
   4096 // Blend 32 pixels at a time.
   4097 // unsigned version of math
   4098 // =((A2*C2)+(B2*(255-C2))+255)/256
   4099 // signed version of math
   4100 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
   4101 __declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0,
   4102                                           const uint8* src1,
   4103                                           const uint8* alpha,
   4104                                           uint8* dst,
   4105                                           int width) {
   4106   __asm {
   4107     push        esi
   4108     push        edi
   4109     vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff00ff00
   4110     vpsllw      ymm5, ymm5, 8
   4111     mov         eax, 0x80808080  // 128 for biasing image to signed.
   4112     vmovd       xmm6, eax
   4113     vbroadcastss ymm6, xmm6
   4114     mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
   4115     vmovd       xmm7, eax
   4116     vbroadcastss ymm7, xmm7
   4117     mov         eax, [esp + 8 + 4]  // src0
   4118     mov         edx, [esp + 8 + 8]  // src1
   4119     mov         esi, [esp + 8 + 12]  // alpha
   4120     mov         edi, [esp + 8 + 16]  // dst
   4121     mov         ecx, [esp + 8 + 20]  // width
   4122     sub         eax, esi
   4123     sub         edx, esi
   4124     sub         edi, esi
   4125 
   4126     // 32 pixel loop.
   4127   convertloop32:
   4128     vmovdqu     ymm0, [esi]  // alpha
   4129     vpunpckhbw  ymm3, ymm0, ymm0  // 8..15, 24..31
   4130     vpunpcklbw  ymm0, ymm0, ymm0  // 0..7, 16..23
   4131     vpxor       ymm3, ymm3, ymm5  // a, 255-a
   4132     vpxor       ymm0, ymm0, ymm5  // a, 255-a
   4133     vmovdqu     ymm1, [eax + esi]  // src0
   4134     vmovdqu     ymm2, [edx + esi]  // src1
   4135     vpunpckhbw  ymm4, ymm1, ymm2
   4136     vpunpcklbw  ymm1, ymm1, ymm2
   4137     vpsubb      ymm4, ymm4, ymm6  // bias src0/1 - 128
   4138     vpsubb      ymm1, ymm1, ymm6  // bias src0/1 - 128
   4139     vpmaddubsw  ymm3, ymm3, ymm4
   4140     vpmaddubsw  ymm0, ymm0, ymm1
   4141     vpaddw      ymm3, ymm3, ymm7  // unbias result - 32768 and round.
   4142     vpaddw      ymm0, ymm0, ymm7  // unbias result - 32768 and round.
   4143     vpsrlw      ymm3, ymm3, 8
   4144     vpsrlw      ymm0, ymm0, 8
   4145     vpackuswb   ymm0, ymm0, ymm3
   4146     vmovdqu     [edi + esi], ymm0
   4147     lea         esi, [esi + 32]
   4148     sub         ecx, 32
   4149     jg          convertloop32
   4150 
   4151     pop         edi
   4152     pop         esi
   4153     vzeroupper
   4154     ret
   4155   }
   4156 }
   4157 #endif  // HAS_BLENDPLANEROW_AVX2
   4158 
   4159 #ifdef HAS_ARGBBLENDROW_SSSE3
   4160 // Shuffle table for isolating alpha.
   4161 static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
   4162                                     11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
   4163 
   4164 // Blend 8 pixels at a time.
   4165 __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
   4166                                           const uint8* src_argb1,
   4167                                           uint8* dst_argb,
   4168                                           int width) {
   4169   __asm {
   4170     push       esi
   4171     mov        eax, [esp + 4 + 4]  // src_argb0
   4172     mov        esi, [esp + 4 + 8]  // src_argb1
   4173     mov        edx, [esp + 4 + 12]  // dst_argb
   4174     mov        ecx, [esp + 4 + 16]  // width
   4175     pcmpeqb    xmm7, xmm7  // generate constant 0x0001
   4176     psrlw      xmm7, 15
   4177     pcmpeqb    xmm6, xmm6  // generate mask 0x00ff00ff
   4178     psrlw      xmm6, 8
   4179     pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
   4180     psllw      xmm5, 8
   4181     pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
   4182     pslld      xmm4, 24
   4183     sub        ecx, 4
   4184     jl         convertloop4b  // less than 4 pixels?
   4185 
   4186     // 4 pixel loop.
   4187   convertloop4:
   4188     movdqu     xmm3, [eax]  // src argb
   4189     lea        eax, [eax + 16]
   4190     movdqa     xmm0, xmm3  // src argb
   4191     pxor       xmm3, xmm4  // ~alpha
   4192     movdqu     xmm2, [esi]  // _r_b
   4193     pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
   4194     pand       xmm2, xmm6  // _r_b
   4195     paddw      xmm3, xmm7  // 256 - alpha
   4196     pmullw     xmm2, xmm3  // _r_b * alpha
   4197     movdqu     xmm1, [esi]  // _a_g
   4198     lea        esi, [esi + 16]
   4199     psrlw      xmm1, 8  // _a_g
   4200     por        xmm0, xmm4  // set alpha to 255
   4201     pmullw     xmm1, xmm3  // _a_g * alpha
   4202     psrlw      xmm2, 8  // _r_b convert to 8 bits again
   4203     paddusb    xmm0, xmm2  // + src argb
   4204     pand       xmm1, xmm5  // a_g_ convert to 8 bits again
   4205     paddusb    xmm0, xmm1  // + src argb
   4206     movdqu     [edx], xmm0
   4207     lea        edx, [edx + 16]
   4208     sub        ecx, 4
   4209     jge        convertloop4
   4210 
   4211   convertloop4b:
   4212     add        ecx, 4 - 1
   4213     jl         convertloop1b
   4214 
   4215     // 1 pixel loop.
   4216   convertloop1:
   4217     movd       xmm3, [eax]  // src argb
   4218     lea        eax, [eax + 4]
   4219     movdqa     xmm0, xmm3  // src argb
   4220     pxor       xmm3, xmm4  // ~alpha
   4221     movd       xmm2, [esi]  // _r_b
   4222     pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
   4223     pand       xmm2, xmm6  // _r_b
   4224     paddw      xmm3, xmm7  // 256 - alpha
   4225     pmullw     xmm2, xmm3  // _r_b * alpha
   4226     movd       xmm1, [esi]  // _a_g
   4227     lea        esi, [esi + 4]
   4228     psrlw      xmm1, 8  // _a_g
   4229     por        xmm0, xmm4  // set alpha to 255
   4230     pmullw     xmm1, xmm3  // _a_g * alpha
   4231     psrlw      xmm2, 8  // _r_b convert to 8 bits again
   4232     paddusb    xmm0, xmm2  // + src argb
   4233     pand       xmm1, xmm5  // a_g_ convert to 8 bits again
   4234     paddusb    xmm0, xmm1  // + src argb
   4235     movd       [edx], xmm0
   4236     lea        edx, [edx + 4]
   4237     sub        ecx, 1
   4238     jge        convertloop1
   4239 
   4240   convertloop1b:
   4241     pop        esi
   4242     ret
   4243   }
   4244 }
   4245 #endif  // HAS_ARGBBLENDROW_SSSE3
   4246 
   4247 #ifdef HAS_ARGBATTENUATEROW_SSSE3
   4248 // Shuffle table duplicating alpha.
   4249 static const uvec8 kShuffleAlpha0 = {
   4250     3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
   4251 };
   4252 static const uvec8 kShuffleAlpha1 = {
   4253     11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
   4254     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
   4255 };
   4256 __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb,
   4257                                               uint8* dst_argb,
   4258                                               int width) {
   4259   __asm {
   4260     mov        eax, [esp + 4]  // src_argb0
   4261     mov        edx, [esp + 8]  // dst_argb
   4262     mov        ecx, [esp + 12]  // width
   4263     pcmpeqb    xmm3, xmm3  // generate mask 0xff000000
   4264     pslld      xmm3, 24
   4265     movdqa     xmm4, xmmword ptr kShuffleAlpha0
   4266     movdqa     xmm5, xmmword ptr kShuffleAlpha1
   4267 
   4268  convertloop:
   4269     movdqu     xmm0, [eax]  // read 4 pixels
   4270     pshufb     xmm0, xmm4  // isolate first 2 alphas
   4271     movdqu     xmm1, [eax]  // read 4 pixels
   4272     punpcklbw  xmm1, xmm1  // first 2 pixel rgbs
   4273     pmulhuw    xmm0, xmm1  // rgb * a
   4274     movdqu     xmm1, [eax]  // read 4 pixels
   4275     pshufb     xmm1, xmm5  // isolate next 2 alphas
   4276     movdqu     xmm2, [eax]  // read 4 pixels
   4277     punpckhbw  xmm2, xmm2  // next 2 pixel rgbs
   4278     pmulhuw    xmm1, xmm2  // rgb * a
   4279     movdqu     xmm2, [eax]  // mask original alpha
   4280     lea        eax, [eax + 16]
   4281     pand       xmm2, xmm3
   4282     psrlw      xmm0, 8
   4283     psrlw      xmm1, 8
   4284     packuswb   xmm0, xmm1
   4285     por        xmm0, xmm2  // copy original alpha
   4286     movdqu     [edx], xmm0
   4287     lea        edx, [edx + 16]
   4288     sub        ecx, 4
   4289     jg         convertloop
   4290 
   4291     ret
   4292   }
   4293 }
   4294 #endif  // HAS_ARGBATTENUATEROW_SSSE3
   4295 
   4296 #ifdef HAS_ARGBATTENUATEROW_AVX2
   4297 // Shuffle table duplicating alpha.
   4298 static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
   4299                                          128u, 128u, 14u,  15u, 14u, 15u,
   4300                                          14u,  15u,  128u, 128u};
   4301 __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb,
   4302                                              uint8* dst_argb,
   4303                                              int width) {
   4304   __asm {
   4305     mov        eax, [esp + 4]  // src_argb0
   4306     mov        edx, [esp + 8]  // dst_argb
   4307     mov        ecx, [esp + 12]  // width
   4308     sub        edx, eax
   4309     vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
   4310     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000
   4311     vpslld     ymm5, ymm5, 24
   4312 
   4313  convertloop:
   4314     vmovdqu    ymm6, [eax]  // read 8 pixels.
   4315     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
   4316     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
   4317     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
   4318     vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
   4319     vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
   4320     vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
   4321     vpand      ymm6, ymm6, ymm5  // isolate alpha
   4322     vpsrlw     ymm0, ymm0, 8
   4323     vpsrlw     ymm1, ymm1, 8
   4324     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
   4325     vpor       ymm0, ymm0, ymm6  // copy original alpha
   4326     vmovdqu    [eax + edx], ymm0
   4327     lea        eax, [eax + 32]
   4328     sub        ecx, 8
   4329     jg         convertloop
   4330 
   4331     vzeroupper
   4332     ret
   4333   }
   4334 }
   4335 #endif  // HAS_ARGBATTENUATEROW_AVX2
   4336 
   4337 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
   4338 // Unattenuate 4 pixels at a time.
   4339 __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
   4340                                                uint8* dst_argb,
   4341                                                int width) {
   4342   __asm {
   4343     push       ebx
   4344     push       esi
   4345     push       edi
   4346     mov        eax, [esp + 12 + 4]  // src_argb
   4347     mov        edx, [esp + 12 + 8]  // dst_argb
   4348     mov        ecx, [esp + 12 + 12]  // width
   4349     lea        ebx, fixed_invtbl8
   4350 
   4351  convertloop:
   4352     movdqu     xmm0, [eax]  // read 4 pixels
   4353     movzx      esi, byte ptr [eax + 3]  // first alpha
   4354     movzx      edi, byte ptr [eax + 7]  // second alpha
   4355     punpcklbw  xmm0, xmm0  // first 2
   4356     movd       xmm2, dword ptr [ebx + esi * 4]
   4357     movd       xmm3, dword ptr [ebx + edi * 4]
   4358     pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words.  1, a, a, a
   4359     pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
   4360     movlhps    xmm2, xmm3
   4361     pmulhuw    xmm0, xmm2  // rgb * a
   4362 
   4363     movdqu     xmm1, [eax]  // read 4 pixels
   4364     movzx      esi, byte ptr [eax + 11]  // third alpha
   4365     movzx      edi, byte ptr [eax + 15]  // forth alpha
   4366     punpckhbw  xmm1, xmm1  // next 2
   4367     movd       xmm2, dword ptr [ebx + esi * 4]
   4368     movd       xmm3, dword ptr [ebx + edi * 4]
   4369     pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words
   4370     pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
   4371     movlhps    xmm2, xmm3
   4372     pmulhuw    xmm1, xmm2  // rgb * a
   4373     lea        eax, [eax + 16]
   4374     packuswb   xmm0, xmm1
   4375     movdqu     [edx], xmm0
   4376     lea        edx, [edx + 16]
   4377     sub        ecx, 4
   4378     jg         convertloop
   4379 
   4380     pop        edi
   4381     pop        esi
   4382     pop        ebx
   4383     ret
   4384   }
   4385 }
   4386 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
   4387 
   4388 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
   4389 // Shuffle table duplicating alpha.
   4390 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
   4391     0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
   4392 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
   4393 // USE_GATHER is not on by default, due to being a slow instruction.
   4394 #ifdef USE_GATHER
   4395 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
   4396                                                uint8* dst_argb,
   4397                                                int width) {
   4398   __asm {
   4399     mov        eax, [esp + 4]  // src_argb0
   4400     mov        edx, [esp + 8]  // dst_argb
   4401     mov        ecx, [esp + 12]  // width
   4402     sub        edx, eax
   4403     vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
   4404 
   4405  convertloop:
   4406     vmovdqu    ymm6, [eax]  // read 8 pixels.
   4407     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
   4408     vpsrld     ymm2, ymm6, 24  // alpha in low 8 bits.
   4409     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
   4410     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
   4411     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
   4412     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
   4413     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
   4414     vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
   4415     vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
   4416     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
   4417     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
   4418     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
   4419     vmovdqu    [eax + edx], ymm0
   4420     lea        eax, [eax + 32]
   4421     sub        ecx, 8
   4422     jg         convertloop
   4423 
   4424     vzeroupper
   4425     ret
   4426   }
   4427 }
   4428 #else   // USE_GATHER
   4429 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
   4430                                                uint8* dst_argb,
   4431                                                int width) {
   4432   __asm {
   4433 
   4434     push       ebx
   4435     push       esi
   4436     push       edi
   4437     mov        eax, [esp + 12 + 4]  // src_argb
   4438     mov        edx, [esp + 12 + 8]  // dst_argb
   4439     mov        ecx, [esp + 12 + 12]  // width
   4440     sub        edx, eax
   4441     lea        ebx, fixed_invtbl8
   4442     vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
   4443 
   4444  convertloop:
   4445         // replace VPGATHER
   4446     movzx      esi, byte ptr [eax + 3]  // alpha0
   4447     movzx      edi, byte ptr [eax + 7]  // alpha1
   4448     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
   4449     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
   4450     movzx      esi, byte ptr [eax + 11]  // alpha2
   4451     movzx      edi, byte ptr [eax + 15]  // alpha3
   4452     vpunpckldq xmm6, xmm0, xmm1  // [1,a1,1,a0]
   4453     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
   4454     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
   4455     movzx      esi, byte ptr [eax + 19]  // alpha4
   4456     movzx      edi, byte ptr [eax + 23]  // alpha5
   4457     vpunpckldq xmm7, xmm2, xmm3  // [1,a3,1,a2]
   4458     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
   4459     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
   4460     movzx      esi, byte ptr [eax + 27]  // alpha6
   4461     movzx      edi, byte ptr [eax + 31]  // alpha7
   4462     vpunpckldq xmm0, xmm0, xmm1  // [1,a5,1,a4]
   4463     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
   4464     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
   4465     vpunpckldq xmm2, xmm2, xmm3  // [1,a7,1,a6]
   4466     vpunpcklqdq xmm3, xmm6, xmm7  // [1,a3,1,a2,1,a1,1,a0]
   4467     vpunpcklqdq xmm0, xmm0, xmm2  // [1,a7,1,a6,1,a5,1,a4]
   4468     vinserti128 ymm3, ymm3, xmm0, 1                // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
   4469     // end of VPGATHER
   4470 
   4471     vmovdqu    ymm6, [eax]  // read 8 pixels.
   4472     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
   4473     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
   4474     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
   4475     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
   4476     vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
   4477     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
   4478     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
   4479     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
   4480     vpackuswb  ymm0, ymm0, ymm1             // unmutated.
   4481     vmovdqu    [eax + edx], ymm0
   4482     lea        eax, [eax + 32]
   4483     sub        ecx, 8
   4484     jg         convertloop
   4485 
   4486     pop        edi
   4487     pop        esi
   4488     pop        ebx
   4489     vzeroupper
   4490     ret
   4491   }
   4492 }
   4493 #endif  // USE_GATHER
   4494 #endif  // HAS_ARGBATTENUATEROW_AVX2
   4495 
   4496 #ifdef HAS_ARGBGRAYROW_SSSE3
   4497 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
   4498 __declspec(naked) void ARGBGrayRow_SSSE3(const uint8* src_argb,
   4499                                          uint8* dst_argb,
   4500                                          int width) {
   4501   __asm {
   4502     mov        eax, [esp + 4] /* src_argb */
   4503     mov        edx, [esp + 8] /* dst_argb */
   4504     mov        ecx, [esp + 12] /* width */
   4505     movdqa     xmm4, xmmword ptr kARGBToYJ
   4506     movdqa     xmm5, xmmword ptr kAddYJ64
   4507 
   4508  convertloop:
   4509     movdqu     xmm0, [eax]  // G
   4510     movdqu     xmm1, [eax + 16]
   4511     pmaddubsw  xmm0, xmm4
   4512     pmaddubsw  xmm1, xmm4
   4513     phaddw     xmm0, xmm1
   4514     paddw      xmm0, xmm5  // Add .5 for rounding.
   4515     psrlw      xmm0, 7
   4516     packuswb   xmm0, xmm0  // 8 G bytes
   4517     movdqu     xmm2, [eax]  // A
   4518     movdqu     xmm3, [eax + 16]
   4519     lea        eax, [eax + 32]
   4520     psrld      xmm2, 24
   4521     psrld      xmm3, 24
   4522     packuswb   xmm2, xmm3
   4523     packuswb   xmm2, xmm2  // 8 A bytes
   4524     movdqa     xmm3, xmm0  // Weave into GG, GA, then GGGA
   4525     punpcklbw  xmm0, xmm0  // 8 GG words
   4526     punpcklbw  xmm3, xmm2  // 8 GA words
   4527     movdqa     xmm1, xmm0
   4528     punpcklwd  xmm0, xmm3  // GGGA first 4
   4529     punpckhwd  xmm1, xmm3  // GGGA next 4
   4530     movdqu     [edx], xmm0
   4531     movdqu     [edx + 16], xmm1
   4532     lea        edx, [edx + 32]
   4533     sub        ecx, 8
   4534     jg         convertloop
   4535     ret
   4536   }
   4537 }
   4538 #endif  // HAS_ARGBGRAYROW_SSSE3
   4539 
   4540 #ifdef HAS_ARGBSEPIAROW_SSSE3
   4541 //    b = (r * 35 + g * 68 + b * 17) >> 7
   4542 //    g = (r * 45 + g * 88 + b * 22) >> 7
   4543 //    r = (r * 50 + g * 98 + b * 24) >> 7
   4544 // Constant for ARGB color to sepia tone.
   4545 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
   4546                                    17, 68, 35, 0, 17, 68, 35, 0};
   4547 
   4548 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
   4549                                    22, 88, 45, 0, 22, 88, 45, 0};
   4550 
   4551 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
   4552                                    24, 98, 50, 0, 24, 98, 50, 0};
   4553 
   4554 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
   4555 __declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
   4556   __asm {
   4557     mov        eax, [esp + 4] /* dst_argb */
   4558     mov        ecx, [esp + 8] /* width */
   4559     movdqa     xmm2, xmmword ptr kARGBToSepiaB
   4560     movdqa     xmm3, xmmword ptr kARGBToSepiaG
   4561     movdqa     xmm4, xmmword ptr kARGBToSepiaR
   4562 
   4563  convertloop:
   4564     movdqu     xmm0, [eax]  // B
   4565     movdqu     xmm6, [eax + 16]
   4566     pmaddubsw  xmm0, xmm2
   4567     pmaddubsw  xmm6, xmm2
   4568     phaddw     xmm0, xmm6
   4569     psrlw      xmm0, 7
   4570     packuswb   xmm0, xmm0  // 8 B values
   4571     movdqu     xmm5, [eax]  // G
   4572     movdqu     xmm1, [eax + 16]
   4573     pmaddubsw  xmm5, xmm3
   4574     pmaddubsw  xmm1, xmm3
   4575     phaddw     xmm5, xmm1
   4576     psrlw      xmm5, 7
   4577     packuswb   xmm5, xmm5  // 8 G values
   4578     punpcklbw  xmm0, xmm5  // 8 BG values
   4579     movdqu     xmm5, [eax]  // R
   4580     movdqu     xmm1, [eax + 16]
   4581     pmaddubsw  xmm5, xmm4
   4582     pmaddubsw  xmm1, xmm4
   4583     phaddw     xmm5, xmm1
   4584     psrlw      xmm5, 7
   4585     packuswb   xmm5, xmm5  // 8 R values
   4586     movdqu     xmm6, [eax]  // A
   4587     movdqu     xmm1, [eax + 16]
   4588     psrld      xmm6, 24
   4589     psrld      xmm1, 24
   4590     packuswb   xmm6, xmm1
   4591     packuswb   xmm6, xmm6  // 8 A values
   4592     punpcklbw  xmm5, xmm6  // 8 RA values
   4593     movdqa     xmm1, xmm0  // Weave BG, RA together
   4594     punpcklwd  xmm0, xmm5  // BGRA first 4
   4595     punpckhwd  xmm1, xmm5  // BGRA next 4
   4596     movdqu     [eax], xmm0
   4597     movdqu     [eax + 16], xmm1
   4598     lea        eax, [eax + 32]
   4599     sub        ecx, 8
   4600     jg         convertloop
   4601     ret
   4602   }
   4603 }
   4604 #endif  // HAS_ARGBSEPIAROW_SSSE3
   4605 
   4606 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
   4607 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
   4608 // Same as Sepia except matrix is provided.
   4609 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
   4610 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
   4611 __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
   4612                                                 uint8* dst_argb,
   4613                                                 const int8* matrix_argb,
   4614                                                 int width) {
   4615   __asm {
   4616     mov        eax, [esp + 4] /* src_argb */
   4617     mov        edx, [esp + 8] /* dst_argb */
   4618     mov        ecx, [esp + 12] /* matrix_argb */
   4619     movdqu     xmm5, [ecx]
   4620     pshufd     xmm2, xmm5, 0x00
   4621     pshufd     xmm3, xmm5, 0x55
   4622     pshufd     xmm4, xmm5, 0xaa
   4623     pshufd     xmm5, xmm5, 0xff
   4624     mov        ecx, [esp + 16] /* width */
   4625 
   4626  convertloop:
   4627     movdqu     xmm0, [eax]  // B
   4628     movdqu     xmm7, [eax + 16]
   4629     pmaddubsw  xmm0, xmm2
   4630     pmaddubsw  xmm7, xmm2
   4631     movdqu     xmm6, [eax]  // G
   4632     movdqu     xmm1, [eax + 16]
   4633     pmaddubsw  xmm6, xmm3
   4634     pmaddubsw  xmm1, xmm3
   4635     phaddsw    xmm0, xmm7  // B
   4636     phaddsw    xmm6, xmm1  // G
   4637     psraw      xmm0, 6  // B
   4638     psraw      xmm6, 6  // G
   4639     packuswb   xmm0, xmm0  // 8 B values
   4640     packuswb   xmm6, xmm6  // 8 G values
   4641     punpcklbw  xmm0, xmm6  // 8 BG values
   4642     movdqu     xmm1, [eax]  // R
   4643     movdqu     xmm7, [eax + 16]
   4644     pmaddubsw  xmm1, xmm4
   4645     pmaddubsw  xmm7, xmm4
   4646     phaddsw    xmm1, xmm7  // R
   4647     movdqu     xmm6, [eax]  // A
   4648     movdqu     xmm7, [eax + 16]
   4649     pmaddubsw  xmm6, xmm5
   4650     pmaddubsw  xmm7, xmm5
   4651     phaddsw    xmm6, xmm7  // A
   4652     psraw      xmm1, 6  // R
   4653     psraw      xmm6, 6  // A
   4654     packuswb   xmm1, xmm1  // 8 R values
   4655     packuswb   xmm6, xmm6  // 8 A values
   4656     punpcklbw  xmm1, xmm6  // 8 RA values
   4657     movdqa     xmm6, xmm0  // Weave BG, RA together
   4658     punpcklwd  xmm0, xmm1  // BGRA first 4
   4659     punpckhwd  xmm6, xmm1  // BGRA next 4
   4660     movdqu     [edx], xmm0
   4661     movdqu     [edx + 16], xmm6
   4662     lea        eax, [eax + 32]
   4663     lea        edx, [edx + 32]
   4664     sub        ecx, 8
   4665     jg         convertloop
   4666     ret
   4667   }
   4668 }
   4669 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
   4670 
   4671 #ifdef HAS_ARGBQUANTIZEROW_SSE2
   4672 // Quantize 4 ARGB pixels (16 bytes).
   4673 __declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb,
   4674                                             int scale,
   4675                                             int interval_size,
   4676                                             int interval_offset,
   4677                                             int width) {
   4678   __asm {
   4679     mov        eax, [esp + 4] /* dst_argb */
   4680     movd       xmm2, [esp + 8] /* scale */
   4681     movd       xmm3, [esp + 12] /* interval_size */
   4682     movd       xmm4, [esp + 16] /* interval_offset */
   4683     mov        ecx, [esp + 20] /* width */
   4684     pshuflw    xmm2, xmm2, 040h
   4685     pshufd     xmm2, xmm2, 044h
   4686     pshuflw    xmm3, xmm3, 040h
   4687     pshufd     xmm3, xmm3, 044h
   4688     pshuflw    xmm4, xmm4, 040h
   4689     pshufd     xmm4, xmm4, 044h
   4690     pxor       xmm5, xmm5  // constant 0
   4691     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
   4692     pslld      xmm6, 24
   4693 
   4694  convertloop:
   4695     movdqu     xmm0, [eax]  // read 4 pixels
   4696     punpcklbw  xmm0, xmm5  // first 2 pixels
   4697     pmulhuw    xmm0, xmm2  // pixel * scale >> 16
   4698     movdqu     xmm1, [eax]  // read 4 pixels
   4699     punpckhbw  xmm1, xmm5  // next 2 pixels
   4700     pmulhuw    xmm1, xmm2
   4701     pmullw     xmm0, xmm3  // * interval_size
   4702     movdqu     xmm7, [eax]  // read 4 pixels
   4703     pmullw     xmm1, xmm3
   4704     pand       xmm7, xmm6  // mask alpha
   4705     paddw      xmm0, xmm4  // + interval_size / 2
   4706     paddw      xmm1, xmm4
   4707     packuswb   xmm0, xmm1
   4708     por        xmm0, xmm7
   4709     movdqu     [eax], xmm0
   4710     lea        eax, [eax + 16]
   4711     sub        ecx, 4
   4712     jg         convertloop
   4713     ret
   4714   }
   4715 }
   4716 #endif  // HAS_ARGBQUANTIZEROW_SSE2
   4717 
   4718 #ifdef HAS_ARGBSHADEROW_SSE2
   4719 // Shade 4 pixels at a time by specified value.
   4720 __declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb,
   4721                                          uint8* dst_argb,
   4722                                          int width,
   4723                                          uint32 value) {
   4724   __asm {
   4725     mov        eax, [esp + 4]  // src_argb
   4726     mov        edx, [esp + 8]  // dst_argb
   4727     mov        ecx, [esp + 12]  // width
   4728     movd       xmm2, [esp + 16]  // value
   4729     punpcklbw  xmm2, xmm2
   4730     punpcklqdq xmm2, xmm2
   4731 
   4732  convertloop:
   4733     movdqu     xmm0, [eax]  // read 4 pixels
   4734     lea        eax, [eax + 16]
   4735     movdqa     xmm1, xmm0
   4736     punpcklbw  xmm0, xmm0  // first 2
   4737     punpckhbw  xmm1, xmm1  // next 2
   4738     pmulhuw    xmm0, xmm2  // argb * value
   4739     pmulhuw    xmm1, xmm2  // argb * value
   4740     psrlw      xmm0, 8
   4741     psrlw      xmm1, 8
   4742     packuswb   xmm0, xmm1
   4743     movdqu     [edx], xmm0
   4744     lea        edx, [edx + 16]
   4745     sub        ecx, 4
   4746     jg         convertloop
   4747 
   4748     ret
   4749   }
   4750 }
   4751 #endif  // HAS_ARGBSHADEROW_SSE2
   4752 
   4753 #ifdef HAS_ARGBMULTIPLYROW_SSE2
   4754 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
   4755 __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
   4756                                             const uint8* src_argb1,
   4757                                             uint8* dst_argb,
   4758                                             int width) {
   4759   __asm {
   4760     push       esi
   4761     mov        eax, [esp + 4 + 4]  // src_argb0
   4762     mov        esi, [esp + 4 + 8]  // src_argb1
   4763     mov        edx, [esp + 4 + 12]  // dst_argb
   4764     mov        ecx, [esp + 4 + 16]  // width
   4765     pxor       xmm5, xmm5  // constant 0
   4766 
   4767  convertloop:
   4768     movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
   4769     movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
   4770     movdqu     xmm1, xmm0
   4771     movdqu     xmm3, xmm2
   4772     punpcklbw  xmm0, xmm0  // first 2
   4773     punpckhbw  xmm1, xmm1  // next 2
   4774     punpcklbw  xmm2, xmm5  // first 2
   4775     punpckhbw  xmm3, xmm5  // next 2
   4776     pmulhuw    xmm0, xmm2  // src_argb0 * src_argb1 first 2
   4777     pmulhuw    xmm1, xmm3  // src_argb0 * src_argb1 next 2
   4778     lea        eax, [eax + 16]
   4779     lea        esi, [esi + 16]
   4780     packuswb   xmm0, xmm1
   4781     movdqu     [edx], xmm0
   4782     lea        edx, [edx + 16]
   4783     sub        ecx, 4
   4784     jg         convertloop
   4785 
   4786     pop        esi
   4787     ret
   4788   }
   4789 }
   4790 #endif  // HAS_ARGBMULTIPLYROW_SSE2
   4791 
   4792 #ifdef HAS_ARGBADDROW_SSE2
   4793 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
   4794 // TODO(fbarchard): Port this to posix, neon and other math functions.
   4795 __declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0,
   4796                                        const uint8* src_argb1,
   4797                                        uint8* dst_argb,
   4798                                        int width) {
   4799   __asm {
   4800     push       esi
   4801     mov        eax, [esp + 4 + 4]  // src_argb0
   4802     mov        esi, [esp + 4 + 8]  // src_argb1
   4803     mov        edx, [esp + 4 + 12]  // dst_argb
   4804     mov        ecx, [esp + 4 + 16]  // width
   4805 
   4806     sub        ecx, 4
   4807     jl         convertloop49
   4808 
   4809  convertloop4:
   4810     movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
   4811     lea        eax, [eax + 16]
   4812     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
   4813     lea        esi, [esi + 16]
   4814     paddusb    xmm0, xmm1  // src_argb0 + src_argb1
   4815     movdqu     [edx], xmm0
   4816     lea        edx, [edx + 16]
   4817     sub        ecx, 4
   4818     jge        convertloop4
   4819 
   4820  convertloop49:
   4821     add        ecx, 4 - 1
   4822     jl         convertloop19
   4823 
   4824  convertloop1:
   4825     movd       xmm0, [eax]  // read 1 pixels from src_argb0
   4826     lea        eax, [eax + 4]
   4827     movd       xmm1, [esi]  // read 1 pixels from src_argb1
   4828     lea        esi, [esi + 4]
   4829     paddusb    xmm0, xmm1  // src_argb0 + src_argb1
   4830     movd       [edx], xmm0
   4831     lea        edx, [edx + 4]
   4832     sub        ecx, 1
   4833     jge        convertloop1
   4834 
   4835  convertloop19:
   4836     pop        esi
   4837     ret
   4838   }
   4839 }
   4840 #endif  // HAS_ARGBADDROW_SSE2
   4841 
   4842 #ifdef HAS_ARGBSUBTRACTROW_SSE2
   4843 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
   4844 __declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0,
   4845                                             const uint8* src_argb1,
   4846                                             uint8* dst_argb,
   4847                                             int width) {
   4848   __asm {
   4849     push       esi
   4850     mov        eax, [esp + 4 + 4]  // src_argb0
   4851     mov        esi, [esp + 4 + 8]  // src_argb1
   4852     mov        edx, [esp + 4 + 12]  // dst_argb
   4853     mov        ecx, [esp + 4 + 16]  // width
   4854 
   4855  convertloop:
   4856     movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
   4857     lea        eax, [eax + 16]
   4858     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
   4859     lea        esi, [esi + 16]
   4860     psubusb    xmm0, xmm1  // src_argb0 - src_argb1
   4861     movdqu     [edx], xmm0
   4862     lea        edx, [edx + 16]
   4863     sub        ecx, 4
   4864     jg         convertloop
   4865 
   4866     pop        esi
   4867     ret
   4868   }
   4869 }
   4870 #endif  // HAS_ARGBSUBTRACTROW_SSE2
   4871 
   4872 #ifdef HAS_ARGBMULTIPLYROW_AVX2
   4873 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
   4874 __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
   4875                                             const uint8* src_argb1,
   4876                                             uint8* dst_argb,
   4877                                             int width) {
   4878   __asm {
   4879     push       esi
   4880     mov        eax, [esp + 4 + 4]  // src_argb0
   4881     mov        esi, [esp + 4 + 8]  // src_argb1
   4882     mov        edx, [esp + 4 + 12]  // dst_argb
   4883     mov        ecx, [esp + 4 + 16]  // width
   4884     vpxor      ymm5, ymm5, ymm5  // constant 0
   4885 
   4886  convertloop:
   4887     vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb0
   4888     lea        eax, [eax + 32]
   4889     vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1
   4890     lea        esi, [esi + 32]
   4891     vpunpcklbw ymm0, ymm1, ymm1  // low 4
   4892     vpunpckhbw ymm1, ymm1, ymm1  // high 4
   4893     vpunpcklbw ymm2, ymm3, ymm5  // low 4
   4894     vpunpckhbw ymm3, ymm3, ymm5  // high 4
   4895     vpmulhuw   ymm0, ymm0, ymm2  // src_argb0 * src_argb1 low 4
   4896     vpmulhuw   ymm1, ymm1, ymm3  // src_argb0 * src_argb1 high 4
   4897     vpackuswb  ymm0, ymm0, ymm1
   4898     vmovdqu    [edx], ymm0
   4899     lea        edx, [edx + 32]
   4900     sub        ecx, 8
   4901     jg         convertloop
   4902 
   4903     pop        esi
   4904     vzeroupper
   4905     ret
   4906   }
   4907 }
   4908 #endif  // HAS_ARGBMULTIPLYROW_AVX2
   4909 
   4910 #ifdef HAS_ARGBADDROW_AVX2
   4911 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
   4912 __declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0,
   4913                                        const uint8* src_argb1,
   4914                                        uint8* dst_argb,
   4915                                        int width) {
   4916   __asm {
   4917     push       esi
   4918     mov        eax, [esp + 4 + 4]  // src_argb0
   4919     mov        esi, [esp + 4 + 8]  // src_argb1
   4920     mov        edx, [esp + 4 + 12]  // dst_argb
   4921     mov        ecx, [esp + 4 + 16]  // width
   4922 
   4923  convertloop:
   4924     vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
   4925     lea        eax, [eax + 32]
   4926     vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1
   4927     lea        esi, [esi + 32]
   4928     vmovdqu    [edx], ymm0
   4929     lea        edx, [edx + 32]
   4930     sub        ecx, 8
   4931     jg         convertloop
   4932 
   4933     pop        esi
   4934     vzeroupper
   4935     ret
   4936   }
   4937 }
   4938 #endif  // HAS_ARGBADDROW_AVX2
   4939 
   4940 #ifdef HAS_ARGBSUBTRACTROW_AVX2
   4941 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
   4942 __declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0,
   4943                                             const uint8* src_argb1,
   4944                                             uint8* dst_argb,
   4945                                             int width) {
   4946   __asm {
   4947     push       esi
   4948     mov        eax, [esp + 4 + 4]  // src_argb0
   4949     mov        esi, [esp + 4 + 8]  // src_argb1
   4950     mov        edx, [esp + 4 + 12]  // dst_argb
   4951     mov        ecx, [esp + 4 + 16]  // width
   4952 
   4953  convertloop:
   4954     vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
   4955     lea        eax, [eax + 32]
   4956     vpsubusb   ymm0, ymm0, [esi]  // src_argb0 - src_argb1
   4957     lea        esi, [esi + 32]
   4958     vmovdqu    [edx], ymm0
   4959     lea        edx, [edx + 32]
   4960     sub        ecx, 8
   4961     jg         convertloop
   4962 
   4963     pop        esi
   4964     vzeroupper
   4965     ret
   4966   }
   4967 }
   4968 #endif  // HAS_ARGBSUBTRACTROW_AVX2
   4969 
   4970 #ifdef HAS_SOBELXROW_SSE2
   4971 // SobelX as a matrix is
   4972 // -1  0  1
   4973 // -2  0  2
   4974 // -1  0  1
   4975 __declspec(naked) void SobelXRow_SSE2(const uint8* src_y0,
   4976                                       const uint8* src_y1,
   4977                                       const uint8* src_y2,
   4978                                       uint8* dst_sobelx,
   4979                                       int width) {
   4980   __asm {
   4981     push       esi
   4982     push       edi
   4983     mov        eax, [esp + 8 + 4]  // src_y0
   4984     mov        esi, [esp + 8 + 8]  // src_y1
   4985     mov        edi, [esp + 8 + 12]  // src_y2
   4986     mov        edx, [esp + 8 + 16]  // dst_sobelx
   4987     mov        ecx, [esp + 8 + 20]  // width
   4988     sub        esi, eax
   4989     sub        edi, eax
   4990     sub        edx, eax
   4991     pxor       xmm5, xmm5  // constant 0
   4992 
   4993  convertloop:
   4994     movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
   4995     movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
   4996     punpcklbw  xmm0, xmm5
   4997     punpcklbw  xmm1, xmm5
   4998     psubw      xmm0, xmm1
   4999     movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
   5000     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
   5001     punpcklbw  xmm1, xmm5
   5002     punpcklbw  xmm2, xmm5
   5003     psubw      xmm1, xmm2
   5004     movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
   5005     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
   5006     punpcklbw  xmm2, xmm5
   5007     punpcklbw  xmm3, xmm5
   5008     psubw      xmm2, xmm3
   5009     paddw      xmm0, xmm2
   5010     paddw      xmm0, xmm1
   5011     paddw      xmm0, xmm1
   5012     pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
   5013     psubw      xmm1, xmm0
   5014     pmaxsw     xmm0, xmm1
   5015     packuswb   xmm0, xmm0
   5016     movq       qword ptr [eax + edx], xmm0
   5017     lea        eax, [eax + 8]
   5018     sub        ecx, 8
   5019     jg         convertloop
   5020 
   5021     pop        edi
   5022     pop        esi
   5023     ret
   5024   }
   5025 }
   5026 #endif  // HAS_SOBELXROW_SSE2
   5027 
   5028 #ifdef HAS_SOBELYROW_SSE2
   5029 // SobelY as a matrix is
   5030 // -1 -2 -1
   5031 //  0  0  0
   5032 //  1  2  1
   5033 __declspec(naked) void SobelYRow_SSE2(const uint8* src_y0,
   5034                                       const uint8* src_y1,
   5035                                       uint8* dst_sobely,
   5036                                       int width) {
   5037   __asm {
   5038     push       esi
   5039     mov        eax, [esp + 4 + 4]  // src_y0
   5040     mov        esi, [esp + 4 + 8]  // src_y1
   5041     mov        edx, [esp + 4 + 12]  // dst_sobely
   5042     mov        ecx, [esp + 4 + 16]  // width
   5043     sub        esi, eax
   5044     sub        edx, eax
   5045     pxor       xmm5, xmm5  // constant 0
   5046 
   5047  convertloop:
   5048     movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
   5049     movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
   5050     punpcklbw  xmm0, xmm5
   5051     punpcklbw  xmm1, xmm5
   5052     psubw      xmm0, xmm1
   5053     movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
   5054     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
   5055     punpcklbw  xmm1, xmm5
   5056     punpcklbw  xmm2, xmm5
   5057     psubw      xmm1, xmm2
   5058     movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
   5059     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
   5060     punpcklbw  xmm2, xmm5
   5061     punpcklbw  xmm3, xmm5
   5062     psubw      xmm2, xmm3
   5063     paddw      xmm0, xmm2
   5064     paddw      xmm0, xmm1
   5065     paddw      xmm0, xmm1
   5066     pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
   5067     psubw      xmm1, xmm0
   5068     pmaxsw     xmm0, xmm1
   5069     packuswb   xmm0, xmm0
   5070     movq       qword ptr [eax + edx], xmm0
   5071     lea        eax, [eax + 8]
   5072     sub        ecx, 8
   5073     jg         convertloop
   5074 
   5075     pop        esi
   5076     ret
   5077   }
   5078 }
   5079 #endif  // HAS_SOBELYROW_SSE2
   5080 
   5081 #ifdef HAS_SOBELROW_SSE2
   5082 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
   5083 // A = 255
   5084 // R = Sobel
   5085 // G = Sobel
   5086 // B = Sobel
   5087 __declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx,
   5088                                      const uint8* src_sobely,
   5089                                      uint8* dst_argb,
   5090                                      int width) {
   5091   __asm {
   5092     push       esi
   5093     mov        eax, [esp + 4 + 4]  // src_sobelx
   5094     mov        esi, [esp + 4 + 8]  // src_sobely
   5095     mov        edx, [esp + 4 + 12]  // dst_argb
   5096     mov        ecx, [esp + 4 + 16]  // width
   5097     sub        esi, eax
   5098     pcmpeqb    xmm5, xmm5  // alpha 255
   5099     pslld      xmm5, 24  // 0xff000000
   5100 
   5101  convertloop:
   5102     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
   5103     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
   5104     lea        eax, [eax + 16]
   5105     paddusb    xmm0, xmm1  // sobel = sobelx + sobely
   5106     movdqa     xmm2, xmm0  // GG
   5107     punpcklbw  xmm2, xmm0  // First 8
   5108     punpckhbw  xmm0, xmm0  // Next 8
   5109     movdqa     xmm1, xmm2  // GGGG
   5110     punpcklwd  xmm1, xmm2  // First 4
   5111     punpckhwd  xmm2, xmm2  // Next 4
   5112     por        xmm1, xmm5  // GGGA
   5113     por        xmm2, xmm5
   5114     movdqa     xmm3, xmm0  // GGGG
   5115     punpcklwd  xmm3, xmm0  // Next 4
   5116     punpckhwd  xmm0, xmm0  // Last 4
   5117     por        xmm3, xmm5  // GGGA
   5118     por        xmm0, xmm5
   5119     movdqu     [edx], xmm1
   5120     movdqu     [edx + 16], xmm2
   5121     movdqu     [edx + 32], xmm3
   5122     movdqu     [edx + 48], xmm0
   5123     lea        edx, [edx + 64]
   5124     sub        ecx, 16
   5125     jg         convertloop
   5126 
   5127     pop        esi
   5128     ret
   5129   }
   5130 }
   5131 #endif  // HAS_SOBELROW_SSE2
   5132 
   5133 #ifdef HAS_SOBELTOPLANEROW_SSE2
   5134 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
   5135 __declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
   5136                                             const uint8* src_sobely,
   5137                                             uint8* dst_y,
   5138                                             int width) {
   5139   __asm {
   5140     push       esi
   5141     mov        eax, [esp + 4 + 4]  // src_sobelx
   5142     mov        esi, [esp + 4 + 8]  // src_sobely
   5143     mov        edx, [esp + 4 + 12]  // dst_argb
   5144     mov        ecx, [esp + 4 + 16]  // width
   5145     sub        esi, eax
   5146 
   5147  convertloop:
   5148     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
   5149     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
   5150     lea        eax, [eax + 16]
   5151     paddusb    xmm0, xmm1  // sobel = sobelx + sobely
   5152     movdqu     [edx], xmm0
   5153     lea        edx, [edx + 16]
   5154     sub        ecx, 16
   5155     jg         convertloop
   5156 
   5157     pop        esi
   5158     ret
   5159   }
   5160 }
   5161 #endif  // HAS_SOBELTOPLANEROW_SSE2
   5162 
   5163 #ifdef HAS_SOBELXYROW_SSE2
   5164 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
   5165 // A = 255
   5166 // R = Sobel X
   5167 // G = Sobel
   5168 // B = Sobel Y
   5169 __declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx,
   5170                                        const uint8* src_sobely,
   5171                                        uint8* dst_argb,
   5172                                        int width) {
   5173   __asm {
   5174     push       esi
   5175     mov        eax, [esp + 4 + 4]  // src_sobelx
   5176     mov        esi, [esp + 4 + 8]  // src_sobely
   5177     mov        edx, [esp + 4 + 12]  // dst_argb
   5178     mov        ecx, [esp + 4 + 16]  // width
   5179     sub        esi, eax
   5180     pcmpeqb    xmm5, xmm5  // alpha 255
   5181 
   5182  convertloop:
   5183     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
   5184     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
   5185     lea        eax, [eax + 16]
   5186     movdqa     xmm2, xmm0
   5187     paddusb    xmm2, xmm1  // sobel = sobelx + sobely
   5188     movdqa     xmm3, xmm0  // XA
   5189     punpcklbw  xmm3, xmm5
   5190     punpckhbw  xmm0, xmm5
   5191     movdqa     xmm4, xmm1  // YS
   5192     punpcklbw  xmm4, xmm2
   5193     punpckhbw  xmm1, xmm2
   5194     movdqa     xmm6, xmm4  // YSXA
   5195     punpcklwd  xmm6, xmm3  // First 4
   5196     punpckhwd  xmm4, xmm3  // Next 4
   5197     movdqa     xmm7, xmm1  // YSXA
   5198     punpcklwd  xmm7, xmm0  // Next 4
   5199     punpckhwd  xmm1, xmm0  // Last 4
   5200     movdqu     [edx], xmm6
   5201     movdqu     [edx + 16], xmm4
   5202     movdqu     [edx + 32], xmm7
   5203     movdqu     [edx + 48], xmm1
   5204     lea        edx, [edx + 64]
   5205     sub        ecx, 16
   5206     jg         convertloop
   5207 
   5208     pop        esi
   5209     ret
   5210   }
   5211 }
   5212 #endif  // HAS_SOBELXYROW_SSE2
   5213 
   5214 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
   5215 // Consider float CumulativeSum.
   5216 // Consider calling CumulativeSum one row at time as needed.
   5217 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
   5218 // Convert cumulative sum for an area to an average for 1 pixel.
   5219 // topleft is pointer to top left of CumulativeSum buffer for area.
   5220 // botleft is pointer to bottom left of CumulativeSum buffer.
   5221 // width is offset from left to right of area in CumulativeSum buffer measured
   5222 //   in number of ints.
   5223 // area is the number of pixels in the area being averaged.
   5224 // dst points to pixel to store result to.
   5225 // count is number of averaged pixels to produce.
   5226 // Does 4 pixels at a time.
   5227 // This function requires alignment on accumulation buffer pointers.
   5228 void CumulativeSumToAverageRow_SSE2(const int32* topleft,
   5229                                     const int32* botleft,
   5230                                     int width,
   5231                                     int area,
   5232                                     uint8* dst,
   5233                                     int count) {
   5234   __asm {
   5235     mov        eax, topleft  // eax topleft
   5236     mov        esi, botleft  // esi botleft
   5237     mov        edx, width
   5238     movd       xmm5, area
   5239     mov        edi, dst
   5240     mov        ecx, count
   5241     cvtdq2ps   xmm5, xmm5
   5242     rcpss      xmm4, xmm5  // 1.0f / area
   5243     pshufd     xmm4, xmm4, 0
   5244     sub        ecx, 4
   5245     jl         l4b
   5246 
   5247     cmp        area, 128  // 128 pixels will not overflow 15 bits.
   5248     ja         l4
   5249 
   5250     pshufd     xmm5, xmm5, 0  // area
   5251     pcmpeqb    xmm6, xmm6  // constant of 65536.0 - 1 = 65535.0
   5252     psrld      xmm6, 16
   5253     cvtdq2ps   xmm6, xmm6
   5254     addps      xmm5, xmm6  // (65536.0 + area - 1)
   5255     mulps      xmm5, xmm4  // (65536.0 + area - 1) * 1 / area
   5256     cvtps2dq   xmm5, xmm5  // 0.16 fixed point
   5257     packssdw   xmm5, xmm5  // 16 bit shorts
   5258 
   5259     // 4 pixel loop small blocks.
   5260   s4:
   5261         // top left
   5262     movdqu     xmm0, [eax]
   5263     movdqu     xmm1, [eax + 16]
   5264     movdqu     xmm2, [eax + 32]
   5265     movdqu     xmm3, [eax + 48]
   5266 
   5267     // - top right
   5268     psubd      xmm0, [eax + edx * 4]
   5269     psubd      xmm1, [eax + edx * 4 + 16]
   5270     psubd      xmm2, [eax + edx * 4 + 32]
   5271     psubd      xmm3, [eax + edx * 4 + 48]
   5272     lea        eax, [eax + 64]
   5273 
   5274     // - bottom left
   5275     psubd      xmm0, [esi]
   5276     psubd      xmm1, [esi + 16]
   5277     psubd      xmm2, [esi + 32]
   5278     psubd      xmm3, [esi + 48]
   5279 
   5280     // + bottom right
   5281     paddd      xmm0, [esi + edx * 4]
   5282     paddd      xmm1, [esi + edx * 4 + 16]
   5283     paddd      xmm2, [esi + edx * 4 + 32]
   5284     paddd      xmm3, [esi + edx * 4 + 48]
   5285     lea        esi, [esi + 64]
   5286 
   5287     packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
   5288     packssdw   xmm2, xmm3
   5289 
   5290     pmulhuw    xmm0, xmm5
   5291     pmulhuw    xmm2, xmm5
   5292 
   5293     packuswb   xmm0, xmm2
   5294     movdqu     [edi], xmm0
   5295     lea        edi, [edi + 16]
   5296     sub        ecx, 4
   5297     jge        s4
   5298 
   5299     jmp        l4b
   5300 
   5301     // 4 pixel loop
   5302   l4:
   5303         // top left
   5304     movdqu     xmm0, [eax]
   5305     movdqu     xmm1, [eax + 16]
   5306     movdqu     xmm2, [eax + 32]
   5307     movdqu     xmm3, [eax + 48]
   5308 
   5309     // - top right
   5310     psubd      xmm0, [eax + edx * 4]
   5311     psubd      xmm1, [eax + edx * 4 + 16]
   5312     psubd      xmm2, [eax + edx * 4 + 32]
   5313     psubd      xmm3, [eax + edx * 4 + 48]
   5314     lea        eax, [eax + 64]
   5315 
   5316     // - bottom left
   5317     psubd      xmm0, [esi]
   5318     psubd      xmm1, [esi + 16]
   5319     psubd      xmm2, [esi + 32]
   5320     psubd      xmm3, [esi + 48]
   5321 
   5322     // + bottom right
   5323     paddd      xmm0, [esi + edx * 4]
   5324     paddd      xmm1, [esi + edx * 4 + 16]
   5325     paddd      xmm2, [esi + edx * 4 + 32]
   5326     paddd      xmm3, [esi + edx * 4 + 48]
   5327     lea        esi, [esi + 64]
   5328 
   5329     cvtdq2ps   xmm0, xmm0  // Average = Sum * 1 / Area
   5330     cvtdq2ps   xmm1, xmm1
   5331     mulps      xmm0, xmm4
   5332     mulps      xmm1, xmm4
   5333     cvtdq2ps   xmm2, xmm2
   5334     cvtdq2ps   xmm3, xmm3
   5335     mulps      xmm2, xmm4
   5336     mulps      xmm3, xmm4
   5337     cvtps2dq   xmm0, xmm0
   5338     cvtps2dq   xmm1, xmm1
   5339     cvtps2dq   xmm2, xmm2
   5340     cvtps2dq   xmm3, xmm3
   5341     packssdw   xmm0, xmm1
   5342     packssdw   xmm2, xmm3
   5343     packuswb   xmm0, xmm2
   5344     movdqu     [edi], xmm0
   5345     lea        edi, [edi + 16]
   5346     sub        ecx, 4
   5347     jge        l4
   5348 
   5349   l4b:
   5350     add        ecx, 4 - 1
   5351     jl         l1b
   5352 
   5353     // 1 pixel loop
   5354   l1:
   5355     movdqu     xmm0, [eax]
   5356     psubd      xmm0, [eax + edx * 4]
   5357     lea        eax, [eax + 16]
   5358     psubd      xmm0, [esi]
   5359     paddd      xmm0, [esi + edx * 4]
   5360     lea        esi, [esi + 16]
   5361     cvtdq2ps   xmm0, xmm0
   5362     mulps      xmm0, xmm4
   5363     cvtps2dq   xmm0, xmm0
   5364     packssdw   xmm0, xmm0
   5365     packuswb   xmm0, xmm0
   5366     movd       dword ptr [edi], xmm0
   5367     lea        edi, [edi + 4]
   5368     sub        ecx, 1
   5369     jge        l1
   5370   l1b:
   5371   }
   5372 }
   5373 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
   5374 
   5375 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
   5376 // Creates a table of cumulative sums where each value is a sum of all values
   5377 // above and to the left of the value.
   5378 void ComputeCumulativeSumRow_SSE2(const uint8* row,
   5379                                   int32* cumsum,
   5380                                   const int32* previous_cumsum,
   5381                                   int width) {
   5382   __asm {
   5383     mov        eax, row
   5384     mov        edx, cumsum
   5385     mov        esi, previous_cumsum
   5386     mov        ecx, width
   5387     pxor       xmm0, xmm0
   5388     pxor       xmm1, xmm1
   5389 
   5390     sub        ecx, 4
   5391     jl         l4b
   5392     test       edx, 15
   5393     jne        l4b
   5394 
   5395     // 4 pixel loop
   5396   l4:
   5397     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
   5398     lea        eax, [eax + 16]
   5399     movdqa     xmm4, xmm2
   5400 
   5401     punpcklbw  xmm2, xmm1
   5402     movdqa     xmm3, xmm2
   5403     punpcklwd  xmm2, xmm1
   5404     punpckhwd  xmm3, xmm1
   5405 
   5406     punpckhbw  xmm4, xmm1
   5407     movdqa     xmm5, xmm4
   5408     punpcklwd  xmm4, xmm1
   5409     punpckhwd  xmm5, xmm1
   5410 
   5411     paddd      xmm0, xmm2
   5412     movdqu     xmm2, [esi]  // previous row above.
   5413     paddd      xmm2, xmm0
   5414 
   5415     paddd      xmm0, xmm3
   5416     movdqu     xmm3, [esi + 16]
   5417     paddd      xmm3, xmm0
   5418 
   5419     paddd      xmm0, xmm4
   5420     movdqu     xmm4, [esi + 32]
   5421     paddd      xmm4, xmm0
   5422 
   5423     paddd      xmm0, xmm5
   5424     movdqu     xmm5, [esi + 48]
   5425     lea        esi, [esi + 64]
   5426     paddd      xmm5, xmm0
   5427 
   5428     movdqu     [edx], xmm2
   5429     movdqu     [edx + 16], xmm3
   5430     movdqu     [edx + 32], xmm4
   5431     movdqu     [edx + 48], xmm5
   5432 
   5433     lea        edx, [edx + 64]
   5434     sub        ecx, 4
   5435     jge        l4
   5436 
   5437   l4b:
   5438     add        ecx, 4 - 1
   5439     jl         l1b
   5440 
   5441     // 1 pixel loop
   5442   l1:
   5443     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
   5444     lea        eax, [eax + 4]
   5445     punpcklbw  xmm2, xmm1
   5446     punpcklwd  xmm2, xmm1
   5447     paddd      xmm0, xmm2
   5448     movdqu     xmm2, [esi]
   5449     lea        esi, [esi + 16]
   5450     paddd      xmm2, xmm0
   5451     movdqu     [edx], xmm2
   5452     lea        edx, [edx + 16]
   5453     sub        ecx, 1
   5454     jge        l1
   5455 
   5456  l1b:
   5457   }
   5458 }
   5459 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
   5460 
   5461 #ifdef HAS_ARGBAFFINEROW_SSE2
   5462 // Copy ARGB pixels from source image with slope to a row of destination.
   5463 __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
   5464                                                      int src_argb_stride,
   5465                                                      uint8* dst_argb,
   5466                                                      const float* uv_dudv,
   5467                                                      int width) {
   5468   __asm {
   5469     push       esi
   5470     push       edi
   5471     mov        eax, [esp + 12]  // src_argb
   5472     mov        esi, [esp + 16]  // stride
   5473     mov        edx, [esp + 20]  // dst_argb
   5474     mov        ecx, [esp + 24]  // pointer to uv_dudv
   5475     movq       xmm2, qword ptr [ecx]  // uv
   5476     movq       xmm7, qword ptr [ecx + 8]  // dudv
   5477     mov        ecx, [esp + 28]  // width
   5478     shl        esi, 16  // 4, stride
   5479     add        esi, 4
   5480     movd       xmm5, esi
   5481     sub        ecx, 4
   5482     jl         l4b
   5483 
   5484     // setup for 4 pixel loop
   5485     pshufd     xmm7, xmm7, 0x44  // dup dudv
   5486     pshufd     xmm5, xmm5, 0  // dup 4, stride
   5487     movdqa     xmm0, xmm2  // x0, y0, x1, y1
   5488     addps      xmm0, xmm7
   5489     movlhps    xmm2, xmm0
   5490     movdqa     xmm4, xmm7
   5491     addps      xmm4, xmm4  // dudv *= 2
   5492     movdqa     xmm3, xmm2  // x2, y2, x3, y3
   5493     addps      xmm3, xmm4
   5494     addps      xmm4, xmm4  // dudv *= 4
   5495 
   5496     // 4 pixel loop
   5497   l4:
   5498     cvttps2dq  xmm0, xmm2  // x, y float to int first 2
   5499     cvttps2dq  xmm1, xmm3  // x, y float to int next 2
   5500     packssdw   xmm0, xmm1  // x, y as 8 shorts
   5501     pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
   5502     movd       esi, xmm0
   5503     pshufd     xmm0, xmm0, 0x39  // shift right
   5504     movd       edi, xmm0
   5505     pshufd     xmm0, xmm0, 0x39  // shift right
   5506     movd       xmm1, [eax + esi]  // read pixel 0
   5507     movd       xmm6, [eax + edi]  // read pixel 1
   5508     punpckldq  xmm1, xmm6  // combine pixel 0 and 1
   5509     addps      xmm2, xmm4  // x, y += dx, dy first 2
   5510     movq       qword ptr [edx], xmm1
   5511     movd       esi, xmm0
   5512     pshufd     xmm0, xmm0, 0x39  // shift right
   5513     movd       edi, xmm0
   5514     movd       xmm6, [eax + esi]  // read pixel 2
   5515     movd       xmm0, [eax + edi]  // read pixel 3
   5516     punpckldq  xmm6, xmm0  // combine pixel 2 and 3
   5517     addps      xmm3, xmm4  // x, y += dx, dy next 2
   5518     movq       qword ptr 8[edx], xmm6
   5519     lea        edx, [edx + 16]
   5520     sub        ecx, 4
   5521     jge        l4
   5522 
   5523   l4b:
   5524     add        ecx, 4 - 1
   5525     jl         l1b
   5526 
   5527     // 1 pixel loop
   5528   l1:
   5529     cvttps2dq  xmm0, xmm2  // x, y float to int
   5530     packssdw   xmm0, xmm0  // x, y as shorts
   5531     pmaddwd    xmm0, xmm5  // offset = x * 4 + y * stride
   5532     addps      xmm2, xmm7  // x, y += dx, dy
   5533     movd       esi, xmm0
   5534     movd       xmm0, [eax + esi]  // copy a pixel
   5535     movd       [edx], xmm0
   5536     lea        edx, [edx + 4]
   5537     sub        ecx, 1
   5538     jge        l1
   5539   l1b:
   5540     pop        edi
   5541     pop        esi
   5542     ret
   5543   }
   5544 }
   5545 #endif  // HAS_ARGBAFFINEROW_SSE2
   5546 
   5547 #ifdef HAS_INTERPOLATEROW_AVX2
   5548 // Bilinear filter 32x2 -> 32x1
   5549 __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
   5550                                            const uint8* src_ptr,
   5551                                            ptrdiff_t src_stride,
   5552                                            int dst_width,
   5553                                            int source_y_fraction) {
   5554   __asm {
   5555     push       esi
   5556     push       edi
   5557     mov        edi, [esp + 8 + 4]  // dst_ptr
   5558     mov        esi, [esp + 8 + 8]  // src_ptr
   5559     mov        edx, [esp + 8 + 12]  // src_stride
   5560     mov        ecx, [esp + 8 + 16]  // dst_width
   5561     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   5562     // Dispatch to specialized filters if applicable.
   5563     cmp        eax, 0
   5564     je         xloop100  // 0 / 256.  Blend 100 / 0.
   5565     sub        edi, esi
   5566     cmp        eax, 128
   5567     je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.
   5568 
   5569     vmovd      xmm0, eax  // high fraction 0..255
   5570     neg        eax
   5571     add        eax, 256
   5572     vmovd      xmm5, eax  // low fraction 256..1
   5573     vpunpcklbw xmm5, xmm5, xmm0
   5574     vpunpcklwd xmm5, xmm5, xmm5
   5575     vbroadcastss ymm5, xmm5
   5576 
   5577     mov        eax, 0x80808080  // 128b for bias and rounding.
   5578     vmovd      xmm4, eax
   5579     vbroadcastss ymm4, xmm4
   5580 
   5581   xloop:
   5582     vmovdqu    ymm0, [esi]
   5583     vmovdqu    ymm2, [esi + edx]
   5584     vpunpckhbw ymm1, ymm0, ymm2  // mutates
   5585     vpunpcklbw ymm0, ymm0, ymm2
   5586     vpsubb     ymm1, ymm1, ymm4  // bias to signed image
   5587     vpsubb     ymm0, ymm0, ymm4
   5588     vpmaddubsw ymm1, ymm5, ymm1
   5589     vpmaddubsw ymm0, ymm5, ymm0
   5590     vpaddw     ymm1, ymm1, ymm4  // unbias and round
   5591     vpaddw     ymm0, ymm0, ymm4
   5592     vpsrlw     ymm1, ymm1, 8
   5593     vpsrlw     ymm0, ymm0, 8
   5594     vpackuswb  ymm0, ymm0, ymm1            // unmutates
   5595     vmovdqu    [esi + edi], ymm0
   5596     lea        esi, [esi + 32]
   5597     sub        ecx, 32
   5598     jg         xloop
   5599     jmp        xloop99
   5600 
   5601     // Blend 50 / 50.
   5602  xloop50:
   5603    vmovdqu    ymm0, [esi]
   5604    vpavgb     ymm0, ymm0, [esi + edx]
   5605    vmovdqu    [esi + edi], ymm0
   5606    lea        esi, [esi + 32]
   5607    sub        ecx, 32
   5608    jg         xloop50
   5609    jmp        xloop99
   5610 
   5611     // Blend 100 / 0 - Copy row unchanged.
   5612  xloop100:
   5613    rep movsb
   5614 
   5615   xloop99:
   5616     pop        edi
   5617     pop        esi
   5618     vzeroupper
   5619     ret
   5620   }
   5621 }
   5622 #endif  // HAS_INTERPOLATEROW_AVX2
   5623 
   5624 // Bilinear filter 16x2 -> 16x1
   5625 // TODO(fbarchard): Consider allowing 256 using memcpy.
   5626 __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
   5627                                             const uint8* src_ptr,
   5628                                             ptrdiff_t src_stride,
   5629                                             int dst_width,
   5630                                             int source_y_fraction) {
   5631   __asm {
   5632     push       esi
   5633     push       edi
   5634 
   5635     mov        edi, [esp + 8 + 4]  // dst_ptr
   5636     mov        esi, [esp + 8 + 8]  // src_ptr
   5637     mov        edx, [esp + 8 + 12]  // src_stride
   5638     mov        ecx, [esp + 8 + 16]  // dst_width
   5639     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   5640     sub        edi, esi
   5641     // Dispatch to specialized filters if applicable.
   5642     cmp        eax, 0
   5643     je         xloop100  // 0 /256.  Blend 100 / 0.
   5644     cmp        eax, 128
   5645     je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.
   5646 
   5647     movd       xmm0, eax  // high fraction 0..255
   5648     neg        eax
   5649     add        eax, 256
   5650     movd       xmm5, eax  // low fraction 255..1
   5651     punpcklbw  xmm5, xmm0
   5652     punpcklwd  xmm5, xmm5
   5653     pshufd     xmm5, xmm5, 0
   5654     mov        eax, 0x80808080  // 128 for biasing image to signed.
   5655     movd       xmm4, eax
   5656     pshufd     xmm4, xmm4, 0x00
   5657 
   5658   xloop:
   5659     movdqu     xmm0, [esi]
   5660     movdqu     xmm2, [esi + edx]
   5661     movdqu     xmm1, xmm0
   5662     punpcklbw  xmm0, xmm2
   5663     punpckhbw  xmm1, xmm2
   5664     psubb      xmm0, xmm4            // bias image by -128
   5665     psubb      xmm1, xmm4
   5666     movdqa     xmm2, xmm5
   5667     movdqa     xmm3, xmm5
   5668     pmaddubsw  xmm2, xmm0
   5669     pmaddubsw  xmm3, xmm1
   5670     paddw      xmm2, xmm4
   5671     paddw      xmm3, xmm4
   5672     psrlw      xmm2, 8
   5673     psrlw      xmm3, 8
   5674     packuswb   xmm2, xmm3
   5675     movdqu     [esi + edi], xmm2
   5676     lea        esi, [esi + 16]
   5677     sub        ecx, 16
   5678     jg         xloop
   5679     jmp        xloop99
   5680 
   5681     // Blend 50 / 50.
   5682   xloop50:
   5683     movdqu     xmm0, [esi]
   5684     movdqu     xmm1, [esi + edx]
   5685     pavgb      xmm0, xmm1
   5686     movdqu     [esi + edi], xmm0
   5687     lea        esi, [esi + 16]
   5688     sub        ecx, 16
   5689     jg         xloop50
   5690     jmp        xloop99
   5691 
   5692     // Blend 100 / 0 - Copy row unchanged.
   5693   xloop100:
   5694     movdqu     xmm0, [esi]
   5695     movdqu     [esi + edi], xmm0
   5696     lea        esi, [esi + 16]
   5697     sub        ecx, 16
   5698     jg         xloop100
   5699 
   5700   xloop99:
   5701     pop        edi
   5702     pop        esi
   5703     ret
   5704   }
   5705 }
   5706 
   5707 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
   5708 __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb,
   5709                                             uint8* dst_argb,
   5710                                             const uint8* shuffler,
   5711                                             int width) {
   5712   __asm {
   5713     mov        eax, [esp + 4]  // src_argb
   5714     mov        edx, [esp + 8]  // dst_argb
   5715     mov        ecx, [esp + 12]  // shuffler
   5716     movdqu     xmm5, [ecx]
   5717     mov        ecx, [esp + 16]  // width
   5718 
   5719   wloop:
   5720     movdqu     xmm0, [eax]
   5721     movdqu     xmm1, [eax + 16]
   5722     lea        eax, [eax + 32]
   5723     pshufb     xmm0, xmm5
   5724     pshufb     xmm1, xmm5
   5725     movdqu     [edx], xmm0
   5726     movdqu     [edx + 16], xmm1
   5727     lea        edx, [edx + 32]
   5728     sub        ecx, 8
   5729     jg         wloop
   5730     ret
   5731   }
   5732 }
   5733 
   5734 #ifdef HAS_ARGBSHUFFLEROW_AVX2
   5735 __declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb,
   5736                                            uint8* dst_argb,
   5737                                            const uint8* shuffler,
   5738                                            int width) {
   5739   __asm {
   5740     mov        eax, [esp + 4]  // src_argb
   5741     mov        edx, [esp + 8]  // dst_argb
   5742     mov        ecx, [esp + 12]  // shuffler
   5743     vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
   5744     mov        ecx, [esp + 16]  // width
   5745 
   5746   wloop:
   5747     vmovdqu    ymm0, [eax]
   5748     vmovdqu    ymm1, [eax + 32]
   5749     lea        eax, [eax + 64]
   5750     vpshufb    ymm0, ymm0, ymm5
   5751     vpshufb    ymm1, ymm1, ymm5
   5752     vmovdqu    [edx], ymm0
   5753     vmovdqu    [edx + 32], ymm1
   5754     lea        edx, [edx + 64]
   5755     sub        ecx, 16
   5756     jg         wloop
   5757 
   5758     vzeroupper
   5759     ret
   5760   }
   5761 }
   5762 #endif  // HAS_ARGBSHUFFLEROW_AVX2
   5763 
   5764 __declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb,
   5765                                            uint8* dst_argb,
   5766                                            const uint8* shuffler,
   5767                                            int width) {
   5768   __asm {
   5769     push       ebx
   5770     push       esi
   5771     mov        eax, [esp + 8 + 4]  // src_argb
   5772     mov        edx, [esp + 8 + 8]  // dst_argb
   5773     mov        esi, [esp + 8 + 12]  // shuffler
   5774     mov        ecx, [esp + 8 + 16]  // width
   5775     pxor       xmm5, xmm5
   5776 
   5777     mov        ebx, [esi]  // shuffler
   5778     cmp        ebx, 0x03000102
   5779     je         shuf_3012
   5780     cmp        ebx, 0x00010203
   5781     je         shuf_0123
   5782     cmp        ebx, 0x00030201
   5783     je         shuf_0321
   5784     cmp        ebx, 0x02010003
   5785     je         shuf_2103
   5786 
   5787     // TODO(fbarchard): Use one source pointer and 3 offsets.
   5788   shuf_any1:
   5789     movzx      ebx, byte ptr [esi]
   5790     movzx      ebx, byte ptr [eax + ebx]
   5791     mov        [edx], bl
   5792     movzx      ebx, byte ptr [esi + 1]
   5793     movzx      ebx, byte ptr [eax + ebx]
   5794     mov        [edx + 1], bl
   5795     movzx      ebx, byte ptr [esi + 2]
   5796     movzx      ebx, byte ptr [eax + ebx]
   5797     mov        [edx + 2], bl
   5798     movzx      ebx, byte ptr [esi + 3]
   5799     movzx      ebx, byte ptr [eax + ebx]
   5800     mov        [edx + 3], bl
   5801     lea        eax, [eax + 4]
   5802     lea        edx, [edx + 4]
   5803     sub        ecx, 1
   5804     jg         shuf_any1
   5805     jmp        shuf99
   5806 
   5807   shuf_0123:
   5808     movdqu     xmm0, [eax]
   5809     lea        eax, [eax + 16]
   5810     movdqa     xmm1, xmm0
   5811     punpcklbw  xmm0, xmm5
   5812     punpckhbw  xmm1, xmm5
   5813     pshufhw    xmm0, xmm0, 01Bh  // 1B = 00011011 = 0x0123 = BGRAToARGB
   5814     pshuflw    xmm0, xmm0, 01Bh
   5815     pshufhw    xmm1, xmm1, 01Bh
   5816     pshuflw    xmm1, xmm1, 01Bh
   5817     packuswb   xmm0, xmm1
   5818     movdqu     [edx], xmm0
   5819     lea        edx, [edx + 16]
   5820     sub        ecx, 4
   5821     jg         shuf_0123
   5822     jmp        shuf99
   5823 
   5824   shuf_0321:
   5825     movdqu     xmm0, [eax]
   5826     lea        eax, [eax + 16]
   5827     movdqa     xmm1, xmm0
   5828     punpcklbw  xmm0, xmm5
   5829     punpckhbw  xmm1, xmm5
   5830     pshufhw    xmm0, xmm0, 039h  // 39 = 00111001 = 0x0321 = RGBAToARGB
   5831     pshuflw    xmm0, xmm0, 039h
   5832     pshufhw    xmm1, xmm1, 039h
   5833     pshuflw    xmm1, xmm1, 039h
   5834     packuswb   xmm0, xmm1
   5835     movdqu     [edx], xmm0
   5836     lea        edx, [edx + 16]
   5837     sub        ecx, 4
   5838     jg         shuf_0321
   5839     jmp        shuf99
   5840 
   5841   shuf_2103:
   5842     movdqu     xmm0, [eax]
   5843     lea        eax, [eax + 16]
   5844     movdqa     xmm1, xmm0
   5845     punpcklbw  xmm0, xmm5
   5846     punpckhbw  xmm1, xmm5
   5847     pshufhw    xmm0, xmm0, 093h  // 93 = 10010011 = 0x2103 = ARGBToRGBA
   5848     pshuflw    xmm0, xmm0, 093h
   5849     pshufhw    xmm1, xmm1, 093h
   5850     pshuflw    xmm1, xmm1, 093h
   5851     packuswb   xmm0, xmm1
   5852     movdqu     [edx], xmm0
   5853     lea        edx, [edx + 16]
   5854     sub        ecx, 4
   5855     jg         shuf_2103
   5856     jmp        shuf99
   5857 
   5858   shuf_3012:
   5859     movdqu     xmm0, [eax]
   5860     lea        eax, [eax + 16]
   5861     movdqa     xmm1, xmm0
   5862     punpcklbw  xmm0, xmm5
   5863     punpckhbw  xmm1, xmm5
   5864     pshufhw    xmm0, xmm0, 0C6h  // C6 = 11000110 = 0x3012 = ABGRToARGB
   5865     pshuflw    xmm0, xmm0, 0C6h
   5866     pshufhw    xmm1, xmm1, 0C6h
   5867     pshuflw    xmm1, xmm1, 0C6h
   5868     packuswb   xmm0, xmm1
   5869     movdqu     [edx], xmm0
   5870     lea        edx, [edx + 16]
   5871     sub        ecx, 4
   5872     jg         shuf_3012
   5873 
   5874   shuf99:
   5875     pop        esi
   5876     pop        ebx
   5877     ret
   5878   }
   5879 }
   5880 
   5881 // YUY2 - Macro-pixel = 2 image pixels
   5882 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
   5883 
   5884 // UYVY - Macro-pixel = 2 image pixels
   5885 // U0Y0V0Y1
   5886 
   5887 __declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y,
   5888                                           const uint8* src_u,
   5889                                           const uint8* src_v,
   5890                                           uint8* dst_frame,
   5891                                           int width) {
   5892   __asm {
   5893     push       esi
   5894     push       edi
   5895     mov        eax, [esp + 8 + 4]  // src_y
   5896     mov        esi, [esp + 8 + 8]  // src_u
   5897     mov        edx, [esp + 8 + 12]  // src_v
   5898     mov        edi, [esp + 8 + 16]  // dst_frame
   5899     mov        ecx, [esp + 8 + 20]  // width
   5900     sub        edx, esi
   5901 
   5902   convertloop:
   5903     movq       xmm2, qword ptr [esi]  // U
   5904     movq       xmm3, qword ptr [esi + edx]  // V
   5905     lea        esi, [esi + 8]
   5906     punpcklbw  xmm2, xmm3  // UV
   5907     movdqu     xmm0, [eax]  // Y
   5908     lea        eax, [eax + 16]
   5909     movdqa     xmm1, xmm0
   5910     punpcklbw  xmm0, xmm2  // YUYV
   5911     punpckhbw  xmm1, xmm2
   5912     movdqu     [edi], xmm0
   5913     movdqu     [edi + 16], xmm1
   5914     lea        edi, [edi + 32]
   5915     sub        ecx, 16
   5916     jg         convertloop
   5917 
   5918     pop        edi
   5919     pop        esi
   5920     ret
   5921   }
   5922 }
   5923 
   5924 __declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y,
   5925                                           const uint8* src_u,
   5926                                           const uint8* src_v,
   5927                                           uint8* dst_frame,
   5928                                           int width) {
   5929   __asm {
   5930     push       esi
   5931     push       edi
   5932     mov        eax, [esp + 8 + 4]  // src_y
   5933     mov        esi, [esp + 8 + 8]  // src_u
   5934     mov        edx, [esp + 8 + 12]  // src_v
   5935     mov        edi, [esp + 8 + 16]  // dst_frame
   5936     mov        ecx, [esp + 8 + 20]  // width
   5937     sub        edx, esi
   5938 
   5939   convertloop:
   5940     movq       xmm2, qword ptr [esi]  // U
   5941     movq       xmm3, qword ptr [esi + edx]  // V
   5942     lea        esi, [esi + 8]
   5943     punpcklbw  xmm2, xmm3  // UV
   5944     movdqu     xmm0, [eax]  // Y
   5945     movdqa     xmm1, xmm2
   5946     lea        eax, [eax + 16]
   5947     punpcklbw  xmm1, xmm0  // UYVY
   5948     punpckhbw  xmm2, xmm0
   5949     movdqu     [edi], xmm1
   5950     movdqu     [edi + 16], xmm2
   5951     lea        edi, [edi + 32]
   5952     sub        ecx, 16
   5953     jg         convertloop
   5954 
   5955     pop        edi
   5956     pop        esi
   5957     ret
   5958   }
   5959 }
   5960 
   5961 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
   5962 __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
   5963                                               uint8* dst_argb,
   5964                                               const float* poly,
   5965                                               int width) {
   5966   __asm {
   5967     push       esi
   5968     mov        eax, [esp + 4 + 4] /* src_argb */
   5969     mov        edx, [esp + 4 + 8] /* dst_argb */
   5970     mov        esi, [esp + 4 + 12] /* poly */
   5971     mov        ecx, [esp + 4 + 16] /* width */
   5972     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
   5973 
   5974     // 2 pixel loop.
   5975  convertloop:
   5976         //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
   5977         //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
   5978     movq       xmm0, qword ptr [eax]  // BGRABGRA
   5979     lea        eax, [eax + 8]
   5980     punpcklbw  xmm0, xmm3
   5981     movdqa     xmm4, xmm0
   5982     punpcklwd  xmm0, xmm3  // pixel 0
   5983     punpckhwd  xmm4, xmm3  // pixel 1
   5984     cvtdq2ps   xmm0, xmm0  // 4 floats
   5985     cvtdq2ps   xmm4, xmm4
   5986     movdqa     xmm1, xmm0  // X
   5987     movdqa     xmm5, xmm4
   5988     mulps      xmm0, [esi + 16]  // C1 * X
   5989     mulps      xmm4, [esi + 16]
   5990     addps      xmm0, [esi]  // result = C0 + C1 * X
   5991     addps      xmm4, [esi]
   5992     movdqa     xmm2, xmm1
   5993     movdqa     xmm6, xmm5
   5994     mulps      xmm2, xmm1  // X * X
   5995     mulps      xmm6, xmm5
   5996     mulps      xmm1, xmm2  // X * X * X
   5997     mulps      xmm5, xmm6
   5998     mulps      xmm2, [esi + 32]  // C2 * X * X
   5999     mulps      xmm6, [esi + 32]
   6000     mulps      xmm1, [esi + 48]  // C3 * X * X * X
   6001     mulps      xmm5, [esi + 48]
   6002     addps      xmm0, xmm2  // result += C2 * X * X
   6003     addps      xmm4, xmm6
   6004     addps      xmm0, xmm1  // result += C3 * X * X * X
   6005     addps      xmm4, xmm5
   6006     cvttps2dq  xmm0, xmm0
   6007     cvttps2dq  xmm4, xmm4
   6008     packuswb   xmm0, xmm4
   6009     packuswb   xmm0, xmm0
   6010     movq       qword ptr [edx], xmm0
   6011     lea        edx, [edx + 8]
   6012     sub        ecx, 2
   6013     jg         convertloop
   6014     pop        esi
   6015     ret
   6016   }
   6017 }
   6018 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
   6019 
   6020 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
   6021 __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb,
   6022                                               uint8* dst_argb,
   6023                                               const float* poly,
   6024                                               int width) {
   6025   __asm {
   6026     mov        eax, [esp + 4] /* src_argb */
   6027     mov        edx, [esp + 8] /* dst_argb */
   6028     mov        ecx, [esp + 12] /* poly */
   6029     vbroadcastf128 ymm4, [ecx]  // C0
   6030     vbroadcastf128 ymm5, [ecx + 16]  // C1
   6031     vbroadcastf128 ymm6, [ecx + 32]  // C2
   6032     vbroadcastf128 ymm7, [ecx + 48]  // C3
   6033     mov        ecx, [esp + 16] /* width */
   6034 
   6035     // 2 pixel loop.
   6036  convertloop:
   6037     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
   6038     lea         eax, [eax + 8]
   6039     vcvtdq2ps   ymm0, ymm0  // X 8 floats
   6040     vmulps      ymm2, ymm0, ymm0  // X * X
   6041     vmulps      ymm3, ymm0, ymm7  // C3 * X
   6042     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
   6043     vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
   6044     vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
   6045     vcvttps2dq  ymm0, ymm0
   6046     vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
   6047     vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
   6048     vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
   6049     vmovq       qword ptr [edx], xmm0
   6050     lea         edx, [edx + 8]
   6051     sub         ecx, 2
   6052     jg          convertloop
   6053     vzeroupper
   6054     ret
   6055   }
   6056 }
   6057 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
   6058 
   6059 #ifdef HAS_HALFFLOATROW_SSE2
   6060 static float kExpBias = 1.9259299444e-34f;
   6061 __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
   6062                                          uint16* dst,
   6063                                          float scale,
   6064                                          int width) {
   6065   __asm {
   6066     mov        eax, [esp + 4] /* src */
   6067     mov        edx, [esp + 8] /* dst */
   6068     movd       xmm4, dword ptr [esp + 12] /* scale */
   6069     mov        ecx, [esp + 16] /* width */
   6070     mulss      xmm4, kExpBias
   6071     pshufd     xmm4, xmm4, 0
   6072     pxor       xmm5, xmm5
   6073     sub        edx, eax
   6074 
   6075     // 8 pixel loop.
   6076  convertloop:
   6077     movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
   6078     add         eax, 16
   6079     movdqa      xmm3, xmm2
   6080     punpcklwd   xmm2, xmm5
   6081     cvtdq2ps    xmm2, xmm2  // convert 8 ints to floats
   6082     punpckhwd   xmm3, xmm5
   6083     cvtdq2ps    xmm3, xmm3
   6084     mulps       xmm2, xmm4
   6085     mulps       xmm3, xmm4
   6086     psrld       xmm2, 13
   6087     psrld       xmm3, 13
   6088     packssdw    xmm2, xmm3
   6089     movdqu      [eax + edx - 16], xmm2
   6090     sub         ecx, 8
   6091     jg          convertloop
   6092     ret
   6093   }
   6094 }
   6095 #endif  // HAS_HALFFLOATROW_SSE2
   6096 
   6097 #ifdef HAS_HALFFLOATROW_AVX2
   6098 __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
   6099                                          uint16* dst,
   6100                                          float scale,
   6101                                          int width) {
   6102   __asm {
   6103     mov        eax, [esp + 4] /* src */
   6104     mov        edx, [esp + 8] /* dst */
   6105     movd       xmm4, dword ptr [esp + 12] /* scale */
   6106     mov        ecx, [esp + 16] /* width */
   6107 
   6108     vmulss     xmm4, xmm4, kExpBias
   6109     vbroadcastss ymm4, xmm4
   6110     vpxor      ymm5, ymm5, ymm5
   6111     sub        edx, eax
   6112 
   6113     // 16 pixel loop.
   6114  convertloop:
   6115     vmovdqu     ymm2, [eax]  // 16 shorts
   6116     add         eax, 32
   6117     vpunpckhwd  ymm3, ymm2, ymm5  // convert 16 shorts to 16 ints
   6118     vpunpcklwd  ymm2, ymm2, ymm5
   6119     vcvtdq2ps   ymm3, ymm3  // convert 16 ints to floats
   6120     vcvtdq2ps   ymm2, ymm2
   6121     vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
   6122     vmulps      ymm2, ymm2, ymm4
   6123     vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
   6124     vpsrld      ymm2, ymm2, 13
   6125     vpackssdw   ymm2, ymm2, ymm3
   6126     vmovdqu     [eax + edx - 32], ymm2
   6127     sub         ecx, 16
   6128     jg          convertloop
   6129     vzeroupper
   6130     ret
   6131   }
   6132 }
   6133 #endif  // HAS_HALFFLOATROW_AVX2
   6134 
   6135 #ifdef HAS_HALFFLOATROW_F16C
   6136 __declspec(naked) void HalfFloatRow_F16C(const uint16* src,
   6137                                          uint16* dst,
   6138                                          float scale,
   6139                                          int width) {
   6140   __asm {
   6141     mov        eax, [esp + 4] /* src */
   6142     mov        edx, [esp + 8] /* dst */
   6143     vbroadcastss ymm4, [esp + 12] /* scale */
   6144     mov        ecx, [esp + 16] /* width */
   6145     sub        edx, eax
   6146 
   6147     // 16 pixel loop.
   6148  convertloop:
   6149     vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
   6150     vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
   6151     add         eax, 32
   6152     vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
   6153     vcvtdq2ps   ymm3, ymm3
   6154     vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
   6155     vmulps      ymm3, ymm3, ymm4
   6156     vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
   6157     vcvtps2ph   xmm3, ymm3, 3
   6158     vmovdqu     [eax + edx + 32], xmm2
   6159     vmovdqu     [eax + edx + 32 + 16], xmm3
   6160     sub         ecx, 16
   6161     jg          convertloop
   6162     vzeroupper
   6163     ret
   6164   }
   6165 }
   6166 #endif  // HAS_HALFFLOATROW_F16C
   6167 
   6168 #ifdef HAS_ARGBCOLORTABLEROW_X86
   6169 // Tranform ARGB pixels with color table.
   6170 __declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb,
   6171                                              const uint8* table_argb,
   6172                                              int width) {
   6173   __asm {
   6174     push       esi
   6175     mov        eax, [esp + 4 + 4] /* dst_argb */
   6176     mov        esi, [esp + 4 + 8] /* table_argb */
   6177     mov        ecx, [esp + 4 + 12] /* width */
   6178 
   6179     // 1 pixel loop.
   6180   convertloop:
   6181     movzx      edx, byte ptr [eax]
   6182     lea        eax, [eax + 4]
   6183     movzx      edx, byte ptr [esi + edx * 4]
   6184     mov        byte ptr [eax - 4], dl
   6185     movzx      edx, byte ptr [eax - 4 + 1]
   6186     movzx      edx, byte ptr [esi + edx * 4 + 1]
   6187     mov        byte ptr [eax - 4 + 1], dl
   6188     movzx      edx, byte ptr [eax - 4 + 2]
   6189     movzx      edx, byte ptr [esi + edx * 4 + 2]
   6190     mov        byte ptr [eax - 4 + 2], dl
   6191     movzx      edx, byte ptr [eax - 4 + 3]
   6192     movzx      edx, byte ptr [esi + edx * 4 + 3]
   6193     mov        byte ptr [eax - 4 + 3], dl
   6194     dec        ecx
   6195     jg         convertloop
   6196     pop        esi
   6197     ret
   6198   }
   6199 }
   6200 #endif  // HAS_ARGBCOLORTABLEROW_X86
   6201 
   6202 #ifdef HAS_RGBCOLORTABLEROW_X86
   6203 // Tranform RGB pixels with color table.
   6204 __declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb,
   6205                                             const uint8* table_argb,
   6206                                             int width) {
   6207   __asm {
   6208     push       esi
   6209     mov        eax, [esp + 4 + 4] /* dst_argb */
   6210     mov        esi, [esp + 4 + 8] /* table_argb */
   6211     mov        ecx, [esp + 4 + 12] /* width */
   6212 
   6213     // 1 pixel loop.
   6214   convertloop:
   6215     movzx      edx, byte ptr [eax]
   6216     lea        eax, [eax + 4]
   6217     movzx      edx, byte ptr [esi + edx * 4]
   6218     mov        byte ptr [eax - 4], dl
   6219     movzx      edx, byte ptr [eax - 4 + 1]
   6220     movzx      edx, byte ptr [esi + edx * 4 + 1]
   6221     mov        byte ptr [eax - 4 + 1], dl
   6222     movzx      edx, byte ptr [eax - 4 + 2]
   6223     movzx      edx, byte ptr [esi + edx * 4 + 2]
   6224     mov        byte ptr [eax - 4 + 2], dl
   6225     dec        ecx
   6226     jg         convertloop
   6227 
   6228     pop        esi
   6229     ret
   6230   }
   6231 }
   6232 #endif  // HAS_RGBCOLORTABLEROW_X86
   6233 
   6234 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
   6235 // Tranform RGB pixels with luma table.
   6236 __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
   6237                                                    uint8* dst_argb,
   6238                                                    int width,
   6239                                                    const uint8* luma,
   6240                                                    uint32 lumacoeff) {
   6241   __asm {
   6242     push       esi
   6243     push       edi
   6244     mov        eax, [esp + 8 + 4] /* src_argb */
   6245     mov        edi, [esp + 8 + 8] /* dst_argb */
   6246     mov        ecx, [esp + 8 + 12] /* width */
   6247     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
   6248     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
   6249     pshufd     xmm2, xmm2, 0
   6250     pshufd     xmm3, xmm3, 0
   6251     pcmpeqb    xmm4, xmm4  // generate mask 0xff00ff00
   6252     psllw      xmm4, 8
   6253     pxor       xmm5, xmm5
   6254 
   6255     // 4 pixel loop.
   6256   convertloop:
   6257     movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
   6258     pmaddubsw  xmm0, xmm3
   6259     phaddw     xmm0, xmm0
   6260     pand       xmm0, xmm4  // mask out low bits
   6261     punpcklwd  xmm0, xmm5
   6262     paddd      xmm0, xmm2  // add table base
   6263     movd       esi, xmm0
   6264     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
   6265 
   6266     movzx      edx, byte ptr [eax]
   6267     movzx      edx, byte ptr [esi + edx]
   6268     mov        byte ptr [edi], dl
   6269     movzx      edx, byte ptr [eax + 1]
   6270     movzx      edx, byte ptr [esi + edx]
   6271     mov        byte ptr [edi + 1], dl
   6272     movzx      edx, byte ptr [eax + 2]
   6273     movzx      edx, byte ptr [esi + edx]
   6274     mov        byte ptr [edi + 2], dl
   6275     movzx      edx, byte ptr [eax + 3]  // copy alpha.
   6276     mov        byte ptr [edi + 3], dl
   6277 
   6278     movd       esi, xmm0
   6279     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
   6280 
   6281     movzx      edx, byte ptr [eax + 4]
   6282     movzx      edx, byte ptr [esi + edx]
   6283     mov        byte ptr [edi + 4], dl
   6284     movzx      edx, byte ptr [eax + 5]
   6285     movzx      edx, byte ptr [esi + edx]
   6286     mov        byte ptr [edi + 5], dl
   6287     movzx      edx, byte ptr [eax + 6]
   6288     movzx      edx, byte ptr [esi + edx]
   6289     mov        byte ptr [edi + 6], dl
   6290     movzx      edx, byte ptr [eax + 7]  // copy alpha.
   6291     mov        byte ptr [edi + 7], dl
   6292 
   6293     movd       esi, xmm0
   6294     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
   6295 
   6296     movzx      edx, byte ptr [eax + 8]
   6297     movzx      edx, byte ptr [esi + edx]
   6298     mov        byte ptr [edi + 8], dl
   6299     movzx      edx, byte ptr [eax + 9]
   6300     movzx      edx, byte ptr [esi + edx]
   6301     mov        byte ptr [edi + 9], dl
   6302     movzx      edx, byte ptr [eax + 10]
   6303     movzx      edx, byte ptr [esi + edx]
   6304     mov        byte ptr [edi + 10], dl
   6305     movzx      edx, byte ptr [eax + 11]  // copy alpha.
   6306     mov        byte ptr [edi + 11], dl
   6307 
   6308     movd       esi, xmm0
   6309 
   6310     movzx      edx, byte ptr [eax + 12]
   6311     movzx      edx, byte ptr [esi + edx]
   6312     mov        byte ptr [edi + 12], dl
   6313     movzx      edx, byte ptr [eax + 13]
   6314     movzx      edx, byte ptr [esi + edx]
   6315     mov        byte ptr [edi + 13], dl
   6316     movzx      edx, byte ptr [eax + 14]
   6317     movzx      edx, byte ptr [esi + edx]
   6318     mov        byte ptr [edi + 14], dl
   6319     movzx      edx, byte ptr [eax + 15]  // copy alpha.
   6320     mov        byte ptr [edi + 15], dl
   6321 
   6322     lea        eax, [eax + 16]
   6323     lea        edi, [edi + 16]
   6324     sub        ecx, 4
   6325     jg         convertloop
   6326 
   6327     pop        edi
   6328     pop        esi
   6329     ret
   6330   }
   6331 }
   6332 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
   6333 
   6334 #endif  // defined(_M_X64)
   6335 
   6336 #ifdef __cplusplus
   6337 }  // extern "C"
   6338 }  // namespace libyuv
   6339 #endif
   6340 
   6341 #endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
   6342