Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 
     13 // This module is for Visual C 32/64 bit and clangcl 32 bit
     14 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
     15     (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
     16 
     17 #if defined(_M_X64)
     18 #include <emmintrin.h>
     19 #include <tmmintrin.h>  // For _mm_maddubs_epi16
     20 #endif
     21 
     22 #ifdef __cplusplus
     23 namespace libyuv {
     24 extern "C" {
     25 #endif
     26 
     27 // 64 bit
     28 #if defined(_M_X64)
     29 
     30 // Read 4 UV from 422, upsample to 8 UV.
     31 #define READYUV422                                                             \
     32     xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
     33     xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
     34     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
     35     xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
     36     u_buf += 4;                                                                \
     37     xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
     38     xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
     39     y_buf += 8;
     40 
     41 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
     42 #define READYUVA422                                                            \
     43     xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
     44     xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
     45     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
     46     xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
     47     u_buf += 4;                                                                \
     48     xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
     49     xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
     50     y_buf += 8;                                                                \
     51     xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                                   \
     52     a_buf += 8;
     53 
     54 // Convert 8 pixels: 8 UV and 8 Y.
     55 #define YUVTORGB(yuvconstants)                                                 \
     56     xmm1 = _mm_loadu_si128(&xmm0);                                             \
     57     xmm2 = _mm_loadu_si128(&xmm0);                                             \
     58     xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB);           \
     59     xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG);           \
     60     xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR);           \
     61     xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);             \
     62     xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);             \
     63     xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);             \
     64     xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);            \
     65     xmm0 = _mm_adds_epi16(xmm0, xmm4);                                         \
     66     xmm1 = _mm_adds_epi16(xmm1, xmm4);                                         \
     67     xmm2 = _mm_adds_epi16(xmm2, xmm4);                                         \
     68     xmm0 = _mm_srai_epi16(xmm0, 6);                                            \
     69     xmm1 = _mm_srai_epi16(xmm1, 6);                                            \
     70     xmm2 = _mm_srai_epi16(xmm2, 6);                                            \
     71     xmm0 = _mm_packus_epi16(xmm0, xmm0);                                       \
     72     xmm1 = _mm_packus_epi16(xmm1, xmm1);                                       \
     73     xmm2 = _mm_packus_epi16(xmm2, xmm2);
     74 
     75 // Store 8 ARGB values.
     76 #define STOREARGB                                                              \
     77     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
     78     xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);                                      \
     79     xmm1 = _mm_loadu_si128(&xmm0);                                             \
     80     xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);                                     \
     81     xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);                                     \
     82     _mm_storeu_si128((__m128i *)dst_argb, xmm0);                               \
     83     _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);                        \
     84     dst_argb += 32;
     85 
     86 
     87 #if defined(HAS_I422TOARGBROW_SSSE3)
     88 void I422ToARGBRow_SSSE3(const uint8* y_buf,
     89                          const uint8* u_buf,
     90                          const uint8* v_buf,
     91                          uint8* dst_argb,
     92                          const struct YuvConstants* yuvconstants,
     93                          int width) {
     94   __m128i xmm0, xmm1, xmm2, xmm4;
     95   const __m128i xmm5 = _mm_set1_epi8(-1);
     96   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
     97   while (width > 0) {
     98     READYUV422
     99     YUVTORGB(yuvconstants)
    100     STOREARGB
    101     width -= 8;
    102   }
    103 }
    104 #endif
    105 
    106 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
    107 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
    108                               const uint8* u_buf,
    109                               const uint8* v_buf,
    110                               const uint8* a_buf,
    111                               uint8* dst_argb,
    112                               const struct YuvConstants* yuvconstants,
    113                               int width) {
    114   __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
    115   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
    116   while (width > 0) {
    117     READYUVA422
    118     YUVTORGB(yuvconstants)
    119     STOREARGB
    120     width -= 8;
    121   }
    122 }
    123 #endif
    124 
    125 // 32 bit
    126 #else  // defined(_M_X64)
    127 #ifdef HAS_ARGBTOYROW_SSSE3
    128 
    129 // Constants for ARGB.
    130 static const vec8 kARGBToY = {
    131   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
    132 };
    133 
    134 // JPeg full range.
    135 static const vec8 kARGBToYJ = {
    136   15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
    137 };
    138 
    139 static const vec8 kARGBToU = {
    140   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
    141 };
    142 
    143 static const vec8 kARGBToUJ = {
    144   127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
    145 };
    146 
    147 static const vec8 kARGBToV = {
    148   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
    149 };
    150 
    151 static const vec8 kARGBToVJ = {
    152   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
    153 };
    154 
    155 // vpshufb for vphaddw + vpackuswb packed to shorts.
    156 static const lvec8 kShufARGBToUV_AVX = {
    157   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
    158   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
    159 };
    160 
    161 // Constants for BGRA.
    162 static const vec8 kBGRAToY = {
    163   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
    164 };
    165 
    166 static const vec8 kBGRAToU = {
    167   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
    168 };
    169 
    170 static const vec8 kBGRAToV = {
    171   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
    172 };
    173 
    174 // Constants for ABGR.
    175 static const vec8 kABGRToY = {
    176   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
    177 };
    178 
    179 static const vec8 kABGRToU = {
    180   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
    181 };
    182 
    183 static const vec8 kABGRToV = {
    184   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
    185 };
    186 
    187 // Constants for RGBA.
    188 static const vec8 kRGBAToY = {
    189   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
    190 };
    191 
    192 static const vec8 kRGBAToU = {
    193   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
    194 };
    195 
    196 static const vec8 kRGBAToV = {
    197   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
    198 };
    199 
    200 static const uvec8 kAddY16 = {
    201   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
    202 };
    203 
    204 // 7 bit fixed point 0.5.
    205 static const vec16 kAddYJ64 = {
    206   64, 64, 64, 64, 64, 64, 64, 64
    207 };
    208 
    209 static const uvec8 kAddUV128 = {
    210   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
    211   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
    212 };
    213 
    214 static const uvec16 kAddUVJ128 = {
    215   0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
    216 };
    217 
    218 // Shuffle table for converting RGB24 to ARGB.
    219 static const uvec8 kShuffleMaskRGB24ToARGB = {
    220   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
    221 };
    222 
    223 // Shuffle table for converting RAW to ARGB.
    224 static const uvec8 kShuffleMaskRAWToARGB = {
    225   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
    226 };
    227 
    228 // Shuffle table for converting RAW to RGB24.  First 8.
    229 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
    230   2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
    231   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
    232 };
    233 
    234 // Shuffle table for converting RAW to RGB24.  Middle 8.
    235 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
    236   2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
    237   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
    238 };
    239 
    240 // Shuffle table for converting RAW to RGB24.  Last 8.
    241 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
    242   8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
    243   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
    244 };
    245 
    246 // Shuffle table for converting ARGB to RGB24.
    247 static const uvec8 kShuffleMaskARGBToRGB24 = {
    248   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
    249 };
    250 
    251 // Shuffle table for converting ARGB to RAW.
    252 static const uvec8 kShuffleMaskARGBToRAW = {
    253   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
    254 };
    255 
    256 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
    257 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
    258   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
    259 };
    260 
    261 // YUY2 shuf 16 Y to 32 Y.
    262 static const lvec8 kShuffleYUY2Y = {
    263   0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
    264   0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
    265 };
    266 
    267 // YUY2 shuf 8 UV to 16 UV.
    268 static const lvec8 kShuffleYUY2UV = {
    269   1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
    270   1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
    271 };
    272 
    273 // UYVY shuf 16 Y to 32 Y.
    274 static const lvec8 kShuffleUYVYY = {
    275   1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
    276   1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
    277 };
    278 
    279 // UYVY shuf 8 UV to 16 UV.
    280 static const lvec8 kShuffleUYVYUV = {
    281   0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
    282   0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
    283 };
    284 
    285 // NV21 shuf 8 VU to 16 UV.
    286 static const lvec8 kShuffleNV21 = {
    287   1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
    288   1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
    289 };
    290 
    291 // Duplicates gray value 3 times and fills in alpha opaque.
    292 __declspec(naked)
    293 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
    294   __asm {
    295     mov        eax, [esp + 4]        // src_y
    296     mov        edx, [esp + 8]        // dst_argb
    297     mov        ecx, [esp + 12]       // width
    298     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
    299     pslld      xmm5, 24
    300 
    301   convertloop:
    302     movq       xmm0, qword ptr [eax]
    303     lea        eax,  [eax + 8]
    304     punpcklbw  xmm0, xmm0
    305     movdqa     xmm1, xmm0
    306     punpcklwd  xmm0, xmm0
    307     punpckhwd  xmm1, xmm1
    308     por        xmm0, xmm5
    309     por        xmm1, xmm5
    310     movdqu     [edx], xmm0
    311     movdqu     [edx + 16], xmm1
    312     lea        edx, [edx + 32]
    313     sub        ecx, 8
    314     jg         convertloop
    315     ret
    316   }
    317 }
    318 
    319 #ifdef HAS_J400TOARGBROW_AVX2
    320 // Duplicates gray value 3 times and fills in alpha opaque.
    321 __declspec(naked)
    322 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
    323   __asm {
    324     mov         eax, [esp + 4]        // src_y
    325     mov         edx, [esp + 8]        // dst_argb
    326     mov         ecx, [esp + 12]       // width
    327     vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
    328     vpslld      ymm5, ymm5, 24
    329 
    330   convertloop:
    331     vmovdqu     xmm0, [eax]
    332     lea         eax,  [eax + 16]
    333     vpermq      ymm0, ymm0, 0xd8
    334     vpunpcklbw  ymm0, ymm0, ymm0
    335     vpermq      ymm0, ymm0, 0xd8
    336     vpunpckhwd  ymm1, ymm0, ymm0
    337     vpunpcklwd  ymm0, ymm0, ymm0
    338     vpor        ymm0, ymm0, ymm5
    339     vpor        ymm1, ymm1, ymm5
    340     vmovdqu     [edx], ymm0
    341     vmovdqu     [edx + 32], ymm1
    342     lea         edx, [edx + 64]
    343     sub         ecx, 16
    344     jg          convertloop
    345     vzeroupper
    346     ret
    347   }
    348 }
    349 #endif  // HAS_J400TOARGBROW_AVX2
    350 
    351 __declspec(naked)
    352 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
    353   __asm {
    354     mov       eax, [esp + 4]   // src_rgb24
    355     mov       edx, [esp + 8]   // dst_argb
    356     mov       ecx, [esp + 12]  // width
    357     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
    358     pslld     xmm5, 24
    359     movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
    360 
    361  convertloop:
    362     movdqu    xmm0, [eax]
    363     movdqu    xmm1, [eax + 16]
    364     movdqu    xmm3, [eax + 32]
    365     lea       eax, [eax + 48]
    366     movdqa    xmm2, xmm3
    367     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
    368     pshufb    xmm2, xmm4
    369     por       xmm2, xmm5
    370     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
    371     pshufb    xmm0, xmm4
    372     movdqu    [edx + 32], xmm2
    373     por       xmm0, xmm5
    374     pshufb    xmm1, xmm4
    375     movdqu    [edx], xmm0
    376     por       xmm1, xmm5
    377     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
    378     pshufb    xmm3, xmm4
    379     movdqu    [edx + 16], xmm1
    380     por       xmm3, xmm5
    381     movdqu    [edx + 48], xmm3
    382     lea       edx, [edx + 64]
    383     sub       ecx, 16
    384     jg        convertloop
    385     ret
    386   }
    387 }
    388 
    389 __declspec(naked)
    390 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
    391                         int width) {
    392   __asm {
    393     mov       eax, [esp + 4]   // src_raw
    394     mov       edx, [esp + 8]   // dst_argb
    395     mov       ecx, [esp + 12]  // width
    396     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
    397     pslld     xmm5, 24
    398     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
    399 
    400  convertloop:
    401     movdqu    xmm0, [eax]
    402     movdqu    xmm1, [eax + 16]
    403     movdqu    xmm3, [eax + 32]
    404     lea       eax, [eax + 48]
    405     movdqa    xmm2, xmm3
    406     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
    407     pshufb    xmm2, xmm4
    408     por       xmm2, xmm5
    409     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
    410     pshufb    xmm0, xmm4
    411     movdqu    [edx + 32], xmm2
    412     por       xmm0, xmm5
    413     pshufb    xmm1, xmm4
    414     movdqu    [edx], xmm0
    415     por       xmm1, xmm5
    416     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
    417     pshufb    xmm3, xmm4
    418     movdqu    [edx + 16], xmm1
    419     por       xmm3, xmm5
    420     movdqu    [edx + 48], xmm3
    421     lea       edx, [edx + 64]
    422     sub       ecx, 16
    423     jg        convertloop
    424     ret
    425   }
    426 }
    427 
    428 __declspec(naked)
    429 void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
    430   __asm {
    431     mov       eax, [esp + 4]   // src_raw
    432     mov       edx, [esp + 8]   // dst_rgb24
    433     mov       ecx, [esp + 12]  // width
    434     movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
    435     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
    436     movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
    437 
    438  convertloop:
    439     movdqu    xmm0, [eax]
    440     movdqu    xmm1, [eax + 4]
    441     movdqu    xmm2, [eax + 8]
    442     lea       eax, [eax + 24]
    443     pshufb    xmm0, xmm3
    444     pshufb    xmm1, xmm4
    445     pshufb    xmm2, xmm5
    446     movq      qword ptr [edx], xmm0
    447     movq      qword ptr [edx + 8], xmm1
    448     movq      qword ptr [edx + 16], xmm2
    449     lea       edx, [edx + 24]
    450     sub       ecx, 8
    451     jg        convertloop
    452     ret
    453   }
    454 }
    455 
    456 // pmul method to replicate bits.
    457 // Math to replicate bits:
    458 // (v << 8) | (v << 3)
    459 // v * 256 + v * 8
    460 // v * (256 + 8)
    461 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
    462 // 20 instructions.
    463 __declspec(naked)
    464 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
    465                           int width) {
    466   __asm {
    467     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
    468     movd      xmm5, eax
    469     pshufd    xmm5, xmm5, 0
    470     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
    471     movd      xmm6, eax
    472     pshufd    xmm6, xmm6, 0
    473     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
    474     psllw     xmm3, 11
    475     pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
    476     psllw     xmm4, 10
    477     psrlw     xmm4, 5
    478     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
    479     psllw     xmm7, 8
    480 
    481     mov       eax, [esp + 4]   // src_rgb565
    482     mov       edx, [esp + 8]   // dst_argb
    483     mov       ecx, [esp + 12]  // width
    484     sub       edx, eax
    485     sub       edx, eax
    486 
    487  convertloop:
    488     movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
    489     movdqa    xmm1, xmm0
    490     movdqa    xmm2, xmm0
    491     pand      xmm1, xmm3    // R in upper 5 bits
    492     psllw     xmm2, 11      // B in upper 5 bits
    493     pmulhuw   xmm1, xmm5    // * (256 + 8)
    494     pmulhuw   xmm2, xmm5    // * (256 + 8)
    495     psllw     xmm1, 8
    496     por       xmm1, xmm2    // RB
    497     pand      xmm0, xmm4    // G in middle 6 bits
    498     pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
    499     por       xmm0, xmm7    // AG
    500     movdqa    xmm2, xmm1
    501     punpcklbw xmm1, xmm0
    502     punpckhbw xmm2, xmm0
    503     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
    504     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
    505     lea       eax, [eax + 16]
    506     sub       ecx, 8
    507     jg        convertloop
    508     ret
    509   }
    510 }
    511 
    512 #ifdef HAS_RGB565TOARGBROW_AVX2
    513 // pmul method to replicate bits.
    514 // Math to replicate bits:
    515 // (v << 8) | (v << 3)
    516 // v * 256 + v * 8
    517 // v * (256 + 8)
    518 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
    519 __declspec(naked)
    520 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
    521                           int width) {
    522   __asm {
    523     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
    524     vmovd      xmm5, eax
    525     vbroadcastss ymm5, xmm5
    526     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
    527     vmovd      xmm6, eax
    528     vbroadcastss ymm6, xmm6
    529     vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
    530     vpsllw     ymm3, ymm3, 11
    531     vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
    532     vpsllw     ymm4, ymm4, 10
    533     vpsrlw     ymm4, ymm4, 5
    534     vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
    535     vpsllw     ymm7, ymm7, 8
    536 
    537     mov        eax, [esp + 4]   // src_rgb565
    538     mov        edx, [esp + 8]   // dst_argb
    539     mov        ecx, [esp + 12]  // width
    540     sub        edx, eax
    541     sub        edx, eax
    542 
    543  convertloop:
    544     vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
    545     vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
    546     vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
    547     vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
    548     vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
    549     vpsllw     ymm1, ymm1, 8
    550     vpor       ymm1, ymm1, ymm2    // RB
    551     vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
    552     vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
    553     vpor       ymm0, ymm0, ymm7    // AG
    554     vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
    555     vpermq     ymm1, ymm1, 0xd8
    556     vpunpckhbw ymm2, ymm1, ymm0
    557     vpunpcklbw ymm1, ymm1, ymm0
    558     vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
    559     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
    560     lea       eax, [eax + 32]
    561     sub       ecx, 16
    562     jg        convertloop
    563     vzeroupper
    564     ret
    565   }
    566 }
    567 #endif  // HAS_RGB565TOARGBROW_AVX2
    568 
    569 #ifdef HAS_ARGB1555TOARGBROW_AVX2
    570 __declspec(naked)
    571 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
    572                             int width) {
    573   __asm {
    574     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
    575     vmovd      xmm5, eax
    576     vbroadcastss ymm5, xmm5
    577     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
    578     vmovd      xmm6, eax
    579     vbroadcastss ymm6, xmm6
    580     vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
    581     vpsllw     ymm3, ymm3, 11
    582     vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
    583     vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
    584     vpsllw     ymm7, ymm7, 8
    585 
    586     mov        eax,  [esp + 4]   // src_argb1555
    587     mov        edx,  [esp + 8]   // dst_argb
    588     mov        ecx,  [esp + 12]  // width
    589     sub        edx,  eax
    590     sub        edx,  eax
    591 
    592  convertloop:
    593     vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
    594     vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
    595     vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
    596     vpand      ymm1, ymm1, ymm3
    597     vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
    598     vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
    599     vpsllw     ymm1, ymm1, 8
    600     vpor       ymm1, ymm1, ymm2    // RB
    601     vpsraw     ymm2, ymm0, 8       // A
    602     vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
    603     vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
    604     vpand      ymm2, ymm2, ymm7
    605     vpor       ymm0, ymm0, ymm2    // AG
    606     vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
    607     vpermq     ymm1, ymm1, 0xd8
    608     vpunpckhbw ymm2, ymm1, ymm0
    609     vpunpcklbw ymm1, ymm1, ymm0
    610     vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
    611     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
    612     lea       eax, [eax + 32]
    613     sub       ecx, 16
    614     jg        convertloop
    615     vzeroupper
    616     ret
    617   }
    618 }
    619 #endif  // HAS_ARGB1555TOARGBROW_AVX2
    620 
    621 #ifdef HAS_ARGB4444TOARGBROW_AVX2
    622 __declspec(naked)
    623 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
    624                             int width) {
    625   __asm {
    626     mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
    627     vmovd     xmm4, eax
    628     vbroadcastss ymm4, xmm4
    629     vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
    630     mov       eax,  [esp + 4]   // src_argb4444
    631     mov       edx,  [esp + 8]   // dst_argb
    632     mov       ecx,  [esp + 12]  // width
    633     sub       edx,  eax
    634     sub       edx,  eax
    635 
    636  convertloop:
    637     vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
    638     vpand      ymm2, ymm0, ymm5    // mask high nibbles
    639     vpand      ymm0, ymm0, ymm4    // mask low nibbles
    640     vpsrlw     ymm3, ymm2, 4
    641     vpsllw     ymm1, ymm0, 4
    642     vpor       ymm2, ymm2, ymm3
    643     vpor       ymm0, ymm0, ymm1
    644     vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
    645     vpermq     ymm2, ymm2, 0xd8
    646     vpunpckhbw ymm1, ymm0, ymm2
    647     vpunpcklbw ymm0, ymm0, ymm2
    648     vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
    649     vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
    650     lea       eax, [eax + 32]
    651     sub       ecx, 16
    652     jg        convertloop
    653     vzeroupper
    654     ret
    655   }
    656 }
    657 #endif  // HAS_ARGB4444TOARGBROW_AVX2
    658 
    659 // 24 instructions
    660 __declspec(naked)
    661 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
    662                             int width) {
    663   __asm {
    664     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
    665     movd      xmm5, eax
    666     pshufd    xmm5, xmm5, 0
    667     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
    668     movd      xmm6, eax
    669     pshufd    xmm6, xmm6, 0
    670     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
    671     psllw     xmm3, 11
    672     movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
    673     psrlw     xmm4, 6
    674     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
    675     psllw     xmm7, 8
    676 
    677     mov       eax, [esp + 4]   // src_argb1555
    678     mov       edx, [esp + 8]   // dst_argb
    679     mov       ecx, [esp + 12]  // width
    680     sub       edx, eax
    681     sub       edx, eax
    682 
    683  convertloop:
    684     movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
    685     movdqa    xmm1, xmm0
    686     movdqa    xmm2, xmm0
    687     psllw     xmm1, 1       // R in upper 5 bits
    688     psllw     xmm2, 11      // B in upper 5 bits
    689     pand      xmm1, xmm3
    690     pmulhuw   xmm2, xmm5    // * (256 + 8)
    691     pmulhuw   xmm1, xmm5    // * (256 + 8)
    692     psllw     xmm1, 8
    693     por       xmm1, xmm2    // RB
    694     movdqa    xmm2, xmm0
    695     pand      xmm0, xmm4    // G in middle 5 bits
    696     psraw     xmm2, 8       // A
    697     pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
    698     pand      xmm2, xmm7
    699     por       xmm0, xmm2    // AG
    700     movdqa    xmm2, xmm1
    701     punpcklbw xmm1, xmm0
    702     punpckhbw xmm2, xmm0
    703     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
    704     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
    705     lea       eax, [eax + 16]
    706     sub       ecx, 8
    707     jg        convertloop
    708     ret
    709   }
    710 }
    711 
    712 // 18 instructions.
    713 __declspec(naked)
    714 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
    715                             int width) {
    716   __asm {
    717     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
    718     movd      xmm4, eax
    719     pshufd    xmm4, xmm4, 0
    720     movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
    721     pslld     xmm5, 4
    722     mov       eax, [esp + 4]   // src_argb4444
    723     mov       edx, [esp + 8]   // dst_argb
    724     mov       ecx, [esp + 12]  // width
    725     sub       edx, eax
    726     sub       edx, eax
    727 
    728  convertloop:
    729     movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
    730     movdqa    xmm2, xmm0
    731     pand      xmm0, xmm4    // mask low nibbles
    732     pand      xmm2, xmm5    // mask high nibbles
    733     movdqa    xmm1, xmm0
    734     movdqa    xmm3, xmm2
    735     psllw     xmm1, 4
    736     psrlw     xmm3, 4
    737     por       xmm0, xmm1
    738     por       xmm2, xmm3
    739     movdqa    xmm1, xmm0
    740     punpcklbw xmm0, xmm2
    741     punpckhbw xmm1, xmm2
    742     movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
    743     movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
    744     lea       eax, [eax + 16]
    745     sub       ecx, 8
    746     jg        convertloop
    747     ret
    748   }
    749 }
    750 
    751 __declspec(naked)
    752 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
    753   __asm {
    754     mov       eax, [esp + 4]   // src_argb
    755     mov       edx, [esp + 8]   // dst_rgb
    756     mov       ecx, [esp + 12]  // width
    757     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
    758 
    759  convertloop:
    760     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
    761     movdqu    xmm1, [eax + 16]
    762     movdqu    xmm2, [eax + 32]
    763     movdqu    xmm3, [eax + 48]
    764     lea       eax, [eax + 64]
    765     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
    766     pshufb    xmm1, xmm6
    767     pshufb    xmm2, xmm6
    768     pshufb    xmm3, xmm6
    769     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
    770     psrldq    xmm1, 4      // 8 bytes from 1
    771     pslldq    xmm4, 12     // 4 bytes from 1 for 0
    772     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
    773     por       xmm0, xmm4   // 4 bytes from 1 for 0
    774     pslldq    xmm5, 8      // 8 bytes from 2 for 1
    775     movdqu    [edx], xmm0  // store 0
    776     por       xmm1, xmm5   // 8 bytes from 2 for 1
    777     psrldq    xmm2, 8      // 4 bytes from 2
    778     pslldq    xmm3, 4      // 12 bytes from 3 for 2
    779     por       xmm2, xmm3   // 12 bytes from 3 for 2
    780     movdqu    [edx + 16], xmm1   // store 1
    781     movdqu    [edx + 32], xmm2   // store 2
    782     lea       edx, [edx + 48]
    783     sub       ecx, 16
    784     jg        convertloop
    785     ret
    786   }
    787 }
    788 
    789 __declspec(naked)
    790 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
    791   __asm {
    792     mov       eax, [esp + 4]   // src_argb
    793     mov       edx, [esp + 8]   // dst_rgb
    794     mov       ecx, [esp + 12]  // width
    795     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
    796 
    797  convertloop:
    798     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
    799     movdqu    xmm1, [eax + 16]
    800     movdqu    xmm2, [eax + 32]
    801     movdqu    xmm3, [eax + 48]
    802     lea       eax, [eax + 64]
    803     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
    804     pshufb    xmm1, xmm6
    805     pshufb    xmm2, xmm6
    806     pshufb    xmm3, xmm6
    807     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
    808     psrldq    xmm1, 4      // 8 bytes from 1
    809     pslldq    xmm4, 12     // 4 bytes from 1 for 0
    810     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
    811     por       xmm0, xmm4   // 4 bytes from 1 for 0
    812     pslldq    xmm5, 8      // 8 bytes from 2 for 1
    813     movdqu    [edx], xmm0  // store 0
    814     por       xmm1, xmm5   // 8 bytes from 2 for 1
    815     psrldq    xmm2, 8      // 4 bytes from 2
    816     pslldq    xmm3, 4      // 12 bytes from 3 for 2
    817     por       xmm2, xmm3   // 12 bytes from 3 for 2
    818     movdqu    [edx + 16], xmm1   // store 1
    819     movdqu    [edx + 32], xmm2   // store 2
    820     lea       edx, [edx + 48]
    821     sub       ecx, 16
    822     jg        convertloop
    823     ret
    824   }
    825 }
    826 
    827 __declspec(naked)
    828 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
    829   __asm {
    830     mov       eax, [esp + 4]   // src_argb
    831     mov       edx, [esp + 8]   // dst_rgb
    832     mov       ecx, [esp + 12]  // width
    833     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
    834     psrld     xmm3, 27
    835     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
    836     psrld     xmm4, 26
    837     pslld     xmm4, 5
    838     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
    839     pslld     xmm5, 11
    840 
    841  convertloop:
    842     movdqu    xmm0, [eax]   // fetch 4 pixels of argb
    843     movdqa    xmm1, xmm0    // B
    844     movdqa    xmm2, xmm0    // G
    845     pslld     xmm0, 8       // R
    846     psrld     xmm1, 3       // B
    847     psrld     xmm2, 5       // G
    848     psrad     xmm0, 16      // R
    849     pand      xmm1, xmm3    // B
    850     pand      xmm2, xmm4    // G
    851     pand      xmm0, xmm5    // R
    852     por       xmm1, xmm2    // BG
    853     por       xmm0, xmm1    // BGR
    854     packssdw  xmm0, xmm0
    855     lea       eax, [eax + 16]
    856     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
    857     lea       edx, [edx + 8]
    858     sub       ecx, 4
    859     jg        convertloop
    860     ret
    861   }
    862 }
    863 
    864 __declspec(naked)
    865 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
    866                                 const uint32 dither4, int width) {
    867   __asm {
    868 
    869     mov       eax, [esp + 4]   // src_argb
    870     mov       edx, [esp + 8]   // dst_rgb
    871     movd      xmm6, [esp + 12] // dither4
    872     mov       ecx, [esp + 16]  // width
    873     punpcklbw xmm6, xmm6       // make dither 16 bytes
    874     movdqa    xmm7, xmm6
    875     punpcklwd xmm6, xmm6
    876     punpckhwd xmm7, xmm7
    877     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
    878     psrld     xmm3, 27
    879     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
    880     psrld     xmm4, 26
    881     pslld     xmm4, 5
    882     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
    883     pslld     xmm5, 11
    884 
    885  convertloop:
    886     movdqu    xmm0, [eax]   // fetch 4 pixels of argb
    887     paddusb   xmm0, xmm6    // add dither
    888     movdqa    xmm1, xmm0    // B
    889     movdqa    xmm2, xmm0    // G
    890     pslld     xmm0, 8       // R
    891     psrld     xmm1, 3       // B
    892     psrld     xmm2, 5       // G
    893     psrad     xmm0, 16      // R
    894     pand      xmm1, xmm3    // B
    895     pand      xmm2, xmm4    // G
    896     pand      xmm0, xmm5    // R
    897     por       xmm1, xmm2    // BG
    898     por       xmm0, xmm1    // BGR
    899     packssdw  xmm0, xmm0
    900     lea       eax, [eax + 16]
    901     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
    902     lea       edx, [edx + 8]
    903     sub       ecx, 4
    904     jg        convertloop
    905     ret
    906   }
    907 }
    908 
    909 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
    910 __declspec(naked)
    911 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
    912                                 const uint32 dither4, int width) {
    913   __asm {
    914     mov        eax, [esp + 4]      // src_argb
    915     mov        edx, [esp + 8]      // dst_rgb
    916     vbroadcastss xmm6, [esp + 12]  // dither4
    917     mov        ecx, [esp + 16]     // width
    918     vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
    919     vpermq     ymm6, ymm6, 0xd8
    920     vpunpcklwd ymm6, ymm6, ymm6
    921     vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
    922     vpsrld     ymm3, ymm3, 27
    923     vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
    924     vpsrld     ymm4, ymm4, 26
    925     vpslld     ymm4, ymm4, 5
    926     vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
    927 
    928  convertloop:
    929     vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
    930     vpaddusb   ymm0, ymm0, ymm6    // add dither
    931     vpsrld     ymm2, ymm0, 5       // G
    932     vpsrld     ymm1, ymm0, 3       // B
    933     vpsrld     ymm0, ymm0, 8       // R
    934     vpand      ymm2, ymm2, ymm4    // G
    935     vpand      ymm1, ymm1, ymm3    // B
    936     vpand      ymm0, ymm0, ymm5    // R
    937     vpor       ymm1, ymm1, ymm2    // BG
    938     vpor       ymm0, ymm0, ymm1    // BGR
    939     vpackusdw  ymm0, ymm0, ymm0
    940     vpermq     ymm0, ymm0, 0xd8
    941     lea        eax, [eax + 32]
    942     vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
    943     lea        edx, [edx + 16]
    944     sub        ecx, 8
    945     jg         convertloop
    946     vzeroupper
    947     ret
    948   }
    949 }
    950 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
    951 
    952 // TODO(fbarchard): Improve sign extension/packing.
    953 __declspec(naked)
    954 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
    955   __asm {
    956     mov       eax, [esp + 4]   // src_argb
    957     mov       edx, [esp + 8]   // dst_rgb
    958     mov       ecx, [esp + 12]  // width
    959     pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
    960     psrld     xmm4, 27
    961     movdqa    xmm5, xmm4       // generate mask 0x000003e0
    962     pslld     xmm5, 5
    963     movdqa    xmm6, xmm4       // generate mask 0x00007c00
    964     pslld     xmm6, 10
    965     pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
    966     pslld     xmm7, 15
    967 
    968  convertloop:
    969     movdqu    xmm0, [eax]   // fetch 4 pixels of argb
    970     movdqa    xmm1, xmm0    // B
    971     movdqa    xmm2, xmm0    // G
    972     movdqa    xmm3, xmm0    // R
    973     psrad     xmm0, 16      // A
    974     psrld     xmm1, 3       // B
    975     psrld     xmm2, 6       // G
    976     psrld     xmm3, 9       // R
    977     pand      xmm0, xmm7    // A
    978     pand      xmm1, xmm4    // B
    979     pand      xmm2, xmm5    // G
    980     pand      xmm3, xmm6    // R
    981     por       xmm0, xmm1    // BA
    982     por       xmm2, xmm3    // GR
    983     por       xmm0, xmm2    // BGRA
    984     packssdw  xmm0, xmm0
    985     lea       eax, [eax + 16]
    986     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
    987     lea       edx, [edx + 8]
    988     sub       ecx, 4
    989     jg        convertloop
    990     ret
    991   }
    992 }
    993 
    994 __declspec(naked)
    995 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
    996   __asm {
    997     mov       eax, [esp + 4]   // src_argb
    998     mov       edx, [esp + 8]   // dst_rgb
    999     mov       ecx, [esp + 12]  // width
   1000     pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
   1001     psllw     xmm4, 12
   1002     movdqa    xmm3, xmm4       // generate mask 0x00f000f0
   1003     psrlw     xmm3, 8
   1004 
   1005  convertloop:
   1006     movdqu    xmm0, [eax]   // fetch 4 pixels of argb
   1007     movdqa    xmm1, xmm0
   1008     pand      xmm0, xmm3    // low nibble
   1009     pand      xmm1, xmm4    // high nibble
   1010     psrld     xmm0, 4
   1011     psrld     xmm1, 8
   1012     por       xmm0, xmm1
   1013     packuswb  xmm0, xmm0
   1014     lea       eax, [eax + 16]
   1015     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
   1016     lea       edx, [edx + 8]
   1017     sub       ecx, 4
   1018     jg        convertloop
   1019     ret
   1020   }
   1021 }
   1022 
   1023 #ifdef HAS_ARGBTORGB565ROW_AVX2
   1024 __declspec(naked)
   1025 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
   1026   __asm {
   1027     mov        eax, [esp + 4]      // src_argb
   1028     mov        edx, [esp + 8]      // dst_rgb
   1029     mov        ecx, [esp + 12]     // width
   1030     vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
   1031     vpsrld     ymm3, ymm3, 27
   1032     vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
   1033     vpsrld     ymm4, ymm4, 26
   1034     vpslld     ymm4, ymm4, 5
   1035     vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
   1036 
   1037  convertloop:
   1038     vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
   1039     vpsrld     ymm2, ymm0, 5       // G
   1040     vpsrld     ymm1, ymm0, 3       // B
   1041     vpsrld     ymm0, ymm0, 8       // R
   1042     vpand      ymm2, ymm2, ymm4    // G
   1043     vpand      ymm1, ymm1, ymm3    // B
   1044     vpand      ymm0, ymm0, ymm5    // R
   1045     vpor       ymm1, ymm1, ymm2    // BG
   1046     vpor       ymm0, ymm0, ymm1    // BGR
   1047     vpackusdw  ymm0, ymm0, ymm0
   1048     vpermq     ymm0, ymm0, 0xd8
   1049     lea        eax, [eax + 32]
   1050     vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
   1051     lea        edx, [edx + 16]
   1052     sub        ecx, 8
   1053     jg         convertloop
   1054     vzeroupper
   1055     ret
   1056   }
   1057 }
   1058 #endif  // HAS_ARGBTORGB565ROW_AVX2
   1059 
   1060 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
   1061 __declspec(naked)
   1062 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
   1063   __asm {
   1064     mov        eax, [esp + 4]      // src_argb
   1065     mov        edx, [esp + 8]      // dst_rgb
   1066     mov        ecx, [esp + 12]     // width
   1067     vpcmpeqb   ymm4, ymm4, ymm4
   1068     vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
   1069     vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
   1070     vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
   1071     vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
   1072     vpslld     ymm7, ymm7, 15
   1073 
   1074  convertloop:
   1075     vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
   1076     vpsrld     ymm3, ymm0, 9       // R
   1077     vpsrld     ymm2, ymm0, 6       // G
   1078     vpsrld     ymm1, ymm0, 3       // B
   1079     vpsrad     ymm0, ymm0, 16      // A
   1080     vpand      ymm3, ymm3, ymm6    // R
   1081     vpand      ymm2, ymm2, ymm5    // G
   1082     vpand      ymm1, ymm1, ymm4    // B
   1083     vpand      ymm0, ymm0, ymm7    // A
   1084     vpor       ymm0, ymm0, ymm1    // BA
   1085     vpor       ymm2, ymm2, ymm3    // GR
   1086     vpor       ymm0, ymm0, ymm2    // BGRA
   1087     vpackssdw  ymm0, ymm0, ymm0
   1088     vpermq     ymm0, ymm0, 0xd8
   1089     lea        eax, [eax + 32]
   1090     vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
   1091     lea        edx, [edx + 16]
   1092     sub        ecx, 8
   1093     jg         convertloop
   1094     vzeroupper
   1095     ret
   1096   }
   1097 }
   1098 #endif  // HAS_ARGBTOARGB1555ROW_AVX2
   1099 
   1100 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
   1101 __declspec(naked)
   1102 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
   1103   __asm {
   1104     mov        eax, [esp + 4]   // src_argb
   1105     mov        edx, [esp + 8]   // dst_rgb
   1106     mov        ecx, [esp + 12]  // width
   1107     vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
   1108     vpsllw     ymm4, ymm4, 12
   1109     vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
   1110 
   1111  convertloop:
   1112     vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
   1113     vpand      ymm1, ymm0, ymm4    // high nibble
   1114     vpand      ymm0, ymm0, ymm3    // low nibble
   1115     vpsrld     ymm1, ymm1, 8
   1116     vpsrld     ymm0, ymm0, 4
   1117     vpor       ymm0, ymm0, ymm1
   1118     vpackuswb  ymm0, ymm0, ymm0
   1119     vpermq     ymm0, ymm0, 0xd8
   1120     lea        eax, [eax + 32]
   1121     vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
   1122     lea        edx, [edx + 16]
   1123     sub        ecx, 8
   1124     jg         convertloop
   1125     vzeroupper
   1126     ret
   1127   }
   1128 }
   1129 #endif  // HAS_ARGBTOARGB4444ROW_AVX2
   1130 
   1131 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
   1132 __declspec(naked)
   1133 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
   1134   __asm {
   1135     mov        eax, [esp + 4]   /* src_argb */
   1136     mov        edx, [esp + 8]   /* dst_y */
   1137     mov        ecx, [esp + 12]  /* width */
   1138     movdqa     xmm4, xmmword ptr kARGBToY
   1139     movdqa     xmm5, xmmword ptr kAddY16
   1140 
   1141  convertloop:
   1142     movdqu     xmm0, [eax]
   1143     movdqu     xmm1, [eax + 16]
   1144     movdqu     xmm2, [eax + 32]
   1145     movdqu     xmm3, [eax + 48]
   1146     pmaddubsw  xmm0, xmm4
   1147     pmaddubsw  xmm1, xmm4
   1148     pmaddubsw  xmm2, xmm4
   1149     pmaddubsw  xmm3, xmm4
   1150     lea        eax, [eax + 64]
   1151     phaddw     xmm0, xmm1
   1152     phaddw     xmm2, xmm3
   1153     psrlw      xmm0, 7
   1154     psrlw      xmm2, 7
   1155     packuswb   xmm0, xmm2
   1156     paddb      xmm0, xmm5
   1157     movdqu     [edx], xmm0
   1158     lea        edx, [edx + 16]
   1159     sub        ecx, 16
   1160     jg         convertloop
   1161     ret
   1162   }
   1163 }
   1164 
   1165 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
   1166 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
   1167 __declspec(naked)
   1168 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
   1169   __asm {
   1170     mov        eax, [esp + 4]   /* src_argb */
   1171     mov        edx, [esp + 8]   /* dst_y */
   1172     mov        ecx, [esp + 12]  /* width */
   1173     movdqa     xmm4, xmmword ptr kARGBToYJ
   1174     movdqa     xmm5, xmmword ptr kAddYJ64
   1175 
   1176  convertloop:
   1177     movdqu     xmm0, [eax]
   1178     movdqu     xmm1, [eax + 16]
   1179     movdqu     xmm2, [eax + 32]
   1180     movdqu     xmm3, [eax + 48]
   1181     pmaddubsw  xmm0, xmm4
   1182     pmaddubsw  xmm1, xmm4
   1183     pmaddubsw  xmm2, xmm4
   1184     pmaddubsw  xmm3, xmm4
   1185     lea        eax, [eax + 64]
   1186     phaddw     xmm0, xmm1
   1187     phaddw     xmm2, xmm3
   1188     paddw      xmm0, xmm5  // Add .5 for rounding.
   1189     paddw      xmm2, xmm5
   1190     psrlw      xmm0, 7
   1191     psrlw      xmm2, 7
   1192     packuswb   xmm0, xmm2
   1193     movdqu     [edx], xmm0
   1194     lea        edx, [edx + 16]
   1195     sub        ecx, 16
   1196     jg         convertloop
   1197     ret
   1198   }
   1199 }
   1200 
   1201 #ifdef HAS_ARGBTOYROW_AVX2
   1202 // vpermd for vphaddw + vpackuswb vpermd.
   1203 static const lvec32 kPermdARGBToY_AVX = {
   1204   0, 4, 1, 5, 2, 6, 3, 7
   1205 };
   1206 
   1207 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
   1208 __declspec(naked)
   1209 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
   1210   __asm {
   1211     mov        eax, [esp + 4]   /* src_argb */
   1212     mov        edx, [esp + 8]   /* dst_y */
   1213     mov        ecx, [esp + 12]  /* width */
   1214     vbroadcastf128 ymm4, xmmword ptr kARGBToY
   1215     vbroadcastf128 ymm5, xmmword ptr kAddY16
   1216     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
   1217 
   1218  convertloop:
   1219     vmovdqu    ymm0, [eax]
   1220     vmovdqu    ymm1, [eax + 32]
   1221     vmovdqu    ymm2, [eax + 64]
   1222     vmovdqu    ymm3, [eax + 96]
   1223     vpmaddubsw ymm0, ymm0, ymm4
   1224     vpmaddubsw ymm1, ymm1, ymm4
   1225     vpmaddubsw ymm2, ymm2, ymm4
   1226     vpmaddubsw ymm3, ymm3, ymm4
   1227     lea        eax, [eax + 128]
   1228     vphaddw    ymm0, ymm0, ymm1  // mutates.
   1229     vphaddw    ymm2, ymm2, ymm3
   1230     vpsrlw     ymm0, ymm0, 7
   1231     vpsrlw     ymm2, ymm2, 7
   1232     vpackuswb  ymm0, ymm0, ymm2  // mutates.
   1233     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
   1234     vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
   1235     vmovdqu    [edx], ymm0
   1236     lea        edx, [edx + 32]
   1237     sub        ecx, 32
   1238     jg         convertloop
   1239     vzeroupper
   1240     ret
   1241   }
   1242 }
   1243 #endif  //  HAS_ARGBTOYROW_AVX2
   1244 
   1245 #ifdef HAS_ARGBTOYJROW_AVX2
   1246 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
   1247 __declspec(naked)
   1248 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
   1249   __asm {
   1250     mov        eax, [esp + 4]   /* src_argb */
   1251     mov        edx, [esp + 8]   /* dst_y */
   1252     mov        ecx, [esp + 12]  /* width */
   1253     vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
   1254     vbroadcastf128 ymm5, xmmword ptr kAddYJ64
   1255     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
   1256 
   1257  convertloop:
   1258     vmovdqu    ymm0, [eax]
   1259     vmovdqu    ymm1, [eax + 32]
   1260     vmovdqu    ymm2, [eax + 64]
   1261     vmovdqu    ymm3, [eax + 96]
   1262     vpmaddubsw ymm0, ymm0, ymm4
   1263     vpmaddubsw ymm1, ymm1, ymm4
   1264     vpmaddubsw ymm2, ymm2, ymm4
   1265     vpmaddubsw ymm3, ymm3, ymm4
   1266     lea        eax, [eax + 128]
   1267     vphaddw    ymm0, ymm0, ymm1  // mutates.
   1268     vphaddw    ymm2, ymm2, ymm3
   1269     vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
   1270     vpaddw     ymm2, ymm2, ymm5
   1271     vpsrlw     ymm0, ymm0, 7
   1272     vpsrlw     ymm2, ymm2, 7
   1273     vpackuswb  ymm0, ymm0, ymm2  // mutates.
   1274     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
   1275     vmovdqu    [edx], ymm0
   1276     lea        edx, [edx + 32]
   1277     sub        ecx, 32
   1278     jg         convertloop
   1279 
   1280     vzeroupper
   1281     ret
   1282   }
   1283 }
   1284 #endif  //  HAS_ARGBTOYJROW_AVX2
   1285 
   1286 __declspec(naked)
   1287 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
   1288   __asm {
   1289     mov        eax, [esp + 4]   /* src_argb */
   1290     mov        edx, [esp + 8]   /* dst_y */
   1291     mov        ecx, [esp + 12]  /* width */
   1292     movdqa     xmm4, xmmword ptr kBGRAToY
   1293     movdqa     xmm5, xmmword ptr kAddY16
   1294 
   1295  convertloop:
   1296     movdqu     xmm0, [eax]
   1297     movdqu     xmm1, [eax + 16]
   1298     movdqu     xmm2, [eax + 32]
   1299     movdqu     xmm3, [eax + 48]
   1300     pmaddubsw  xmm0, xmm4
   1301     pmaddubsw  xmm1, xmm4
   1302     pmaddubsw  xmm2, xmm4
   1303     pmaddubsw  xmm3, xmm4
   1304     lea        eax, [eax + 64]
   1305     phaddw     xmm0, xmm1
   1306     phaddw     xmm2, xmm3
   1307     psrlw      xmm0, 7
   1308     psrlw      xmm2, 7
   1309     packuswb   xmm0, xmm2
   1310     paddb      xmm0, xmm5
   1311     movdqu     [edx], xmm0
   1312     lea        edx, [edx + 16]
   1313     sub        ecx, 16
   1314     jg         convertloop
   1315     ret
   1316   }
   1317 }
   1318 
   1319 __declspec(naked)
   1320 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
   1321   __asm {
   1322     mov        eax, [esp + 4]   /* src_argb */
   1323     mov        edx, [esp + 8]   /* dst_y */
   1324     mov        ecx, [esp + 12]  /* width */
   1325     movdqa     xmm4, xmmword ptr kABGRToY
   1326     movdqa     xmm5, xmmword ptr kAddY16
   1327 
   1328  convertloop:
   1329     movdqu     xmm0, [eax]
   1330     movdqu     xmm1, [eax + 16]
   1331     movdqu     xmm2, [eax + 32]
   1332     movdqu     xmm3, [eax + 48]
   1333     pmaddubsw  xmm0, xmm4
   1334     pmaddubsw  xmm1, xmm4
   1335     pmaddubsw  xmm2, xmm4
   1336     pmaddubsw  xmm3, xmm4
   1337     lea        eax, [eax + 64]
   1338     phaddw     xmm0, xmm1
   1339     phaddw     xmm2, xmm3
   1340     psrlw      xmm0, 7
   1341     psrlw      xmm2, 7
   1342     packuswb   xmm0, xmm2
   1343     paddb      xmm0, xmm5
   1344     movdqu     [edx], xmm0
   1345     lea        edx, [edx + 16]
   1346     sub        ecx, 16
   1347     jg         convertloop
   1348     ret
   1349   }
   1350 }
   1351 
   1352 __declspec(naked)
   1353 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
   1354   __asm {
   1355     mov        eax, [esp + 4]   /* src_argb */
   1356     mov        edx, [esp + 8]   /* dst_y */
   1357     mov        ecx, [esp + 12]  /* width */
   1358     movdqa     xmm4, xmmword ptr kRGBAToY
   1359     movdqa     xmm5, xmmword ptr kAddY16
   1360 
   1361  convertloop:
   1362     movdqu     xmm0, [eax]
   1363     movdqu     xmm1, [eax + 16]
   1364     movdqu     xmm2, [eax + 32]
   1365     movdqu     xmm3, [eax + 48]
   1366     pmaddubsw  xmm0, xmm4
   1367     pmaddubsw  xmm1, xmm4
   1368     pmaddubsw  xmm2, xmm4
   1369     pmaddubsw  xmm3, xmm4
   1370     lea        eax, [eax + 64]
   1371     phaddw     xmm0, xmm1
   1372     phaddw     xmm2, xmm3
   1373     psrlw      xmm0, 7
   1374     psrlw      xmm2, 7
   1375     packuswb   xmm0, xmm2
   1376     paddb      xmm0, xmm5
   1377     movdqu     [edx], xmm0
   1378     lea        edx, [edx + 16]
   1379     sub        ecx, 16
   1380     jg         convertloop
   1381     ret
   1382   }
   1383 }
   1384 
   1385 __declspec(naked)
   1386 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1387                        uint8* dst_u, uint8* dst_v, int width) {
   1388   __asm {
   1389     push       esi
   1390     push       edi
   1391     mov        eax, [esp + 8 + 4]   // src_argb
   1392     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1393     mov        edx, [esp + 8 + 12]  // dst_u
   1394     mov        edi, [esp + 8 + 16]  // dst_v
   1395     mov        ecx, [esp + 8 + 20]  // width
   1396     movdqa     xmm5, xmmword ptr kAddUV128
   1397     movdqa     xmm6, xmmword ptr kARGBToV
   1398     movdqa     xmm7, xmmword ptr kARGBToU
   1399     sub        edi, edx             // stride from u to v
   1400 
   1401  convertloop:
   1402     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1403     movdqu     xmm0, [eax]
   1404     movdqu     xmm4, [eax + esi]
   1405     pavgb      xmm0, xmm4
   1406     movdqu     xmm1, [eax + 16]
   1407     movdqu     xmm4, [eax + esi + 16]
   1408     pavgb      xmm1, xmm4
   1409     movdqu     xmm2, [eax + 32]
   1410     movdqu     xmm4, [eax + esi + 32]
   1411     pavgb      xmm2, xmm4
   1412     movdqu     xmm3, [eax + 48]
   1413     movdqu     xmm4, [eax + esi + 48]
   1414     pavgb      xmm3, xmm4
   1415 
   1416     lea        eax,  [eax + 64]
   1417     movdqa     xmm4, xmm0
   1418     shufps     xmm0, xmm1, 0x88
   1419     shufps     xmm4, xmm1, 0xdd
   1420     pavgb      xmm0, xmm4
   1421     movdqa     xmm4, xmm2
   1422     shufps     xmm2, xmm3, 0x88
   1423     shufps     xmm4, xmm3, 0xdd
   1424     pavgb      xmm2, xmm4
   1425 
   1426     // step 2 - convert to U and V
   1427     // from here down is very similar to Y code except
   1428     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1429     movdqa     xmm1, xmm0
   1430     movdqa     xmm3, xmm2
   1431     pmaddubsw  xmm0, xmm7  // U
   1432     pmaddubsw  xmm2, xmm7
   1433     pmaddubsw  xmm1, xmm6  // V
   1434     pmaddubsw  xmm3, xmm6
   1435     phaddw     xmm0, xmm2
   1436     phaddw     xmm1, xmm3
   1437     psraw      xmm0, 8
   1438     psraw      xmm1, 8
   1439     packsswb   xmm0, xmm1
   1440     paddb      xmm0, xmm5            // -> unsigned
   1441 
   1442     // step 3 - store 8 U and 8 V values
   1443     movlps     qword ptr [edx], xmm0 // U
   1444     movhps     qword ptr [edx + edi], xmm0 // V
   1445     lea        edx, [edx + 8]
   1446     sub        ecx, 16
   1447     jg         convertloop
   1448 
   1449     pop        edi
   1450     pop        esi
   1451     ret
   1452   }
   1453 }
   1454 
   1455 __declspec(naked)
   1456 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1457                         uint8* dst_u, uint8* dst_v, int width) {
   1458   __asm {
   1459     push       esi
   1460     push       edi
   1461     mov        eax, [esp + 8 + 4]   // src_argb
   1462     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1463     mov        edx, [esp + 8 + 12]  // dst_u
   1464     mov        edi, [esp + 8 + 16]  // dst_v
   1465     mov        ecx, [esp + 8 + 20]  // width
   1466     movdqa     xmm5, xmmword ptr kAddUVJ128
   1467     movdqa     xmm6, xmmword ptr kARGBToVJ
   1468     movdqa     xmm7, xmmword ptr kARGBToUJ
   1469     sub        edi, edx             // stride from u to v
   1470 
   1471  convertloop:
   1472     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1473     movdqu     xmm0, [eax]
   1474     movdqu     xmm4, [eax + esi]
   1475     pavgb      xmm0, xmm4
   1476     movdqu     xmm1, [eax + 16]
   1477     movdqu     xmm4, [eax + esi + 16]
   1478     pavgb      xmm1, xmm4
   1479     movdqu     xmm2, [eax + 32]
   1480     movdqu     xmm4, [eax + esi + 32]
   1481     pavgb      xmm2, xmm4
   1482     movdqu     xmm3, [eax + 48]
   1483     movdqu     xmm4, [eax + esi + 48]
   1484     pavgb      xmm3, xmm4
   1485 
   1486     lea        eax,  [eax + 64]
   1487     movdqa     xmm4, xmm0
   1488     shufps     xmm0, xmm1, 0x88
   1489     shufps     xmm4, xmm1, 0xdd
   1490     pavgb      xmm0, xmm4
   1491     movdqa     xmm4, xmm2
   1492     shufps     xmm2, xmm3, 0x88
   1493     shufps     xmm4, xmm3, 0xdd
   1494     pavgb      xmm2, xmm4
   1495 
   1496     // step 2 - convert to U and V
   1497     // from here down is very similar to Y code except
   1498     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1499     movdqa     xmm1, xmm0
   1500     movdqa     xmm3, xmm2
   1501     pmaddubsw  xmm0, xmm7  // U
   1502     pmaddubsw  xmm2, xmm7
   1503     pmaddubsw  xmm1, xmm6  // V
   1504     pmaddubsw  xmm3, xmm6
   1505     phaddw     xmm0, xmm2
   1506     phaddw     xmm1, xmm3
   1507     paddw      xmm0, xmm5  // +.5 rounding -> unsigned
   1508     paddw      xmm1, xmm5
   1509     psraw      xmm0, 8
   1510     psraw      xmm1, 8
   1511     packsswb   xmm0, xmm1
   1512 
   1513     // step 3 - store 8 U and 8 V values
   1514     movlps     qword ptr [edx], xmm0 // U
   1515     movhps     qword ptr [edx + edi], xmm0 // V
   1516     lea        edx, [edx + 8]
   1517     sub        ecx, 16
   1518     jg         convertloop
   1519 
   1520     pop        edi
   1521     pop        esi
   1522     ret
   1523   }
   1524 }
   1525 
   1526 #ifdef HAS_ARGBTOUVROW_AVX2
   1527 __declspec(naked)
   1528 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
   1529                       uint8* dst_u, uint8* dst_v, int width) {
   1530   __asm {
   1531     push       esi
   1532     push       edi
   1533     mov        eax, [esp + 8 + 4]   // src_argb
   1534     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1535     mov        edx, [esp + 8 + 12]  // dst_u
   1536     mov        edi, [esp + 8 + 16]  // dst_v
   1537     mov        ecx, [esp + 8 + 20]  // width
   1538     vbroadcastf128 ymm5, xmmword ptr kAddUV128
   1539     vbroadcastf128 ymm6, xmmword ptr kARGBToV
   1540     vbroadcastf128 ymm7, xmmword ptr kARGBToU
   1541     sub        edi, edx             // stride from u to v
   1542 
   1543  convertloop:
   1544     /* step 1 - subsample 32x2 argb pixels to 16x1 */
   1545     vmovdqu    ymm0, [eax]
   1546     vmovdqu    ymm1, [eax + 32]
   1547     vmovdqu    ymm2, [eax + 64]
   1548     vmovdqu    ymm3, [eax + 96]
   1549     vpavgb     ymm0, ymm0, [eax + esi]
   1550     vpavgb     ymm1, ymm1, [eax + esi + 32]
   1551     vpavgb     ymm2, ymm2, [eax + esi + 64]
   1552     vpavgb     ymm3, ymm3, [eax + esi + 96]
   1553     lea        eax,  [eax + 128]
   1554     vshufps    ymm4, ymm0, ymm1, 0x88
   1555     vshufps    ymm0, ymm0, ymm1, 0xdd
   1556     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
   1557     vshufps    ymm4, ymm2, ymm3, 0x88
   1558     vshufps    ymm2, ymm2, ymm3, 0xdd
   1559     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
   1560 
   1561     // step 2 - convert to U and V
   1562     // from here down is very similar to Y code except
   1563     // instead of 32 different pixels, its 16 pixels of U and 16 of V
   1564     vpmaddubsw ymm1, ymm0, ymm7  // U
   1565     vpmaddubsw ymm3, ymm2, ymm7
   1566     vpmaddubsw ymm0, ymm0, ymm6  // V
   1567     vpmaddubsw ymm2, ymm2, ymm6
   1568     vphaddw    ymm1, ymm1, ymm3  // mutates
   1569     vphaddw    ymm0, ymm0, ymm2
   1570     vpsraw     ymm1, ymm1, 8
   1571     vpsraw     ymm0, ymm0, 8
   1572     vpacksswb  ymm0, ymm1, ymm0  // mutates
   1573     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
   1574     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
   1575     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
   1576 
   1577     // step 3 - store 16 U and 16 V values
   1578     vextractf128 [edx], ymm0, 0 // U
   1579     vextractf128 [edx + edi], ymm0, 1 // V
   1580     lea        edx, [edx + 16]
   1581     sub        ecx, 32
   1582     jg         convertloop
   1583 
   1584     pop        edi
   1585     pop        esi
   1586     vzeroupper
   1587     ret
   1588   }
   1589 }
   1590 #endif  // HAS_ARGBTOUVROW_AVX2
   1591 
   1592 #ifdef HAS_ARGBTOUVJROW_AVX2
   1593 __declspec(naked)
   1594 void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
   1595                       uint8* dst_u, uint8* dst_v, int width) {
   1596   __asm {
   1597     push       esi
   1598     push       edi
   1599     mov        eax, [esp + 8 + 4]   // src_argb
   1600     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1601     mov        edx, [esp + 8 + 12]  // dst_u
   1602     mov        edi, [esp + 8 + 16]  // dst_v
   1603     mov        ecx, [esp + 8 + 20]  // width
   1604     vbroadcastf128 ymm5, xmmword ptr kAddUV128
   1605     vbroadcastf128 ymm6, xmmword ptr kARGBToV
   1606     vbroadcastf128 ymm7, xmmword ptr kARGBToU
   1607     sub        edi, edx             // stride from u to v
   1608 
   1609  convertloop:
   1610     /* step 1 - subsample 32x2 argb pixels to 16x1 */
   1611     vmovdqu    ymm0, [eax]
   1612     vmovdqu    ymm1, [eax + 32]
   1613     vmovdqu    ymm2, [eax + 64]
   1614     vmovdqu    ymm3, [eax + 96]
   1615     vpavgb     ymm0, ymm0, [eax + esi]
   1616     vpavgb     ymm1, ymm1, [eax + esi + 32]
   1617     vpavgb     ymm2, ymm2, [eax + esi + 64]
   1618     vpavgb     ymm3, ymm3, [eax + esi + 96]
   1619     lea        eax,  [eax + 128]
   1620     vshufps    ymm4, ymm0, ymm1, 0x88
   1621     vshufps    ymm0, ymm0, ymm1, 0xdd
   1622     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
   1623     vshufps    ymm4, ymm2, ymm3, 0x88
   1624     vshufps    ymm2, ymm2, ymm3, 0xdd
   1625     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
   1626 
   1627     // step 2 - convert to U and V
   1628     // from here down is very similar to Y code except
   1629     // instead of 32 different pixels, its 16 pixels of U and 16 of V
   1630     vpmaddubsw ymm1, ymm0, ymm7  // U
   1631     vpmaddubsw ymm3, ymm2, ymm7
   1632     vpmaddubsw ymm0, ymm0, ymm6  // V
   1633     vpmaddubsw ymm2, ymm2, ymm6
   1634     vphaddw    ymm1, ymm1, ymm3  // mutates
   1635     vphaddw    ymm0, ymm0, ymm2
   1636     vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
   1637     vpaddw     ymm0, ymm0, ymm5
   1638     vpsraw     ymm1, ymm1, 8
   1639     vpsraw     ymm0, ymm0, 8
   1640     vpacksswb  ymm0, ymm1, ymm0  // mutates
   1641     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
   1642     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
   1643 
   1644     // step 3 - store 16 U and 16 V values
   1645     vextractf128 [edx], ymm0, 0 // U
   1646     vextractf128 [edx + edi], ymm0, 1 // V
   1647     lea        edx, [edx + 16]
   1648     sub        ecx, 32
   1649     jg         convertloop
   1650 
   1651     pop        edi
   1652     pop        esi
   1653     vzeroupper
   1654     ret
   1655   }
   1656 }
   1657 #endif  // HAS_ARGBTOUVJROW_AVX2
   1658 
   1659 __declspec(naked)
   1660 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
   1661                           uint8* dst_u, uint8* dst_v, int width) {
   1662   __asm {
   1663     push       edi
   1664     mov        eax, [esp + 4 + 4]   // src_argb
   1665     mov        edx, [esp + 4 + 8]   // dst_u
   1666     mov        edi, [esp + 4 + 12]  // dst_v
   1667     mov        ecx, [esp + 4 + 16]  // width
   1668     movdqa     xmm5, xmmword ptr kAddUV128
   1669     movdqa     xmm6, xmmword ptr kARGBToV
   1670     movdqa     xmm7, xmmword ptr kARGBToU
   1671     sub        edi, edx             // stride from u to v
   1672 
   1673  convertloop:
   1674     /* convert to U and V */
   1675     movdqu     xmm0, [eax]          // U
   1676     movdqu     xmm1, [eax + 16]
   1677     movdqu     xmm2, [eax + 32]
   1678     movdqu     xmm3, [eax + 48]
   1679     pmaddubsw  xmm0, xmm7
   1680     pmaddubsw  xmm1, xmm7
   1681     pmaddubsw  xmm2, xmm7
   1682     pmaddubsw  xmm3, xmm7
   1683     phaddw     xmm0, xmm1
   1684     phaddw     xmm2, xmm3
   1685     psraw      xmm0, 8
   1686     psraw      xmm2, 8
   1687     packsswb   xmm0, xmm2
   1688     paddb      xmm0, xmm5
   1689     movdqu     [edx], xmm0
   1690 
   1691     movdqu     xmm0, [eax]          // V
   1692     movdqu     xmm1, [eax + 16]
   1693     movdqu     xmm2, [eax + 32]
   1694     movdqu     xmm3, [eax + 48]
   1695     pmaddubsw  xmm0, xmm6
   1696     pmaddubsw  xmm1, xmm6
   1697     pmaddubsw  xmm2, xmm6
   1698     pmaddubsw  xmm3, xmm6
   1699     phaddw     xmm0, xmm1
   1700     phaddw     xmm2, xmm3
   1701     psraw      xmm0, 8
   1702     psraw      xmm2, 8
   1703     packsswb   xmm0, xmm2
   1704     paddb      xmm0, xmm5
   1705     lea        eax,  [eax + 64]
   1706     movdqu     [edx + edi], xmm0
   1707     lea        edx,  [edx + 16]
   1708     sub        ecx,  16
   1709     jg         convertloop
   1710 
   1711     pop        edi
   1712     ret
   1713   }
   1714 }
   1715 
   1716 __declspec(naked)
   1717 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1718                        uint8* dst_u, uint8* dst_v, int width) {
   1719   __asm {
   1720     push       esi
   1721     push       edi
   1722     mov        eax, [esp + 8 + 4]   // src_argb
   1723     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1724     mov        edx, [esp + 8 + 12]  // dst_u
   1725     mov        edi, [esp + 8 + 16]  // dst_v
   1726     mov        ecx, [esp + 8 + 20]  // width
   1727     movdqa     xmm5, xmmword ptr kAddUV128
   1728     movdqa     xmm6, xmmword ptr kBGRAToV
   1729     movdqa     xmm7, xmmword ptr kBGRAToU
   1730     sub        edi, edx             // stride from u to v
   1731 
   1732  convertloop:
   1733     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1734     movdqu     xmm0, [eax]
   1735     movdqu     xmm4, [eax + esi]
   1736     pavgb      xmm0, xmm4
   1737     movdqu     xmm1, [eax + 16]
   1738     movdqu     xmm4, [eax + esi + 16]
   1739     pavgb      xmm1, xmm4
   1740     movdqu     xmm2, [eax + 32]
   1741     movdqu     xmm4, [eax + esi + 32]
   1742     pavgb      xmm2, xmm4
   1743     movdqu     xmm3, [eax + 48]
   1744     movdqu     xmm4, [eax + esi + 48]
   1745     pavgb      xmm3, xmm4
   1746 
   1747     lea        eax,  [eax + 64]
   1748     movdqa     xmm4, xmm0
   1749     shufps     xmm0, xmm1, 0x88
   1750     shufps     xmm4, xmm1, 0xdd
   1751     pavgb      xmm0, xmm4
   1752     movdqa     xmm4, xmm2
   1753     shufps     xmm2, xmm3, 0x88
   1754     shufps     xmm4, xmm3, 0xdd
   1755     pavgb      xmm2, xmm4
   1756 
   1757     // step 2 - convert to U and V
   1758     // from here down is very similar to Y code except
   1759     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1760     movdqa     xmm1, xmm0
   1761     movdqa     xmm3, xmm2
   1762     pmaddubsw  xmm0, xmm7  // U
   1763     pmaddubsw  xmm2, xmm7
   1764     pmaddubsw  xmm1, xmm6  // V
   1765     pmaddubsw  xmm3, xmm6
   1766     phaddw     xmm0, xmm2
   1767     phaddw     xmm1, xmm3
   1768     psraw      xmm0, 8
   1769     psraw      xmm1, 8
   1770     packsswb   xmm0, xmm1
   1771     paddb      xmm0, xmm5            // -> unsigned
   1772 
   1773     // step 3 - store 8 U and 8 V values
   1774     movlps     qword ptr [edx], xmm0 // U
   1775     movhps     qword ptr [edx + edi], xmm0 // V
   1776     lea        edx, [edx + 8]
   1777     sub        ecx, 16
   1778     jg         convertloop
   1779 
   1780     pop        edi
   1781     pop        esi
   1782     ret
   1783   }
   1784 }
   1785 
   1786 __declspec(naked)
   1787 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1788                        uint8* dst_u, uint8* dst_v, int width) {
   1789   __asm {
   1790     push       esi
   1791     push       edi
   1792     mov        eax, [esp + 8 + 4]   // src_argb
   1793     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1794     mov        edx, [esp + 8 + 12]  // dst_u
   1795     mov        edi, [esp + 8 + 16]  // dst_v
   1796     mov        ecx, [esp + 8 + 20]  // width
   1797     movdqa     xmm5, xmmword ptr kAddUV128
   1798     movdqa     xmm6, xmmword ptr kABGRToV
   1799     movdqa     xmm7, xmmword ptr kABGRToU
   1800     sub        edi, edx             // stride from u to v
   1801 
   1802  convertloop:
   1803     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1804     movdqu     xmm0, [eax]
   1805     movdqu     xmm4, [eax + esi]
   1806     pavgb      xmm0, xmm4
   1807     movdqu     xmm1, [eax + 16]
   1808     movdqu     xmm4, [eax + esi + 16]
   1809     pavgb      xmm1, xmm4
   1810     movdqu     xmm2, [eax + 32]
   1811     movdqu     xmm4, [eax + esi + 32]
   1812     pavgb      xmm2, xmm4
   1813     movdqu     xmm3, [eax + 48]
   1814     movdqu     xmm4, [eax + esi + 48]
   1815     pavgb      xmm3, xmm4
   1816 
   1817     lea        eax,  [eax + 64]
   1818     movdqa     xmm4, xmm0
   1819     shufps     xmm0, xmm1, 0x88
   1820     shufps     xmm4, xmm1, 0xdd
   1821     pavgb      xmm0, xmm4
   1822     movdqa     xmm4, xmm2
   1823     shufps     xmm2, xmm3, 0x88
   1824     shufps     xmm4, xmm3, 0xdd
   1825     pavgb      xmm2, xmm4
   1826 
   1827     // step 2 - convert to U and V
   1828     // from here down is very similar to Y code except
   1829     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1830     movdqa     xmm1, xmm0
   1831     movdqa     xmm3, xmm2
   1832     pmaddubsw  xmm0, xmm7  // U
   1833     pmaddubsw  xmm2, xmm7
   1834     pmaddubsw  xmm1, xmm6  // V
   1835     pmaddubsw  xmm3, xmm6
   1836     phaddw     xmm0, xmm2
   1837     phaddw     xmm1, xmm3
   1838     psraw      xmm0, 8
   1839     psraw      xmm1, 8
   1840     packsswb   xmm0, xmm1
   1841     paddb      xmm0, xmm5            // -> unsigned
   1842 
   1843     // step 3 - store 8 U and 8 V values
   1844     movlps     qword ptr [edx], xmm0 // U
   1845     movhps     qword ptr [edx + edi], xmm0 // V
   1846     lea        edx, [edx + 8]
   1847     sub        ecx, 16
   1848     jg         convertloop
   1849 
   1850     pop        edi
   1851     pop        esi
   1852     ret
   1853   }
   1854 }
   1855 
   1856 __declspec(naked)
   1857 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1858                        uint8* dst_u, uint8* dst_v, int width) {
   1859   __asm {
   1860     push       esi
   1861     push       edi
   1862     mov        eax, [esp + 8 + 4]   // src_argb
   1863     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1864     mov        edx, [esp + 8 + 12]  // dst_u
   1865     mov        edi, [esp + 8 + 16]  // dst_v
   1866     mov        ecx, [esp + 8 + 20]  // width
   1867     movdqa     xmm5, xmmword ptr kAddUV128
   1868     movdqa     xmm6, xmmword ptr kRGBAToV
   1869     movdqa     xmm7, xmmword ptr kRGBAToU
   1870     sub        edi, edx             // stride from u to v
   1871 
   1872  convertloop:
   1873     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1874     movdqu     xmm0, [eax]
   1875     movdqu     xmm4, [eax + esi]
   1876     pavgb      xmm0, xmm4
   1877     movdqu     xmm1, [eax + 16]
   1878     movdqu     xmm4, [eax + esi + 16]
   1879     pavgb      xmm1, xmm4
   1880     movdqu     xmm2, [eax + 32]
   1881     movdqu     xmm4, [eax + esi + 32]
   1882     pavgb      xmm2, xmm4
   1883     movdqu     xmm3, [eax + 48]
   1884     movdqu     xmm4, [eax + esi + 48]
   1885     pavgb      xmm3, xmm4
   1886 
   1887     lea        eax,  [eax + 64]
   1888     movdqa     xmm4, xmm0
   1889     shufps     xmm0, xmm1, 0x88
   1890     shufps     xmm4, xmm1, 0xdd
   1891     pavgb      xmm0, xmm4
   1892     movdqa     xmm4, xmm2
   1893     shufps     xmm2, xmm3, 0x88
   1894     shufps     xmm4, xmm3, 0xdd
   1895     pavgb      xmm2, xmm4
   1896 
   1897     // step 2 - convert to U and V
   1898     // from here down is very similar to Y code except
   1899     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1900     movdqa     xmm1, xmm0
   1901     movdqa     xmm3, xmm2
   1902     pmaddubsw  xmm0, xmm7  // U
   1903     pmaddubsw  xmm2, xmm7
   1904     pmaddubsw  xmm1, xmm6  // V
   1905     pmaddubsw  xmm3, xmm6
   1906     phaddw     xmm0, xmm2
   1907     phaddw     xmm1, xmm3
   1908     psraw      xmm0, 8
   1909     psraw      xmm1, 8
   1910     packsswb   xmm0, xmm1
   1911     paddb      xmm0, xmm5            // -> unsigned
   1912 
   1913     // step 3 - store 8 U and 8 V values
   1914     movlps     qword ptr [edx], xmm0 // U
   1915     movhps     qword ptr [edx + edi], xmm0 // V
   1916     lea        edx, [edx + 8]
   1917     sub        ecx, 16
   1918     jg         convertloop
   1919 
   1920     pop        edi
   1921     pop        esi
   1922     ret
   1923   }
   1924 }
   1925 #endif  // HAS_ARGBTOYROW_SSSE3
   1926 
   1927 // Read 16 UV from 444
   1928 #define READYUV444_AVX2 __asm {                                                \
   1929     __asm vmovdqu    xmm0, [esi]                  /* U */                      \
   1930     __asm vmovdqu    xmm1, [esi + edi]            /* V */                      \
   1931     __asm lea        esi,  [esi + 16]                                          \
   1932     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1933     __asm vpermq     ymm1, ymm1, 0xd8                                          \
   1934     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
   1935     __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
   1936     __asm vpermq     ymm4, ymm4, 0xd8                                          \
   1937     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
   1938     __asm lea        eax, [eax + 16]                                           \
   1939   }
   1940 
   1941 // Read 8 UV from 422, upsample to 16 UV.
   1942 #define READYUV422_AVX2 __asm {                                                \
   1943     __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
   1944     __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
   1945     __asm lea        esi,  [esi + 8]                                           \
   1946     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
   1947     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1948     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
   1949     __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
   1950     __asm vpermq     ymm4, ymm4, 0xd8                                          \
   1951     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
   1952     __asm lea        eax, [eax + 16]                                           \
   1953   }
   1954 
   1955 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
   1956 #define READYUVA422_AVX2 __asm {                                               \
   1957     __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
   1958     __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
   1959     __asm lea        esi,  [esi + 8]                                           \
   1960     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
   1961     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1962     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
   1963     __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
   1964     __asm vpermq     ymm4, ymm4, 0xd8                                          \
   1965     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
   1966     __asm lea        eax, [eax + 16]                                           \
   1967     __asm vmovdqu    xmm5, [ebp]                  /* A */                      \
   1968     __asm vpermq     ymm5, ymm5, 0xd8                                          \
   1969     __asm lea        ebp, [ebp + 16]                                           \
   1970   }
   1971 
   1972 // Read 4 UV from 411, upsample to 16 UV.
   1973 #define READYUV411_AVX2 __asm {                                                \
   1974     __asm vmovd      xmm0, dword ptr [esi]        /* U */                      \
   1975     __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */                      \
   1976     __asm lea        esi,  [esi + 4]                                           \
   1977     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
   1978     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
   1979     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1980     __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
   1981     __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
   1982     __asm vpermq     ymm4, ymm4, 0xd8                                          \
   1983     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
   1984     __asm lea        eax, [eax + 16]                                           \
   1985   }
   1986 
   1987 // Read 8 UV from NV12, upsample to 16 UV.
   1988 #define READNV12_AVX2 __asm {                                                  \
   1989     __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
   1990     __asm lea        esi,  [esi + 16]                                          \
   1991     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1992     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
   1993     __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
   1994     __asm vpermq     ymm4, ymm4, 0xd8                                          \
   1995     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
   1996     __asm lea        eax, [eax + 16]                                           \
   1997   }
   1998 
   1999 // Read 8 UV from NV21, upsample to 16 UV.
   2000 #define READNV21_AVX2 __asm {                                                  \
   2001     __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
   2002     __asm lea        esi,  [esi + 16]                                          \
   2003     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   2004     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
   2005     __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
   2006     __asm vpermq     ymm4, ymm4, 0xd8                                          \
   2007     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
   2008     __asm lea        eax, [eax + 16]                                           \
   2009   }
   2010 
   2011 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
   2012 #define READYUY2_AVX2 __asm {                                                  \
   2013     __asm vmovdqu    ymm4, [eax]          /* YUY2 */                           \
   2014     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
   2015     __asm vmovdqu    ymm0, [eax]          /* UV */                             \
   2016     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
   2017     __asm lea        eax, [eax + 32]                                           \
   2018   }
   2019 
   2020 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
   2021 #define READUYVY_AVX2 __asm {                                                  \
   2022     __asm vmovdqu    ymm4, [eax]          /* UYVY */                           \
   2023     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
   2024     __asm vmovdqu    ymm0, [eax]          /* UV */                             \
   2025     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
   2026     __asm lea        eax, [eax + 32]                                           \
   2027   }
   2028 
   2029 // Convert 16 pixels: 16 UV and 16 Y.
   2030 #define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
   2031     __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
   2032     __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
   2033     __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
   2034     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
   2035     __asm vpsubw     ymm2, ymm3, ymm2                                          \
   2036     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
   2037     __asm vpsubw     ymm1, ymm3, ymm1                                          \
   2038     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
   2039     __asm vpsubw     ymm0, ymm3, ymm0                                          \
   2040     /* Step 2: Find Y contribution to 16 R,G,B values */                       \
   2041     __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
   2042     __asm vpaddsw    ymm0, ymm0, ymm4           /* B += Y */                   \
   2043     __asm vpaddsw    ymm1, ymm1, ymm4           /* G += Y */                   \
   2044     __asm vpaddsw    ymm2, ymm2, ymm4           /* R += Y */                   \
   2045     __asm vpsraw     ymm0, ymm0, 6                                             \
   2046     __asm vpsraw     ymm1, ymm1, 6                                             \
   2047     __asm vpsraw     ymm2, ymm2, 6                                             \
   2048     __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
   2049     __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
   2050     __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
   2051   }
   2052 
   2053 // Store 16 ARGB values.
   2054 #define STOREARGB_AVX2 __asm {                                                 \
   2055     __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
   2056     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   2057     __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
   2058     __asm vpermq     ymm2, ymm2, 0xd8                                          \
   2059     __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
   2060     __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
   2061     __asm vmovdqu    0[edx], ymm1                                              \
   2062     __asm vmovdqu    32[edx], ymm0                                             \
   2063     __asm lea        edx,  [edx + 64]                                          \
   2064   }
   2065 
   2066 // Store 16 RGBA values.
   2067 #define STORERGBA_AVX2 __asm {                                                 \
   2068     __asm vpunpcklbw ymm1, ymm1, ymm2           /* GR */                       \
   2069     __asm vpermq     ymm1, ymm1, 0xd8                                          \
   2070     __asm vpunpcklbw ymm2, ymm5, ymm0           /* AB */                       \
   2071     __asm vpermq     ymm2, ymm2, 0xd8                                          \
   2072     __asm vpunpcklwd ymm0, ymm2, ymm1           /* ABGR first 8 pixels */      \
   2073     __asm vpunpckhwd ymm1, ymm2, ymm1           /* ABGR next 8 pixels */       \
   2074     __asm vmovdqu    [edx], ymm0                                               \
   2075     __asm vmovdqu    [edx + 32], ymm1                                          \
   2076     __asm lea        edx,  [edx + 64]                                          \
   2077   }
   2078 
   2079 #ifdef HAS_I422TOARGBROW_AVX2
   2080 // 16 pixels
   2081 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2082 __declspec(naked)
   2083 void I422ToARGBRow_AVX2(const uint8* y_buf,
   2084                         const uint8* u_buf,
   2085                         const uint8* v_buf,
   2086                         uint8* dst_argb,
   2087                         const struct YuvConstants* yuvconstants,
   2088                         int width) {
   2089   __asm {
   2090     push       esi
   2091     push       edi
   2092     push       ebx
   2093     mov        eax, [esp + 12 + 4]   // Y
   2094     mov        esi, [esp + 12 + 8]   // U
   2095     mov        edi, [esp + 12 + 12]  // V
   2096     mov        edx, [esp + 12 + 16]  // argb
   2097     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2098     mov        ecx, [esp + 12 + 24]  // width
   2099     sub        edi, esi
   2100     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2101 
   2102  convertloop:
   2103     READYUV422_AVX2
   2104     YUVTORGB_AVX2(ebx)
   2105     STOREARGB_AVX2
   2106 
   2107     sub        ecx, 16
   2108     jg         convertloop
   2109 
   2110     pop        ebx
   2111     pop        edi
   2112     pop        esi
   2113     vzeroupper
   2114     ret
   2115   }
   2116 }
   2117 #endif  // HAS_I422TOARGBROW_AVX2
   2118 
   2119 #ifdef HAS_I422ALPHATOARGBROW_AVX2
   2120 // 16 pixels
   2121 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
   2122 __declspec(naked)
   2123 void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
   2124                              const uint8* u_buf,
   2125                              const uint8* v_buf,
   2126                              const uint8* a_buf,
   2127                              uint8* dst_argb,
   2128                              const struct YuvConstants* yuvconstants,
   2129                              int width) {
   2130   __asm {
   2131     push       esi
   2132     push       edi
   2133     push       ebx
   2134     push       ebp
   2135     mov        eax, [esp + 16 + 4]   // Y
   2136     mov        esi, [esp + 16 + 8]   // U
   2137     mov        edi, [esp + 16 + 12]  // V
   2138     mov        ebp, [esp + 16 + 16]  // A
   2139     mov        edx, [esp + 16 + 20]  // argb
   2140     mov        ebx, [esp + 16 + 24]  // yuvconstants
   2141     mov        ecx, [esp + 16 + 28]  // width
   2142     sub        edi, esi
   2143 
   2144  convertloop:
   2145     READYUVA422_AVX2
   2146     YUVTORGB_AVX2(ebx)
   2147     STOREARGB_AVX2
   2148 
   2149     sub        ecx, 16
   2150     jg         convertloop
   2151 
   2152     pop        ebp
   2153     pop        ebx
   2154     pop        edi
   2155     pop        esi
   2156     vzeroupper
   2157     ret
   2158   }
   2159 }
   2160 #endif  // HAS_I422ALPHATOARGBROW_AVX2
   2161 
   2162 #ifdef HAS_I444TOARGBROW_AVX2
   2163 // 16 pixels
   2164 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
   2165 __declspec(naked)
   2166 void I444ToARGBRow_AVX2(const uint8* y_buf,
   2167                         const uint8* u_buf,
   2168                         const uint8* v_buf,
   2169                         uint8* dst_argb,
   2170                         const struct YuvConstants* yuvconstants,
   2171                         int width) {
   2172   __asm {
   2173     push       esi
   2174     push       edi
   2175     push       ebx
   2176     mov        eax, [esp + 12 + 4]   // Y
   2177     mov        esi, [esp + 12 + 8]   // U
   2178     mov        edi, [esp + 12 + 12]  // V
   2179     mov        edx, [esp + 12 + 16]  // argb
   2180     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2181     mov        ecx, [esp + 12 + 24]  // width
   2182     sub        edi, esi
   2183     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2184  convertloop:
   2185     READYUV444_AVX2
   2186     YUVTORGB_AVX2(ebx)
   2187     STOREARGB_AVX2
   2188 
   2189     sub        ecx, 16
   2190     jg         convertloop
   2191 
   2192     pop        ebx
   2193     pop        edi
   2194     pop        esi
   2195     vzeroupper
   2196     ret
   2197   }
   2198 }
   2199 #endif  // HAS_I444TOARGBROW_AVX2
   2200 
   2201 #ifdef HAS_I411TOARGBROW_AVX2
   2202 // 16 pixels
   2203 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2204 __declspec(naked)
   2205 void I411ToARGBRow_AVX2(const uint8* y_buf,
   2206                         const uint8* u_buf,
   2207                         const uint8* v_buf,
   2208                         uint8* dst_argb,
   2209                         const struct YuvConstants* yuvconstants,
   2210                         int width) {
   2211   __asm {
   2212     push       esi
   2213     push       edi
   2214     push       ebx
   2215     mov        eax, [esp + 12 + 4]   // Y
   2216     mov        esi, [esp + 12 + 8]   // U
   2217     mov        edi, [esp + 12 + 12]  // V
   2218     mov        edx, [esp + 12 + 16]  // abgr
   2219     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2220     mov        ecx, [esp + 12 + 24]  // width
   2221     sub        edi, esi
   2222     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2223 
   2224  convertloop:
   2225     READYUV411_AVX2
   2226     YUVTORGB_AVX2(ebx)
   2227     STOREARGB_AVX2
   2228 
   2229     sub        ecx, 16
   2230     jg         convertloop
   2231 
   2232     pop        ebx
   2233     pop        edi
   2234     pop        esi
   2235     vzeroupper
   2236     ret
   2237   }
   2238 }
   2239 #endif  // HAS_I411TOARGBROW_AVX2
   2240 
   2241 #ifdef HAS_NV12TOARGBROW_AVX2
   2242 // 16 pixels.
   2243 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2244 __declspec(naked)
   2245 void NV12ToARGBRow_AVX2(const uint8* y_buf,
   2246                         const uint8* uv_buf,
   2247                         uint8* dst_argb,
   2248                         const struct YuvConstants* yuvconstants,
   2249                         int width) {
   2250   __asm {
   2251     push       esi
   2252     push       ebx
   2253     mov        eax, [esp + 8 + 4]   // Y
   2254     mov        esi, [esp + 8 + 8]   // UV
   2255     mov        edx, [esp + 8 + 12]  // argb
   2256     mov        ebx, [esp + 8 + 16]  // yuvconstants
   2257     mov        ecx, [esp + 8 + 20]  // width
   2258     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2259 
   2260  convertloop:
   2261     READNV12_AVX2
   2262     YUVTORGB_AVX2(ebx)
   2263     STOREARGB_AVX2
   2264 
   2265     sub        ecx, 16
   2266     jg         convertloop
   2267 
   2268     pop        ebx
   2269     pop        esi
   2270     vzeroupper
   2271     ret
   2272   }
   2273 }
   2274 #endif  // HAS_NV12TOARGBROW_AVX2
   2275 
   2276 #ifdef HAS_NV21TOARGBROW_AVX2
   2277 // 16 pixels.
   2278 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2279 __declspec(naked)
   2280 void NV21ToARGBRow_AVX2(const uint8* y_buf,
   2281                         const uint8* vu_buf,
   2282                         uint8* dst_argb,
   2283                         const struct YuvConstants* yuvconstants,
   2284                         int width) {
   2285   __asm {
   2286     push       esi
   2287     push       ebx
   2288     mov        eax, [esp + 8 + 4]   // Y
   2289     mov        esi, [esp + 8 + 8]   // VU
   2290     mov        edx, [esp + 8 + 12]  // argb
   2291     mov        ebx, [esp + 8 + 16]  // yuvconstants
   2292     mov        ecx, [esp + 8 + 20]  // width
   2293     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2294 
   2295  convertloop:
   2296     READNV21_AVX2
   2297     YUVTORGB_AVX2(ebx)
   2298     STOREARGB_AVX2
   2299 
   2300     sub        ecx, 16
   2301     jg         convertloop
   2302 
   2303     pop        ebx
   2304     pop        esi
   2305     vzeroupper
   2306     ret
   2307   }
   2308 }
   2309 #endif  // HAS_NV21TOARGBROW_AVX2
   2310 
   2311 #ifdef HAS_YUY2TOARGBROW_AVX2
   2312 // 16 pixels.
   2313 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
   2314 __declspec(naked)
   2315 void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
   2316                         uint8* dst_argb,
   2317                         const struct YuvConstants* yuvconstants,
   2318                         int width) {
   2319   __asm {
   2320     push       ebx
   2321     mov        eax, [esp + 4 + 4]   // yuy2
   2322     mov        edx, [esp + 4 + 8]   // argb
   2323     mov        ebx, [esp + 4 + 12]  // yuvconstants
   2324     mov        ecx, [esp + 4 + 16]  // width
   2325     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2326 
   2327  convertloop:
   2328     READYUY2_AVX2
   2329     YUVTORGB_AVX2(ebx)
   2330     STOREARGB_AVX2
   2331 
   2332     sub        ecx, 16
   2333     jg         convertloop
   2334 
   2335     pop        ebx
   2336     vzeroupper
   2337     ret
   2338   }
   2339 }
   2340 #endif  // HAS_YUY2TOARGBROW_AVX2
   2341 
   2342 #ifdef HAS_UYVYTOARGBROW_AVX2
   2343 // 16 pixels.
   2344 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
   2345 __declspec(naked)
   2346 void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
   2347                         uint8* dst_argb,
   2348                         const struct YuvConstants* yuvconstants,
   2349                         int width) {
   2350   __asm {
   2351     push       ebx
   2352     mov        eax, [esp + 4 + 4]   // uyvy
   2353     mov        edx, [esp + 4 + 8]   // argb
   2354     mov        ebx, [esp + 4 + 12]  // yuvconstants
   2355     mov        ecx, [esp + 4 + 16]  // width
   2356     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2357 
   2358  convertloop:
   2359     READUYVY_AVX2
   2360     YUVTORGB_AVX2(ebx)
   2361     STOREARGB_AVX2
   2362 
   2363     sub        ecx, 16
   2364     jg         convertloop
   2365 
   2366     pop        ebx
   2367     vzeroupper
   2368     ret
   2369   }
   2370 }
   2371 #endif  // HAS_UYVYTOARGBROW_AVX2
   2372 
   2373 #ifdef HAS_I422TORGBAROW_AVX2
   2374 // 16 pixels
   2375 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
   2376 __declspec(naked)
   2377 void I422ToRGBARow_AVX2(const uint8* y_buf,
   2378                         const uint8* u_buf,
   2379                         const uint8* v_buf,
   2380                         uint8* dst_argb,
   2381                         const struct YuvConstants* yuvconstants,
   2382                         int width) {
   2383   __asm {
   2384     push       esi
   2385     push       edi
   2386     push       ebx
   2387     mov        eax, [esp + 12 + 4]   // Y
   2388     mov        esi, [esp + 12 + 8]   // U
   2389     mov        edi, [esp + 12 + 12]  // V
   2390     mov        edx, [esp + 12 + 16]  // abgr
   2391     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2392     mov        ecx, [esp + 12 + 24]  // width
   2393     sub        edi, esi
   2394     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2395 
   2396  convertloop:
   2397     READYUV422_AVX2
   2398     YUVTORGB_AVX2(ebx)
   2399     STORERGBA_AVX2
   2400 
   2401     sub        ecx, 16
   2402     jg         convertloop
   2403 
   2404     pop        ebx
   2405     pop        edi
   2406     pop        esi
   2407     vzeroupper
   2408     ret
   2409   }
   2410 }
   2411 #endif  // HAS_I422TORGBAROW_AVX2
   2412 
   2413 #if defined(HAS_I422TOARGBROW_SSSE3)
   2414 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
   2415 // Allows a conversion with half size scaling.
   2416 
   2417 // Read 8 UV from 444.
   2418 #define READYUV444 __asm {                                                     \
   2419     __asm movq       xmm0, qword ptr [esi] /* U */                             \
   2420     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
   2421     __asm lea        esi,  [esi + 8]                                           \
   2422     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
   2423     __asm movq       xmm4, qword ptr [eax]                                     \
   2424     __asm punpcklbw  xmm4, xmm4                                                \
   2425     __asm lea        eax, [eax + 8]                                            \
   2426   }
   2427 
   2428 // Read 4 UV from 422, upsample to 8 UV.
   2429 #define READYUV422 __asm {                                                     \
   2430     __asm movd       xmm0, [esi]          /* U */                              \
   2431     __asm movd       xmm1, [esi + edi]    /* V */                              \
   2432     __asm lea        esi,  [esi + 4]                                           \
   2433     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
   2434     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
   2435     __asm movq       xmm4, qword ptr [eax]                                     \
   2436     __asm punpcklbw  xmm4, xmm4                                                \
   2437     __asm lea        eax, [eax + 8]                                            \
   2438   }
   2439 
   2440 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
   2441 #define READYUVA422 __asm {                                                    \
   2442     __asm movd       xmm0, [esi]          /* U */                              \
   2443     __asm movd       xmm1, [esi + edi]    /* V */                              \
   2444     __asm lea        esi,  [esi + 4]                                           \
   2445     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
   2446     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
   2447     __asm movq       xmm4, qword ptr [eax]   /* Y */                           \
   2448     __asm punpcklbw  xmm4, xmm4                                                \
   2449     __asm lea        eax, [eax + 8]                                            \
   2450     __asm movq       xmm5, qword ptr [ebp]   /* A */                           \
   2451     __asm lea        ebp, [ebp + 8]                                            \
   2452   }
   2453 
   2454 // Read 2 UV from 411, upsample to 8 UV.
   2455 // drmemory fails with memory fault if pinsrw used. libyuv bug: 525
   2456 //  __asm pinsrw     xmm0, [esi], 0        /* U */
   2457 //  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
   2458 #define READYUV411_EBX __asm {                                                 \
   2459     __asm movzx      ebx, word ptr [esi]        /* U */                        \
   2460     __asm movd       xmm0, ebx                                                 \
   2461     __asm movzx      ebx, word ptr [esi + edi]  /* V */                        \
   2462     __asm movd       xmm1, ebx                                                 \
   2463     __asm lea        esi,  [esi + 2]                                           \
   2464     __asm punpcklbw  xmm0, xmm1            /* UV */                            \
   2465     __asm punpcklwd  xmm0, xmm0            /* UVUV (upsample) */               \
   2466     __asm punpckldq  xmm0, xmm0            /* UVUVUVUV (upsample) */           \
   2467     __asm movq       xmm4, qword ptr [eax]                                     \
   2468     __asm punpcklbw  xmm4, xmm4                                                \
   2469     __asm lea        eax, [eax + 8]                                            \
   2470   }
   2471 
   2472 // Read 4 UV from NV12, upsample to 8 UV.
   2473 #define READNV12 __asm {                                                       \
   2474     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
   2475     __asm lea        esi,  [esi + 8]                                           \
   2476     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
   2477     __asm movq       xmm4, qword ptr [eax]                                     \
   2478     __asm punpcklbw  xmm4, xmm4                                                \
   2479     __asm lea        eax, [eax + 8]                                            \
   2480   }
   2481 
   2482 // Read 4 VU from NV21, upsample to 8 UV.
   2483 #define READNV21 __asm {                                                       \
   2484     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
   2485     __asm lea        esi,  [esi + 8]                                           \
   2486     __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
   2487     __asm movq       xmm4, qword ptr [eax]                                     \
   2488     __asm punpcklbw  xmm4, xmm4                                                \
   2489     __asm lea        eax, [eax + 8]                                            \
   2490   }
   2491 
   2492 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
   2493 #define READYUY2 __asm {                                                       \
   2494     __asm movdqu     xmm4, [eax]          /* YUY2 */                           \
   2495     __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
   2496     __asm movdqu     xmm0, [eax]          /* UV */                             \
   2497     __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
   2498     __asm lea        eax, [eax + 16]                                           \
   2499   }
   2500 
   2501 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
   2502 #define READUYVY __asm {                                                       \
   2503     __asm movdqu     xmm4, [eax]          /* UYVY */                           \
   2504     __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
   2505     __asm movdqu     xmm0, [eax]          /* UV */                             \
   2506     __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
   2507     __asm lea        eax, [eax + 16]                                           \
   2508   }
   2509 
   2510 // Convert 8 pixels: 8 UV and 8 Y.
   2511 #define YUVTORGB(YuvConstants) __asm {                                         \
   2512     __asm movdqa     xmm1, xmm0                                                \
   2513     __asm movdqa     xmm2, xmm0                                                \
   2514     __asm movdqa     xmm3, xmm0                                                \
   2515     __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \
   2516     __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \
   2517     __asm psubw      xmm0, xmm1                                                \
   2518     __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \
   2519     __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \
   2520     __asm psubw      xmm1, xmm2                                                \
   2521     __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
   2522     __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
   2523     __asm psubw      xmm2, xmm3                                                \
   2524     __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
   2525     __asm paddsw     xmm0, xmm4           /* B += Y */                         \
   2526     __asm paddsw     xmm1, xmm4           /* G += Y */                         \
   2527     __asm paddsw     xmm2, xmm4           /* R += Y */                         \
   2528     __asm psraw      xmm0, 6                                                   \
   2529     __asm psraw      xmm1, 6                                                   \
   2530     __asm psraw      xmm2, 6                                                   \
   2531     __asm packuswb   xmm0, xmm0           /* B */                              \
   2532     __asm packuswb   xmm1, xmm1           /* G */                              \
   2533     __asm packuswb   xmm2, xmm2           /* R */                              \
   2534   }
   2535 
   2536 // Store 8 ARGB values.
   2537 #define STOREARGB __asm {                                                      \
   2538     __asm punpcklbw  xmm0, xmm1           /* BG */                             \
   2539     __asm punpcklbw  xmm2, xmm5           /* RA */                             \
   2540     __asm movdqa     xmm1, xmm0                                                \
   2541     __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
   2542     __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
   2543     __asm movdqu     0[edx], xmm0                                              \
   2544     __asm movdqu     16[edx], xmm1                                             \
   2545     __asm lea        edx,  [edx + 32]                                          \
   2546   }
   2547 
   2548 // Store 8 BGRA values.
   2549 #define STOREBGRA __asm {                                                      \
   2550     __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
   2551     __asm punpcklbw  xmm1, xmm0           /* GB */                             \
   2552     __asm punpcklbw  xmm5, xmm2           /* AR */                             \
   2553     __asm movdqa     xmm0, xmm5                                                \
   2554     __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
   2555     __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
   2556     __asm movdqu     0[edx], xmm5                                              \
   2557     __asm movdqu     16[edx], xmm0                                             \
   2558     __asm lea        edx,  [edx + 32]                                          \
   2559   }
   2560 
   2561 // Store 8 RGBA values.
   2562 #define STORERGBA __asm {                                                      \
   2563     __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
   2564     __asm punpcklbw  xmm1, xmm2           /* GR */                             \
   2565     __asm punpcklbw  xmm5, xmm0           /* AB */                             \
   2566     __asm movdqa     xmm0, xmm5                                                \
   2567     __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
   2568     __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
   2569     __asm movdqu     0[edx], xmm5                                              \
   2570     __asm movdqu     16[edx], xmm0                                             \
   2571     __asm lea        edx,  [edx + 32]                                          \
   2572   }
   2573 
   2574 // Store 8 RGB24 values.
   2575 #define STORERGB24 __asm {                                                     \
   2576     /* Weave into RRGB */                                                      \
   2577     __asm punpcklbw  xmm0, xmm1           /* BG */                             \
   2578     __asm punpcklbw  xmm2, xmm2           /* RR */                             \
   2579     __asm movdqa     xmm1, xmm0                                                \
   2580     __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
   2581     __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
   2582     /* RRGB -> RGB24 */                                                        \
   2583     __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
   2584     __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
   2585     __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
   2586     __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
   2587     __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
   2588     __asm lea        edx,  [edx + 24]                                          \
   2589   }
   2590 
   2591 // Store 8 RGB565 values.
   2592 #define STORERGB565 __asm {                                                    \
   2593     /* Weave into RRGB */                                                      \
   2594     __asm punpcklbw  xmm0, xmm1           /* BG */                             \
   2595     __asm punpcklbw  xmm2, xmm2           /* RR */                             \
   2596     __asm movdqa     xmm1, xmm0                                                \
   2597     __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
   2598     __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
   2599     /* RRGB -> RGB565 */                                                       \
   2600     __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
   2601     __asm movdqa     xmm2, xmm0    /* G */                                     \
   2602     __asm pslld      xmm0, 8       /* R */                                     \
   2603     __asm psrld      xmm3, 3       /* B */                                     \
   2604     __asm psrld      xmm2, 5       /* G */                                     \
   2605     __asm psrad      xmm0, 16      /* R */                                     \
   2606     __asm pand       xmm3, xmm5    /* B */                                     \
   2607     __asm pand       xmm2, xmm6    /* G */                                     \
   2608     __asm pand       xmm0, xmm7    /* R */                                     \
   2609     __asm por        xmm3, xmm2    /* BG */                                    \
   2610     __asm por        xmm0, xmm3    /* BGR */                                   \
   2611     __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
   2612     __asm movdqa     xmm2, xmm1    /* G */                                     \
   2613     __asm pslld      xmm1, 8       /* R */                                     \
   2614     __asm psrld      xmm3, 3       /* B */                                     \
   2615     __asm psrld      xmm2, 5       /* G */                                     \
   2616     __asm psrad      xmm1, 16      /* R */                                     \
   2617     __asm pand       xmm3, xmm5    /* B */                                     \
   2618     __asm pand       xmm2, xmm6    /* G */                                     \
   2619     __asm pand       xmm1, xmm7    /* R */                                     \
   2620     __asm por        xmm3, xmm2    /* BG */                                    \
   2621     __asm por        xmm1, xmm3    /* BGR */                                   \
   2622     __asm packssdw   xmm0, xmm1                                                \
   2623     __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
   2624     __asm lea        edx, [edx + 16]                                           \
   2625   }
   2626 
   2627 // 8 pixels.
   2628 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
   2629 __declspec(naked)
   2630 void I444ToARGBRow_SSSE3(const uint8* y_buf,
   2631                          const uint8* u_buf,
   2632                          const uint8* v_buf,
   2633                          uint8* dst_argb,
   2634                          const struct YuvConstants* yuvconstants,
   2635                          int width) {
   2636   __asm {
   2637     push       esi
   2638     push       edi
   2639     push       ebx
   2640     mov        eax, [esp + 12 + 4]   // Y
   2641     mov        esi, [esp + 12 + 8]   // U
   2642     mov        edi, [esp + 12 + 12]  // V
   2643     mov        edx, [esp + 12 + 16]  // argb
   2644     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2645     mov        ecx, [esp + 12 + 24]  // width
   2646     sub        edi, esi
   2647     pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
   2648 
   2649  convertloop:
   2650     READYUV444
   2651     YUVTORGB(ebx)
   2652     STOREARGB
   2653 
   2654     sub        ecx, 8
   2655     jg         convertloop
   2656 
   2657     pop        ebx
   2658     pop        edi
   2659     pop        esi
   2660     ret
   2661   }
   2662 }
   2663 
   2664 // 8 pixels.
   2665 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
   2666 __declspec(naked)
   2667 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
   2668                           const uint8* u_buf,
   2669                           const uint8* v_buf,
   2670                           uint8* dst_rgb24,
   2671                           const struct YuvConstants* yuvconstants,
   2672                           int width) {
   2673   __asm {
   2674     push       esi
   2675     push       edi
   2676     push       ebx
   2677     mov        eax, [esp + 12 + 4]   // Y
   2678     mov        esi, [esp + 12 + 8]   // U
   2679     mov        edi, [esp + 12 + 12]  // V
   2680     mov        edx, [esp + 12 + 16]  // argb
   2681     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2682     mov        ecx, [esp + 12 + 24]  // width
   2683     sub        edi, esi
   2684     movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
   2685     movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
   2686 
   2687  convertloop:
   2688     READYUV422
   2689     YUVTORGB(ebx)
   2690     STORERGB24
   2691 
   2692     sub        ecx, 8
   2693     jg         convertloop
   2694 
   2695     pop        ebx
   2696     pop        edi
   2697     pop        esi
   2698     ret
   2699   }
   2700 }
   2701 
   2702 // 8 pixels
   2703 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
   2704 __declspec(naked)
   2705 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
   2706                            const uint8* u_buf,
   2707                            const uint8* v_buf,
   2708                            uint8* rgb565_buf,
   2709                            const struct YuvConstants* yuvconstants,
   2710                            int width) {
   2711   __asm {
   2712     push       esi
   2713     push       edi
   2714     push       ebx
   2715     mov        eax, [esp + 12 + 4]   // Y
   2716     mov        esi, [esp + 12 + 8]   // U
   2717     mov        edi, [esp + 12 + 12]  // V
   2718     mov        edx, [esp + 12 + 16]  // argb
   2719     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2720     mov        ecx, [esp + 12 + 24]  // width
   2721     sub        edi, esi
   2722     pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
   2723     psrld      xmm5, 27
   2724     pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
   2725     psrld      xmm6, 26
   2726     pslld      xmm6, 5
   2727     pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
   2728     pslld      xmm7, 11
   2729 
   2730  convertloop:
   2731     READYUV422
   2732     YUVTORGB(ebx)
   2733     STORERGB565
   2734 
   2735     sub        ecx, 8
   2736     jg         convertloop
   2737 
   2738     pop        ebx
   2739     pop        edi
   2740     pop        esi
   2741     ret
   2742   }
   2743 }
   2744 
   2745 // 8 pixels.
   2746 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2747 __declspec(naked)
   2748 void I422ToARGBRow_SSSE3(const uint8* y_buf,
   2749                          const uint8* u_buf,
   2750                          const uint8* v_buf,
   2751                          uint8* dst_argb,
   2752                          const struct YuvConstants* yuvconstants,
   2753                          int width) {
   2754   __asm {
   2755     push       esi
   2756     push       edi
   2757     push       ebx
   2758     mov        eax, [esp + 12 + 4]   // Y
   2759     mov        esi, [esp + 12 + 8]   // U
   2760     mov        edi, [esp + 12 + 12]  // V
   2761     mov        edx, [esp + 12 + 16]  // argb
   2762     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2763     mov        ecx, [esp + 12 + 24]  // width
   2764     sub        edi, esi
   2765     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2766 
   2767  convertloop:
   2768     READYUV422
   2769     YUVTORGB(ebx)
   2770     STOREARGB
   2771 
   2772     sub        ecx, 8
   2773     jg         convertloop
   2774 
   2775     pop        ebx
   2776     pop        edi
   2777     pop        esi
   2778     ret
   2779   }
   2780 }
   2781 
   2782 // 8 pixels.
   2783 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
   2784 __declspec(naked)
   2785 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
   2786                               const uint8* u_buf,
   2787                               const uint8* v_buf,
   2788                               const uint8* a_buf,
   2789                               uint8* dst_argb,
   2790                               const struct YuvConstants* yuvconstants,
   2791                               int width) {
   2792   __asm {
   2793     push       esi
   2794     push       edi
   2795     push       ebx
   2796     push       ebp
   2797     mov        eax, [esp + 16 + 4]   // Y
   2798     mov        esi, [esp + 16 + 8]   // U
   2799     mov        edi, [esp + 16 + 12]  // V
   2800     mov        ebp, [esp + 16 + 16]  // A
   2801     mov        edx, [esp + 16 + 20]  // argb
   2802     mov        ebx, [esp + 16 + 24]  // yuvconstants
   2803     mov        ecx, [esp + 16 + 28]  // width
   2804     sub        edi, esi
   2805 
   2806  convertloop:
   2807     READYUVA422
   2808     YUVTORGB(ebx)
   2809     STOREARGB
   2810 
   2811     sub        ecx, 8
   2812     jg         convertloop
   2813 
   2814     pop        ebp
   2815     pop        ebx
   2816     pop        edi
   2817     pop        esi
   2818     ret
   2819   }
   2820 }
   2821 
   2822 // 8 pixels.
   2823 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2824 // Similar to I420 but duplicate UV once more.
   2825 __declspec(naked)
   2826 void I411ToARGBRow_SSSE3(const uint8* y_buf,
   2827                          const uint8* u_buf,
   2828                          const uint8* v_buf,
   2829                          uint8* dst_argb,
   2830                          const struct YuvConstants* yuvconstants,
   2831                          int width) {
   2832   __asm {
   2833     push       esi
   2834     push       edi
   2835     push       ebx
   2836     push       ebp
   2837     mov        eax, [esp + 16 + 4]   // Y
   2838     mov        esi, [esp + 16 + 8]   // U
   2839     mov        edi, [esp + 16 + 12]  // V
   2840     mov        edx, [esp + 16 + 16]  // abgr
   2841     mov        ebp, [esp + 16 + 20]  // yuvconstants
   2842     mov        ecx, [esp + 16 + 24]  // width
   2843     sub        edi, esi
   2844     pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
   2845 
   2846  convertloop:
   2847     READYUV411_EBX
   2848     YUVTORGB(ebp)
   2849     STOREARGB
   2850 
   2851     sub        ecx, 8
   2852     jg         convertloop
   2853 
   2854     pop        ebp
   2855     pop        ebx
   2856     pop        edi
   2857     pop        esi
   2858     ret
   2859   }
   2860 }
   2861 
   2862 // 8 pixels.
   2863 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2864 __declspec(naked)
   2865 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
   2866                          const uint8* uv_buf,
   2867                          uint8* dst_argb,
   2868                          const struct YuvConstants* yuvconstants,
   2869                          int width) {
   2870   __asm {
   2871     push       esi
   2872     push       ebx
   2873     mov        eax, [esp + 8 + 4]   // Y
   2874     mov        esi, [esp + 8 + 8]   // UV
   2875     mov        edx, [esp + 8 + 12]  // argb
   2876     mov        ebx, [esp + 8 + 16]  // yuvconstants
   2877     mov        ecx, [esp + 8 + 20]  // width
   2878     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2879 
   2880  convertloop:
   2881     READNV12
   2882     YUVTORGB(ebx)
   2883     STOREARGB
   2884 
   2885     sub        ecx, 8
   2886     jg         convertloop
   2887 
   2888     pop        ebx
   2889     pop        esi
   2890     ret
   2891   }
   2892 }
   2893 
   2894 // 8 pixels.
   2895 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2896 __declspec(naked)
   2897 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
   2898                          const uint8* vu_buf,
   2899                          uint8* dst_argb,
   2900                          const struct YuvConstants* yuvconstants,
   2901                          int width) {
   2902   __asm {
   2903     push       esi
   2904     push       ebx
   2905     mov        eax, [esp + 8 + 4]   // Y
   2906     mov        esi, [esp + 8 + 8]   // VU
   2907     mov        edx, [esp + 8 + 12]  // argb
   2908     mov        ebx, [esp + 8 + 16]  // yuvconstants
   2909     mov        ecx, [esp + 8 + 20]  // width
   2910     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2911 
   2912  convertloop:
   2913     READNV21
   2914     YUVTORGB(ebx)
   2915     STOREARGB
   2916 
   2917     sub        ecx, 8
   2918     jg         convertloop
   2919 
   2920     pop        ebx
   2921     pop        esi
   2922     ret
   2923   }
   2924 }
   2925 
   2926 // 8 pixels.
   2927 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
   2928 __declspec(naked)
   2929 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
   2930                          uint8* dst_argb,
   2931                          const struct YuvConstants* yuvconstants,
   2932                          int width) {
   2933   __asm {
   2934     push       ebx
   2935     mov        eax, [esp + 4 + 4]   // yuy2
   2936     mov        edx, [esp + 4 + 8]   // argb
   2937     mov        ebx, [esp + 4 + 12]  // yuvconstants
   2938     mov        ecx, [esp + 4 + 16]  // width
   2939     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2940 
   2941  convertloop:
   2942     READYUY2
   2943     YUVTORGB(ebx)
   2944     STOREARGB
   2945 
   2946     sub        ecx, 8
   2947     jg         convertloop
   2948 
   2949     pop        ebx
   2950     ret
   2951   }
   2952 }
   2953 
   2954 // 8 pixels.
   2955 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
   2956 __declspec(naked)
   2957 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
   2958                          uint8* dst_argb,
   2959                          const struct YuvConstants* yuvconstants,
   2960                          int width) {
   2961   __asm {
   2962     push       ebx
   2963     mov        eax, [esp + 4 + 4]   // uyvy
   2964     mov        edx, [esp + 4 + 8]   // argb
   2965     mov        ebx, [esp + 4 + 12]  // yuvconstants
   2966     mov        ecx, [esp + 4 + 16]  // width
   2967     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2968 
   2969  convertloop:
   2970     READUYVY
   2971     YUVTORGB(ebx)
   2972     STOREARGB
   2973 
   2974     sub        ecx, 8
   2975     jg         convertloop
   2976 
   2977     pop        ebx
   2978     ret
   2979   }
   2980 }
   2981 
   2982 __declspec(naked)
   2983 void I422ToRGBARow_SSSE3(const uint8* y_buf,
   2984                          const uint8* u_buf,
   2985                          const uint8* v_buf,
   2986                          uint8* dst_rgba,
   2987                          const struct YuvConstants* yuvconstants,
   2988                          int width) {
   2989   __asm {
   2990     push       esi
   2991     push       edi
   2992     push       ebx
   2993     mov        eax, [esp + 12 + 4]   // Y
   2994     mov        esi, [esp + 12 + 8]   // U
   2995     mov        edi, [esp + 12 + 12]  // V
   2996     mov        edx, [esp + 12 + 16]  // argb
   2997     mov        ebx, [esp + 12 + 20]  // yuvconstants
   2998     mov        ecx, [esp + 12 + 24]  // width
   2999     sub        edi, esi
   3000 
   3001  convertloop:
   3002     READYUV422
   3003     YUVTORGB(ebx)
   3004     STORERGBA
   3005 
   3006     sub        ecx, 8
   3007     jg         convertloop
   3008 
   3009     pop        ebx
   3010     pop        edi
   3011     pop        esi
   3012     ret
   3013   }
   3014 }
   3015 #endif  // HAS_I422TOARGBROW_SSSE3
   3016 
   3017 #ifdef HAS_I400TOARGBROW_SSE2
   3018 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
   3019 __declspec(naked)
   3020 void I400ToARGBRow_SSE2(const uint8* y_buf,
   3021                         uint8* rgb_buf,
   3022                         int width) {
   3023   __asm {
   3024     mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
   3025     movd       xmm2, eax
   3026     pshufd     xmm2, xmm2,0
   3027     mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
   3028     movd       xmm3, eax
   3029     pshufd     xmm3, xmm3, 0
   3030     pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
   3031     pslld      xmm4, 24
   3032 
   3033     mov        eax, [esp + 4]       // Y
   3034     mov        edx, [esp + 8]       // rgb
   3035     mov        ecx, [esp + 12]      // width
   3036 
   3037  convertloop:
   3038     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
   3039     movq       xmm0, qword ptr [eax]
   3040     lea        eax, [eax + 8]
   3041     punpcklbw  xmm0, xmm0           // Y.Y
   3042     pmulhuw    xmm0, xmm2
   3043     psubusw    xmm0, xmm3
   3044     psrlw      xmm0, 6
   3045     packuswb   xmm0, xmm0           // G
   3046 
   3047     // Step 2: Weave into ARGB
   3048     punpcklbw  xmm0, xmm0           // GG
   3049     movdqa     xmm1, xmm0
   3050     punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
   3051     punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
   3052     por        xmm0, xmm4
   3053     por        xmm1, xmm4
   3054     movdqu     [edx], xmm0
   3055     movdqu     [edx + 16], xmm1
   3056     lea        edx,  [edx + 32]
   3057     sub        ecx, 8
   3058     jg         convertloop
   3059     ret
   3060   }
   3061 }
   3062 #endif  // HAS_I400TOARGBROW_SSE2
   3063 
   3064 #ifdef HAS_I400TOARGBROW_AVX2
   3065 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
   3066 // note: vpunpcklbw mutates and vpackuswb unmutates.
   3067 __declspec(naked)
   3068 void I400ToARGBRow_AVX2(const uint8* y_buf,
   3069                         uint8* rgb_buf,
   3070                         int width) {
   3071   __asm {
   3072     mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
   3073     vmovd      xmm2, eax
   3074     vbroadcastss ymm2, xmm2
   3075     mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
   3076     vmovd      xmm3, eax
   3077     vbroadcastss ymm3, xmm3
   3078     vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
   3079     vpslld     ymm4, ymm4, 24
   3080 
   3081     mov        eax, [esp + 4]       // Y
   3082     mov        edx, [esp + 8]       // rgb
   3083     mov        ecx, [esp + 12]      // width
   3084 
   3085  convertloop:
   3086     // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
   3087     vmovdqu    xmm0, [eax]
   3088     lea        eax, [eax + 16]
   3089     vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
   3090     vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
   3091     vpmulhuw   ymm0, ymm0, ymm2
   3092     vpsubusw   ymm0, ymm0, ymm3
   3093     vpsrlw     ymm0, ymm0, 6
   3094     vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
   3095 
   3096     // TODO(fbarchard): Weave alpha with unpack.
   3097     // Step 2: Weave into ARGB
   3098     vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
   3099     vpermq     ymm1, ymm1, 0xd8
   3100     vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
   3101     vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
   3102     vpor       ymm0, ymm0, ymm4
   3103     vpor       ymm1, ymm1, ymm4
   3104     vmovdqu    [edx], ymm0
   3105     vmovdqu    [edx + 32], ymm1
   3106     lea        edx,  [edx + 64]
   3107     sub        ecx, 16
   3108     jg         convertloop
   3109     vzeroupper
   3110     ret
   3111   }
   3112 }
   3113 #endif  // HAS_I400TOARGBROW_AVX2
   3114 
   3115 #ifdef HAS_MIRRORROW_SSSE3
   3116 // Shuffle table for reversing the bytes.
   3117 static const uvec8 kShuffleMirror = {
   3118   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
   3119 };
   3120 
   3121 // TODO(fbarchard): Replace lea with -16 offset.
   3122 __declspec(naked)
   3123 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   3124   __asm {
   3125     mov       eax, [esp + 4]   // src
   3126     mov       edx, [esp + 8]   // dst
   3127     mov       ecx, [esp + 12]  // width
   3128     movdqa    xmm5, xmmword ptr kShuffleMirror
   3129 
   3130  convertloop:
   3131     movdqu    xmm0, [eax - 16 + ecx]
   3132     pshufb    xmm0, xmm5
   3133     movdqu    [edx], xmm0
   3134     lea       edx, [edx + 16]
   3135     sub       ecx, 16
   3136     jg        convertloop
   3137     ret
   3138   }
   3139 }
   3140 #endif  // HAS_MIRRORROW_SSSE3
   3141 
   3142 #ifdef HAS_MIRRORROW_AVX2
   3143 __declspec(naked)
   3144 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   3145   __asm {
   3146     mov       eax, [esp + 4]   // src
   3147     mov       edx, [esp + 8]   // dst
   3148     mov       ecx, [esp + 12]  // width
   3149     vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
   3150 
   3151  convertloop:
   3152     vmovdqu   ymm0, [eax - 32 + ecx]
   3153     vpshufb   ymm0, ymm0, ymm5
   3154     vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
   3155     vmovdqu   [edx], ymm0
   3156     lea       edx, [edx + 32]
   3157     sub       ecx, 32
   3158     jg        convertloop
   3159     vzeroupper
   3160     ret
   3161   }
   3162 }
   3163 #endif  // HAS_MIRRORROW_AVX2
   3164 
   3165 #ifdef HAS_MIRRORUVROW_SSSE3
   3166 // Shuffle table for reversing the bytes of UV channels.
   3167 static const uvec8 kShuffleMirrorUV = {
   3168   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
   3169 };
   3170 
   3171 __declspec(naked)
   3172 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
   3173                        int width) {
   3174   __asm {
   3175     push      edi
   3176     mov       eax, [esp + 4 + 4]   // src
   3177     mov       edx, [esp + 4 + 8]   // dst_u
   3178     mov       edi, [esp + 4 + 12]  // dst_v
   3179     mov       ecx, [esp + 4 + 16]  // width
   3180     movdqa    xmm1, xmmword ptr kShuffleMirrorUV
   3181     lea       eax, [eax + ecx * 2 - 16]
   3182     sub       edi, edx
   3183 
   3184  convertloop:
   3185     movdqu    xmm0, [eax]
   3186     lea       eax, [eax - 16]
   3187     pshufb    xmm0, xmm1
   3188     movlpd    qword ptr [edx], xmm0
   3189     movhpd    qword ptr [edx + edi], xmm0
   3190     lea       edx, [edx + 8]
   3191     sub       ecx, 8
   3192     jg        convertloop
   3193 
   3194     pop       edi
   3195     ret
   3196   }
   3197 }
   3198 #endif  // HAS_MIRRORUVROW_SSSE3
   3199 
   3200 #ifdef HAS_ARGBMIRRORROW_SSE2
   3201 __declspec(naked)
   3202 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   3203   __asm {
   3204     mov       eax, [esp + 4]   // src
   3205     mov       edx, [esp + 8]   // dst
   3206     mov       ecx, [esp + 12]  // width
   3207     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
   3208 
   3209  convertloop:
   3210     movdqu    xmm0, [eax]
   3211     lea       eax, [eax - 16]
   3212     pshufd    xmm0, xmm0, 0x1b
   3213     movdqu    [edx], xmm0
   3214     lea       edx, [edx + 16]
   3215     sub       ecx, 4
   3216     jg        convertloop
   3217     ret
   3218   }
   3219 }
   3220 #endif  // HAS_ARGBMIRRORROW_SSE2
   3221 
   3222 #ifdef HAS_ARGBMIRRORROW_AVX2
   3223 // Shuffle table for reversing the bytes.
   3224 static const ulvec32 kARGBShuffleMirror_AVX2 = {
   3225   7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
   3226 };
   3227 
   3228 __declspec(naked)
   3229 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   3230   __asm {
   3231     mov       eax, [esp + 4]   // src
   3232     mov       edx, [esp + 8]   // dst
   3233     mov       ecx, [esp + 12]  // width
   3234     vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
   3235 
   3236  convertloop:
   3237     vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
   3238     vmovdqu   [edx], ymm0
   3239     lea       edx, [edx + 32]
   3240     sub       ecx, 8
   3241     jg        convertloop
   3242     vzeroupper
   3243     ret
   3244   }
   3245 }
   3246 #endif  // HAS_ARGBMIRRORROW_AVX2
   3247 
   3248 #ifdef HAS_SPLITUVROW_SSE2
   3249 __declspec(naked)
   3250 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
   3251                      int width) {
   3252   __asm {
   3253     push       edi
   3254     mov        eax, [esp + 4 + 4]    // src_uv
   3255     mov        edx, [esp + 4 + 8]    // dst_u
   3256     mov        edi, [esp + 4 + 12]   // dst_v
   3257     mov        ecx, [esp + 4 + 16]   // width
   3258     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   3259     psrlw      xmm5, 8
   3260     sub        edi, edx
   3261 
   3262   convertloop:
   3263     movdqu     xmm0, [eax]
   3264     movdqu     xmm1, [eax + 16]
   3265     lea        eax,  [eax + 32]
   3266     movdqa     xmm2, xmm0
   3267     movdqa     xmm3, xmm1
   3268     pand       xmm0, xmm5   // even bytes
   3269     pand       xmm1, xmm5
   3270     packuswb   xmm0, xmm1
   3271     psrlw      xmm2, 8      // odd bytes
   3272     psrlw      xmm3, 8
   3273     packuswb   xmm2, xmm3
   3274     movdqu     [edx], xmm0
   3275     movdqu     [edx + edi], xmm2
   3276     lea        edx, [edx + 16]
   3277     sub        ecx, 16
   3278     jg         convertloop
   3279 
   3280     pop        edi
   3281     ret
   3282   }
   3283 }
   3284 
   3285 #endif  // HAS_SPLITUVROW_SSE2
   3286 
   3287 #ifdef HAS_SPLITUVROW_AVX2
   3288 __declspec(naked)
   3289 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
   3290                      int width) {
   3291   __asm {
   3292     push       edi
   3293     mov        eax, [esp + 4 + 4]    // src_uv
   3294     mov        edx, [esp + 4 + 8]    // dst_u
   3295     mov        edi, [esp + 4 + 12]   // dst_v
   3296     mov        ecx, [esp + 4 + 16]   // width
   3297     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   3298     vpsrlw     ymm5, ymm5, 8
   3299     sub        edi, edx
   3300 
   3301   convertloop:
   3302     vmovdqu    ymm0, [eax]
   3303     vmovdqu    ymm1, [eax + 32]
   3304     lea        eax,  [eax + 64]
   3305     vpsrlw     ymm2, ymm0, 8      // odd bytes
   3306     vpsrlw     ymm3, ymm1, 8
   3307     vpand      ymm0, ymm0, ymm5   // even bytes
   3308     vpand      ymm1, ymm1, ymm5
   3309     vpackuswb  ymm0, ymm0, ymm1
   3310     vpackuswb  ymm2, ymm2, ymm3
   3311     vpermq     ymm0, ymm0, 0xd8
   3312     vpermq     ymm2, ymm2, 0xd8
   3313     vmovdqu    [edx], ymm0
   3314     vmovdqu    [edx + edi], ymm2
   3315     lea        edx, [edx + 32]
   3316     sub        ecx, 32
   3317     jg         convertloop
   3318 
   3319     pop        edi
   3320     vzeroupper
   3321     ret
   3322   }
   3323 }
   3324 #endif  // HAS_SPLITUVROW_AVX2
   3325 
   3326 #ifdef HAS_MERGEUVROW_SSE2
   3327 __declspec(naked)
   3328 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
   3329                      int width) {
   3330   __asm {
   3331     push       edi
   3332     mov        eax, [esp + 4 + 4]    // src_u
   3333     mov        edx, [esp + 4 + 8]    // src_v
   3334     mov        edi, [esp + 4 + 12]   // dst_uv
   3335     mov        ecx, [esp + 4 + 16]   // width
   3336     sub        edx, eax
   3337 
   3338   convertloop:
   3339     movdqu     xmm0, [eax]      // read 16 U's
   3340     movdqu     xmm1, [eax + edx]  // and 16 V's
   3341     lea        eax,  [eax + 16]
   3342     movdqa     xmm2, xmm0
   3343     punpcklbw  xmm0, xmm1       // first 8 UV pairs
   3344     punpckhbw  xmm2, xmm1       // next 8 UV pairs
   3345     movdqu     [edi], xmm0
   3346     movdqu     [edi + 16], xmm2
   3347     lea        edi, [edi + 32]
   3348     sub        ecx, 16
   3349     jg         convertloop
   3350 
   3351     pop        edi
   3352     ret
   3353   }
   3354 }
   3355 #endif  //  HAS_MERGEUVROW_SSE2
   3356 
   3357 #ifdef HAS_MERGEUVROW_AVX2
   3358 __declspec(naked)
   3359 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
   3360                      int width) {
   3361   __asm {
   3362     push       edi
   3363     mov        eax, [esp + 4 + 4]    // src_u
   3364     mov        edx, [esp + 4 + 8]    // src_v
   3365     mov        edi, [esp + 4 + 12]   // dst_uv
   3366     mov        ecx, [esp + 4 + 16]   // width
   3367     sub        edx, eax
   3368 
   3369   convertloop:
   3370     vmovdqu    ymm0, [eax]           // read 32 U's
   3371     vmovdqu    ymm1, [eax + edx]     // and 32 V's
   3372     lea        eax,  [eax + 32]
   3373     vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
   3374     vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
   3375     vextractf128 [edi], ymm2, 0       // bytes 0..15
   3376     vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
   3377     vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
   3378     vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
   3379     lea        edi, [edi + 64]
   3380     sub        ecx, 32
   3381     jg         convertloop
   3382 
   3383     pop        edi
   3384     vzeroupper
   3385     ret
   3386   }
   3387 }
   3388 #endif  //  HAS_MERGEUVROW_AVX2
   3389 
   3390 #ifdef HAS_COPYROW_SSE2
   3391 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
   3392 __declspec(naked)
   3393 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   3394   __asm {
   3395     mov        eax, [esp + 4]   // src
   3396     mov        edx, [esp + 8]   // dst
   3397     mov        ecx, [esp + 12]  // count
   3398     test       eax, 15
   3399     jne        convertloopu
   3400     test       edx, 15
   3401     jne        convertloopu
   3402 
   3403   convertloopa:
   3404     movdqa     xmm0, [eax]
   3405     movdqa     xmm1, [eax + 16]
   3406     lea        eax, [eax + 32]
   3407     movdqa     [edx], xmm0
   3408     movdqa     [edx + 16], xmm1
   3409     lea        edx, [edx + 32]
   3410     sub        ecx, 32
   3411     jg         convertloopa
   3412     ret
   3413 
   3414   convertloopu:
   3415     movdqu     xmm0, [eax]
   3416     movdqu     xmm1, [eax + 16]
   3417     lea        eax, [eax + 32]
   3418     movdqu     [edx], xmm0
   3419     movdqu     [edx + 16], xmm1
   3420     lea        edx, [edx + 32]
   3421     sub        ecx, 32
   3422     jg         convertloopu
   3423     ret
   3424   }
   3425 }
   3426 #endif  // HAS_COPYROW_SSE2
   3427 
   3428 #ifdef HAS_COPYROW_AVX
   3429 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
   3430 __declspec(naked)
   3431 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
   3432   __asm {
   3433     mov        eax, [esp + 4]   // src
   3434     mov        edx, [esp + 8]   // dst
   3435     mov        ecx, [esp + 12]  // count
   3436 
   3437   convertloop:
   3438     vmovdqu    ymm0, [eax]
   3439     vmovdqu    ymm1, [eax + 32]
   3440     lea        eax, [eax + 64]
   3441     vmovdqu    [edx], ymm0
   3442     vmovdqu    [edx + 32], ymm1
   3443     lea        edx, [edx + 64]
   3444     sub        ecx, 64
   3445     jg         convertloop
   3446 
   3447     vzeroupper
   3448     ret
   3449   }
   3450 }
   3451 #endif  // HAS_COPYROW_AVX
   3452 
   3453 // Multiple of 1.
   3454 __declspec(naked)
   3455 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
   3456   __asm {
   3457     mov        eax, esi
   3458     mov        edx, edi
   3459     mov        esi, [esp + 4]   // src
   3460     mov        edi, [esp + 8]   // dst
   3461     mov        ecx, [esp + 12]  // count
   3462     rep movsb
   3463     mov        edi, edx
   3464     mov        esi, eax
   3465     ret
   3466   }
   3467 }
   3468 
   3469 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
   3470 // width in pixels
   3471 __declspec(naked)
   3472 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   3473   __asm {
   3474     mov        eax, [esp + 4]   // src
   3475     mov        edx, [esp + 8]   // dst
   3476     mov        ecx, [esp + 12]  // count
   3477     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
   3478     pslld      xmm0, 24
   3479     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
   3480     psrld      xmm1, 8
   3481 
   3482   convertloop:
   3483     movdqu     xmm2, [eax]
   3484     movdqu     xmm3, [eax + 16]
   3485     lea        eax, [eax + 32]
   3486     movdqu     xmm4, [edx]
   3487     movdqu     xmm5, [edx + 16]
   3488     pand       xmm2, xmm0
   3489     pand       xmm3, xmm0
   3490     pand       xmm4, xmm1
   3491     pand       xmm5, xmm1
   3492     por        xmm2, xmm4
   3493     por        xmm3, xmm5
   3494     movdqu     [edx], xmm2
   3495     movdqu     [edx + 16], xmm3
   3496     lea        edx, [edx + 32]
   3497     sub        ecx, 8
   3498     jg         convertloop
   3499 
   3500     ret
   3501   }
   3502 }
   3503 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
   3504 
   3505 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
   3506 // width in pixels
   3507 __declspec(naked)
   3508 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   3509   __asm {
   3510     mov        eax, [esp + 4]   // src
   3511     mov        edx, [esp + 8]   // dst
   3512     mov        ecx, [esp + 12]  // count
   3513     vpcmpeqb   ymm0, ymm0, ymm0
   3514     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
   3515 
   3516   convertloop:
   3517     vmovdqu    ymm1, [eax]
   3518     vmovdqu    ymm2, [eax + 32]
   3519     lea        eax, [eax + 64]
   3520     vpblendvb  ymm1, ymm1, [edx], ymm0
   3521     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
   3522     vmovdqu    [edx], ymm1
   3523     vmovdqu    [edx + 32], ymm2
   3524     lea        edx, [edx + 64]
   3525     sub        ecx, 16
   3526     jg         convertloop
   3527 
   3528     vzeroupper
   3529     ret
   3530   }
   3531 }
   3532 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
   3533 
   3534 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
   3535 // width in pixels
   3536 __declspec(naked)
   3537 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
   3538   __asm {
   3539     mov        eax, [esp + 4]   // src_argb
   3540     mov        edx, [esp + 8]   // dst_a
   3541     mov        ecx, [esp + 12]  // width
   3542 
   3543   extractloop:
   3544     movdqu     xmm0, [eax]
   3545     movdqu     xmm1, [eax + 16]
   3546     lea        eax, [eax + 32]
   3547     psrld      xmm0, 24
   3548     psrld      xmm1, 24
   3549     packssdw   xmm0, xmm1
   3550     packuswb   xmm0, xmm0
   3551     movq       qword ptr [edx], xmm0
   3552     lea        edx, [edx + 8]
   3553     sub        ecx, 8
   3554     jg         extractloop
   3555 
   3556     ret
   3557   }
   3558 }
   3559 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
   3560 
   3561 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
   3562 // width in pixels
   3563 __declspec(naked)
   3564 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   3565   __asm {
   3566     mov        eax, [esp + 4]   // src
   3567     mov        edx, [esp + 8]   // dst
   3568     mov        ecx, [esp + 12]  // count
   3569     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
   3570     pslld      xmm0, 24
   3571     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
   3572     psrld      xmm1, 8
   3573 
   3574   convertloop:
   3575     movq       xmm2, qword ptr [eax]  // 8 Y's
   3576     lea        eax, [eax + 8]
   3577     punpcklbw  xmm2, xmm2
   3578     punpckhwd  xmm3, xmm2
   3579     punpcklwd  xmm2, xmm2
   3580     movdqu     xmm4, [edx]
   3581     movdqu     xmm5, [edx + 16]
   3582     pand       xmm2, xmm0
   3583     pand       xmm3, xmm0
   3584     pand       xmm4, xmm1
   3585     pand       xmm5, xmm1
   3586     por        xmm2, xmm4
   3587     por        xmm3, xmm5
   3588     movdqu     [edx], xmm2
   3589     movdqu     [edx + 16], xmm3
   3590     lea        edx, [edx + 32]
   3591     sub        ecx, 8
   3592     jg         convertloop
   3593 
   3594     ret
   3595   }
   3596 }
   3597 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
   3598 
   3599 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
   3600 // width in pixels
   3601 __declspec(naked)
   3602 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   3603   __asm {
   3604     mov        eax, [esp + 4]   // src
   3605     mov        edx, [esp + 8]   // dst
   3606     mov        ecx, [esp + 12]  // count
   3607     vpcmpeqb   ymm0, ymm0, ymm0
   3608     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
   3609 
   3610   convertloop:
   3611     vpmovzxbd  ymm1, qword ptr [eax]
   3612     vpmovzxbd  ymm2, qword ptr [eax + 8]
   3613     lea        eax, [eax + 16]
   3614     vpslld     ymm1, ymm1, 24
   3615     vpslld     ymm2, ymm2, 24
   3616     vpblendvb  ymm1, ymm1, [edx], ymm0
   3617     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
   3618     vmovdqu    [edx], ymm1
   3619     vmovdqu    [edx + 32], ymm2
   3620     lea        edx, [edx + 64]
   3621     sub        ecx, 16
   3622     jg         convertloop
   3623 
   3624     vzeroupper
   3625     ret
   3626   }
   3627 }
   3628 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
   3629 
   3630 #ifdef HAS_SETROW_X86
   3631 // Write 'count' bytes using an 8 bit value repeated.
   3632 // Count should be multiple of 4.
   3633 __declspec(naked)
   3634 void SetRow_X86(uint8* dst, uint8 v8, int count) {
   3635   __asm {
   3636     movzx      eax, byte ptr [esp + 8]    // v8
   3637     mov        edx, 0x01010101  // Duplicate byte to all bytes.
   3638     mul        edx              // overwrites edx with upper part of result.
   3639     mov        edx, edi
   3640     mov        edi, [esp + 4]   // dst
   3641     mov        ecx, [esp + 12]  // count
   3642     shr        ecx, 2
   3643     rep stosd
   3644     mov        edi, edx
   3645     ret
   3646   }
   3647 }
   3648 
   3649 // Write 'count' bytes using an 8 bit value repeated.
   3650 __declspec(naked)
   3651 void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
   3652   __asm {
   3653     mov        edx, edi
   3654     mov        edi, [esp + 4]   // dst
   3655     mov        eax, [esp + 8]   // v8
   3656     mov        ecx, [esp + 12]  // count
   3657     rep stosb
   3658     mov        edi, edx
   3659     ret
   3660   }
   3661 }
   3662 
   3663 // Write 'count' 32 bit values.
   3664 __declspec(naked)
   3665 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
   3666   __asm {
   3667     mov        edx, edi
   3668     mov        edi, [esp + 4]   // dst
   3669     mov        eax, [esp + 8]   // v32
   3670     mov        ecx, [esp + 12]  // count
   3671     rep stosd
   3672     mov        edi, edx
   3673     ret
   3674   }
   3675 }
   3676 #endif  // HAS_SETROW_X86
   3677 
   3678 #ifdef HAS_YUY2TOYROW_AVX2
   3679 __declspec(naked)
   3680 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
   3681   __asm {
   3682     mov        eax, [esp + 4]    // src_yuy2
   3683     mov        edx, [esp + 8]    // dst_y
   3684     mov        ecx, [esp + 12]   // width
   3685     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
   3686     vpsrlw     ymm5, ymm5, 8
   3687 
   3688   convertloop:
   3689     vmovdqu    ymm0, [eax]
   3690     vmovdqu    ymm1, [eax + 32]
   3691     lea        eax,  [eax + 64]
   3692     vpand      ymm0, ymm0, ymm5   // even bytes are Y
   3693     vpand      ymm1, ymm1, ymm5
   3694     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   3695     vpermq     ymm0, ymm0, 0xd8
   3696     vmovdqu    [edx], ymm0
   3697     lea        edx, [edx + 32]
   3698     sub        ecx, 32
   3699     jg         convertloop
   3700     vzeroupper
   3701     ret
   3702   }
   3703 }
   3704 
   3705 __declspec(naked)
   3706 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
   3707                       uint8* dst_u, uint8* dst_v, int width) {
   3708   __asm {
   3709     push       esi
   3710     push       edi
   3711     mov        eax, [esp + 8 + 4]    // src_yuy2
   3712     mov        esi, [esp + 8 + 8]    // stride_yuy2
   3713     mov        edx, [esp + 8 + 12]   // dst_u
   3714     mov        edi, [esp + 8 + 16]   // dst_v
   3715     mov        ecx, [esp + 8 + 20]   // width
   3716     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   3717     vpsrlw     ymm5, ymm5, 8
   3718     sub        edi, edx
   3719 
   3720   convertloop:
   3721     vmovdqu    ymm0, [eax]
   3722     vmovdqu    ymm1, [eax + 32]
   3723     vpavgb     ymm0, ymm0, [eax + esi]
   3724     vpavgb     ymm1, ymm1, [eax + esi + 32]
   3725     lea        eax,  [eax + 64]
   3726     vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
   3727     vpsrlw     ymm1, ymm1, 8
   3728     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   3729     vpermq     ymm0, ymm0, 0xd8
   3730     vpand      ymm1, ymm0, ymm5  // U
   3731     vpsrlw     ymm0, ymm0, 8     // V
   3732     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   3733     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   3734     vpermq     ymm1, ymm1, 0xd8
   3735     vpermq     ymm0, ymm0, 0xd8
   3736     vextractf128 [edx], ymm1, 0  // U
   3737     vextractf128 [edx + edi], ymm0, 0 // V
   3738     lea        edx, [edx + 16]
   3739     sub        ecx, 32
   3740     jg         convertloop
   3741 
   3742     pop        edi
   3743     pop        esi
   3744     vzeroupper
   3745     ret
   3746   }
   3747 }
   3748 
   3749 __declspec(naked)
   3750 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
   3751                          uint8* dst_u, uint8* dst_v, int width) {
   3752   __asm {
   3753     push       edi
   3754     mov        eax, [esp + 4 + 4]    // src_yuy2
   3755     mov        edx, [esp + 4 + 8]    // dst_u
   3756     mov        edi, [esp + 4 + 12]   // dst_v
   3757     mov        ecx, [esp + 4 + 16]   // width
   3758     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   3759     vpsrlw     ymm5, ymm5, 8
   3760     sub        edi, edx
   3761 
   3762   convertloop:
   3763     vmovdqu    ymm0, [eax]
   3764     vmovdqu    ymm1, [eax + 32]
   3765     lea        eax,  [eax + 64]
   3766     vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
   3767     vpsrlw     ymm1, ymm1, 8
   3768     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   3769     vpermq     ymm0, ymm0, 0xd8
   3770     vpand      ymm1, ymm0, ymm5  // U
   3771     vpsrlw     ymm0, ymm0, 8     // V
   3772     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   3773     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   3774     vpermq     ymm1, ymm1, 0xd8
   3775     vpermq     ymm0, ymm0, 0xd8
   3776     vextractf128 [edx], ymm1, 0  // U
   3777     vextractf128 [edx + edi], ymm0, 0 // V
   3778     lea        edx, [edx + 16]
   3779     sub        ecx, 32
   3780     jg         convertloop
   3781 
   3782     pop        edi
   3783     vzeroupper
   3784     ret
   3785   }
   3786 }
   3787 
   3788 __declspec(naked)
   3789 void UYVYToYRow_AVX2(const uint8* src_uyvy,
   3790                      uint8* dst_y, int width) {
   3791   __asm {
   3792     mov        eax, [esp + 4]    // src_uyvy
   3793     mov        edx, [esp + 8]    // dst_y
   3794     mov        ecx, [esp + 12]   // width
   3795 
   3796   convertloop:
   3797     vmovdqu    ymm0, [eax]
   3798     vmovdqu    ymm1, [eax + 32]
   3799     lea        eax,  [eax + 64]
   3800     vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
   3801     vpsrlw     ymm1, ymm1, 8
   3802     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   3803     vpermq     ymm0, ymm0, 0xd8
   3804     vmovdqu    [edx], ymm0
   3805     lea        edx, [edx + 32]
   3806     sub        ecx, 32
   3807     jg         convertloop
   3808     vzeroupper
   3809     ret
   3810   }
   3811 }
   3812 
   3813 __declspec(naked)
   3814 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
   3815                       uint8* dst_u, uint8* dst_v, int width) {
   3816   __asm {
   3817     push       esi
   3818     push       edi
   3819     mov        eax, [esp + 8 + 4]    // src_yuy2
   3820     mov        esi, [esp + 8 + 8]    // stride_yuy2
   3821     mov        edx, [esp + 8 + 12]   // dst_u
   3822     mov        edi, [esp + 8 + 16]   // dst_v
   3823     mov        ecx, [esp + 8 + 20]   // width
   3824     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   3825     vpsrlw     ymm5, ymm5, 8
   3826     sub        edi, edx
   3827 
   3828   convertloop:
   3829     vmovdqu    ymm0, [eax]
   3830     vmovdqu    ymm1, [eax + 32]
   3831     vpavgb     ymm0, ymm0, [eax + esi]
   3832     vpavgb     ymm1, ymm1, [eax + esi + 32]
   3833     lea        eax,  [eax + 64]
   3834     vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
   3835     vpand      ymm1, ymm1, ymm5
   3836     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   3837     vpermq     ymm0, ymm0, 0xd8
   3838     vpand      ymm1, ymm0, ymm5  // U
   3839     vpsrlw     ymm0, ymm0, 8     // V
   3840     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   3841     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   3842     vpermq     ymm1, ymm1, 0xd8
   3843     vpermq     ymm0, ymm0, 0xd8
   3844     vextractf128 [edx], ymm1, 0  // U
   3845     vextractf128 [edx + edi], ymm0, 0 // V
   3846     lea        edx, [edx + 16]
   3847     sub        ecx, 32
   3848     jg         convertloop
   3849 
   3850     pop        edi
   3851     pop        esi
   3852     vzeroupper
   3853     ret
   3854   }
   3855 }
   3856 
   3857 __declspec(naked)
   3858 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
   3859                          uint8* dst_u, uint8* dst_v, int width) {
   3860   __asm {
   3861     push       edi
   3862     mov        eax, [esp + 4 + 4]    // src_yuy2
   3863     mov        edx, [esp + 4 + 8]    // dst_u
   3864     mov        edi, [esp + 4 + 12]   // dst_v
   3865     mov        ecx, [esp + 4 + 16]   // width
   3866     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   3867     vpsrlw     ymm5, ymm5, 8
   3868     sub        edi, edx
   3869 
   3870   convertloop:
   3871     vmovdqu    ymm0, [eax]
   3872     vmovdqu    ymm1, [eax + 32]
   3873     lea        eax,  [eax + 64]
   3874     vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
   3875     vpand      ymm1, ymm1, ymm5
   3876     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   3877     vpermq     ymm0, ymm0, 0xd8
   3878     vpand      ymm1, ymm0, ymm5  // U
   3879     vpsrlw     ymm0, ymm0, 8     // V
   3880     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   3881     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   3882     vpermq     ymm1, ymm1, 0xd8
   3883     vpermq     ymm0, ymm0, 0xd8
   3884     vextractf128 [edx], ymm1, 0  // U
   3885     vextractf128 [edx + edi], ymm0, 0 // V
   3886     lea        edx, [edx + 16]
   3887     sub        ecx, 32
   3888     jg         convertloop
   3889 
   3890     pop        edi
   3891     vzeroupper
   3892     ret
   3893   }
   3894 }
   3895 #endif  // HAS_YUY2TOYROW_AVX2
   3896 
   3897 #ifdef HAS_YUY2TOYROW_SSE2
   3898 __declspec(naked)
   3899 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
   3900                      uint8* dst_y, int width) {
   3901   __asm {
   3902     mov        eax, [esp + 4]    // src_yuy2
   3903     mov        edx, [esp + 8]    // dst_y
   3904     mov        ecx, [esp + 12]   // width
   3905     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
   3906     psrlw      xmm5, 8
   3907 
   3908   convertloop:
   3909     movdqu     xmm0, [eax]
   3910     movdqu     xmm1, [eax + 16]
   3911     lea        eax,  [eax + 32]
   3912     pand       xmm0, xmm5   // even bytes are Y
   3913     pand       xmm1, xmm5
   3914     packuswb   xmm0, xmm1
   3915     movdqu     [edx], xmm0
   3916     lea        edx, [edx + 16]
   3917     sub        ecx, 16
   3918     jg         convertloop
   3919     ret
   3920   }
   3921 }
   3922 
   3923 __declspec(naked)
   3924 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
   3925                       uint8* dst_u, uint8* dst_v, int width) {
   3926   __asm {
   3927     push       esi
   3928     push       edi
   3929     mov        eax, [esp + 8 + 4]    // src_yuy2
   3930     mov        esi, [esp + 8 + 8]    // stride_yuy2
   3931     mov        edx, [esp + 8 + 12]   // dst_u
   3932     mov        edi, [esp + 8 + 16]   // dst_v
   3933     mov        ecx, [esp + 8 + 20]   // width
   3934     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   3935     psrlw      xmm5, 8
   3936     sub        edi, edx
   3937 
   3938   convertloop:
   3939     movdqu     xmm0, [eax]
   3940     movdqu     xmm1, [eax + 16]
   3941     movdqu     xmm2, [eax + esi]
   3942     movdqu     xmm3, [eax + esi + 16]
   3943     lea        eax,  [eax + 32]
   3944     pavgb      xmm0, xmm2
   3945     pavgb      xmm1, xmm3
   3946     psrlw      xmm0, 8      // YUYV -> UVUV
   3947     psrlw      xmm1, 8
   3948     packuswb   xmm0, xmm1
   3949     movdqa     xmm1, xmm0
   3950     pand       xmm0, xmm5  // U
   3951     packuswb   xmm0, xmm0
   3952     psrlw      xmm1, 8     // V
   3953     packuswb   xmm1, xmm1
   3954     movq       qword ptr [edx], xmm0
   3955     movq       qword ptr [edx + edi], xmm1
   3956     lea        edx, [edx + 8]
   3957     sub        ecx, 16
   3958     jg         convertloop
   3959 
   3960     pop        edi
   3961     pop        esi
   3962     ret
   3963   }
   3964 }
   3965 
   3966 __declspec(naked)
   3967 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
   3968                          uint8* dst_u, uint8* dst_v, int width) {
   3969   __asm {
   3970     push       edi
   3971     mov        eax, [esp + 4 + 4]    // src_yuy2
   3972     mov        edx, [esp + 4 + 8]    // dst_u
   3973     mov        edi, [esp + 4 + 12]   // dst_v
   3974     mov        ecx, [esp + 4 + 16]   // width
   3975     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   3976     psrlw      xmm5, 8
   3977     sub        edi, edx
   3978 
   3979   convertloop:
   3980     movdqu     xmm0, [eax]
   3981     movdqu     xmm1, [eax + 16]
   3982     lea        eax,  [eax + 32]
   3983     psrlw      xmm0, 8      // YUYV -> UVUV
   3984     psrlw      xmm1, 8
   3985     packuswb   xmm0, xmm1
   3986     movdqa     xmm1, xmm0
   3987     pand       xmm0, xmm5  // U
   3988     packuswb   xmm0, xmm0
   3989     psrlw      xmm1, 8     // V
   3990     packuswb   xmm1, xmm1
   3991     movq       qword ptr [edx], xmm0
   3992     movq       qword ptr [edx + edi], xmm1
   3993     lea        edx, [edx + 8]
   3994     sub        ecx, 16
   3995     jg         convertloop
   3996 
   3997     pop        edi
   3998     ret
   3999   }
   4000 }
   4001 
   4002 __declspec(naked)
   4003 void UYVYToYRow_SSE2(const uint8* src_uyvy,
   4004                      uint8* dst_y, int width) {
   4005   __asm {
   4006     mov        eax, [esp + 4]    // src_uyvy
   4007     mov        edx, [esp + 8]    // dst_y
   4008     mov        ecx, [esp + 12]   // width
   4009 
   4010   convertloop:
   4011     movdqu     xmm0, [eax]
   4012     movdqu     xmm1, [eax + 16]
   4013     lea        eax,  [eax + 32]
   4014     psrlw      xmm0, 8    // odd bytes are Y
   4015     psrlw      xmm1, 8
   4016     packuswb   xmm0, xmm1
   4017     movdqu     [edx], xmm0
   4018     lea        edx, [edx + 16]
   4019     sub        ecx, 16
   4020     jg         convertloop
   4021     ret
   4022   }
   4023 }
   4024 
   4025 __declspec(naked)
   4026 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
   4027                       uint8* dst_u, uint8* dst_v, int width) {
   4028   __asm {
   4029     push       esi
   4030     push       edi
   4031     mov        eax, [esp + 8 + 4]    // src_yuy2
   4032     mov        esi, [esp + 8 + 8]    // stride_yuy2
   4033     mov        edx, [esp + 8 + 12]   // dst_u
   4034     mov        edi, [esp + 8 + 16]   // dst_v
   4035     mov        ecx, [esp + 8 + 20]   // width
   4036     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   4037     psrlw      xmm5, 8
   4038     sub        edi, edx
   4039 
   4040   convertloop:
   4041     movdqu     xmm0, [eax]
   4042     movdqu     xmm1, [eax + 16]
   4043     movdqu     xmm2, [eax + esi]
   4044     movdqu     xmm3, [eax + esi + 16]
   4045     lea        eax,  [eax + 32]
   4046     pavgb      xmm0, xmm2
   4047     pavgb      xmm1, xmm3
   4048     pand       xmm0, xmm5   // UYVY -> UVUV
   4049     pand       xmm1, xmm5
   4050     packuswb   xmm0, xmm1
   4051     movdqa     xmm1, xmm0
   4052     pand       xmm0, xmm5  // U
   4053     packuswb   xmm0, xmm0
   4054     psrlw      xmm1, 8     // V
   4055     packuswb   xmm1, xmm1
   4056     movq       qword ptr [edx], xmm0
   4057     movq       qword ptr [edx + edi], xmm1
   4058     lea        edx, [edx + 8]
   4059     sub        ecx, 16
   4060     jg         convertloop
   4061 
   4062     pop        edi
   4063     pop        esi
   4064     ret
   4065   }
   4066 }
   4067 
   4068 __declspec(naked)
   4069 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
   4070                          uint8* dst_u, uint8* dst_v, int width) {
   4071   __asm {
   4072     push       edi
   4073     mov        eax, [esp + 4 + 4]    // src_yuy2
   4074     mov        edx, [esp + 4 + 8]    // dst_u
   4075     mov        edi, [esp + 4 + 12]   // dst_v
   4076     mov        ecx, [esp + 4 + 16]   // width
   4077     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   4078     psrlw      xmm5, 8
   4079     sub        edi, edx
   4080 
   4081   convertloop:
   4082     movdqu     xmm0, [eax]
   4083     movdqu     xmm1, [eax + 16]
   4084     lea        eax,  [eax + 32]
   4085     pand       xmm0, xmm5   // UYVY -> UVUV
   4086     pand       xmm1, xmm5
   4087     packuswb   xmm0, xmm1
   4088     movdqa     xmm1, xmm0
   4089     pand       xmm0, xmm5  // U
   4090     packuswb   xmm0, xmm0
   4091     psrlw      xmm1, 8     // V
   4092     packuswb   xmm1, xmm1
   4093     movq       qword ptr [edx], xmm0
   4094     movq       qword ptr [edx + edi], xmm1
   4095     lea        edx, [edx + 8]
   4096     sub        ecx, 16
   4097     jg         convertloop
   4098 
   4099     pop        edi
   4100     ret
   4101   }
   4102 }
   4103 #endif  // HAS_YUY2TOYROW_SSE2
   4104 
   4105 #ifdef HAS_BLENDPLANEROW_SSSE3
   4106 // Blend 8 pixels at a time.
   4107 // unsigned version of math
   4108 // =((A2*C2)+(B2*(255-C2))+255)/256
   4109 // signed version of math
   4110 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
   4111 __declspec(naked)
   4112 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
   4113                          const uint8* alpha, uint8* dst, int width) {
   4114   __asm {
   4115     push       esi
   4116     push       edi
   4117     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
   4118     psllw      xmm5, 8
   4119     mov        eax, 0x80808080  // 128 for biasing image to signed.
   4120     movd       xmm6, eax
   4121     pshufd     xmm6, xmm6, 0x00
   4122 
   4123     mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
   4124     movd       xmm7, eax
   4125     pshufd     xmm7, xmm7, 0x00
   4126     mov        eax, [esp + 8 + 4]   // src0
   4127     mov        edx, [esp + 8 + 8]   // src1
   4128     mov        esi, [esp + 8 + 12]  // alpha
   4129     mov        edi, [esp + 8 + 16]  // dst
   4130     mov        ecx, [esp + 8 + 20]  // width
   4131     sub        eax, esi
   4132     sub        edx, esi
   4133     sub        edi, esi
   4134 
   4135     // 8 pixel loop.
   4136   convertloop8:
   4137     movq       xmm0, qword ptr [esi]        // alpha
   4138     punpcklbw  xmm0, xmm0
   4139     pxor       xmm0, xmm5         // a, 255-a
   4140     movq       xmm1, qword ptr [eax + esi]  // src0
   4141     movq       xmm2, qword ptr [edx + esi]  // src1
   4142     punpcklbw  xmm1, xmm2
   4143     psubb      xmm1, xmm6         // bias src0/1 - 128
   4144     pmaddubsw  xmm0, xmm1
   4145     paddw      xmm0, xmm7         // unbias result - 32768 and round.
   4146     psrlw      xmm0, 8
   4147     packuswb   xmm0, xmm0
   4148     movq       qword ptr [edi + esi], xmm0
   4149     lea        esi, [esi + 8]
   4150     sub        ecx, 8
   4151     jg         convertloop8
   4152 
   4153     pop        edi
   4154     pop        esi
   4155     ret
   4156   }
   4157 }
   4158 #endif  // HAS_BLENDPLANEROW_SSSE3
   4159 
   4160 #ifdef HAS_BLENDPLANEROW_AVX2
   4161 // Blend 32 pixels at a time.
   4162 // unsigned version of math
   4163 // =((A2*C2)+(B2*(255-C2))+255)/256
   4164 // signed version of math
   4165 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
   4166 __declspec(naked)
   4167 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
   4168                          const uint8* alpha, uint8* dst, int width) {
   4169   __asm {
   4170     push        esi
   4171     push        edi
   4172     vpcmpeqb    ymm5, ymm5, ymm5       // generate mask 0xff00ff00
   4173     vpsllw      ymm5, ymm5, 8
   4174     mov         eax, 0x80808080  // 128 for biasing image to signed.
   4175     vmovd       xmm6, eax
   4176     vbroadcastss ymm6, xmm6
   4177     mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
   4178     vmovd       xmm7, eax
   4179     vbroadcastss ymm7, xmm7
   4180     mov         eax, [esp + 8 + 4]   // src0
   4181     mov         edx, [esp + 8 + 8]   // src1
   4182     mov         esi, [esp + 8 + 12]  // alpha
   4183     mov         edi, [esp + 8 + 16]  // dst
   4184     mov         ecx, [esp + 8 + 20]  // width
   4185     sub         eax, esi
   4186     sub         edx, esi
   4187     sub         edi, esi
   4188 
   4189     // 32 pixel loop.
   4190   convertloop32:
   4191     vmovdqu     ymm0, [esi]        // alpha
   4192     vpunpckhbw  ymm3, ymm0, ymm0   // 8..15, 24..31
   4193     vpunpcklbw  ymm0, ymm0, ymm0   // 0..7, 16..23
   4194     vpxor       ymm3, ymm3, ymm5   // a, 255-a
   4195     vpxor       ymm0, ymm0, ymm5   // a, 255-a
   4196     vmovdqu     ymm1, [eax + esi]  // src0
   4197     vmovdqu     ymm2, [edx + esi]  // src1
   4198     vpunpckhbw  ymm4, ymm1, ymm2
   4199     vpunpcklbw  ymm1, ymm1, ymm2
   4200     vpsubb      ymm4, ymm4, ymm6   // bias src0/1 - 128
   4201     vpsubb      ymm1, ymm1, ymm6   // bias src0/1 - 128
   4202     vpmaddubsw  ymm3, ymm3, ymm4
   4203     vpmaddubsw  ymm0, ymm0, ymm1
   4204     vpaddw      ymm3, ymm3, ymm7   // unbias result - 32768 and round.
   4205     vpaddw      ymm0, ymm0, ymm7   // unbias result - 32768 and round.
   4206     vpsrlw      ymm3, ymm3, 8
   4207     vpsrlw      ymm0, ymm0, 8
   4208     vpackuswb   ymm0, ymm0, ymm3
   4209     vmovdqu     [edi + esi], ymm0
   4210     lea         esi, [esi + 32]
   4211     sub         ecx, 32
   4212     jg          convertloop32
   4213 
   4214     pop         edi
   4215     pop         esi
   4216     vzeroupper
   4217     ret
   4218   }
   4219 }
   4220 #endif  // HAS_BLENDPLANEROW_AVX2
   4221 
   4222 #ifdef HAS_ARGBBLENDROW_SSSE3
   4223 // Shuffle table for isolating alpha.
   4224 static const uvec8 kShuffleAlpha = {
   4225   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
   4226   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
   4227 };
   4228 
   4229 // Blend 8 pixels at a time.
   4230 __declspec(naked)
   4231 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
   4232                         uint8* dst_argb, int width) {
   4233   __asm {
   4234     push       esi
   4235     mov        eax, [esp + 4 + 4]   // src_argb0
   4236     mov        esi, [esp + 4 + 8]   // src_argb1
   4237     mov        edx, [esp + 4 + 12]  // dst_argb
   4238     mov        ecx, [esp + 4 + 16]  // width
   4239     pcmpeqb    xmm7, xmm7       // generate constant 0x0001
   4240     psrlw      xmm7, 15
   4241     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
   4242     psrlw      xmm6, 8
   4243     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
   4244     psllw      xmm5, 8
   4245     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
   4246     pslld      xmm4, 24
   4247     sub        ecx, 4
   4248     jl         convertloop4b    // less than 4 pixels?
   4249 
   4250     // 4 pixel loop.
   4251   convertloop4:
   4252     movdqu     xmm3, [eax]      // src argb
   4253     lea        eax, [eax + 16]
   4254     movdqa     xmm0, xmm3       // src argb
   4255     pxor       xmm3, xmm4       // ~alpha
   4256     movdqu     xmm2, [esi]      // _r_b
   4257     pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
   4258     pand       xmm2, xmm6       // _r_b
   4259     paddw      xmm3, xmm7       // 256 - alpha
   4260     pmullw     xmm2, xmm3       // _r_b * alpha
   4261     movdqu     xmm1, [esi]      // _a_g
   4262     lea        esi, [esi + 16]
   4263     psrlw      xmm1, 8          // _a_g
   4264     por        xmm0, xmm4       // set alpha to 255
   4265     pmullw     xmm1, xmm3       // _a_g * alpha
   4266     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   4267     paddusb    xmm0, xmm2       // + src argb
   4268     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   4269     paddusb    xmm0, xmm1       // + src argb
   4270     movdqu     [edx], xmm0
   4271     lea        edx, [edx + 16]
   4272     sub        ecx, 4
   4273     jge        convertloop4
   4274 
   4275   convertloop4b:
   4276     add        ecx, 4 - 1
   4277     jl         convertloop1b
   4278 
   4279     // 1 pixel loop.
   4280   convertloop1:
   4281     movd       xmm3, [eax]      // src argb
   4282     lea        eax, [eax + 4]
   4283     movdqa     xmm0, xmm3       // src argb
   4284     pxor       xmm3, xmm4       // ~alpha
   4285     movd       xmm2, [esi]      // _r_b
   4286     pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
   4287     pand       xmm2, xmm6       // _r_b
   4288     paddw      xmm3, xmm7       // 256 - alpha
   4289     pmullw     xmm2, xmm3       // _r_b * alpha
   4290     movd       xmm1, [esi]      // _a_g
   4291     lea        esi, [esi + 4]
   4292     psrlw      xmm1, 8          // _a_g
   4293     por        xmm0, xmm4       // set alpha to 255
   4294     pmullw     xmm1, xmm3       // _a_g * alpha
   4295     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   4296     paddusb    xmm0, xmm2       // + src argb
   4297     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   4298     paddusb    xmm0, xmm1       // + src argb
   4299     movd       [edx], xmm0
   4300     lea        edx, [edx + 4]
   4301     sub        ecx, 1
   4302     jge        convertloop1
   4303 
   4304   convertloop1b:
   4305     pop        esi
   4306     ret
   4307   }
   4308 }
   4309 #endif  // HAS_ARGBBLENDROW_SSSE3
   4310 
   4311 #ifdef HAS_ARGBATTENUATEROW_SSSE3
   4312 // Shuffle table duplicating alpha.
   4313 static const uvec8 kShuffleAlpha0 = {
   4314   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
   4315 };
   4316 static const uvec8 kShuffleAlpha1 = {
   4317   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
   4318   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
   4319 };
   4320 __declspec(naked)
   4321 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   4322   __asm {
   4323     mov        eax, [esp + 4]   // src_argb0
   4324     mov        edx, [esp + 8]   // dst_argb
   4325     mov        ecx, [esp + 12]  // width
   4326     pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
   4327     pslld      xmm3, 24
   4328     movdqa     xmm4, xmmword ptr kShuffleAlpha0
   4329     movdqa     xmm5, xmmword ptr kShuffleAlpha1
   4330 
   4331  convertloop:
   4332     movdqu     xmm0, [eax]      // read 4 pixels
   4333     pshufb     xmm0, xmm4       // isolate first 2 alphas
   4334     movdqu     xmm1, [eax]      // read 4 pixels
   4335     punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
   4336     pmulhuw    xmm0, xmm1       // rgb * a
   4337     movdqu     xmm1, [eax]      // read 4 pixels
   4338     pshufb     xmm1, xmm5       // isolate next 2 alphas
   4339     movdqu     xmm2, [eax]      // read 4 pixels
   4340     punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
   4341     pmulhuw    xmm1, xmm2       // rgb * a
   4342     movdqu     xmm2, [eax]      // mask original alpha
   4343     lea        eax, [eax + 16]
   4344     pand       xmm2, xmm3
   4345     psrlw      xmm0, 8
   4346     psrlw      xmm1, 8
   4347     packuswb   xmm0, xmm1
   4348     por        xmm0, xmm2       // copy original alpha
   4349     movdqu     [edx], xmm0
   4350     lea        edx, [edx + 16]
   4351     sub        ecx, 4
   4352     jg         convertloop
   4353 
   4354     ret
   4355   }
   4356 }
   4357 #endif  // HAS_ARGBATTENUATEROW_SSSE3
   4358 
   4359 #ifdef HAS_ARGBATTENUATEROW_AVX2
   4360 // Shuffle table duplicating alpha.
   4361 static const uvec8 kShuffleAlpha_AVX2 = {
   4362   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
   4363 };
   4364 __declspec(naked)
   4365 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
   4366   __asm {
   4367     mov        eax, [esp + 4]   // src_argb0
   4368     mov        edx, [esp + 8]   // dst_argb
   4369     mov        ecx, [esp + 12]  // width
   4370     sub        edx, eax
   4371     vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
   4372     vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
   4373     vpslld     ymm5, ymm5, 24
   4374 
   4375  convertloop:
   4376     vmovdqu    ymm6, [eax]       // read 8 pixels.
   4377     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
   4378     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
   4379     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
   4380     vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
   4381     vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
   4382     vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
   4383     vpand      ymm6, ymm6, ymm5  // isolate alpha
   4384     vpsrlw     ymm0, ymm0, 8
   4385     vpsrlw     ymm1, ymm1, 8
   4386     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
   4387     vpor       ymm0, ymm0, ymm6  // copy original alpha
   4388     vmovdqu    [eax + edx], ymm0
   4389     lea        eax, [eax + 32]
   4390     sub        ecx, 8
   4391     jg         convertloop
   4392 
   4393     vzeroupper
   4394     ret
   4395   }
   4396 }
   4397 #endif  // HAS_ARGBATTENUATEROW_AVX2
   4398 
   4399 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
   4400 // Unattenuate 4 pixels at a time.
   4401 __declspec(naked)
   4402 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
   4403                              int width) {
   4404   __asm {
   4405     push       ebx
   4406     push       esi
   4407     push       edi
   4408     mov        eax, [esp + 12 + 4]   // src_argb
   4409     mov        edx, [esp + 12 + 8]   // dst_argb
   4410     mov        ecx, [esp + 12 + 12]  // width
   4411     lea        ebx, fixed_invtbl8
   4412 
   4413  convertloop:
   4414     movdqu     xmm0, [eax]      // read 4 pixels
   4415     movzx      esi, byte ptr [eax + 3]  // first alpha
   4416     movzx      edi, byte ptr [eax + 7]  // second alpha
   4417     punpcklbw  xmm0, xmm0       // first 2
   4418     movd       xmm2, dword ptr [ebx + esi * 4]
   4419     movd       xmm3, dword ptr [ebx + edi * 4]
   4420     pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
   4421     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
   4422     movlhps    xmm2, xmm3
   4423     pmulhuw    xmm0, xmm2       // rgb * a
   4424 
   4425     movdqu     xmm1, [eax]      // read 4 pixels
   4426     movzx      esi, byte ptr [eax + 11]  // third alpha
   4427     movzx      edi, byte ptr [eax + 15]  // forth alpha
   4428     punpckhbw  xmm1, xmm1       // next 2
   4429     movd       xmm2, dword ptr [ebx + esi * 4]
   4430     movd       xmm3, dword ptr [ebx + edi * 4]
   4431     pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
   4432     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
   4433     movlhps    xmm2, xmm3
   4434     pmulhuw    xmm1, xmm2       // rgb * a
   4435     lea        eax, [eax + 16]
   4436     packuswb   xmm0, xmm1
   4437     movdqu     [edx], xmm0
   4438     lea        edx, [edx + 16]
   4439     sub        ecx, 4
   4440     jg         convertloop
   4441 
   4442     pop        edi
   4443     pop        esi
   4444     pop        ebx
   4445     ret
   4446   }
   4447 }
   4448 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
   4449 
   4450 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
   4451 // Shuffle table duplicating alpha.
   4452 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
   4453   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
   4454 };
   4455 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
   4456 // USE_GATHER is not on by default, due to being a slow instruction.
   4457 #ifdef USE_GATHER
   4458 __declspec(naked)
   4459 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
   4460                              int width) {
   4461   __asm {
   4462     mov        eax, [esp + 4]   // src_argb0
   4463     mov        edx, [esp + 8]   // dst_argb
   4464     mov        ecx, [esp + 12]  // width
   4465     sub        edx, eax
   4466     vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
   4467 
   4468  convertloop:
   4469     vmovdqu    ymm6, [eax]       // read 8 pixels.
   4470     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
   4471     vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
   4472     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
   4473     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
   4474     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
   4475     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
   4476     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
   4477     vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
   4478     vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
   4479     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
   4480     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
   4481     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
   4482     vmovdqu    [eax + edx], ymm0
   4483     lea        eax, [eax + 32]
   4484     sub        ecx, 8
   4485     jg         convertloop
   4486 
   4487     vzeroupper
   4488     ret
   4489   }
   4490 }
   4491 #else  // USE_GATHER
   4492 __declspec(naked)
   4493 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
   4494                              int width) {
   4495   __asm {
   4496 
   4497     push       ebx
   4498     push       esi
   4499     push       edi
   4500     mov        eax, [esp + 12 + 4]   // src_argb
   4501     mov        edx, [esp + 12 + 8]   // dst_argb
   4502     mov        ecx, [esp + 12 + 12]  // width
   4503     sub        edx, eax
   4504     lea        ebx, fixed_invtbl8
   4505     vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
   4506 
   4507  convertloop:
   4508     // replace VPGATHER
   4509     movzx      esi, byte ptr [eax + 3]                 // alpha0
   4510     movzx      edi, byte ptr [eax + 7]                 // alpha1
   4511     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
   4512     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
   4513     movzx      esi, byte ptr [eax + 11]                // alpha2
   4514     movzx      edi, byte ptr [eax + 15]                // alpha3
   4515     vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
   4516     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
   4517     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
   4518     movzx      esi, byte ptr [eax + 19]                // alpha4
   4519     movzx      edi, byte ptr [eax + 23]                // alpha5
   4520     vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
   4521     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
   4522     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
   4523     movzx      esi, byte ptr [eax + 27]                // alpha6
   4524     movzx      edi, byte ptr [eax + 31]                // alpha7
   4525     vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
   4526     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
   4527     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
   4528     vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
   4529     vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
   4530     vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
   4531     vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
   4532     // end of VPGATHER
   4533 
   4534     vmovdqu    ymm6, [eax]       // read 8 pixels.
   4535     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
   4536     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
   4537     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
   4538     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
   4539     vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
   4540     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
   4541     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
   4542     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
   4543     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
   4544     vmovdqu    [eax + edx], ymm0
   4545     lea        eax, [eax + 32]
   4546     sub        ecx, 8
   4547     jg         convertloop
   4548 
   4549     pop        edi
   4550     pop        esi
   4551     pop        ebx
   4552     vzeroupper
   4553     ret
   4554   }
   4555 }
   4556 #endif  // USE_GATHER
   4557 #endif  // HAS_ARGBATTENUATEROW_AVX2
   4558 
   4559 #ifdef HAS_ARGBGRAYROW_SSSE3
   4560 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
   4561 __declspec(naked)
   4562 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   4563   __asm {
   4564     mov        eax, [esp + 4]   /* src_argb */
   4565     mov        edx, [esp + 8]   /* dst_argb */
   4566     mov        ecx, [esp + 12]  /* width */
   4567     movdqa     xmm4, xmmword ptr kARGBToYJ
   4568     movdqa     xmm5, xmmword ptr kAddYJ64
   4569 
   4570  convertloop:
   4571     movdqu     xmm0, [eax]  // G
   4572     movdqu     xmm1, [eax + 16]
   4573     pmaddubsw  xmm0, xmm4
   4574     pmaddubsw  xmm1, xmm4
   4575     phaddw     xmm0, xmm1
   4576     paddw      xmm0, xmm5  // Add .5 for rounding.
   4577     psrlw      xmm0, 7
   4578     packuswb   xmm0, xmm0   // 8 G bytes
   4579     movdqu     xmm2, [eax]  // A
   4580     movdqu     xmm3, [eax + 16]
   4581     lea        eax, [eax + 32]
   4582     psrld      xmm2, 24
   4583     psrld      xmm3, 24
   4584     packuswb   xmm2, xmm3
   4585     packuswb   xmm2, xmm2   // 8 A bytes
   4586     movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
   4587     punpcklbw  xmm0, xmm0   // 8 GG words
   4588     punpcklbw  xmm3, xmm2   // 8 GA words
   4589     movdqa     xmm1, xmm0
   4590     punpcklwd  xmm0, xmm3   // GGGA first 4
   4591     punpckhwd  xmm1, xmm3   // GGGA next 4
   4592     movdqu     [edx], xmm0
   4593     movdqu     [edx + 16], xmm1
   4594     lea        edx, [edx + 32]
   4595     sub        ecx, 8
   4596     jg         convertloop
   4597     ret
   4598   }
   4599 }
   4600 #endif  // HAS_ARGBGRAYROW_SSSE3
   4601 
   4602 #ifdef HAS_ARGBSEPIAROW_SSSE3
   4603 //    b = (r * 35 + g * 68 + b * 17) >> 7
   4604 //    g = (r * 45 + g * 88 + b * 22) >> 7
   4605 //    r = (r * 50 + g * 98 + b * 24) >> 7
   4606 // Constant for ARGB color to sepia tone.
   4607 static const vec8 kARGBToSepiaB = {
   4608   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
   4609 };
   4610 
   4611 static const vec8 kARGBToSepiaG = {
   4612   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
   4613 };
   4614 
   4615 static const vec8 kARGBToSepiaR = {
   4616   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
   4617 };
   4618 
   4619 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
   4620 __declspec(naked)
   4621 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
   4622   __asm {
   4623     mov        eax, [esp + 4]   /* dst_argb */
   4624     mov        ecx, [esp + 8]   /* width */
   4625     movdqa     xmm2, xmmword ptr kARGBToSepiaB
   4626     movdqa     xmm3, xmmword ptr kARGBToSepiaG
   4627     movdqa     xmm4, xmmword ptr kARGBToSepiaR
   4628 
   4629  convertloop:
   4630     movdqu     xmm0, [eax]  // B
   4631     movdqu     xmm6, [eax + 16]
   4632     pmaddubsw  xmm0, xmm2
   4633     pmaddubsw  xmm6, xmm2
   4634     phaddw     xmm0, xmm6
   4635     psrlw      xmm0, 7
   4636     packuswb   xmm0, xmm0   // 8 B values
   4637     movdqu     xmm5, [eax]  // G
   4638     movdqu     xmm1, [eax + 16]
   4639     pmaddubsw  xmm5, xmm3
   4640     pmaddubsw  xmm1, xmm3
   4641     phaddw     xmm5, xmm1
   4642     psrlw      xmm5, 7
   4643     packuswb   xmm5, xmm5   // 8 G values
   4644     punpcklbw  xmm0, xmm5   // 8 BG values
   4645     movdqu     xmm5, [eax]  // R
   4646     movdqu     xmm1, [eax + 16]
   4647     pmaddubsw  xmm5, xmm4
   4648     pmaddubsw  xmm1, xmm4
   4649     phaddw     xmm5, xmm1
   4650     psrlw      xmm5, 7
   4651     packuswb   xmm5, xmm5   // 8 R values
   4652     movdqu     xmm6, [eax]  // A
   4653     movdqu     xmm1, [eax + 16]
   4654     psrld      xmm6, 24
   4655     psrld      xmm1, 24
   4656     packuswb   xmm6, xmm1
   4657     packuswb   xmm6, xmm6   // 8 A values
   4658     punpcklbw  xmm5, xmm6   // 8 RA values
   4659     movdqa     xmm1, xmm0   // Weave BG, RA together
   4660     punpcklwd  xmm0, xmm5   // BGRA first 4
   4661     punpckhwd  xmm1, xmm5   // BGRA next 4
   4662     movdqu     [eax], xmm0
   4663     movdqu     [eax + 16], xmm1
   4664     lea        eax, [eax + 32]
   4665     sub        ecx, 8
   4666     jg         convertloop
   4667     ret
   4668   }
   4669 }
   4670 #endif  // HAS_ARGBSEPIAROW_SSSE3
   4671 
   4672 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
   4673 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
   4674 // Same as Sepia except matrix is provided.
   4675 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
   4676 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
   4677 __declspec(naked)
   4678 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
   4679                               const int8* matrix_argb, int width) {
   4680   __asm {
   4681     mov        eax, [esp + 4]   /* src_argb */
   4682     mov        edx, [esp + 8]   /* dst_argb */
   4683     mov        ecx, [esp + 12]  /* matrix_argb */
   4684     movdqu     xmm5, [ecx]
   4685     pshufd     xmm2, xmm5, 0x00
   4686     pshufd     xmm3, xmm5, 0x55
   4687     pshufd     xmm4, xmm5, 0xaa
   4688     pshufd     xmm5, xmm5, 0xff
   4689     mov        ecx, [esp + 16]  /* width */
   4690 
   4691  convertloop:
   4692     movdqu     xmm0, [eax]  // B
   4693     movdqu     xmm7, [eax + 16]
   4694     pmaddubsw  xmm0, xmm2
   4695     pmaddubsw  xmm7, xmm2
   4696     movdqu     xmm6, [eax]  // G
   4697     movdqu     xmm1, [eax + 16]
   4698     pmaddubsw  xmm6, xmm3
   4699     pmaddubsw  xmm1, xmm3
   4700     phaddsw    xmm0, xmm7   // B
   4701     phaddsw    xmm6, xmm1   // G
   4702     psraw      xmm0, 6      // B
   4703     psraw      xmm6, 6      // G
   4704     packuswb   xmm0, xmm0   // 8 B values
   4705     packuswb   xmm6, xmm6   // 8 G values
   4706     punpcklbw  xmm0, xmm6   // 8 BG values
   4707     movdqu     xmm1, [eax]  // R
   4708     movdqu     xmm7, [eax + 16]
   4709     pmaddubsw  xmm1, xmm4
   4710     pmaddubsw  xmm7, xmm4
   4711     phaddsw    xmm1, xmm7   // R
   4712     movdqu     xmm6, [eax]  // A
   4713     movdqu     xmm7, [eax + 16]
   4714     pmaddubsw  xmm6, xmm5
   4715     pmaddubsw  xmm7, xmm5
   4716     phaddsw    xmm6, xmm7   // A
   4717     psraw      xmm1, 6      // R
   4718     psraw      xmm6, 6      // A
   4719     packuswb   xmm1, xmm1   // 8 R values
   4720     packuswb   xmm6, xmm6   // 8 A values
   4721     punpcklbw  xmm1, xmm6   // 8 RA values
   4722     movdqa     xmm6, xmm0   // Weave BG, RA together
   4723     punpcklwd  xmm0, xmm1   // BGRA first 4
   4724     punpckhwd  xmm6, xmm1   // BGRA next 4
   4725     movdqu     [edx], xmm0
   4726     movdqu     [edx + 16], xmm6
   4727     lea        eax, [eax + 32]
   4728     lea        edx, [edx + 32]
   4729     sub        ecx, 8
   4730     jg         convertloop
   4731     ret
   4732   }
   4733 }
   4734 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
   4735 
   4736 #ifdef HAS_ARGBQUANTIZEROW_SSE2
   4737 // Quantize 4 ARGB pixels (16 bytes).
   4738 __declspec(naked)
   4739 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
   4740                           int interval_offset, int width) {
   4741   __asm {
   4742     mov        eax, [esp + 4]    /* dst_argb */
   4743     movd       xmm2, [esp + 8]   /* scale */
   4744     movd       xmm3, [esp + 12]  /* interval_size */
   4745     movd       xmm4, [esp + 16]  /* interval_offset */
   4746     mov        ecx, [esp + 20]   /* width */
   4747     pshuflw    xmm2, xmm2, 040h
   4748     pshufd     xmm2, xmm2, 044h
   4749     pshuflw    xmm3, xmm3, 040h
   4750     pshufd     xmm3, xmm3, 044h
   4751     pshuflw    xmm4, xmm4, 040h
   4752     pshufd     xmm4, xmm4, 044h
   4753     pxor       xmm5, xmm5  // constant 0
   4754     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
   4755     pslld      xmm6, 24
   4756 
   4757  convertloop:
   4758     movdqu     xmm0, [eax]  // read 4 pixels
   4759     punpcklbw  xmm0, xmm5   // first 2 pixels
   4760     pmulhuw    xmm0, xmm2   // pixel * scale >> 16
   4761     movdqu     xmm1, [eax]  // read 4 pixels
   4762     punpckhbw  xmm1, xmm5   // next 2 pixels
   4763     pmulhuw    xmm1, xmm2
   4764     pmullw     xmm0, xmm3   // * interval_size
   4765     movdqu     xmm7, [eax]  // read 4 pixels
   4766     pmullw     xmm1, xmm3
   4767     pand       xmm7, xmm6   // mask alpha
   4768     paddw      xmm0, xmm4   // + interval_size / 2
   4769     paddw      xmm1, xmm4
   4770     packuswb   xmm0, xmm1
   4771     por        xmm0, xmm7
   4772     movdqu     [eax], xmm0
   4773     lea        eax, [eax + 16]
   4774     sub        ecx, 4
   4775     jg         convertloop
   4776     ret
   4777   }
   4778 }
   4779 #endif  // HAS_ARGBQUANTIZEROW_SSE2
   4780 
   4781 #ifdef HAS_ARGBSHADEROW_SSE2
   4782 // Shade 4 pixels at a time by specified value.
   4783 __declspec(naked)
   4784 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
   4785                        uint32 value) {
   4786   __asm {
   4787     mov        eax, [esp + 4]   // src_argb
   4788     mov        edx, [esp + 8]   // dst_argb
   4789     mov        ecx, [esp + 12]  // width
   4790     movd       xmm2, [esp + 16]  // value
   4791     punpcklbw  xmm2, xmm2
   4792     punpcklqdq xmm2, xmm2
   4793 
   4794  convertloop:
   4795     movdqu     xmm0, [eax]      // read 4 pixels
   4796     lea        eax, [eax + 16]
   4797     movdqa     xmm1, xmm0
   4798     punpcklbw  xmm0, xmm0       // first 2
   4799     punpckhbw  xmm1, xmm1       // next 2
   4800     pmulhuw    xmm0, xmm2       // argb * value
   4801     pmulhuw    xmm1, xmm2       // argb * value
   4802     psrlw      xmm0, 8
   4803     psrlw      xmm1, 8
   4804     packuswb   xmm0, xmm1
   4805     movdqu     [edx], xmm0
   4806     lea        edx, [edx + 16]
   4807     sub        ecx, 4
   4808     jg         convertloop
   4809 
   4810     ret
   4811   }
   4812 }
   4813 #endif  // HAS_ARGBSHADEROW_SSE2
   4814 
   4815 #ifdef HAS_ARGBMULTIPLYROW_SSE2
   4816 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
   4817 __declspec(naked)
   4818 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   4819                           uint8* dst_argb, int width) {
   4820   __asm {
   4821     push       esi
   4822     mov        eax, [esp + 4 + 4]   // src_argb0
   4823     mov        esi, [esp + 4 + 8]   // src_argb1
   4824     mov        edx, [esp + 4 + 12]  // dst_argb
   4825     mov        ecx, [esp + 4 + 16]  // width
   4826     pxor       xmm5, xmm5  // constant 0
   4827 
   4828  convertloop:
   4829     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
   4830     movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
   4831     movdqu     xmm1, xmm0
   4832     movdqu     xmm3, xmm2
   4833     punpcklbw  xmm0, xmm0         // first 2
   4834     punpckhbw  xmm1, xmm1         // next 2
   4835     punpcklbw  xmm2, xmm5         // first 2
   4836     punpckhbw  xmm3, xmm5         // next 2
   4837     pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
   4838     pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
   4839     lea        eax, [eax + 16]
   4840     lea        esi, [esi + 16]
   4841     packuswb   xmm0, xmm1
   4842     movdqu     [edx], xmm0
   4843     lea        edx, [edx + 16]
   4844     sub        ecx, 4
   4845     jg         convertloop
   4846 
   4847     pop        esi
   4848     ret
   4849   }
   4850 }
   4851 #endif  // HAS_ARGBMULTIPLYROW_SSE2
   4852 
   4853 #ifdef HAS_ARGBADDROW_SSE2
   4854 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
   4855 // TODO(fbarchard): Port this to posix, neon and other math functions.
   4856 __declspec(naked)
   4857 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   4858                      uint8* dst_argb, int width) {
   4859   __asm {
   4860     push       esi
   4861     mov        eax, [esp + 4 + 4]   // src_argb0
   4862     mov        esi, [esp + 4 + 8]   // src_argb1
   4863     mov        edx, [esp + 4 + 12]  // dst_argb
   4864     mov        ecx, [esp + 4 + 16]  // width
   4865 
   4866     sub        ecx, 4
   4867     jl         convertloop49
   4868 
   4869  convertloop4:
   4870     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
   4871     lea        eax, [eax + 16]
   4872     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
   4873     lea        esi, [esi + 16]
   4874     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
   4875     movdqu     [edx], xmm0
   4876     lea        edx, [edx + 16]
   4877     sub        ecx, 4
   4878     jge        convertloop4
   4879 
   4880  convertloop49:
   4881     add        ecx, 4 - 1
   4882     jl         convertloop19
   4883 
   4884  convertloop1:
   4885     movd       xmm0, [eax]        // read 1 pixels from src_argb0
   4886     lea        eax, [eax + 4]
   4887     movd       xmm1, [esi]        // read 1 pixels from src_argb1
   4888     lea        esi, [esi + 4]
   4889     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
   4890     movd       [edx], xmm0
   4891     lea        edx, [edx + 4]
   4892     sub        ecx, 1
   4893     jge        convertloop1
   4894 
   4895  convertloop19:
   4896     pop        esi
   4897     ret
   4898   }
   4899 }
   4900 #endif  // HAS_ARGBADDROW_SSE2
   4901 
   4902 #ifdef HAS_ARGBSUBTRACTROW_SSE2
   4903 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
   4904 __declspec(naked)
   4905 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   4906                           uint8* dst_argb, int width) {
   4907   __asm {
   4908     push       esi
   4909     mov        eax, [esp + 4 + 4]   // src_argb0
   4910     mov        esi, [esp + 4 + 8]   // src_argb1
   4911     mov        edx, [esp + 4 + 12]  // dst_argb
   4912     mov        ecx, [esp + 4 + 16]  // width
   4913 
   4914  convertloop:
   4915     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
   4916     lea        eax, [eax + 16]
   4917     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
   4918     lea        esi, [esi + 16]
   4919     psubusb    xmm0, xmm1         // src_argb0 - src_argb1
   4920     movdqu     [edx], xmm0
   4921     lea        edx, [edx + 16]
   4922     sub        ecx, 4
   4923     jg         convertloop
   4924 
   4925     pop        esi
   4926     ret
   4927   }
   4928 }
   4929 #endif  // HAS_ARGBSUBTRACTROW_SSE2
   4930 
   4931 #ifdef HAS_ARGBMULTIPLYROW_AVX2
   4932 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
   4933 __declspec(naked)
   4934 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
   4935                           uint8* dst_argb, int width) {
   4936   __asm {
   4937     push       esi
   4938     mov        eax, [esp + 4 + 4]   // src_argb0
   4939     mov        esi, [esp + 4 + 8]   // src_argb1
   4940     mov        edx, [esp + 4 + 12]  // dst_argb
   4941     mov        ecx, [esp + 4 + 16]  // width
   4942     vpxor      ymm5, ymm5, ymm5     // constant 0
   4943 
   4944  convertloop:
   4945     vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
   4946     lea        eax, [eax + 32]
   4947     vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
   4948     lea        esi, [esi + 32]
   4949     vpunpcklbw ymm0, ymm1, ymm1   // low 4
   4950     vpunpckhbw ymm1, ymm1, ymm1   // high 4
   4951     vpunpcklbw ymm2, ymm3, ymm5   // low 4
   4952     vpunpckhbw ymm3, ymm3, ymm5   // high 4
   4953     vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
   4954     vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
   4955     vpackuswb  ymm0, ymm0, ymm1
   4956     vmovdqu    [edx], ymm0
   4957     lea        edx, [edx + 32]
   4958     sub        ecx, 8
   4959     jg         convertloop
   4960 
   4961     pop        esi
   4962     vzeroupper
   4963     ret
   4964   }
   4965 }
   4966 #endif  // HAS_ARGBMULTIPLYROW_AVX2
   4967 
   4968 #ifdef HAS_ARGBADDROW_AVX2
   4969 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
   4970 __declspec(naked)
   4971 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
   4972                      uint8* dst_argb, int width) {
   4973   __asm {
   4974     push       esi
   4975     mov        eax, [esp + 4 + 4]   // src_argb0
   4976     mov        esi, [esp + 4 + 8]   // src_argb1
   4977     mov        edx, [esp + 4 + 12]  // dst_argb
   4978     mov        ecx, [esp + 4 + 16]  // width
   4979 
   4980  convertloop:
   4981     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
   4982     lea        eax, [eax + 32]
   4983     vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
   4984     lea        esi, [esi + 32]
   4985     vmovdqu    [edx], ymm0
   4986     lea        edx, [edx + 32]
   4987     sub        ecx, 8
   4988     jg         convertloop
   4989 
   4990     pop        esi
   4991     vzeroupper
   4992     ret
   4993   }
   4994 }
   4995 #endif  // HAS_ARGBADDROW_AVX2
   4996 
   4997 #ifdef HAS_ARGBSUBTRACTROW_AVX2
   4998 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
   4999 __declspec(naked)
   5000 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
   5001                           uint8* dst_argb, int width) {
   5002   __asm {
   5003     push       esi
   5004     mov        eax, [esp + 4 + 4]   // src_argb0
   5005     mov        esi, [esp + 4 + 8]   // src_argb1
   5006     mov        edx, [esp + 4 + 12]  // dst_argb
   5007     mov        ecx, [esp + 4 + 16]  // width
   5008 
   5009  convertloop:
   5010     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
   5011     lea        eax, [eax + 32]
   5012     vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
   5013     lea        esi, [esi + 32]
   5014     vmovdqu    [edx], ymm0
   5015     lea        edx, [edx + 32]
   5016     sub        ecx, 8
   5017     jg         convertloop
   5018 
   5019     pop        esi
   5020     vzeroupper
   5021     ret
   5022   }
   5023 }
   5024 #endif  // HAS_ARGBSUBTRACTROW_AVX2
   5025 
   5026 #ifdef HAS_SOBELXROW_SSE2
   5027 // SobelX as a matrix is
   5028 // -1  0  1
   5029 // -2  0  2
   5030 // -1  0  1
   5031 __declspec(naked)
   5032 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
   5033                     const uint8* src_y2, uint8* dst_sobelx, int width) {
   5034   __asm {
   5035     push       esi
   5036     push       edi
   5037     mov        eax, [esp + 8 + 4]   // src_y0
   5038     mov        esi, [esp + 8 + 8]   // src_y1
   5039     mov        edi, [esp + 8 + 12]  // src_y2
   5040     mov        edx, [esp + 8 + 16]  // dst_sobelx
   5041     mov        ecx, [esp + 8 + 20]  // width
   5042     sub        esi, eax
   5043     sub        edi, eax
   5044     sub        edx, eax
   5045     pxor       xmm5, xmm5  // constant 0
   5046 
   5047  convertloop:
   5048     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
   5049     movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
   5050     punpcklbw  xmm0, xmm5
   5051     punpcklbw  xmm1, xmm5
   5052     psubw      xmm0, xmm1
   5053     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
   5054     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
   5055     punpcklbw  xmm1, xmm5
   5056     punpcklbw  xmm2, xmm5
   5057     psubw      xmm1, xmm2
   5058     movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
   5059     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
   5060     punpcklbw  xmm2, xmm5
   5061     punpcklbw  xmm3, xmm5
   5062     psubw      xmm2, xmm3
   5063     paddw      xmm0, xmm2
   5064     paddw      xmm0, xmm1
   5065     paddw      xmm0, xmm1
   5066     pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
   5067     psubw      xmm1, xmm0
   5068     pmaxsw     xmm0, xmm1
   5069     packuswb   xmm0, xmm0
   5070     movq       qword ptr [eax + edx], xmm0
   5071     lea        eax, [eax + 8]
   5072     sub        ecx, 8
   5073     jg         convertloop
   5074 
   5075     pop        edi
   5076     pop        esi
   5077     ret
   5078   }
   5079 }
   5080 #endif  // HAS_SOBELXROW_SSE2
   5081 
   5082 #ifdef HAS_SOBELYROW_SSE2
   5083 // SobelY as a matrix is
   5084 // -1 -2 -1
   5085 //  0  0  0
   5086 //  1  2  1
   5087 __declspec(naked)
   5088 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
   5089                     uint8* dst_sobely, int width) {
   5090   __asm {
   5091     push       esi
   5092     mov        eax, [esp + 4 + 4]   // src_y0
   5093     mov        esi, [esp + 4 + 8]   // src_y1
   5094     mov        edx, [esp + 4 + 12]  // dst_sobely
   5095     mov        ecx, [esp + 4 + 16]  // width
   5096     sub        esi, eax
   5097     sub        edx, eax
   5098     pxor       xmm5, xmm5  // constant 0
   5099 
   5100  convertloop:
   5101     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
   5102     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
   5103     punpcklbw  xmm0, xmm5
   5104     punpcklbw  xmm1, xmm5
   5105     psubw      xmm0, xmm1
   5106     movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
   5107     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
   5108     punpcklbw  xmm1, xmm5
   5109     punpcklbw  xmm2, xmm5
   5110     psubw      xmm1, xmm2
   5111     movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
   5112     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
   5113     punpcklbw  xmm2, xmm5
   5114     punpcklbw  xmm3, xmm5
   5115     psubw      xmm2, xmm3
   5116     paddw      xmm0, xmm2
   5117     paddw      xmm0, xmm1
   5118     paddw      xmm0, xmm1
   5119     pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
   5120     psubw      xmm1, xmm0
   5121     pmaxsw     xmm0, xmm1
   5122     packuswb   xmm0, xmm0
   5123     movq       qword ptr [eax + edx], xmm0
   5124     lea        eax, [eax + 8]
   5125     sub        ecx, 8
   5126     jg         convertloop
   5127 
   5128     pop        esi
   5129     ret
   5130   }
   5131 }
   5132 #endif  // HAS_SOBELYROW_SSE2
   5133 
   5134 #ifdef HAS_SOBELROW_SSE2
   5135 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
   5136 // A = 255
   5137 // R = Sobel
   5138 // G = Sobel
   5139 // B = Sobel
   5140 __declspec(naked)
   5141 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
   5142                    uint8* dst_argb, int width) {
   5143   __asm {
   5144     push       esi
   5145     mov        eax, [esp + 4 + 4]   // src_sobelx
   5146     mov        esi, [esp + 4 + 8]   // src_sobely
   5147     mov        edx, [esp + 4 + 12]  // dst_argb
   5148     mov        ecx, [esp + 4 + 16]  // width
   5149     sub        esi, eax
   5150     pcmpeqb    xmm5, xmm5           // alpha 255
   5151     pslld      xmm5, 24             // 0xff000000
   5152 
   5153  convertloop:
   5154     movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
   5155     movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
   5156     lea        eax, [eax + 16]
   5157     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
   5158     movdqa     xmm2, xmm0             // GG
   5159     punpcklbw  xmm2, xmm0             // First 8
   5160     punpckhbw  xmm0, xmm0             // Next 8
   5161     movdqa     xmm1, xmm2             // GGGG
   5162     punpcklwd  xmm1, xmm2             // First 4
   5163     punpckhwd  xmm2, xmm2             // Next 4
   5164     por        xmm1, xmm5             // GGGA
   5165     por        xmm2, xmm5
   5166     movdqa     xmm3, xmm0             // GGGG
   5167     punpcklwd  xmm3, xmm0             // Next 4
   5168     punpckhwd  xmm0, xmm0             // Last 4
   5169     por        xmm3, xmm5             // GGGA
   5170     por        xmm0, xmm5
   5171     movdqu     [edx], xmm1
   5172     movdqu     [edx + 16], xmm2
   5173     movdqu     [edx + 32], xmm3
   5174     movdqu     [edx + 48], xmm0
   5175     lea        edx, [edx + 64]
   5176     sub        ecx, 16
   5177     jg         convertloop
   5178 
   5179     pop        esi
   5180     ret
   5181   }
   5182 }
   5183 #endif  // HAS_SOBELROW_SSE2
   5184 
   5185 #ifdef HAS_SOBELTOPLANEROW_SSE2
   5186 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
   5187 __declspec(naked)
   5188 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
   5189                           uint8* dst_y, int width) {
   5190   __asm {
   5191     push       esi
   5192     mov        eax, [esp + 4 + 4]   // src_sobelx
   5193     mov        esi, [esp + 4 + 8]   // src_sobely
   5194     mov        edx, [esp + 4 + 12]  // dst_argb
   5195     mov        ecx, [esp + 4 + 16]  // width
   5196     sub        esi, eax
   5197 
   5198  convertloop:
   5199     movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
   5200     movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
   5201     lea        eax, [eax + 16]
   5202     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
   5203     movdqu     [edx], xmm0
   5204     lea        edx, [edx + 16]
   5205     sub        ecx, 16
   5206     jg         convertloop
   5207 
   5208     pop        esi
   5209     ret
   5210   }
   5211 }
   5212 #endif  // HAS_SOBELTOPLANEROW_SSE2
   5213 
   5214 #ifdef HAS_SOBELXYROW_SSE2
   5215 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
   5216 // A = 255
   5217 // R = Sobel X
   5218 // G = Sobel
   5219 // B = Sobel Y
   5220 __declspec(naked)
   5221 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
   5222                      uint8* dst_argb, int width) {
   5223   __asm {
   5224     push       esi
   5225     mov        eax, [esp + 4 + 4]   // src_sobelx
   5226     mov        esi, [esp + 4 + 8]   // src_sobely
   5227     mov        edx, [esp + 4 + 12]  // dst_argb
   5228     mov        ecx, [esp + 4 + 16]  // width
   5229     sub        esi, eax
   5230     pcmpeqb    xmm5, xmm5           // alpha 255
   5231 
   5232  convertloop:
   5233     movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
   5234     movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
   5235     lea        eax, [eax + 16]
   5236     movdqa     xmm2, xmm0
   5237     paddusb    xmm2, xmm1             // sobel = sobelx + sobely
   5238     movdqa     xmm3, xmm0             // XA
   5239     punpcklbw  xmm3, xmm5
   5240     punpckhbw  xmm0, xmm5
   5241     movdqa     xmm4, xmm1             // YS
   5242     punpcklbw  xmm4, xmm2
   5243     punpckhbw  xmm1, xmm2
   5244     movdqa     xmm6, xmm4             // YSXA
   5245     punpcklwd  xmm6, xmm3             // First 4
   5246     punpckhwd  xmm4, xmm3             // Next 4
   5247     movdqa     xmm7, xmm1             // YSXA
   5248     punpcklwd  xmm7, xmm0             // Next 4
   5249     punpckhwd  xmm1, xmm0             // Last 4
   5250     movdqu     [edx], xmm6
   5251     movdqu     [edx + 16], xmm4
   5252     movdqu     [edx + 32], xmm7
   5253     movdqu     [edx + 48], xmm1
   5254     lea        edx, [edx + 64]
   5255     sub        ecx, 16
   5256     jg         convertloop
   5257 
   5258     pop        esi
   5259     ret
   5260   }
   5261 }
   5262 #endif  // HAS_SOBELXYROW_SSE2
   5263 
   5264 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
   5265 // Consider float CumulativeSum.
   5266 // Consider calling CumulativeSum one row at time as needed.
   5267 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
   5268 // Convert cumulative sum for an area to an average for 1 pixel.
   5269 // topleft is pointer to top left of CumulativeSum buffer for area.
   5270 // botleft is pointer to bottom left of CumulativeSum buffer.
   5271 // width is offset from left to right of area in CumulativeSum buffer measured
   5272 //   in number of ints.
   5273 // area is the number of pixels in the area being averaged.
   5274 // dst points to pixel to store result to.
   5275 // count is number of averaged pixels to produce.
   5276 // Does 4 pixels at a time.
   5277 // This function requires alignment on accumulation buffer pointers.
   5278 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
   5279                                     int width, int area, uint8* dst,
   5280                                     int count) {
   5281   __asm {
   5282     mov        eax, topleft  // eax topleft
   5283     mov        esi, botleft  // esi botleft
   5284     mov        edx, width
   5285     movd       xmm5, area
   5286     mov        edi, dst
   5287     mov        ecx, count
   5288     cvtdq2ps   xmm5, xmm5
   5289     rcpss      xmm4, xmm5  // 1.0f / area
   5290     pshufd     xmm4, xmm4, 0
   5291     sub        ecx, 4
   5292     jl         l4b
   5293 
   5294     cmp        area, 128  // 128 pixels will not overflow 15 bits.
   5295     ja         l4
   5296 
   5297     pshufd     xmm5, xmm5, 0        // area
   5298     pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
   5299     psrld      xmm6, 16
   5300     cvtdq2ps   xmm6, xmm6
   5301     addps      xmm5, xmm6           // (65536.0 + area - 1)
   5302     mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
   5303     cvtps2dq   xmm5, xmm5           // 0.16 fixed point
   5304     packssdw   xmm5, xmm5           // 16 bit shorts
   5305 
   5306     // 4 pixel loop small blocks.
   5307   s4:
   5308     // top left
   5309     movdqu     xmm0, [eax]
   5310     movdqu     xmm1, [eax + 16]
   5311     movdqu     xmm2, [eax + 32]
   5312     movdqu     xmm3, [eax + 48]
   5313 
   5314     // - top right
   5315     psubd      xmm0, [eax + edx * 4]
   5316     psubd      xmm1, [eax + edx * 4 + 16]
   5317     psubd      xmm2, [eax + edx * 4 + 32]
   5318     psubd      xmm3, [eax + edx * 4 + 48]
   5319     lea        eax, [eax + 64]
   5320 
   5321     // - bottom left
   5322     psubd      xmm0, [esi]
   5323     psubd      xmm1, [esi + 16]
   5324     psubd      xmm2, [esi + 32]
   5325     psubd      xmm3, [esi + 48]
   5326 
   5327     // + bottom right
   5328     paddd      xmm0, [esi + edx * 4]
   5329     paddd      xmm1, [esi + edx * 4 + 16]
   5330     paddd      xmm2, [esi + edx * 4 + 32]
   5331     paddd      xmm3, [esi + edx * 4 + 48]
   5332     lea        esi, [esi + 64]
   5333 
   5334     packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
   5335     packssdw   xmm2, xmm3
   5336 
   5337     pmulhuw    xmm0, xmm5
   5338     pmulhuw    xmm2, xmm5
   5339 
   5340     packuswb   xmm0, xmm2
   5341     movdqu     [edi], xmm0
   5342     lea        edi, [edi + 16]
   5343     sub        ecx, 4
   5344     jge        s4
   5345 
   5346     jmp        l4b
   5347 
   5348     // 4 pixel loop
   5349   l4:
   5350     // top left
   5351     movdqu     xmm0, [eax]
   5352     movdqu     xmm1, [eax + 16]
   5353     movdqu     xmm2, [eax + 32]
   5354     movdqu     xmm3, [eax + 48]
   5355 
   5356     // - top right
   5357     psubd      xmm0, [eax + edx * 4]
   5358     psubd      xmm1, [eax + edx * 4 + 16]
   5359     psubd      xmm2, [eax + edx * 4 + 32]
   5360     psubd      xmm3, [eax + edx * 4 + 48]
   5361     lea        eax, [eax + 64]
   5362 
   5363     // - bottom left
   5364     psubd      xmm0, [esi]
   5365     psubd      xmm1, [esi + 16]
   5366     psubd      xmm2, [esi + 32]
   5367     psubd      xmm3, [esi + 48]
   5368 
   5369     // + bottom right
   5370     paddd      xmm0, [esi + edx * 4]
   5371     paddd      xmm1, [esi + edx * 4 + 16]
   5372     paddd      xmm2, [esi + edx * 4 + 32]
   5373     paddd      xmm3, [esi + edx * 4 + 48]
   5374     lea        esi, [esi + 64]
   5375 
   5376     cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
   5377     cvtdq2ps   xmm1, xmm1
   5378     mulps      xmm0, xmm4
   5379     mulps      xmm1, xmm4
   5380     cvtdq2ps   xmm2, xmm2
   5381     cvtdq2ps   xmm3, xmm3
   5382     mulps      xmm2, xmm4
   5383     mulps      xmm3, xmm4
   5384     cvtps2dq   xmm0, xmm0
   5385     cvtps2dq   xmm1, xmm1
   5386     cvtps2dq   xmm2, xmm2
   5387     cvtps2dq   xmm3, xmm3
   5388     packssdw   xmm0, xmm1
   5389     packssdw   xmm2, xmm3
   5390     packuswb   xmm0, xmm2
   5391     movdqu     [edi], xmm0
   5392     lea        edi, [edi + 16]
   5393     sub        ecx, 4
   5394     jge        l4
   5395 
   5396   l4b:
   5397     add        ecx, 4 - 1
   5398     jl         l1b
   5399 
   5400     // 1 pixel loop
   5401   l1:
   5402     movdqu     xmm0, [eax]
   5403     psubd      xmm0, [eax + edx * 4]
   5404     lea        eax, [eax + 16]
   5405     psubd      xmm0, [esi]
   5406     paddd      xmm0, [esi + edx * 4]
   5407     lea        esi, [esi + 16]
   5408     cvtdq2ps   xmm0, xmm0
   5409     mulps      xmm0, xmm4
   5410     cvtps2dq   xmm0, xmm0
   5411     packssdw   xmm0, xmm0
   5412     packuswb   xmm0, xmm0
   5413     movd       dword ptr [edi], xmm0
   5414     lea        edi, [edi + 4]
   5415     sub        ecx, 1
   5416     jge        l1
   5417   l1b:
   5418   }
   5419 }
   5420 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
   5421 
   5422 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
   5423 // Creates a table of cumulative sums where each value is a sum of all values
   5424 // above and to the left of the value.
   5425 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
   5426                                   const int32* previous_cumsum, int width) {
   5427   __asm {
   5428     mov        eax, row
   5429     mov        edx, cumsum
   5430     mov        esi, previous_cumsum
   5431     mov        ecx, width
   5432     pxor       xmm0, xmm0
   5433     pxor       xmm1, xmm1
   5434 
   5435     sub        ecx, 4
   5436     jl         l4b
   5437     test       edx, 15
   5438     jne        l4b
   5439 
   5440     // 4 pixel loop
   5441   l4:
   5442     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
   5443     lea        eax, [eax + 16]
   5444     movdqa     xmm4, xmm2
   5445 
   5446     punpcklbw  xmm2, xmm1
   5447     movdqa     xmm3, xmm2
   5448     punpcklwd  xmm2, xmm1
   5449     punpckhwd  xmm3, xmm1
   5450 
   5451     punpckhbw  xmm4, xmm1
   5452     movdqa     xmm5, xmm4
   5453     punpcklwd  xmm4, xmm1
   5454     punpckhwd  xmm5, xmm1
   5455 
   5456     paddd      xmm0, xmm2
   5457     movdqu     xmm2, [esi]  // previous row above.
   5458     paddd      xmm2, xmm0
   5459 
   5460     paddd      xmm0, xmm3
   5461     movdqu     xmm3, [esi + 16]
   5462     paddd      xmm3, xmm0
   5463 
   5464     paddd      xmm0, xmm4
   5465     movdqu     xmm4, [esi + 32]
   5466     paddd      xmm4, xmm0
   5467 
   5468     paddd      xmm0, xmm5
   5469     movdqu     xmm5, [esi + 48]
   5470     lea        esi, [esi + 64]
   5471     paddd      xmm5, xmm0
   5472 
   5473     movdqu     [edx], xmm2
   5474     movdqu     [edx + 16], xmm3
   5475     movdqu     [edx + 32], xmm4
   5476     movdqu     [edx + 48], xmm5
   5477 
   5478     lea        edx, [edx + 64]
   5479     sub        ecx, 4
   5480     jge        l4
   5481 
   5482   l4b:
   5483     add        ecx, 4 - 1
   5484     jl         l1b
   5485 
   5486     // 1 pixel loop
   5487   l1:
   5488     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
   5489     lea        eax, [eax + 4]
   5490     punpcklbw  xmm2, xmm1
   5491     punpcklwd  xmm2, xmm1
   5492     paddd      xmm0, xmm2
   5493     movdqu     xmm2, [esi]
   5494     lea        esi, [esi + 16]
   5495     paddd      xmm2, xmm0
   5496     movdqu     [edx], xmm2
   5497     lea        edx, [edx + 16]
   5498     sub        ecx, 1
   5499     jge        l1
   5500 
   5501  l1b:
   5502   }
   5503 }
   5504 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
   5505 
   5506 #ifdef HAS_ARGBAFFINEROW_SSE2
   5507 // Copy ARGB pixels from source image with slope to a row of destination.
   5508 __declspec(naked)
   5509 LIBYUV_API
   5510 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
   5511                         uint8* dst_argb, const float* uv_dudv, int width) {
   5512   __asm {
   5513     push       esi
   5514     push       edi
   5515     mov        eax, [esp + 12]  // src_argb
   5516     mov        esi, [esp + 16]  // stride
   5517     mov        edx, [esp + 20]  // dst_argb
   5518     mov        ecx, [esp + 24]  // pointer to uv_dudv
   5519     movq       xmm2, qword ptr [ecx]  // uv
   5520     movq       xmm7, qword ptr [ecx + 8]  // dudv
   5521     mov        ecx, [esp + 28]  // width
   5522     shl        esi, 16          // 4, stride
   5523     add        esi, 4
   5524     movd       xmm5, esi
   5525     sub        ecx, 4
   5526     jl         l4b
   5527 
   5528     // setup for 4 pixel loop
   5529     pshufd     xmm7, xmm7, 0x44  // dup dudv
   5530     pshufd     xmm5, xmm5, 0  // dup 4, stride
   5531     movdqa     xmm0, xmm2    // x0, y0, x1, y1
   5532     addps      xmm0, xmm7
   5533     movlhps    xmm2, xmm0
   5534     movdqa     xmm4, xmm7
   5535     addps      xmm4, xmm4    // dudv *= 2
   5536     movdqa     xmm3, xmm2    // x2, y2, x3, y3
   5537     addps      xmm3, xmm4
   5538     addps      xmm4, xmm4    // dudv *= 4
   5539 
   5540     // 4 pixel loop
   5541   l4:
   5542     cvttps2dq  xmm0, xmm2    // x, y float to int first 2
   5543     cvttps2dq  xmm1, xmm3    // x, y float to int next 2
   5544     packssdw   xmm0, xmm1    // x, y as 8 shorts
   5545     pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
   5546     movd       esi, xmm0
   5547     pshufd     xmm0, xmm0, 0x39  // shift right
   5548     movd       edi, xmm0
   5549     pshufd     xmm0, xmm0, 0x39  // shift right
   5550     movd       xmm1, [eax + esi]  // read pixel 0
   5551     movd       xmm6, [eax + edi]  // read pixel 1
   5552     punpckldq  xmm1, xmm6     // combine pixel 0 and 1
   5553     addps      xmm2, xmm4    // x, y += dx, dy first 2
   5554     movq       qword ptr [edx], xmm1
   5555     movd       esi, xmm0
   5556     pshufd     xmm0, xmm0, 0x39  // shift right
   5557     movd       edi, xmm0
   5558     movd       xmm6, [eax + esi]  // read pixel 2
   5559     movd       xmm0, [eax + edi]  // read pixel 3
   5560     punpckldq  xmm6, xmm0     // combine pixel 2 and 3
   5561     addps      xmm3, xmm4    // x, y += dx, dy next 2
   5562     movq       qword ptr 8[edx], xmm6
   5563     lea        edx, [edx + 16]
   5564     sub        ecx, 4
   5565     jge        l4
   5566 
   5567   l4b:
   5568     add        ecx, 4 - 1
   5569     jl         l1b
   5570 
   5571     // 1 pixel loop
   5572   l1:
   5573     cvttps2dq  xmm0, xmm2    // x, y float to int
   5574     packssdw   xmm0, xmm0    // x, y as shorts
   5575     pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
   5576     addps      xmm2, xmm7    // x, y += dx, dy
   5577     movd       esi, xmm0
   5578     movd       xmm0, [eax + esi]  // copy a pixel
   5579     movd       [edx], xmm0
   5580     lea        edx, [edx + 4]
   5581     sub        ecx, 1
   5582     jge        l1
   5583   l1b:
   5584     pop        edi
   5585     pop        esi
   5586     ret
   5587   }
   5588 }
   5589 #endif  // HAS_ARGBAFFINEROW_SSE2
   5590 
   5591 #ifdef HAS_INTERPOLATEROW_AVX2
   5592 // Bilinear filter 32x2 -> 32x1
   5593 __declspec(naked)
   5594 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
   5595                          ptrdiff_t src_stride, int dst_width,
   5596                          int source_y_fraction) {
   5597   __asm {
   5598     push       esi
   5599     push       edi
   5600     mov        edi, [esp + 8 + 4]   // dst_ptr
   5601     mov        esi, [esp + 8 + 8]   // src_ptr
   5602     mov        edx, [esp + 8 + 12]  // src_stride
   5603     mov        ecx, [esp + 8 + 16]  // dst_width
   5604     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   5605     // Dispatch to specialized filters if applicable.
   5606     cmp        eax, 0
   5607     je         xloop100  // 0 / 256.  Blend 100 / 0.
   5608     sub        edi, esi
   5609     cmp        eax, 128
   5610     je         xloop50   // 128 /256 is 0.50.  Blend 50 / 50.
   5611 
   5612     vmovd      xmm0, eax  // high fraction 0..255
   5613     neg        eax
   5614     add        eax, 256
   5615     vmovd      xmm5, eax  // low fraction 256..1
   5616     vpunpcklbw xmm5, xmm5, xmm0
   5617     vpunpcklwd xmm5, xmm5, xmm5
   5618     vbroadcastss ymm5, xmm5
   5619 
   5620     mov        eax, 0x80808080  // 128b for bias and rounding.
   5621     vmovd      xmm4, eax
   5622     vbroadcastss ymm4, xmm4
   5623 
   5624   xloop:
   5625     vmovdqu    ymm0, [esi]
   5626     vmovdqu    ymm2, [esi + edx]
   5627     vpunpckhbw ymm1, ymm0, ymm2  // mutates
   5628     vpunpcklbw ymm0, ymm0, ymm2
   5629     vpsubb     ymm1, ymm1, ymm4  // bias to signed image
   5630     vpsubb     ymm0, ymm0, ymm4
   5631     vpmaddubsw ymm1, ymm5, ymm1
   5632     vpmaddubsw ymm0, ymm5, ymm0
   5633     vpaddw     ymm1, ymm1, ymm4  // unbias and round
   5634     vpaddw     ymm0, ymm0, ymm4
   5635     vpsrlw     ymm1, ymm1, 8
   5636     vpsrlw     ymm0, ymm0, 8
   5637     vpackuswb  ymm0, ymm0, ymm1  // unmutates
   5638     vmovdqu    [esi + edi], ymm0
   5639     lea        esi, [esi + 32]
   5640     sub        ecx, 32
   5641     jg         xloop
   5642     jmp        xloop99
   5643 
   5644    // Blend 50 / 50.
   5645  xloop50:
   5646    vmovdqu    ymm0, [esi]
   5647    vpavgb     ymm0, ymm0, [esi + edx]
   5648    vmovdqu    [esi + edi], ymm0
   5649    lea        esi, [esi + 32]
   5650    sub        ecx, 32
   5651    jg         xloop50
   5652    jmp        xloop99
   5653 
   5654    // Blend 100 / 0 - Copy row unchanged.
   5655  xloop100:
   5656    rep movsb
   5657 
   5658   xloop99:
   5659     pop        edi
   5660     pop        esi
   5661     vzeroupper
   5662     ret
   5663   }
   5664 }
   5665 #endif  // HAS_INTERPOLATEROW_AVX2
   5666 
   5667 // Bilinear filter 16x2 -> 16x1
   5668 // TODO(fbarchard): Consider allowing 256 using memcpy.
   5669 __declspec(naked)
   5670 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   5671                           ptrdiff_t src_stride, int dst_width,
   5672                           int source_y_fraction) {
   5673   __asm {
   5674     push       esi
   5675     push       edi
   5676 
   5677     mov        edi, [esp + 8 + 4]   // dst_ptr
   5678     mov        esi, [esp + 8 + 8]   // src_ptr
   5679     mov        edx, [esp + 8 + 12]  // src_stride
   5680     mov        ecx, [esp + 8 + 16]  // dst_width
   5681     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   5682     sub        edi, esi
   5683     // Dispatch to specialized filters if applicable.
   5684     cmp        eax, 0
   5685     je         xloop100  // 0 /256.  Blend 100 / 0.
   5686     cmp        eax, 128
   5687     je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
   5688 
   5689     movd       xmm0, eax  // high fraction 0..255
   5690     neg        eax
   5691     add        eax, 256
   5692     movd       xmm5, eax  // low fraction 255..1
   5693     punpcklbw  xmm5, xmm0
   5694     punpcklwd  xmm5, xmm5
   5695     pshufd     xmm5, xmm5, 0
   5696     mov        eax, 0x80808080  // 128 for biasing image to signed.
   5697     movd       xmm4, eax
   5698     pshufd     xmm4, xmm4, 0x00
   5699 
   5700   xloop:
   5701     movdqu     xmm0, [esi]
   5702     movdqu     xmm2, [esi + edx]
   5703     movdqu     xmm1, xmm0
   5704     punpcklbw  xmm0, xmm2
   5705     punpckhbw  xmm1, xmm2
   5706     psubb      xmm0, xmm4  // bias image by -128
   5707     psubb      xmm1, xmm4
   5708     movdqa     xmm2, xmm5
   5709     movdqa     xmm3, xmm5
   5710     pmaddubsw  xmm2, xmm0
   5711     pmaddubsw  xmm3, xmm1
   5712     paddw      xmm2, xmm4
   5713     paddw      xmm3, xmm4
   5714     psrlw      xmm2, 8
   5715     psrlw      xmm3, 8
   5716     packuswb   xmm2, xmm3
   5717     movdqu     [esi + edi], xmm2
   5718     lea        esi, [esi + 16]
   5719     sub        ecx, 16
   5720     jg         xloop
   5721     jmp        xloop99
   5722 
   5723     // Blend 50 / 50.
   5724   xloop50:
   5725     movdqu     xmm0, [esi]
   5726     movdqu     xmm1, [esi + edx]
   5727     pavgb      xmm0, xmm1
   5728     movdqu     [esi + edi], xmm0
   5729     lea        esi, [esi + 16]
   5730     sub        ecx, 16
   5731     jg         xloop50
   5732     jmp        xloop99
   5733 
   5734     // Blend 100 / 0 - Copy row unchanged.
   5735   xloop100:
   5736     movdqu     xmm0, [esi]
   5737     movdqu     [esi + edi], xmm0
   5738     lea        esi, [esi + 16]
   5739     sub        ecx, 16
   5740     jg         xloop100
   5741 
   5742   xloop99:
   5743     pop        edi
   5744     pop        esi
   5745     ret
   5746   }
   5747 }
   5748 
   5749 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
   5750 __declspec(naked)
   5751 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
   5752                           const uint8* shuffler, int width) {
   5753   __asm {
   5754     mov        eax, [esp + 4]    // src_argb
   5755     mov        edx, [esp + 8]    // dst_argb
   5756     mov        ecx, [esp + 12]   // shuffler
   5757     movdqu     xmm5, [ecx]
   5758     mov        ecx, [esp + 16]   // width
   5759 
   5760   wloop:
   5761     movdqu     xmm0, [eax]
   5762     movdqu     xmm1, [eax + 16]
   5763     lea        eax, [eax + 32]
   5764     pshufb     xmm0, xmm5
   5765     pshufb     xmm1, xmm5
   5766     movdqu     [edx], xmm0
   5767     movdqu     [edx + 16], xmm1
   5768     lea        edx, [edx + 32]
   5769     sub        ecx, 8
   5770     jg         wloop
   5771     ret
   5772   }
   5773 }
   5774 
   5775 #ifdef HAS_ARGBSHUFFLEROW_AVX2
   5776 __declspec(naked)
   5777 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
   5778                          const uint8* shuffler, int width) {
   5779   __asm {
   5780     mov        eax, [esp + 4]     // src_argb
   5781     mov        edx, [esp + 8]     // dst_argb
   5782     mov        ecx, [esp + 12]    // shuffler
   5783     vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
   5784     mov        ecx, [esp + 16]    // width
   5785 
   5786   wloop:
   5787     vmovdqu    ymm0, [eax]
   5788     vmovdqu    ymm1, [eax + 32]
   5789     lea        eax, [eax + 64]
   5790     vpshufb    ymm0, ymm0, ymm5
   5791     vpshufb    ymm1, ymm1, ymm5
   5792     vmovdqu    [edx], ymm0
   5793     vmovdqu    [edx + 32], ymm1
   5794     lea        edx, [edx + 64]
   5795     sub        ecx, 16
   5796     jg         wloop
   5797 
   5798     vzeroupper
   5799     ret
   5800   }
   5801 }
   5802 #endif  // HAS_ARGBSHUFFLEROW_AVX2
   5803 
   5804 __declspec(naked)
   5805 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
   5806                          const uint8* shuffler, int width) {
   5807   __asm {
   5808     push       ebx
   5809     push       esi
   5810     mov        eax, [esp + 8 + 4]    // src_argb
   5811     mov        edx, [esp + 8 + 8]    // dst_argb
   5812     mov        esi, [esp + 8 + 12]   // shuffler
   5813     mov        ecx, [esp + 8 + 16]   // width
   5814     pxor       xmm5, xmm5
   5815 
   5816     mov        ebx, [esi]   // shuffler
   5817     cmp        ebx, 0x03000102
   5818     je         shuf_3012
   5819     cmp        ebx, 0x00010203
   5820     je         shuf_0123
   5821     cmp        ebx, 0x00030201
   5822     je         shuf_0321
   5823     cmp        ebx, 0x02010003
   5824     je         shuf_2103
   5825 
   5826   // TODO(fbarchard): Use one source pointer and 3 offsets.
   5827   shuf_any1:
   5828     movzx      ebx, byte ptr [esi]
   5829     movzx      ebx, byte ptr [eax + ebx]
   5830     mov        [edx], bl
   5831     movzx      ebx, byte ptr [esi + 1]
   5832     movzx      ebx, byte ptr [eax + ebx]
   5833     mov        [edx + 1], bl
   5834     movzx      ebx, byte ptr [esi + 2]
   5835     movzx      ebx, byte ptr [eax + ebx]
   5836     mov        [edx + 2], bl
   5837     movzx      ebx, byte ptr [esi + 3]
   5838     movzx      ebx, byte ptr [eax + ebx]
   5839     mov        [edx + 3], bl
   5840     lea        eax, [eax + 4]
   5841     lea        edx, [edx + 4]
   5842     sub        ecx, 1
   5843     jg         shuf_any1
   5844     jmp        shuf99
   5845 
   5846   shuf_0123:
   5847     movdqu     xmm0, [eax]
   5848     lea        eax, [eax + 16]
   5849     movdqa     xmm1, xmm0
   5850     punpcklbw  xmm0, xmm5
   5851     punpckhbw  xmm1, xmm5
   5852     pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
   5853     pshuflw    xmm0, xmm0, 01Bh
   5854     pshufhw    xmm1, xmm1, 01Bh
   5855     pshuflw    xmm1, xmm1, 01Bh
   5856     packuswb   xmm0, xmm1
   5857     movdqu     [edx], xmm0
   5858     lea        edx, [edx + 16]
   5859     sub        ecx, 4
   5860     jg         shuf_0123
   5861     jmp        shuf99
   5862 
   5863   shuf_0321:
   5864     movdqu     xmm0, [eax]
   5865     lea        eax, [eax + 16]
   5866     movdqa     xmm1, xmm0
   5867     punpcklbw  xmm0, xmm5
   5868     punpckhbw  xmm1, xmm5
   5869     pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
   5870     pshuflw    xmm0, xmm0, 039h
   5871     pshufhw    xmm1, xmm1, 039h
   5872     pshuflw    xmm1, xmm1, 039h
   5873     packuswb   xmm0, xmm1
   5874     movdqu     [edx], xmm0
   5875     lea        edx, [edx + 16]
   5876     sub        ecx, 4
   5877     jg         shuf_0321
   5878     jmp        shuf99
   5879 
   5880   shuf_2103:
   5881     movdqu     xmm0, [eax]
   5882     lea        eax, [eax + 16]
   5883     movdqa     xmm1, xmm0
   5884     punpcklbw  xmm0, xmm5
   5885     punpckhbw  xmm1, xmm5
   5886     pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
   5887     pshuflw    xmm0, xmm0, 093h
   5888     pshufhw    xmm1, xmm1, 093h
   5889     pshuflw    xmm1, xmm1, 093h
   5890     packuswb   xmm0, xmm1
   5891     movdqu     [edx], xmm0
   5892     lea        edx, [edx + 16]
   5893     sub        ecx, 4
   5894     jg         shuf_2103
   5895     jmp        shuf99
   5896 
   5897   shuf_3012:
   5898     movdqu     xmm0, [eax]
   5899     lea        eax, [eax + 16]
   5900     movdqa     xmm1, xmm0
   5901     punpcklbw  xmm0, xmm5
   5902     punpckhbw  xmm1, xmm5
   5903     pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
   5904     pshuflw    xmm0, xmm0, 0C6h
   5905     pshufhw    xmm1, xmm1, 0C6h
   5906     pshuflw    xmm1, xmm1, 0C6h
   5907     packuswb   xmm0, xmm1
   5908     movdqu     [edx], xmm0
   5909     lea        edx, [edx + 16]
   5910     sub        ecx, 4
   5911     jg         shuf_3012
   5912 
   5913   shuf99:
   5914     pop        esi
   5915     pop        ebx
   5916     ret
   5917   }
   5918 }
   5919 
   5920 // YUY2 - Macro-pixel = 2 image pixels
   5921 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
   5922 
   5923 // UYVY - Macro-pixel = 2 image pixels
   5924 // U0Y0V0Y1
   5925 
   5926 __declspec(naked)
   5927 void I422ToYUY2Row_SSE2(const uint8* src_y,
   5928                         const uint8* src_u,
   5929                         const uint8* src_v,
   5930                         uint8* dst_frame, int width) {
   5931   __asm {
   5932     push       esi
   5933     push       edi
   5934     mov        eax, [esp + 8 + 4]    // src_y
   5935     mov        esi, [esp + 8 + 8]    // src_u
   5936     mov        edx, [esp + 8 + 12]   // src_v
   5937     mov        edi, [esp + 8 + 16]   // dst_frame
   5938     mov        ecx, [esp + 8 + 20]   // width
   5939     sub        edx, esi
   5940 
   5941   convertloop:
   5942     movq       xmm2, qword ptr [esi] // U
   5943     movq       xmm3, qword ptr [esi + edx] // V
   5944     lea        esi, [esi + 8]
   5945     punpcklbw  xmm2, xmm3 // UV
   5946     movdqu     xmm0, [eax] // Y
   5947     lea        eax, [eax + 16]
   5948     movdqa     xmm1, xmm0
   5949     punpcklbw  xmm0, xmm2 // YUYV
   5950     punpckhbw  xmm1, xmm2
   5951     movdqu     [edi], xmm0
   5952     movdqu     [edi + 16], xmm1
   5953     lea        edi, [edi + 32]
   5954     sub        ecx, 16
   5955     jg         convertloop
   5956 
   5957     pop        edi
   5958     pop        esi
   5959     ret
   5960   }
   5961 }
   5962 
   5963 __declspec(naked)
   5964 void I422ToUYVYRow_SSE2(const uint8* src_y,
   5965                         const uint8* src_u,
   5966                         const uint8* src_v,
   5967                         uint8* dst_frame, int width) {
   5968   __asm {
   5969     push       esi
   5970     push       edi
   5971     mov        eax, [esp + 8 + 4]    // src_y
   5972     mov        esi, [esp + 8 + 8]    // src_u
   5973     mov        edx, [esp + 8 + 12]   // src_v
   5974     mov        edi, [esp + 8 + 16]   // dst_frame
   5975     mov        ecx, [esp + 8 + 20]   // width
   5976     sub        edx, esi
   5977 
   5978   convertloop:
   5979     movq       xmm2, qword ptr [esi] // U
   5980     movq       xmm3, qword ptr [esi + edx] // V
   5981     lea        esi, [esi + 8]
   5982     punpcklbw  xmm2, xmm3 // UV
   5983     movdqu     xmm0, [eax] // Y
   5984     movdqa     xmm1, xmm2
   5985     lea        eax, [eax + 16]
   5986     punpcklbw  xmm1, xmm0 // UYVY
   5987     punpckhbw  xmm2, xmm0
   5988     movdqu     [edi], xmm1
   5989     movdqu     [edi + 16], xmm2
   5990     lea        edi, [edi + 32]
   5991     sub        ecx, 16
   5992     jg         convertloop
   5993 
   5994     pop        edi
   5995     pop        esi
   5996     ret
   5997   }
   5998 }
   5999 
   6000 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
   6001 __declspec(naked)
   6002 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
   6003                             uint8* dst_argb, const float* poly,
   6004                             int width) {
   6005   __asm {
   6006     push       esi
   6007     mov        eax, [esp + 4 + 4]   /* src_argb */
   6008     mov        edx, [esp + 4 + 8]   /* dst_argb */
   6009     mov        esi, [esp + 4 + 12]  /* poly */
   6010     mov        ecx, [esp + 4 + 16]  /* width */
   6011     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
   6012 
   6013     // 2 pixel loop.
   6014  convertloop:
   6015 //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
   6016 //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
   6017     movq       xmm0, qword ptr [eax]  // BGRABGRA
   6018     lea        eax, [eax + 8]
   6019     punpcklbw  xmm0, xmm3
   6020     movdqa     xmm4, xmm0
   6021     punpcklwd  xmm0, xmm3  // pixel 0
   6022     punpckhwd  xmm4, xmm3  // pixel 1
   6023     cvtdq2ps   xmm0, xmm0  // 4 floats
   6024     cvtdq2ps   xmm4, xmm4
   6025     movdqa     xmm1, xmm0  // X
   6026     movdqa     xmm5, xmm4
   6027     mulps      xmm0, [esi + 16]  // C1 * X
   6028     mulps      xmm4, [esi + 16]
   6029     addps      xmm0, [esi]  // result = C0 + C1 * X
   6030     addps      xmm4, [esi]
   6031     movdqa     xmm2, xmm1
   6032     movdqa     xmm6, xmm5
   6033     mulps      xmm2, xmm1  // X * X
   6034     mulps      xmm6, xmm5
   6035     mulps      xmm1, xmm2  // X * X * X
   6036     mulps      xmm5, xmm6
   6037     mulps      xmm2, [esi + 32]  // C2 * X * X
   6038     mulps      xmm6, [esi + 32]
   6039     mulps      xmm1, [esi + 48]  // C3 * X * X * X
   6040     mulps      xmm5, [esi + 48]
   6041     addps      xmm0, xmm2  // result += C2 * X * X
   6042     addps      xmm4, xmm6
   6043     addps      xmm0, xmm1  // result += C3 * X * X * X
   6044     addps      xmm4, xmm5
   6045     cvttps2dq  xmm0, xmm0
   6046     cvttps2dq  xmm4, xmm4
   6047     packuswb   xmm0, xmm4
   6048     packuswb   xmm0, xmm0
   6049     movq       qword ptr [edx], xmm0
   6050     lea        edx, [edx + 8]
   6051     sub        ecx, 2
   6052     jg         convertloop
   6053     pop        esi
   6054     ret
   6055   }
   6056 }
   6057 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
   6058 
   6059 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
   6060 __declspec(naked)
   6061 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
   6062                             uint8* dst_argb, const float* poly,
   6063                             int width) {
   6064   __asm {
   6065     mov        eax, [esp + 4]   /* src_argb */
   6066     mov        edx, [esp + 8]   /* dst_argb */
   6067     mov        ecx, [esp + 12]   /* poly */
   6068     vbroadcastf128 ymm4, [ecx]       // C0
   6069     vbroadcastf128 ymm5, [ecx + 16]  // C1
   6070     vbroadcastf128 ymm6, [ecx + 32]  // C2
   6071     vbroadcastf128 ymm7, [ecx + 48]  // C3
   6072     mov        ecx, [esp + 16]  /* width */
   6073 
   6074     // 2 pixel loop.
   6075  convertloop:
   6076     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
   6077     lea         eax, [eax + 8]
   6078     vcvtdq2ps   ymm0, ymm0        // X 8 floats
   6079     vmulps      ymm2, ymm0, ymm0  // X * X
   6080     vmulps      ymm3, ymm0, ymm7  // C3 * X
   6081     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
   6082     vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
   6083     vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
   6084     vcvttps2dq  ymm0, ymm0
   6085     vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
   6086     vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
   6087     vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
   6088     vmovq       qword ptr [edx], xmm0
   6089     lea         edx, [edx + 8]
   6090     sub         ecx, 2
   6091     jg          convertloop
   6092     vzeroupper
   6093     ret
   6094   }
   6095 }
   6096 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
   6097 
   6098 #ifdef HAS_ARGBCOLORTABLEROW_X86
   6099 // Tranform ARGB pixels with color table.
   6100 __declspec(naked)
   6101 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
   6102                            int width) {
   6103   __asm {
   6104     push       esi
   6105     mov        eax, [esp + 4 + 4]   /* dst_argb */
   6106     mov        esi, [esp + 4 + 8]   /* table_argb */
   6107     mov        ecx, [esp + 4 + 12]  /* width */
   6108 
   6109     // 1 pixel loop.
   6110   convertloop:
   6111     movzx      edx, byte ptr [eax]
   6112     lea        eax, [eax + 4]
   6113     movzx      edx, byte ptr [esi + edx * 4]
   6114     mov        byte ptr [eax - 4], dl
   6115     movzx      edx, byte ptr [eax - 4 + 1]
   6116     movzx      edx, byte ptr [esi + edx * 4 + 1]
   6117     mov        byte ptr [eax - 4 + 1], dl
   6118     movzx      edx, byte ptr [eax - 4 + 2]
   6119     movzx      edx, byte ptr [esi + edx * 4 + 2]
   6120     mov        byte ptr [eax - 4 + 2], dl
   6121     movzx      edx, byte ptr [eax - 4 + 3]
   6122     movzx      edx, byte ptr [esi + edx * 4 + 3]
   6123     mov        byte ptr [eax - 4 + 3], dl
   6124     dec        ecx
   6125     jg         convertloop
   6126     pop        esi
   6127     ret
   6128   }
   6129 }
   6130 #endif  // HAS_ARGBCOLORTABLEROW_X86
   6131 
   6132 #ifdef HAS_RGBCOLORTABLEROW_X86
   6133 // Tranform RGB pixels with color table.
   6134 __declspec(naked)
   6135 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
   6136   __asm {
   6137     push       esi
   6138     mov        eax, [esp + 4 + 4]   /* dst_argb */
   6139     mov        esi, [esp + 4 + 8]   /* table_argb */
   6140     mov        ecx, [esp + 4 + 12]  /* width */
   6141 
   6142     // 1 pixel loop.
   6143   convertloop:
   6144     movzx      edx, byte ptr [eax]
   6145     lea        eax, [eax + 4]
   6146     movzx      edx, byte ptr [esi + edx * 4]
   6147     mov        byte ptr [eax - 4], dl
   6148     movzx      edx, byte ptr [eax - 4 + 1]
   6149     movzx      edx, byte ptr [esi + edx * 4 + 1]
   6150     mov        byte ptr [eax - 4 + 1], dl
   6151     movzx      edx, byte ptr [eax - 4 + 2]
   6152     movzx      edx, byte ptr [esi + edx * 4 + 2]
   6153     mov        byte ptr [eax - 4 + 2], dl
   6154     dec        ecx
   6155     jg         convertloop
   6156 
   6157     pop        esi
   6158     ret
   6159   }
   6160 }
   6161 #endif  // HAS_RGBCOLORTABLEROW_X86
   6162 
   6163 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
   6164 // Tranform RGB pixels with luma table.
   6165 __declspec(naked)
   6166 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
   6167                                  int width,
   6168                                  const uint8* luma, uint32 lumacoeff) {
   6169   __asm {
   6170     push       esi
   6171     push       edi
   6172     mov        eax, [esp + 8 + 4]   /* src_argb */
   6173     mov        edi, [esp + 8 + 8]   /* dst_argb */
   6174     mov        ecx, [esp + 8 + 12]  /* width */
   6175     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
   6176     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
   6177     pshufd     xmm2, xmm2, 0
   6178     pshufd     xmm3, xmm3, 0
   6179     pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
   6180     psllw      xmm4, 8
   6181     pxor       xmm5, xmm5
   6182 
   6183     // 4 pixel loop.
   6184   convertloop:
   6185     movdqu     xmm0, xmmword ptr [eax]      // generate luma ptr
   6186     pmaddubsw  xmm0, xmm3
   6187     phaddw     xmm0, xmm0
   6188     pand       xmm0, xmm4  // mask out low bits
   6189     punpcklwd  xmm0, xmm5
   6190     paddd      xmm0, xmm2  // add table base
   6191     movd       esi, xmm0
   6192     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
   6193 
   6194     movzx      edx, byte ptr [eax]
   6195     movzx      edx, byte ptr [esi + edx]
   6196     mov        byte ptr [edi], dl
   6197     movzx      edx, byte ptr [eax + 1]
   6198     movzx      edx, byte ptr [esi + edx]
   6199     mov        byte ptr [edi + 1], dl
   6200     movzx      edx, byte ptr [eax + 2]
   6201     movzx      edx, byte ptr [esi + edx]
   6202     mov        byte ptr [edi + 2], dl
   6203     movzx      edx, byte ptr [eax + 3]  // copy alpha.
   6204     mov        byte ptr [edi + 3], dl
   6205 
   6206     movd       esi, xmm0
   6207     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
   6208 
   6209     movzx      edx, byte ptr [eax + 4]
   6210     movzx      edx, byte ptr [esi + edx]
   6211     mov        byte ptr [edi + 4], dl
   6212     movzx      edx, byte ptr [eax + 5]
   6213     movzx      edx, byte ptr [esi + edx]
   6214     mov        byte ptr [edi + 5], dl
   6215     movzx      edx, byte ptr [eax + 6]
   6216     movzx      edx, byte ptr [esi + edx]
   6217     mov        byte ptr [edi + 6], dl
   6218     movzx      edx, byte ptr [eax + 7]  // copy alpha.
   6219     mov        byte ptr [edi + 7], dl
   6220 
   6221     movd       esi, xmm0
   6222     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
   6223 
   6224     movzx      edx, byte ptr [eax + 8]
   6225     movzx      edx, byte ptr [esi + edx]
   6226     mov        byte ptr [edi + 8], dl
   6227     movzx      edx, byte ptr [eax + 9]
   6228     movzx      edx, byte ptr [esi + edx]
   6229     mov        byte ptr [edi + 9], dl
   6230     movzx      edx, byte ptr [eax + 10]
   6231     movzx      edx, byte ptr [esi + edx]
   6232     mov        byte ptr [edi + 10], dl
   6233     movzx      edx, byte ptr [eax + 11]  // copy alpha.
   6234     mov        byte ptr [edi + 11], dl
   6235 
   6236     movd       esi, xmm0
   6237 
   6238     movzx      edx, byte ptr [eax + 12]
   6239     movzx      edx, byte ptr [esi + edx]
   6240     mov        byte ptr [edi + 12], dl
   6241     movzx      edx, byte ptr [eax + 13]
   6242     movzx      edx, byte ptr [esi + edx]
   6243     mov        byte ptr [edi + 13], dl
   6244     movzx      edx, byte ptr [eax + 14]
   6245     movzx      edx, byte ptr [esi + edx]
   6246     mov        byte ptr [edi + 14], dl
   6247     movzx      edx, byte ptr [eax + 15]  // copy alpha.
   6248     mov        byte ptr [edi + 15], dl
   6249 
   6250     lea        eax, [eax + 16]
   6251     lea        edi, [edi + 16]
   6252     sub        ecx, 4
   6253     jg         convertloop
   6254 
   6255     pop        edi
   6256     pop        esi
   6257     ret
   6258   }
   6259 }
   6260 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
   6261 
   6262 #endif  // defined(_M_X64)
   6263 
   6264 #ifdef __cplusplus
   6265 }  // extern "C"
   6266 }  // namespace libyuv
   6267 #endif
   6268 
   6269 #endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
   6270