Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 
     13 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
     14     defined(_MSC_VER) && !defined(__clang__)
     15 #include <emmintrin.h>
     16 #include <tmmintrin.h>  // For _mm_maddubs_epi16
     17 #endif
     18 
     19 #ifdef __cplusplus
     20 namespace libyuv {
     21 extern "C" {
     22 #endif
     23 
     24 // This module is for Visual C.
     25 #if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \
     26     defined(_MSC_VER) && !defined(__clang__)
     27 
     28 struct YuvConstants {
     29   lvec8 kUVToB;     // 0
     30   lvec8 kUVToG;     // 32
     31   lvec8 kUVToR;     // 64
     32   lvec16 kUVBiasB;  // 96
     33   lvec16 kUVBiasG;  // 128
     34   lvec16 kUVBiasR;  // 160
     35   lvec16 kYToRgb;   // 192
     36 };
     37 
     38 // BT.601 YUV to RGB reference
     39 //  R = (Y - 16) * 1.164              - V * -1.596
     40 //  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
     41 //  B = (Y - 16) * 1.164 - U * -2.018
     42 
     43 // Y contribution to R,G,B.  Scale and bias.
     44 // TODO(fbarchard): Consider moving constants into a common header.
     45 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
     46 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
     47 
     48 // U and V contributions to R,G,B.
     49 #define UB -128 /* max(-128, round(-2.018 * 64)) */
     50 #define UG 25 /* round(0.391 * 64) */
     51 #define VG 52 /* round(0.813 * 64) */
     52 #define VR -102 /* round(-1.596 * 64) */
     53 
     54 // Bias values to subtract 16 from Y and 128 from U and V.
     55 #define BB (UB * 128            + YGB)
     56 #define BG (UG * 128 + VG * 128 + YGB)
     57 #define BR            (VR * 128 + YGB)
     58 
     59 // BT601 constants for YUV to RGB.
     60 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
     61   { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
     62     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
     63   { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
     64     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
     65   { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
     66     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
     67   { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
     68   { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
     69   { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
     70   { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
     71 };
     72 
     73 // BT601 constants for NV21 where chroma plane is VU instead of UV.
     74 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
     75   { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
     76     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
     77   { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
     78     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
     79   { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
     80     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
     81   { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
     82   { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
     83   { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
     84   { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
     85 };
     86 
     87 #undef YG
     88 #undef YGB
     89 #undef UB
     90 #undef UG
     91 #undef VG
     92 #undef VR
     93 #undef BB
     94 #undef BG
     95 #undef BR
     96 
     97 // JPEG YUV to RGB reference
     98 // *  R = Y                - V * -1.40200
     99 // *  G = Y - U *  0.34414 - V *  0.71414
    100 // *  B = Y - U * -1.77200
    101 
    102 // Y contribution to R,G,B.  Scale and bias.
    103 // TODO(fbarchard): Consider moving constants into a common header.
    104 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
    105 #define YGBJ 32  /* 64 / 2 */
    106 
    107 // U and V contributions to R,G,B.
    108 #define UBJ -113 /* round(-1.77200 * 64) */
    109 #define UGJ 22 /* round(0.34414 * 64) */
    110 #define VGJ 46 /* round(0.71414  * 64) */
    111 #define VRJ -90 /* round(-1.40200 * 64) */
    112 
    113 // Bias values to subtract 16 from Y and 128 from U and V.
    114 #define BBJ (UBJ * 128             + YGBJ)
    115 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
    116 #define BRJ             (VRJ * 128 + YGBJ)
    117 
    118 // JPEG constants for YUV to RGB.
    119 static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
    120   { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
    121     UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
    122   { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
    123     UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
    124     UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
    125     UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
    126   { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
    127     0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
    128   { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
    129     BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
    130   { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
    131     BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
    132   { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
    133     BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
    134   { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
    135     YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
    136 };
    137 
    138 #undef YGJ
    139 #undef YGBJ
    140 #undef UBJ
    141 #undef UGJ
    142 #undef VGJ
    143 #undef VRJ
    144 #undef BBJ
    145 #undef BGJ
    146 #undef BRJ
    147 
    148 // 64 bit
    149 #if defined(_M_X64)
    150 #if defined(HAS_I422TOARGBROW_SSSE3)
    151 void I422ToARGBRow_SSSE3(const uint8* y_buf,
    152                          const uint8* u_buf,
    153                          const uint8* v_buf,
    154                          uint8* dst_argb,
    155                          int width) {
    156   __m128i xmm0, xmm1, xmm2, xmm3;
    157   const __m128i xmm5 = _mm_set1_epi8(-1);
    158   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
    159 
    160   while (width > 0) {
    161     xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
    162     xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
    163     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
    164     xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
    165     xmm1 = _mm_loadu_si128(&xmm0);
    166     xmm2 = _mm_loadu_si128(&xmm0);
    167     xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
    168     xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
    169     xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
    170     xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
    171     xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
    172     xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
    173     xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
    174     xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
    175     xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
    176     xmm0 = _mm_adds_epi16(xmm0, xmm3);
    177     xmm1 = _mm_adds_epi16(xmm1, xmm3);
    178     xmm2 = _mm_adds_epi16(xmm2, xmm3);
    179     xmm0 = _mm_srai_epi16(xmm0, 6);
    180     xmm1 = _mm_srai_epi16(xmm1, 6);
    181     xmm2 = _mm_srai_epi16(xmm2, 6);
    182     xmm0 = _mm_packus_epi16(xmm0, xmm0);
    183     xmm1 = _mm_packus_epi16(xmm1, xmm1);
    184     xmm2 = _mm_packus_epi16(xmm2, xmm2);
    185     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
    186     xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
    187     xmm1 = _mm_loadu_si128(&xmm0);
    188     xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
    189     xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
    190 
    191     _mm_storeu_si128((__m128i *)dst_argb, xmm0);
    192     _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
    193 
    194     y_buf += 8;
    195     u_buf += 4;
    196     dst_argb += 32;
    197     width -= 8;
    198   }
    199 }
    200 #endif
    201 // 32 bit
    202 #else  // defined(_M_X64)
    203 #ifdef HAS_ARGBTOYROW_SSSE3
    204 
    205 // Constants for ARGB.
    206 static const vec8 kARGBToY = {
    207   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
    208 };
    209 
    210 // JPeg full range.
    211 static const vec8 kARGBToYJ = {
    212   15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
    213 };
    214 
    215 static const vec8 kARGBToU = {
    216   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
    217 };
    218 
    219 static const vec8 kARGBToUJ = {
    220   127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
    221 };
    222 
    223 static const vec8 kARGBToV = {
    224   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
    225 };
    226 
    227 static const vec8 kARGBToVJ = {
    228   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
    229 };
    230 
    231 // vpshufb for vphaddw + vpackuswb packed to shorts.
    232 static const lvec8 kShufARGBToUV_AVX = {
    233   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
    234   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
    235 };
    236 
    237 // Constants for BGRA.
    238 static const vec8 kBGRAToY = {
    239   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
    240 };
    241 
    242 static const vec8 kBGRAToU = {
    243   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
    244 };
    245 
    246 static const vec8 kBGRAToV = {
    247   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
    248 };
    249 
    250 // Constants for ABGR.
    251 static const vec8 kABGRToY = {
    252   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
    253 };
    254 
    255 static const vec8 kABGRToU = {
    256   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
    257 };
    258 
    259 static const vec8 kABGRToV = {
    260   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
    261 };
    262 
    263 // Constants for RGBA.
    264 static const vec8 kRGBAToY = {
    265   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
    266 };
    267 
    268 static const vec8 kRGBAToU = {
    269   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
    270 };
    271 
    272 static const vec8 kRGBAToV = {
    273   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
    274 };
    275 
    276 static const uvec8 kAddY16 = {
    277   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
    278 };
    279 
    280 // 7 bit fixed point 0.5.
    281 static const vec16 kAddYJ64 = {
    282   64, 64, 64, 64, 64, 64, 64, 64
    283 };
    284 
    285 static const uvec8 kAddUV128 = {
    286   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
    287   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
    288 };
    289 
    290 static const uvec16 kAddUVJ128 = {
    291   0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
    292 };
    293 
    294 // Shuffle table for converting RGB24 to ARGB.
    295 static const uvec8 kShuffleMaskRGB24ToARGB = {
    296   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
    297 };
    298 
    299 // Shuffle table for converting RAW to ARGB.
    300 static const uvec8 kShuffleMaskRAWToARGB = {
    301   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
    302 };
    303 
    304 // Shuffle table for converting ARGB to RGB24.
    305 static const uvec8 kShuffleMaskARGBToRGB24 = {
    306   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
    307 };
    308 
    309 // Shuffle table for converting ARGB to RAW.
    310 static const uvec8 kShuffleMaskARGBToRAW = {
    311   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
    312 };
    313 
    314 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
    315 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
    316   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
    317 };
    318 
    319 // Shuffle table for converting ARGB to RAW.
    320 static const uvec8 kShuffleMaskARGBToRAW_0 = {
    321   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
    322 };
    323 
    324 // Duplicates gray value 3 times and fills in alpha opaque.
    325 __declspec(naked)
    326 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
    327   __asm {
    328     mov        eax, [esp + 4]        // src_y
    329     mov        edx, [esp + 8]        // dst_argb
    330     mov        ecx, [esp + 12]       // pix
    331     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
    332     pslld      xmm5, 24
    333 
    334   convertloop:
    335     movq       xmm0, qword ptr [eax]
    336     lea        eax,  [eax + 8]
    337     punpcklbw  xmm0, xmm0
    338     movdqa     xmm1, xmm0
    339     punpcklwd  xmm0, xmm0
    340     punpckhwd  xmm1, xmm1
    341     por        xmm0, xmm5
    342     por        xmm1, xmm5
    343     movdqu     [edx], xmm0
    344     movdqu     [edx + 16], xmm1
    345     lea        edx, [edx + 32]
    346     sub        ecx, 8
    347     jg         convertloop
    348     ret
    349   }
    350 }
    351 
    352 #ifdef HAS_J400TOARGBROW_AVX2
    353 // Duplicates gray value 3 times and fills in alpha opaque.
    354 __declspec(naked)
    355 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
    356   __asm {
    357     mov         eax, [esp + 4]        // src_y
    358     mov         edx, [esp + 8]        // dst_argb
    359     mov         ecx, [esp + 12]       // pix
    360     vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
    361     vpslld      ymm5, ymm5, 24
    362 
    363   convertloop:
    364     vmovdqu     xmm0, [eax]
    365     lea         eax,  [eax + 16]
    366     vpermq      ymm0, ymm0, 0xd8
    367     vpunpcklbw  ymm0, ymm0, ymm0
    368     vpermq      ymm0, ymm0, 0xd8
    369     vpunpckhwd  ymm1, ymm0, ymm0
    370     vpunpcklwd  ymm0, ymm0, ymm0
    371     vpor        ymm0, ymm0, ymm5
    372     vpor        ymm1, ymm1, ymm5
    373     vmovdqu     [edx], ymm0
    374     vmovdqu     [edx + 32], ymm1
    375     lea         edx, [edx + 64]
    376     sub         ecx, 16
    377     jg          convertloop
    378     vzeroupper
    379     ret
    380   }
    381 }
    382 #endif  // HAS_J400TOARGBROW_AVX2
    383 
    384 __declspec(naked)
    385 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
    386   __asm {
    387     mov       eax, [esp + 4]   // src_rgb24
    388     mov       edx, [esp + 8]   // dst_argb
    389     mov       ecx, [esp + 12]  // pix
    390     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
    391     pslld     xmm5, 24
    392     movdqa    xmm4, kShuffleMaskRGB24ToARGB
    393 
    394  convertloop:
    395     movdqu    xmm0, [eax]
    396     movdqu    xmm1, [eax + 16]
    397     movdqu    xmm3, [eax + 32]
    398     lea       eax, [eax + 48]
    399     movdqa    xmm2, xmm3
    400     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
    401     pshufb    xmm2, xmm4
    402     por       xmm2, xmm5
    403     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
    404     pshufb    xmm0, xmm4
    405     movdqu    [edx + 32], xmm2
    406     por       xmm0, xmm5
    407     pshufb    xmm1, xmm4
    408     movdqu    [edx], xmm0
    409     por       xmm1, xmm5
    410     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
    411     pshufb    xmm3, xmm4
    412     movdqu    [edx + 16], xmm1
    413     por       xmm3, xmm5
    414     movdqu    [edx + 48], xmm3
    415     lea       edx, [edx + 64]
    416     sub       ecx, 16
    417     jg        convertloop
    418     ret
    419   }
    420 }
    421 
    422 __declspec(naked)
    423 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
    424                         int pix) {
    425   __asm {
    426     mov       eax, [esp + 4]   // src_raw
    427     mov       edx, [esp + 8]   // dst_argb
    428     mov       ecx, [esp + 12]  // pix
    429     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
    430     pslld     xmm5, 24
    431     movdqa    xmm4, kShuffleMaskRAWToARGB
    432 
    433  convertloop:
    434     movdqu    xmm0, [eax]
    435     movdqu    xmm1, [eax + 16]
    436     movdqu    xmm3, [eax + 32]
    437     lea       eax, [eax + 48]
    438     movdqa    xmm2, xmm3
    439     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
    440     pshufb    xmm2, xmm4
    441     por       xmm2, xmm5
    442     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
    443     pshufb    xmm0, xmm4
    444     movdqu    [edx + 32], xmm2
    445     por       xmm0, xmm5
    446     pshufb    xmm1, xmm4
    447     movdqu    [edx], xmm0
    448     por       xmm1, xmm5
    449     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
    450     pshufb    xmm3, xmm4
    451     movdqu    [edx + 16], xmm1
    452     por       xmm3, xmm5
    453     movdqu    [edx + 48], xmm3
    454     lea       edx, [edx + 64]
    455     sub       ecx, 16
    456     jg        convertloop
    457     ret
    458   }
    459 }
    460 
    461 // pmul method to replicate bits.
    462 // Math to replicate bits:
    463 // (v << 8) | (v << 3)
    464 // v * 256 + v * 8
    465 // v * (256 + 8)
    466 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
    467 // 20 instructions.
    468 __declspec(naked)
    469 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
    470                           int pix) {
    471   __asm {
    472     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
    473     movd      xmm5, eax
    474     pshufd    xmm5, xmm5, 0
    475     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
    476     movd      xmm6, eax
    477     pshufd    xmm6, xmm6, 0
    478     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
    479     psllw     xmm3, 11
    480     pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
    481     psllw     xmm4, 10
    482     psrlw     xmm4, 5
    483     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
    484     psllw     xmm7, 8
    485 
    486     mov       eax, [esp + 4]   // src_rgb565
    487     mov       edx, [esp + 8]   // dst_argb
    488     mov       ecx, [esp + 12]  // pix
    489     sub       edx, eax
    490     sub       edx, eax
    491 
    492  convertloop:
    493     movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
    494     movdqa    xmm1, xmm0
    495     movdqa    xmm2, xmm0
    496     pand      xmm1, xmm3    // R in upper 5 bits
    497     psllw     xmm2, 11      // B in upper 5 bits
    498     pmulhuw   xmm1, xmm5    // * (256 + 8)
    499     pmulhuw   xmm2, xmm5    // * (256 + 8)
    500     psllw     xmm1, 8
    501     por       xmm1, xmm2    // RB
    502     pand      xmm0, xmm4    // G in middle 6 bits
    503     pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
    504     por       xmm0, xmm7    // AG
    505     movdqa    xmm2, xmm1
    506     punpcklbw xmm1, xmm0
    507     punpckhbw xmm2, xmm0
    508     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
    509     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
    510     lea       eax, [eax + 16]
    511     sub       ecx, 8
    512     jg        convertloop
    513     ret
    514   }
    515 }
    516 
    517 #ifdef HAS_RGB565TOARGBROW_AVX2
    518 // pmul method to replicate bits.
    519 // Math to replicate bits:
    520 // (v << 8) | (v << 3)
    521 // v * 256 + v * 8
    522 // v * (256 + 8)
    523 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
    524 __declspec(naked)
    525 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
    526                           int pix) {
    527   __asm {
    528     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
    529     vmovd      xmm5, eax
    530     vbroadcastss ymm5, xmm5
    531     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
    532     movd       xmm6, eax
    533     vbroadcastss ymm6, xmm6
    534     vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
    535     vpsllw     ymm3, ymm3, 11
    536     vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
    537     vpsllw     ymm4, ymm4, 10
    538     vpsrlw     ymm4, ymm4, 5
    539     vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
    540     vpsllw     ymm7, ymm7, 8
    541 
    542     mov        eax, [esp + 4]   // src_rgb565
    543     mov        edx, [esp + 8]   // dst_argb
    544     mov        ecx, [esp + 12]  // pix
    545     sub        edx, eax
    546     sub        edx, eax
    547 
    548  convertloop:
    549     vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
    550     vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
    551     vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
    552     vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
    553     vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
    554     vpsllw     ymm1, ymm1, 8
    555     vpor       ymm1, ymm1, ymm2    // RB
    556     vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
    557     vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
    558     vpor       ymm0, ymm0, ymm7    // AG
    559     vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
    560     vpermq     ymm1, ymm1, 0xd8
    561     vpunpckhbw ymm2, ymm1, ymm0
    562     vpunpcklbw ymm1, ymm1, ymm0
    563     vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
    564     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
    565     lea       eax, [eax + 32]
    566     sub       ecx, 16
    567     jg        convertloop
    568     vzeroupper
    569     ret
    570   }
    571 }
    572 #endif  // HAS_RGB565TOARGBROW_AVX2
    573 
    574 #ifdef HAS_ARGB1555TOARGBROW_AVX2
    575 __declspec(naked)
    576 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
    577                             int pix) {
    578   __asm {
    579     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
    580     vmovd      xmm5, eax
    581     vbroadcastss ymm5, xmm5
    582     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
    583     movd       xmm6, eax
    584     vbroadcastss ymm6, xmm6
    585     vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
    586     vpsllw     ymm3, ymm3, 11
    587     vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
    588     vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
    589     vpsllw     ymm7, ymm7, 8
    590 
    591     mov        eax,  [esp + 4]   // src_argb1555
    592     mov        edx,  [esp + 8]   // dst_argb
    593     mov        ecx,  [esp + 12]  // pix
    594     sub        edx,  eax
    595     sub        edx,  eax
    596 
    597  convertloop:
    598     vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
    599     vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
    600     vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
    601     vpand      ymm1, ymm1, ymm3
    602     vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
    603     vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
    604     vpsllw     ymm1, ymm1, 8
    605     vpor       ymm1, ymm1, ymm2    // RB
    606     vpsraw     ymm2, ymm0, 8       // A
    607     vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
    608     vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
    609     vpand      ymm2, ymm2, ymm7
    610     vpor       ymm0, ymm0, ymm2    // AG
    611     vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
    612     vpermq     ymm1, ymm1, 0xd8
    613     vpunpckhbw ymm2, ymm1, ymm0
    614     vpunpcklbw ymm1, ymm1, ymm0
    615     vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
    616     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
    617     lea       eax, [eax + 32]
    618     sub       ecx, 16
    619     jg        convertloop
    620     vzeroupper
    621     ret
    622   }
    623 }
    624 #endif  // HAS_ARGB1555TOARGBROW_AVX2
    625 
    626 #ifdef HAS_ARGB4444TOARGBROW_AVX2
    627 __declspec(naked)
    628 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
    629                             int pix) {
    630   __asm {
    631     mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
    632     vmovd     xmm4, eax
    633     vbroadcastss ymm4, xmm4
    634     vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
    635     mov       eax,  [esp + 4]   // src_argb4444
    636     mov       edx,  [esp + 8]   // dst_argb
    637     mov       ecx,  [esp + 12]  // pix
    638     sub       edx,  eax
    639     sub       edx,  eax
    640 
    641  convertloop:
    642     vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
    643     vpand      ymm2, ymm0, ymm5    // mask high nibbles
    644     vpand      ymm0, ymm0, ymm4    // mask low nibbles
    645     vpsrlw     ymm3, ymm2, 4
    646     vpsllw     ymm1, ymm0, 4
    647     vpor       ymm2, ymm2, ymm3
    648     vpor       ymm0, ymm0, ymm1
    649     vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
    650     vpermq     ymm2, ymm2, 0xd8
    651     vpunpckhbw ymm1, ymm0, ymm2
    652     vpunpcklbw ymm0, ymm0, ymm2
    653     vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
    654     vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
    655     lea       eax, [eax + 32]
    656     sub       ecx, 16
    657     jg        convertloop
    658     vzeroupper
    659     ret
    660   }
    661 }
    662 #endif  // HAS_ARGB4444TOARGBROW_AVX2
    663 
    664 // 24 instructions
    665 __declspec(naked)
    666 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
    667                             int pix) {
    668   __asm {
    669     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
    670     movd      xmm5, eax
    671     pshufd    xmm5, xmm5, 0
    672     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
    673     movd      xmm6, eax
    674     pshufd    xmm6, xmm6, 0
    675     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
    676     psllw     xmm3, 11
    677     movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
    678     psrlw     xmm4, 6
    679     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
    680     psllw     xmm7, 8
    681 
    682     mov       eax, [esp + 4]   // src_argb1555
    683     mov       edx, [esp + 8]   // dst_argb
    684     mov       ecx, [esp + 12]  // pix
    685     sub       edx, eax
    686     sub       edx, eax
    687 
    688  convertloop:
    689     movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
    690     movdqa    xmm1, xmm0
    691     movdqa    xmm2, xmm0
    692     psllw     xmm1, 1       // R in upper 5 bits
    693     psllw     xmm2, 11      // B in upper 5 bits
    694     pand      xmm1, xmm3
    695     pmulhuw   xmm2, xmm5    // * (256 + 8)
    696     pmulhuw   xmm1, xmm5    // * (256 + 8)
    697     psllw     xmm1, 8
    698     por       xmm1, xmm2    // RB
    699     movdqa    xmm2, xmm0
    700     pand      xmm0, xmm4    // G in middle 5 bits
    701     psraw     xmm2, 8       // A
    702     pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
    703     pand      xmm2, xmm7
    704     por       xmm0, xmm2    // AG
    705     movdqa    xmm2, xmm1
    706     punpcklbw xmm1, xmm0
    707     punpckhbw xmm2, xmm0
    708     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
    709     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
    710     lea       eax, [eax + 16]
    711     sub       ecx, 8
    712     jg        convertloop
    713     ret
    714   }
    715 }
    716 
    717 // 18 instructions.
    718 __declspec(naked)
    719 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
    720                             int pix) {
    721   __asm {
    722     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
    723     movd      xmm4, eax
    724     pshufd    xmm4, xmm4, 0
    725     movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
    726     pslld     xmm5, 4
    727     mov       eax, [esp + 4]   // src_argb4444
    728     mov       edx, [esp + 8]   // dst_argb
    729     mov       ecx, [esp + 12]  // pix
    730     sub       edx, eax
    731     sub       edx, eax
    732 
    733  convertloop:
    734     movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
    735     movdqa    xmm2, xmm0
    736     pand      xmm0, xmm4    // mask low nibbles
    737     pand      xmm2, xmm5    // mask high nibbles
    738     movdqa    xmm1, xmm0
    739     movdqa    xmm3, xmm2
    740     psllw     xmm1, 4
    741     psrlw     xmm3, 4
    742     por       xmm0, xmm1
    743     por       xmm2, xmm3
    744     movdqa    xmm1, xmm0
    745     punpcklbw xmm0, xmm2
    746     punpckhbw xmm1, xmm2
    747     movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
    748     movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
    749     lea       eax, [eax + 16]
    750     sub       ecx, 8
    751     jg        convertloop
    752     ret
    753   }
    754 }
    755 
    756 __declspec(naked)
    757 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
    758   __asm {
    759     mov       eax, [esp + 4]   // src_argb
    760     mov       edx, [esp + 8]   // dst_rgb
    761     mov       ecx, [esp + 12]  // pix
    762     movdqa    xmm6, kShuffleMaskARGBToRGB24
    763 
    764  convertloop:
    765     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
    766     movdqu    xmm1, [eax + 16]
    767     movdqu    xmm2, [eax + 32]
    768     movdqu    xmm3, [eax + 48]
    769     lea       eax, [eax + 64]
    770     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
    771     pshufb    xmm1, xmm6
    772     pshufb    xmm2, xmm6
    773     pshufb    xmm3, xmm6
    774     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
    775     psrldq    xmm1, 4      // 8 bytes from 1
    776     pslldq    xmm4, 12     // 4 bytes from 1 for 0
    777     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
    778     por       xmm0, xmm4   // 4 bytes from 1 for 0
    779     pslldq    xmm5, 8      // 8 bytes from 2 for 1
    780     movdqu    [edx], xmm0  // store 0
    781     por       xmm1, xmm5   // 8 bytes from 2 for 1
    782     psrldq    xmm2, 8      // 4 bytes from 2
    783     pslldq    xmm3, 4      // 12 bytes from 3 for 2
    784     por       xmm2, xmm3   // 12 bytes from 3 for 2
    785     movdqu    [edx + 16], xmm1   // store 1
    786     movdqu    [edx + 32], xmm2   // store 2
    787     lea       edx, [edx + 48]
    788     sub       ecx, 16
    789     jg        convertloop
    790     ret
    791   }
    792 }
    793 
    794 __declspec(naked)
    795 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
    796   __asm {
    797     mov       eax, [esp + 4]   // src_argb
    798     mov       edx, [esp + 8]   // dst_rgb
    799     mov       ecx, [esp + 12]  // pix
    800     movdqa    xmm6, kShuffleMaskARGBToRAW
    801 
    802  convertloop:
    803     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
    804     movdqu    xmm1, [eax + 16]
    805     movdqu    xmm2, [eax + 32]
    806     movdqu    xmm3, [eax + 48]
    807     lea       eax, [eax + 64]
    808     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
    809     pshufb    xmm1, xmm6
    810     pshufb    xmm2, xmm6
    811     pshufb    xmm3, xmm6
    812     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
    813     psrldq    xmm1, 4      // 8 bytes from 1
    814     pslldq    xmm4, 12     // 4 bytes from 1 for 0
    815     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
    816     por       xmm0, xmm4   // 4 bytes from 1 for 0
    817     pslldq    xmm5, 8      // 8 bytes from 2 for 1
    818     movdqu    [edx], xmm0  // store 0
    819     por       xmm1, xmm5   // 8 bytes from 2 for 1
    820     psrldq    xmm2, 8      // 4 bytes from 2
    821     pslldq    xmm3, 4      // 12 bytes from 3 for 2
    822     por       xmm2, xmm3   // 12 bytes from 3 for 2
    823     movdqu    [edx + 16], xmm1   // store 1
    824     movdqu    [edx + 32], xmm2   // store 2
    825     lea       edx, [edx + 48]
    826     sub       ecx, 16
    827     jg        convertloop
    828     ret
    829   }
    830 }
    831 
    832 // 4 pixels
    833 __declspec(naked)
    834 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
    835   __asm {
    836     mov       eax, [esp + 4]   // src_argb
    837     mov       edx, [esp + 8]   // dst_rgb
    838     mov       ecx, [esp + 12]  // pix
    839     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
    840     psrld     xmm3, 27
    841     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
    842     psrld     xmm4, 26
    843     pslld     xmm4, 5
    844     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
    845     pslld     xmm5, 11
    846 
    847  convertloop:
    848     movdqu    xmm0, [eax]   // fetch 4 pixels of argb
    849     movdqa    xmm1, xmm0    // B
    850     movdqa    xmm2, xmm0    // G
    851     pslld     xmm0, 8       // R
    852     psrld     xmm1, 3       // B
    853     psrld     xmm2, 5       // G
    854     psrad     xmm0, 16      // R
    855     pand      xmm1, xmm3    // B
    856     pand      xmm2, xmm4    // G
    857     pand      xmm0, xmm5    // R
    858     por       xmm1, xmm2    // BG
    859     por       xmm0, xmm1    // BGR
    860     packssdw  xmm0, xmm0
    861     lea       eax, [eax + 16]
    862     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
    863     lea       edx, [edx + 8]
    864     sub       ecx, 4
    865     jg        convertloop
    866     ret
    867   }
    868 }
    869 
    870 // 8 pixels
    871 __declspec(naked)
    872 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
    873                                 const uint32 dither4, int pix) {
    874   __asm {
    875 
    876     mov       eax, [esp + 4]   // src_argb
    877     mov       edx, [esp + 8]   // dst_rgb
    878     movd      xmm6, [esp + 12] // dither4
    879     mov       ecx, [esp + 16]  // pix
    880     punpcklbw xmm6, xmm6       // make dither 16 bytes
    881     movdqa    xmm7, xmm6
    882     punpcklwd xmm6, xmm6
    883     punpckhwd xmm7, xmm7
    884     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
    885     psrld     xmm3, 27
    886     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
    887     psrld     xmm4, 26
    888     pslld     xmm4, 5
    889     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
    890     pslld     xmm5, 11
    891 
    892  convertloop:
    893     movdqu    xmm0, [eax]   // fetch 4 pixels of argb
    894     paddusb   xmm0, xmm6    // add dither
    895     movdqa    xmm1, xmm0    // B
    896     movdqa    xmm2, xmm0    // G
    897     pslld     xmm0, 8       // R
    898     psrld     xmm1, 3       // B
    899     psrld     xmm2, 5       // G
    900     psrad     xmm0, 16      // R
    901     pand      xmm1, xmm3    // B
    902     pand      xmm2, xmm4    // G
    903     pand      xmm0, xmm5    // R
    904     por       xmm1, xmm2    // BG
    905     por       xmm0, xmm1    // BGR
    906     packssdw  xmm0, xmm0
    907     lea       eax, [eax + 16]
    908     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
    909     lea       edx, [edx + 8]
    910     sub       ecx, 4
    911     jg        convertloop
    912     ret
    913   }
    914 }
    915 
    916 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
    917 __declspec(naked)
    918 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
    919                                 const uint32 dither4, int pix) {
    920   __asm {
    921     mov        eax, [esp + 4]      // src_argb
    922     mov        edx, [esp + 8]      // dst_rgb
    923     vbroadcastss xmm6, [esp + 12]  // dither4
    924     mov        ecx, [esp + 16]     // pix
    925     vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
    926     vpermq     ymm6, ymm6, 0xd8
    927     vpunpcklwd ymm6, ymm6, ymm6
    928     vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
    929     vpsrld     ymm3, ymm3, 27
    930     vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
    931     vpsrld     ymm4, ymm4, 26
    932     vpslld     ymm4, ymm4, 5
    933     vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
    934 
    935  convertloop:
    936     vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
    937     vpaddusb   ymm0, ymm0, ymm6    // add dither
    938     vpsrld     ymm2, ymm0, 5       // G
    939     vpsrld     ymm1, ymm0, 3       // B
    940     vpsrld     ymm0, ymm0, 8       // R
    941     vpand      ymm2, ymm2, ymm4    // G
    942     vpand      ymm1, ymm1, ymm3    // B
    943     vpand      ymm0, ymm0, ymm5    // R
    944     vpor       ymm1, ymm1, ymm2    // BG
    945     vpor       ymm0, ymm0, ymm1    // BGR
    946     vpackusdw  ymm0, ymm0, ymm0
    947     vpermq     ymm0, ymm0, 0xd8
    948     lea        eax, [eax + 32]
    949     vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
    950     lea        edx, [edx + 16]
    951     sub        ecx, 8
    952     jg         convertloop
    953     vzeroupper
    954     ret
    955   }
    956 }
    957 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
    958 
    959 // TODO(fbarchard): Improve sign extension/packing.
    960 __declspec(naked)
    961 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
    962   __asm {
    963     mov       eax, [esp + 4]   // src_argb
    964     mov       edx, [esp + 8]   // dst_rgb
    965     mov       ecx, [esp + 12]  // pix
    966     pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
    967     psrld     xmm4, 27
    968     movdqa    xmm5, xmm4       // generate mask 0x000003e0
    969     pslld     xmm5, 5
    970     movdqa    xmm6, xmm4       // generate mask 0x00007c00
    971     pslld     xmm6, 10
    972     pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
    973     pslld     xmm7, 15
    974 
    975  convertloop:
    976     movdqu    xmm0, [eax]   // fetch 4 pixels of argb
    977     movdqa    xmm1, xmm0    // B
    978     movdqa    xmm2, xmm0    // G
    979     movdqa    xmm3, xmm0    // R
    980     psrad     xmm0, 16      // A
    981     psrld     xmm1, 3       // B
    982     psrld     xmm2, 6       // G
    983     psrld     xmm3, 9       // R
    984     pand      xmm0, xmm7    // A
    985     pand      xmm1, xmm4    // B
    986     pand      xmm2, xmm5    // G
    987     pand      xmm3, xmm6    // R
    988     por       xmm0, xmm1    // BA
    989     por       xmm2, xmm3    // GR
    990     por       xmm0, xmm2    // BGRA
    991     packssdw  xmm0, xmm0
    992     lea       eax, [eax + 16]
    993     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
    994     lea       edx, [edx + 8]
    995     sub       ecx, 4
    996     jg        convertloop
    997     ret
    998   }
    999 }
   1000 
   1001 __declspec(naked)
   1002 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   1003   __asm {
   1004     mov       eax, [esp + 4]   // src_argb
   1005     mov       edx, [esp + 8]   // dst_rgb
   1006     mov       ecx, [esp + 12]  // pix
   1007     pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
   1008     psllw     xmm4, 12
   1009     movdqa    xmm3, xmm4       // generate mask 0x00f000f0
   1010     psrlw     xmm3, 8
   1011 
   1012  convertloop:
   1013     movdqu    xmm0, [eax]   // fetch 4 pixels of argb
   1014     movdqa    xmm1, xmm0
   1015     pand      xmm0, xmm3    // low nibble
   1016     pand      xmm1, xmm4    // high nibble
   1017     psrld     xmm0, 4
   1018     psrld     xmm1, 8
   1019     por       xmm0, xmm1
   1020     packuswb  xmm0, xmm0
   1021     lea       eax, [eax + 16]
   1022     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
   1023     lea       edx, [edx + 8]
   1024     sub       ecx, 4
   1025     jg        convertloop
   1026     ret
   1027   }
   1028 }
   1029 
   1030 #ifdef HAS_ARGBTORGB565ROW_AVX2
   1031 __declspec(naked)
   1032 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   1033   __asm {
   1034     mov        eax, [esp + 4]      // src_argb
   1035     mov        edx, [esp + 8]      // dst_rgb
   1036     mov        ecx, [esp + 12]     // pix
   1037     vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
   1038     vpsrld     ymm3, ymm3, 27
   1039     vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
   1040     vpsrld     ymm4, ymm4, 26
   1041     vpslld     ymm4, ymm4, 5
   1042     vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
   1043 
   1044  convertloop:
   1045     vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
   1046     vpsrld     ymm2, ymm0, 5       // G
   1047     vpsrld     ymm1, ymm0, 3       // B
   1048     vpsrld     ymm0, ymm0, 8       // R
   1049     vpand      ymm2, ymm2, ymm4    // G
   1050     vpand      ymm1, ymm1, ymm3    // B
   1051     vpand      ymm0, ymm0, ymm5    // R
   1052     vpor       ymm1, ymm1, ymm2    // BG
   1053     vpor       ymm0, ymm0, ymm1    // BGR
   1054     vpackusdw  ymm0, ymm0, ymm0
   1055     vpermq     ymm0, ymm0, 0xd8
   1056     lea        eax, [eax + 32]
   1057     vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
   1058     lea        edx, [edx + 16]
   1059     sub        ecx, 8
   1060     jg         convertloop
   1061     vzeroupper
   1062     ret
   1063   }
   1064 }
   1065 #endif  // HAS_ARGBTORGB565ROW_AVX2
   1066 
   1067 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
   1068 __declspec(naked)
   1069 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   1070   __asm {
   1071     mov        eax, [esp + 4]      // src_argb
   1072     mov        edx, [esp + 8]      // dst_rgb
   1073     mov        ecx, [esp + 12]     // pix
   1074     vpcmpeqb   ymm4, ymm4, ymm4
   1075     vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
   1076     vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
   1077     vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
   1078     vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
   1079     vpslld     ymm7, ymm7, 15
   1080 
   1081  convertloop:
   1082     vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
   1083     vpsrld     ymm3, ymm0, 9       // R
   1084     vpsrld     ymm2, ymm0, 6       // G
   1085     vpsrld     ymm1, ymm0, 3       // B
   1086     vpsrad     ymm0, ymm0, 16      // A
   1087     vpand      ymm3, ymm3, ymm6    // R
   1088     vpand      ymm2, ymm2, ymm5    // G
   1089     vpand      ymm1, ymm1, ymm4    // B
   1090     vpand      ymm0, ymm0, ymm7    // A
   1091     vpor       ymm0, ymm0, ymm1    // BA
   1092     vpor       ymm2, ymm2, ymm3    // GR
   1093     vpor       ymm0, ymm0, ymm2    // BGRA
   1094     vpackssdw  ymm0, ymm0, ymm0
   1095     vpermq     ymm0, ymm0, 0xd8
   1096     lea        eax, [eax + 32]
   1097     vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
   1098     lea        edx, [edx + 16]
   1099     sub        ecx, 8
   1100     jg         convertloop
   1101     vzeroupper
   1102     ret
   1103   }
   1104 }
   1105 #endif  // HAS_ARGBTOARGB1555ROW_AVX2
   1106 
   1107 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
   1108 __declspec(naked)
   1109 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   1110   __asm {
   1111     mov        eax, [esp + 4]   // src_argb
   1112     mov        edx, [esp + 8]   // dst_rgb
   1113     mov        ecx, [esp + 12]  // pix
   1114     vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
   1115     vpsllw     ymm4, ymm4, 12
   1116     vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
   1117 
   1118  convertloop:
   1119     vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
   1120     vpand      ymm1, ymm0, ymm4    // high nibble
   1121     vpand      ymm0, ymm0, ymm3    // low nibble
   1122     vpsrld     ymm1, ymm1, 8
   1123     vpsrld     ymm0, ymm0, 4
   1124     vpor       ymm0, ymm0, ymm1
   1125     vpackuswb  ymm0, ymm0, ymm0
   1126     vpermq     ymm0, ymm0, 0xd8
   1127     lea        eax, [eax + 32]
   1128     vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
   1129     lea        edx, [edx + 16]
   1130     sub        ecx, 8
   1131     jg         convertloop
   1132     vzeroupper
   1133     ret
   1134   }
   1135 }
   1136 #endif  // HAS_ARGBTOARGB4444ROW_AVX2
   1137 
   1138 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
   1139 __declspec(naked)
   1140 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1141   __asm {
   1142     mov        eax, [esp + 4]   /* src_argb */
   1143     mov        edx, [esp + 8]   /* dst_y */
   1144     mov        ecx, [esp + 12]  /* pix */
   1145     movdqa     xmm4, kARGBToY
   1146     movdqa     xmm5, kAddY16
   1147 
   1148  convertloop:
   1149     movdqu     xmm0, [eax]
   1150     movdqu     xmm1, [eax + 16]
   1151     movdqu     xmm2, [eax + 32]
   1152     movdqu     xmm3, [eax + 48]
   1153     pmaddubsw  xmm0, xmm4
   1154     pmaddubsw  xmm1, xmm4
   1155     pmaddubsw  xmm2, xmm4
   1156     pmaddubsw  xmm3, xmm4
   1157     lea        eax, [eax + 64]
   1158     phaddw     xmm0, xmm1
   1159     phaddw     xmm2, xmm3
   1160     psrlw      xmm0, 7
   1161     psrlw      xmm2, 7
   1162     packuswb   xmm0, xmm2
   1163     paddb      xmm0, xmm5
   1164     movdqu     [edx], xmm0
   1165     lea        edx, [edx + 16]
   1166     sub        ecx, 16
   1167     jg         convertloop
   1168     ret
   1169   }
   1170 }
   1171 
   1172 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
   1173 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
   1174 __declspec(naked)
   1175 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1176   __asm {
   1177     mov        eax, [esp + 4]   /* src_argb */
   1178     mov        edx, [esp + 8]   /* dst_y */
   1179     mov        ecx, [esp + 12]  /* pix */
   1180     movdqa     xmm4, kARGBToYJ
   1181     movdqa     xmm5, kAddYJ64
   1182 
   1183  convertloop:
   1184     movdqu     xmm0, [eax]
   1185     movdqu     xmm1, [eax + 16]
   1186     movdqu     xmm2, [eax + 32]
   1187     movdqu     xmm3, [eax + 48]
   1188     pmaddubsw  xmm0, xmm4
   1189     pmaddubsw  xmm1, xmm4
   1190     pmaddubsw  xmm2, xmm4
   1191     pmaddubsw  xmm3, xmm4
   1192     lea        eax, [eax + 64]
   1193     phaddw     xmm0, xmm1
   1194     phaddw     xmm2, xmm3
   1195     paddw      xmm0, xmm5  // Add .5 for rounding.
   1196     paddw      xmm2, xmm5
   1197     psrlw      xmm0, 7
   1198     psrlw      xmm2, 7
   1199     packuswb   xmm0, xmm2
   1200     movdqu     [edx], xmm0
   1201     lea        edx, [edx + 16]
   1202     sub        ecx, 16
   1203     jg         convertloop
   1204     ret
   1205   }
   1206 }
   1207 
   1208 #ifdef HAS_ARGBTOYROW_AVX2
   1209 // vpermd for vphaddw + vpackuswb vpermd.
   1210 static const lvec32 kPermdARGBToY_AVX = {
   1211   0, 4, 1, 5, 2, 6, 3, 7
   1212 };
   1213 
   1214 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
   1215 __declspec(naked)
   1216 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
   1217   __asm {
   1218     mov        eax, [esp + 4]   /* src_argb */
   1219     mov        edx, [esp + 8]   /* dst_y */
   1220     mov        ecx, [esp + 12]  /* pix */
   1221     vbroadcastf128 ymm4, kARGBToY
   1222     vbroadcastf128 ymm5, kAddY16
   1223     vmovdqu    ymm6, kPermdARGBToY_AVX
   1224 
   1225  convertloop:
   1226     vmovdqu    ymm0, [eax]
   1227     vmovdqu    ymm1, [eax + 32]
   1228     vmovdqu    ymm2, [eax + 64]
   1229     vmovdqu    ymm3, [eax + 96]
   1230     vpmaddubsw ymm0, ymm0, ymm4
   1231     vpmaddubsw ymm1, ymm1, ymm4
   1232     vpmaddubsw ymm2, ymm2, ymm4
   1233     vpmaddubsw ymm3, ymm3, ymm4
   1234     lea        eax, [eax + 128]
   1235     vphaddw    ymm0, ymm0, ymm1  // mutates.
   1236     vphaddw    ymm2, ymm2, ymm3
   1237     vpsrlw     ymm0, ymm0, 7
   1238     vpsrlw     ymm2, ymm2, 7
   1239     vpackuswb  ymm0, ymm0, ymm2  // mutates.
   1240     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
   1241     vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
   1242     vmovdqu    [edx], ymm0
   1243     lea        edx, [edx + 32]
   1244     sub        ecx, 32
   1245     jg         convertloop
   1246     vzeroupper
   1247     ret
   1248   }
   1249 }
   1250 #endif  //  HAS_ARGBTOYROW_AVX2
   1251 
   1252 #ifdef HAS_ARGBTOYJROW_AVX2
   1253 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
   1254 __declspec(naked)
   1255 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
   1256   __asm {
   1257     mov        eax, [esp + 4]   /* src_argb */
   1258     mov        edx, [esp + 8]   /* dst_y */
   1259     mov        ecx, [esp + 12]  /* pix */
   1260     vbroadcastf128 ymm4, kARGBToYJ
   1261     vbroadcastf128 ymm5, kAddYJ64
   1262     vmovdqu    ymm6, kPermdARGBToY_AVX
   1263 
   1264  convertloop:
   1265     vmovdqu    ymm0, [eax]
   1266     vmovdqu    ymm1, [eax + 32]
   1267     vmovdqu    ymm2, [eax + 64]
   1268     vmovdqu    ymm3, [eax + 96]
   1269     vpmaddubsw ymm0, ymm0, ymm4
   1270     vpmaddubsw ymm1, ymm1, ymm4
   1271     vpmaddubsw ymm2, ymm2, ymm4
   1272     vpmaddubsw ymm3, ymm3, ymm4
   1273     lea        eax, [eax + 128]
   1274     vphaddw    ymm0, ymm0, ymm1  // mutates.
   1275     vphaddw    ymm2, ymm2, ymm3
   1276     vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
   1277     vpaddw     ymm2, ymm2, ymm5
   1278     vpsrlw     ymm0, ymm0, 7
   1279     vpsrlw     ymm2, ymm2, 7
   1280     vpackuswb  ymm0, ymm0, ymm2  // mutates.
   1281     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
   1282     vmovdqu    [edx], ymm0
   1283     lea        edx, [edx + 32]
   1284     sub        ecx, 32
   1285     jg         convertloop
   1286 
   1287     vzeroupper
   1288     ret
   1289   }
   1290 }
   1291 #endif  //  HAS_ARGBTOYJROW_AVX2
   1292 
   1293 __declspec(naked)
   1294 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1295   __asm {
   1296     mov        eax, [esp + 4]   /* src_argb */
   1297     mov        edx, [esp + 8]   /* dst_y */
   1298     mov        ecx, [esp + 12]  /* pix */
   1299     movdqa     xmm4, kBGRAToY
   1300     movdqa     xmm5, kAddY16
   1301 
   1302  convertloop:
   1303     movdqu     xmm0, [eax]
   1304     movdqu     xmm1, [eax + 16]
   1305     movdqu     xmm2, [eax + 32]
   1306     movdqu     xmm3, [eax + 48]
   1307     pmaddubsw  xmm0, xmm4
   1308     pmaddubsw  xmm1, xmm4
   1309     pmaddubsw  xmm2, xmm4
   1310     pmaddubsw  xmm3, xmm4
   1311     lea        eax, [eax + 64]
   1312     phaddw     xmm0, xmm1
   1313     phaddw     xmm2, xmm3
   1314     psrlw      xmm0, 7
   1315     psrlw      xmm2, 7
   1316     packuswb   xmm0, xmm2
   1317     paddb      xmm0, xmm5
   1318     movdqu     [edx], xmm0
   1319     lea        edx, [edx + 16]
   1320     sub        ecx, 16
   1321     jg         convertloop
   1322     ret
   1323   }
   1324 }
   1325 
   1326 __declspec(naked)
   1327 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1328   __asm {
   1329     mov        eax, [esp + 4]   /* src_argb */
   1330     mov        edx, [esp + 8]   /* dst_y */
   1331     mov        ecx, [esp + 12]  /* pix */
   1332     movdqa     xmm4, kABGRToY
   1333     movdqa     xmm5, kAddY16
   1334 
   1335  convertloop:
   1336     movdqu     xmm0, [eax]
   1337     movdqu     xmm1, [eax + 16]
   1338     movdqu     xmm2, [eax + 32]
   1339     movdqu     xmm3, [eax + 48]
   1340     pmaddubsw  xmm0, xmm4
   1341     pmaddubsw  xmm1, xmm4
   1342     pmaddubsw  xmm2, xmm4
   1343     pmaddubsw  xmm3, xmm4
   1344     lea        eax, [eax + 64]
   1345     phaddw     xmm0, xmm1
   1346     phaddw     xmm2, xmm3
   1347     psrlw      xmm0, 7
   1348     psrlw      xmm2, 7
   1349     packuswb   xmm0, xmm2
   1350     paddb      xmm0, xmm5
   1351     movdqu     [edx], xmm0
   1352     lea        edx, [edx + 16]
   1353     sub        ecx, 16
   1354     jg         convertloop
   1355     ret
   1356   }
   1357 }
   1358 
   1359 __declspec(naked)
   1360 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1361   __asm {
   1362     mov        eax, [esp + 4]   /* src_argb */
   1363     mov        edx, [esp + 8]   /* dst_y */
   1364     mov        ecx, [esp + 12]  /* pix */
   1365     movdqa     xmm4, kRGBAToY
   1366     movdqa     xmm5, kAddY16
   1367 
   1368  convertloop:
   1369     movdqu     xmm0, [eax]
   1370     movdqu     xmm1, [eax + 16]
   1371     movdqu     xmm2, [eax + 32]
   1372     movdqu     xmm3, [eax + 48]
   1373     pmaddubsw  xmm0, xmm4
   1374     pmaddubsw  xmm1, xmm4
   1375     pmaddubsw  xmm2, xmm4
   1376     pmaddubsw  xmm3, xmm4
   1377     lea        eax, [eax + 64]
   1378     phaddw     xmm0, xmm1
   1379     phaddw     xmm2, xmm3
   1380     psrlw      xmm0, 7
   1381     psrlw      xmm2, 7
   1382     packuswb   xmm0, xmm2
   1383     paddb      xmm0, xmm5
   1384     movdqu     [edx], xmm0
   1385     lea        edx, [edx + 16]
   1386     sub        ecx, 16
   1387     jg         convertloop
   1388     ret
   1389   }
   1390 }
   1391 
   1392 __declspec(naked)
   1393 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1394                        uint8* dst_u, uint8* dst_v, int width) {
   1395   __asm {
   1396     push       esi
   1397     push       edi
   1398     mov        eax, [esp + 8 + 4]   // src_argb
   1399     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1400     mov        edx, [esp + 8 + 12]  // dst_u
   1401     mov        edi, [esp + 8 + 16]  // dst_v
   1402     mov        ecx, [esp + 8 + 20]  // pix
   1403     movdqa     xmm5, kAddUV128
   1404     movdqa     xmm6, kARGBToV
   1405     movdqa     xmm7, kARGBToU
   1406     sub        edi, edx             // stride from u to v
   1407 
   1408  convertloop:
   1409     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1410     movdqu     xmm0, [eax]
   1411     movdqu     xmm4, [eax + esi]
   1412     pavgb      xmm0, xmm4
   1413     movdqu     xmm1, [eax + 16]
   1414     movdqu     xmm4, [eax + esi + 16]
   1415     pavgb      xmm1, xmm4
   1416     movdqu     xmm2, [eax + 32]
   1417     movdqu     xmm4, [eax + esi + 32]
   1418     pavgb      xmm2, xmm4
   1419     movdqu     xmm3, [eax + 48]
   1420     movdqu     xmm4, [eax + esi + 48]
   1421     pavgb      xmm3, xmm4
   1422 
   1423     lea        eax,  [eax + 64]
   1424     movdqa     xmm4, xmm0
   1425     shufps     xmm0, xmm1, 0x88
   1426     shufps     xmm4, xmm1, 0xdd
   1427     pavgb      xmm0, xmm4
   1428     movdqa     xmm4, xmm2
   1429     shufps     xmm2, xmm3, 0x88
   1430     shufps     xmm4, xmm3, 0xdd
   1431     pavgb      xmm2, xmm4
   1432 
   1433     // step 2 - convert to U and V
   1434     // from here down is very similar to Y code except
   1435     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1436     movdqa     xmm1, xmm0
   1437     movdqa     xmm3, xmm2
   1438     pmaddubsw  xmm0, xmm7  // U
   1439     pmaddubsw  xmm2, xmm7
   1440     pmaddubsw  xmm1, xmm6  // V
   1441     pmaddubsw  xmm3, xmm6
   1442     phaddw     xmm0, xmm2
   1443     phaddw     xmm1, xmm3
   1444     psraw      xmm0, 8
   1445     psraw      xmm1, 8
   1446     packsswb   xmm0, xmm1
   1447     paddb      xmm0, xmm5            // -> unsigned
   1448 
   1449     // step 3 - store 8 U and 8 V values
   1450     movlps     qword ptr [edx], xmm0 // U
   1451     movhps     qword ptr [edx + edi], xmm0 // V
   1452     lea        edx, [edx + 8]
   1453     sub        ecx, 16
   1454     jg         convertloop
   1455 
   1456     pop        edi
   1457     pop        esi
   1458     ret
   1459   }
   1460 }
   1461 
   1462 __declspec(naked)
   1463 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1464                         uint8* dst_u, uint8* dst_v, int width) {
   1465   __asm {
   1466     push       esi
   1467     push       edi
   1468     mov        eax, [esp + 8 + 4]   // src_argb
   1469     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1470     mov        edx, [esp + 8 + 12]  // dst_u
   1471     mov        edi, [esp + 8 + 16]  // dst_v
   1472     mov        ecx, [esp + 8 + 20]  // pix
   1473     movdqa     xmm5, kAddUVJ128
   1474     movdqa     xmm6, kARGBToVJ
   1475     movdqa     xmm7, kARGBToUJ
   1476     sub        edi, edx             // stride from u to v
   1477 
   1478  convertloop:
   1479     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1480     movdqu     xmm0, [eax]
   1481     movdqu     xmm4, [eax + esi]
   1482     pavgb      xmm0, xmm4
   1483     movdqu     xmm1, [eax + 16]
   1484     movdqu     xmm4, [eax + esi + 16]
   1485     pavgb      xmm1, xmm4
   1486     movdqu     xmm2, [eax + 32]
   1487     movdqu     xmm4, [eax + esi + 32]
   1488     pavgb      xmm2, xmm4
   1489     movdqu     xmm3, [eax + 48]
   1490     movdqu     xmm4, [eax + esi + 48]
   1491     pavgb      xmm3, xmm4
   1492 
   1493     lea        eax,  [eax + 64]
   1494     movdqa     xmm4, xmm0
   1495     shufps     xmm0, xmm1, 0x88
   1496     shufps     xmm4, xmm1, 0xdd
   1497     pavgb      xmm0, xmm4
   1498     movdqa     xmm4, xmm2
   1499     shufps     xmm2, xmm3, 0x88
   1500     shufps     xmm4, xmm3, 0xdd
   1501     pavgb      xmm2, xmm4
   1502 
   1503     // step 2 - convert to U and V
   1504     // from here down is very similar to Y code except
   1505     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1506     movdqa     xmm1, xmm0
   1507     movdqa     xmm3, xmm2
   1508     pmaddubsw  xmm0, xmm7  // U
   1509     pmaddubsw  xmm2, xmm7
   1510     pmaddubsw  xmm1, xmm6  // V
   1511     pmaddubsw  xmm3, xmm6
   1512     phaddw     xmm0, xmm2
   1513     phaddw     xmm1, xmm3
   1514     paddw      xmm0, xmm5            // +.5 rounding -> unsigned
   1515     paddw      xmm1, xmm5
   1516     psraw      xmm0, 8
   1517     psraw      xmm1, 8
   1518     packsswb   xmm0, xmm1
   1519 
   1520     // step 3 - store 8 U and 8 V values
   1521     movlps     qword ptr [edx], xmm0 // U
   1522     movhps     qword ptr [edx + edi], xmm0 // V
   1523     lea        edx, [edx + 8]
   1524     sub        ecx, 16
   1525     jg         convertloop
   1526 
   1527     pop        edi
   1528     pop        esi
   1529     ret
   1530   }
   1531 }
   1532 
   1533 #ifdef HAS_ARGBTOUVROW_AVX2
   1534 __declspec(naked)
   1535 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
   1536                       uint8* dst_u, uint8* dst_v, int width) {
   1537   __asm {
   1538     push       esi
   1539     push       edi
   1540     mov        eax, [esp + 8 + 4]   // src_argb
   1541     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1542     mov        edx, [esp + 8 + 12]  // dst_u
   1543     mov        edi, [esp + 8 + 16]  // dst_v
   1544     mov        ecx, [esp + 8 + 20]  // pix
   1545     vbroadcastf128 ymm5, kAddUV128
   1546     vbroadcastf128 ymm6, kARGBToV
   1547     vbroadcastf128 ymm7, kARGBToU
   1548     sub        edi, edx             // stride from u to v
   1549 
   1550  convertloop:
   1551     /* step 1 - subsample 32x2 argb pixels to 16x1 */
   1552     vmovdqu    ymm0, [eax]
   1553     vmovdqu    ymm1, [eax + 32]
   1554     vmovdqu    ymm2, [eax + 64]
   1555     vmovdqu    ymm3, [eax + 96]
   1556     vpavgb     ymm0, ymm0, [eax + esi]
   1557     vpavgb     ymm1, ymm1, [eax + esi + 32]
   1558     vpavgb     ymm2, ymm2, [eax + esi + 64]
   1559     vpavgb     ymm3, ymm3, [eax + esi + 96]
   1560     lea        eax,  [eax + 128]
   1561     vshufps    ymm4, ymm0, ymm1, 0x88
   1562     vshufps    ymm0, ymm0, ymm1, 0xdd
   1563     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
   1564     vshufps    ymm4, ymm2, ymm3, 0x88
   1565     vshufps    ymm2, ymm2, ymm3, 0xdd
   1566     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
   1567 
   1568     // step 2 - convert to U and V
   1569     // from here down is very similar to Y code except
   1570     // instead of 32 different pixels, its 16 pixels of U and 16 of V
   1571     vpmaddubsw ymm1, ymm0, ymm7  // U
   1572     vpmaddubsw ymm3, ymm2, ymm7
   1573     vpmaddubsw ymm0, ymm0, ymm6  // V
   1574     vpmaddubsw ymm2, ymm2, ymm6
   1575     vphaddw    ymm1, ymm1, ymm3  // mutates
   1576     vphaddw    ymm0, ymm0, ymm2
   1577     vpsraw     ymm1, ymm1, 8
   1578     vpsraw     ymm0, ymm0, 8
   1579     vpacksswb  ymm0, ymm1, ymm0  // mutates
   1580     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
   1581     vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw
   1582     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
   1583 
   1584     // step 3 - store 16 U and 16 V values
   1585     vextractf128 [edx], ymm0, 0 // U
   1586     vextractf128 [edx + edi], ymm0, 1 // V
   1587     lea        edx, [edx + 16]
   1588     sub        ecx, 32
   1589     jg         convertloop
   1590 
   1591     pop        edi
   1592     pop        esi
   1593     vzeroupper
   1594     ret
   1595   }
   1596 }
   1597 #endif  // HAS_ARGBTOUVROW_AVX2
   1598 
   1599 __declspec(naked)
   1600 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
   1601                           uint8* dst_u, uint8* dst_v, int width) {
   1602   __asm {
   1603     push       edi
   1604     mov        eax, [esp + 4 + 4]   // src_argb
   1605     mov        edx, [esp + 4 + 8]   // dst_u
   1606     mov        edi, [esp + 4 + 12]  // dst_v
   1607     mov        ecx, [esp + 4 + 16]  // pix
   1608     movdqa     xmm5, kAddUV128
   1609     movdqa     xmm6, kARGBToV
   1610     movdqa     xmm7, kARGBToU
   1611     sub        edi, edx             // stride from u to v
   1612 
   1613  convertloop:
   1614     /* convert to U and V */
   1615     movdqu     xmm0, [eax]          // U
   1616     movdqu     xmm1, [eax + 16]
   1617     movdqu     xmm2, [eax + 32]
   1618     movdqu     xmm3, [eax + 48]
   1619     pmaddubsw  xmm0, xmm7
   1620     pmaddubsw  xmm1, xmm7
   1621     pmaddubsw  xmm2, xmm7
   1622     pmaddubsw  xmm3, xmm7
   1623     phaddw     xmm0, xmm1
   1624     phaddw     xmm2, xmm3
   1625     psraw      xmm0, 8
   1626     psraw      xmm2, 8
   1627     packsswb   xmm0, xmm2
   1628     paddb      xmm0, xmm5
   1629     movdqu     [edx], xmm0
   1630 
   1631     movdqu     xmm0, [eax]          // V
   1632     movdqu     xmm1, [eax + 16]
   1633     movdqu     xmm2, [eax + 32]
   1634     movdqu     xmm3, [eax + 48]
   1635     pmaddubsw  xmm0, xmm6
   1636     pmaddubsw  xmm1, xmm6
   1637     pmaddubsw  xmm2, xmm6
   1638     pmaddubsw  xmm3, xmm6
   1639     phaddw     xmm0, xmm1
   1640     phaddw     xmm2, xmm3
   1641     psraw      xmm0, 8
   1642     psraw      xmm2, 8
   1643     packsswb   xmm0, xmm2
   1644     paddb      xmm0, xmm5
   1645     lea        eax,  [eax + 64]
   1646     movdqu     [edx + edi], xmm0
   1647     lea        edx,  [edx + 16]
   1648     sub        ecx,  16
   1649     jg         convertloop
   1650 
   1651     pop        edi
   1652     ret
   1653   }
   1654 }
   1655 
   1656 __declspec(naked)
   1657 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
   1658                           uint8* dst_u, uint8* dst_v, int width) {
   1659   __asm {
   1660     push       edi
   1661     mov        eax, [esp + 4 + 4]   // src_argb
   1662     mov        edx, [esp + 4 + 8]   // dst_u
   1663     mov        edi, [esp + 4 + 12]  // dst_v
   1664     mov        ecx, [esp + 4 + 16]  // pix
   1665     movdqa     xmm5, kAddUV128
   1666     movdqa     xmm6, kARGBToV
   1667     movdqa     xmm7, kARGBToU
   1668     sub        edi, edx             // stride from u to v
   1669 
   1670  convertloop:
   1671     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1672     movdqu     xmm0, [eax]
   1673     movdqu     xmm1, [eax + 16]
   1674     movdqu     xmm2, [eax + 32]
   1675     movdqu     xmm3, [eax + 48]
   1676     lea        eax,  [eax + 64]
   1677     movdqa     xmm4, xmm0
   1678     shufps     xmm0, xmm1, 0x88
   1679     shufps     xmm4, xmm1, 0xdd
   1680     pavgb      xmm0, xmm4
   1681     movdqa     xmm4, xmm2
   1682     shufps     xmm2, xmm3, 0x88
   1683     shufps     xmm4, xmm3, 0xdd
   1684     pavgb      xmm2, xmm4
   1685 
   1686     // step 2 - convert to U and V
   1687     // from here down is very similar to Y code except
   1688     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1689     movdqa     xmm1, xmm0
   1690     movdqa     xmm3, xmm2
   1691     pmaddubsw  xmm0, xmm7  // U
   1692     pmaddubsw  xmm2, xmm7
   1693     pmaddubsw  xmm1, xmm6  // V
   1694     pmaddubsw  xmm3, xmm6
   1695     phaddw     xmm0, xmm2
   1696     phaddw     xmm1, xmm3
   1697     psraw      xmm0, 8
   1698     psraw      xmm1, 8
   1699     packsswb   xmm0, xmm1
   1700     paddb      xmm0, xmm5            // -> unsigned
   1701 
   1702     // step 3 - store 8 U and 8 V values
   1703     movlps     qword ptr [edx], xmm0 // U
   1704     movhps     qword ptr [edx + edi], xmm0 // V
   1705     lea        edx, [edx + 8]
   1706     sub        ecx, 16
   1707     jg         convertloop
   1708 
   1709     pop        edi
   1710     ret
   1711   }
   1712 }
   1713 
   1714 __declspec(naked)
   1715 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1716                        uint8* dst_u, uint8* dst_v, int width) {
   1717   __asm {
   1718     push       esi
   1719     push       edi
   1720     mov        eax, [esp + 8 + 4]   // src_argb
   1721     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1722     mov        edx, [esp + 8 + 12]  // dst_u
   1723     mov        edi, [esp + 8 + 16]  // dst_v
   1724     mov        ecx, [esp + 8 + 20]  // pix
   1725     movdqa     xmm5, kAddUV128
   1726     movdqa     xmm6, kBGRAToV
   1727     movdqa     xmm7, kBGRAToU
   1728     sub        edi, edx             // stride from u to v
   1729 
   1730  convertloop:
   1731     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1732     movdqu     xmm0, [eax]
   1733     movdqu     xmm4, [eax + esi]
   1734     pavgb      xmm0, xmm4
   1735     movdqu     xmm1, [eax + 16]
   1736     movdqu     xmm4, [eax + esi + 16]
   1737     pavgb      xmm1, xmm4
   1738     movdqu     xmm2, [eax + 32]
   1739     movdqu     xmm4, [eax + esi + 32]
   1740     pavgb      xmm2, xmm4
   1741     movdqu     xmm3, [eax + 48]
   1742     movdqu     xmm4, [eax + esi + 48]
   1743     pavgb      xmm3, xmm4
   1744 
   1745     lea        eax,  [eax + 64]
   1746     movdqa     xmm4, xmm0
   1747     shufps     xmm0, xmm1, 0x88
   1748     shufps     xmm4, xmm1, 0xdd
   1749     pavgb      xmm0, xmm4
   1750     movdqa     xmm4, xmm2
   1751     shufps     xmm2, xmm3, 0x88
   1752     shufps     xmm4, xmm3, 0xdd
   1753     pavgb      xmm2, xmm4
   1754 
   1755     // step 2 - convert to U and V
   1756     // from here down is very similar to Y code except
   1757     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1758     movdqa     xmm1, xmm0
   1759     movdqa     xmm3, xmm2
   1760     pmaddubsw  xmm0, xmm7  // U
   1761     pmaddubsw  xmm2, xmm7
   1762     pmaddubsw  xmm1, xmm6  // V
   1763     pmaddubsw  xmm3, xmm6
   1764     phaddw     xmm0, xmm2
   1765     phaddw     xmm1, xmm3
   1766     psraw      xmm0, 8
   1767     psraw      xmm1, 8
   1768     packsswb   xmm0, xmm1
   1769     paddb      xmm0, xmm5            // -> unsigned
   1770 
   1771     // step 3 - store 8 U and 8 V values
   1772     movlps     qword ptr [edx], xmm0 // U
   1773     movhps     qword ptr [edx + edi], xmm0 // V
   1774     lea        edx, [edx + 8]
   1775     sub        ecx, 16
   1776     jg         convertloop
   1777 
   1778     pop        edi
   1779     pop        esi
   1780     ret
   1781   }
   1782 }
   1783 
   1784 __declspec(naked)
   1785 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1786                        uint8* dst_u, uint8* dst_v, int width) {
   1787   __asm {
   1788     push       esi
   1789     push       edi
   1790     mov        eax, [esp + 8 + 4]   // src_argb
   1791     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1792     mov        edx, [esp + 8 + 12]  // dst_u
   1793     mov        edi, [esp + 8 + 16]  // dst_v
   1794     mov        ecx, [esp + 8 + 20]  // pix
   1795     movdqa     xmm5, kAddUV128
   1796     movdqa     xmm6, kABGRToV
   1797     movdqa     xmm7, kABGRToU
   1798     sub        edi, edx             // stride from u to v
   1799 
   1800  convertloop:
   1801     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1802     movdqu     xmm0, [eax]
   1803     movdqu     xmm4, [eax + esi]
   1804     pavgb      xmm0, xmm4
   1805     movdqu     xmm1, [eax + 16]
   1806     movdqu     xmm4, [eax + esi + 16]
   1807     pavgb      xmm1, xmm4
   1808     movdqu     xmm2, [eax + 32]
   1809     movdqu     xmm4, [eax + esi + 32]
   1810     pavgb      xmm2, xmm4
   1811     movdqu     xmm3, [eax + 48]
   1812     movdqu     xmm4, [eax + esi + 48]
   1813     pavgb      xmm3, xmm4
   1814 
   1815     lea        eax,  [eax + 64]
   1816     movdqa     xmm4, xmm0
   1817     shufps     xmm0, xmm1, 0x88
   1818     shufps     xmm4, xmm1, 0xdd
   1819     pavgb      xmm0, xmm4
   1820     movdqa     xmm4, xmm2
   1821     shufps     xmm2, xmm3, 0x88
   1822     shufps     xmm4, xmm3, 0xdd
   1823     pavgb      xmm2, xmm4
   1824 
   1825     // step 2 - convert to U and V
   1826     // from here down is very similar to Y code except
   1827     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1828     movdqa     xmm1, xmm0
   1829     movdqa     xmm3, xmm2
   1830     pmaddubsw  xmm0, xmm7  // U
   1831     pmaddubsw  xmm2, xmm7
   1832     pmaddubsw  xmm1, xmm6  // V
   1833     pmaddubsw  xmm3, xmm6
   1834     phaddw     xmm0, xmm2
   1835     phaddw     xmm1, xmm3
   1836     psraw      xmm0, 8
   1837     psraw      xmm1, 8
   1838     packsswb   xmm0, xmm1
   1839     paddb      xmm0, xmm5            // -> unsigned
   1840 
   1841     // step 3 - store 8 U and 8 V values
   1842     movlps     qword ptr [edx], xmm0 // U
   1843     movhps     qword ptr [edx + edi], xmm0 // V
   1844     lea        edx, [edx + 8]
   1845     sub        ecx, 16
   1846     jg         convertloop
   1847 
   1848     pop        edi
   1849     pop        esi
   1850     ret
   1851   }
   1852 }
   1853 
   1854 __declspec(naked)
   1855 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1856                        uint8* dst_u, uint8* dst_v, int width) {
   1857   __asm {
   1858     push       esi
   1859     push       edi
   1860     mov        eax, [esp + 8 + 4]   // src_argb
   1861     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1862     mov        edx, [esp + 8 + 12]  // dst_u
   1863     mov        edi, [esp + 8 + 16]  // dst_v
   1864     mov        ecx, [esp + 8 + 20]  // pix
   1865     movdqa     xmm5, kAddUV128
   1866     movdqa     xmm6, kRGBAToV
   1867     movdqa     xmm7, kRGBAToU
   1868     sub        edi, edx             // stride from u to v
   1869 
   1870  convertloop:
   1871     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1872     movdqu     xmm0, [eax]
   1873     movdqu     xmm4, [eax + esi]
   1874     pavgb      xmm0, xmm4
   1875     movdqu     xmm1, [eax + 16]
   1876     movdqu     xmm4, [eax + esi + 16]
   1877     pavgb      xmm1, xmm4
   1878     movdqu     xmm2, [eax + 32]
   1879     movdqu     xmm4, [eax + esi + 32]
   1880     pavgb      xmm2, xmm4
   1881     movdqu     xmm3, [eax + 48]
   1882     movdqu     xmm4, [eax + esi + 48]
   1883     pavgb      xmm3, xmm4
   1884 
   1885     lea        eax,  [eax + 64]
   1886     movdqa     xmm4, xmm0
   1887     shufps     xmm0, xmm1, 0x88
   1888     shufps     xmm4, xmm1, 0xdd
   1889     pavgb      xmm0, xmm4
   1890     movdqa     xmm4, xmm2
   1891     shufps     xmm2, xmm3, 0x88
   1892     shufps     xmm4, xmm3, 0xdd
   1893     pavgb      xmm2, xmm4
   1894 
   1895     // step 2 - convert to U and V
   1896     // from here down is very similar to Y code except
   1897     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1898     movdqa     xmm1, xmm0
   1899     movdqa     xmm3, xmm2
   1900     pmaddubsw  xmm0, xmm7  // U
   1901     pmaddubsw  xmm2, xmm7
   1902     pmaddubsw  xmm1, xmm6  // V
   1903     pmaddubsw  xmm3, xmm6
   1904     phaddw     xmm0, xmm2
   1905     phaddw     xmm1, xmm3
   1906     psraw      xmm0, 8
   1907     psraw      xmm1, 8
   1908     packsswb   xmm0, xmm1
   1909     paddb      xmm0, xmm5            // -> unsigned
   1910 
   1911     // step 3 - store 8 U and 8 V values
   1912     movlps     qword ptr [edx], xmm0 // U
   1913     movhps     qword ptr [edx + edi], xmm0 // V
   1914     lea        edx, [edx + 8]
   1915     sub        ecx, 16
   1916     jg         convertloop
   1917 
   1918     pop        edi
   1919     pop        esi
   1920     ret
   1921   }
   1922 }
   1923 #endif  // HAS_ARGBTOYROW_SSSE3
   1924 
   1925 // Read 16 UV from 444
   1926 #define READYUV444_AVX2 __asm {                                                \
   1927     __asm vmovdqu    xmm0, [esi]                  /* U */         /* NOLINT */ \
   1928     __asm vmovdqu    xmm1, [esi + edi]            /* V */         /* NOLINT */ \
   1929     __asm lea        esi,  [esi + 16]                                          \
   1930     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1931     __asm vpermq     ymm1, ymm1, 0xd8                                          \
   1932     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
   1933   }
   1934 
   1935 // Read 8 UV from 422, upsample to 16 UV.
   1936 #define READYUV422_AVX2 __asm {                                                \
   1937     __asm vmovq      xmm0, qword ptr [esi]        /* U */         /* NOLINT */ \
   1938     __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */         /* NOLINT */ \
   1939     __asm lea        esi,  [esi + 8]                                           \
   1940     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
   1941     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1942     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
   1943   }
   1944 
   1945 // Read 4 UV from 411, upsample to 16 UV.
   1946 #define READYUV411_AVX2 __asm {                                                \
   1947     __asm vmovd      xmm0, dword ptr [esi]        /* U */         /* NOLINT */ \
   1948     __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */         /* NOLINT */ \
   1949     __asm lea        esi,  [esi + 4]                                           \
   1950     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
   1951     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
   1952     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1953     __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
   1954   }
   1955 
   1956 // Read 8 UV from NV12, upsample to 16 UV.
   1957 #define READNV12_AVX2 __asm {                                                  \
   1958     __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
   1959     __asm lea        esi,  [esi + 16]                                          \
   1960     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1961     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
   1962   }
   1963 
   1964 // Convert 16 pixels: 16 UV and 16 Y.
   1965 #define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
   1966     /* Step 1: Find 8 UV contributions to 16 R,G,B values */                   \
   1967     __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR        /* scale R UV */   \
   1968     __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG        /* scale G UV */   \
   1969     __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB        /* scale B UV */   \
   1970     __asm vmovdqu    ymm3, YuvConstants.kUVBiasR                               \
   1971     __asm vpsubw     ymm2, ymm3, ymm2                                          \
   1972     __asm vmovdqu    ymm3, YuvConstants.kUVBiasG                               \
   1973     __asm vpsubw     ymm1, ymm3, ymm1                                          \
   1974     __asm vmovdqu    ymm3, YuvConstants.kUVBiasB                               \
   1975     __asm vpsubw     ymm0, ymm3, ymm0                                          \
   1976     /* Step 2: Find Y contribution to 16 R,G,B values */                       \
   1977     __asm vmovdqu    xmm3, [eax]                  /* NOLINT */                 \
   1978     __asm lea        eax, [eax + 16]                                           \
   1979     __asm vpermq     ymm3, ymm3, 0xd8                                          \
   1980     __asm vpunpcklbw ymm3, ymm3, ymm3                                          \
   1981     __asm vpmulhuw   ymm3, ymm3, YuvConstants.kYToRgb                          \
   1982     __asm vpaddsw    ymm0, ymm0, ymm3           /* B += Y */                   \
   1983     __asm vpaddsw    ymm1, ymm1, ymm3           /* G += Y */                   \
   1984     __asm vpaddsw    ymm2, ymm2, ymm3           /* R += Y */                   \
   1985     __asm vpsraw     ymm0, ymm0, 6                                             \
   1986     __asm vpsraw     ymm1, ymm1, 6                                             \
   1987     __asm vpsraw     ymm2, ymm2, 6                                             \
   1988     __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
   1989     __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
   1990     __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
   1991   }
   1992 
   1993 // Store 16 ARGB values.
   1994 #define STOREARGB_AVX2 __asm {                                                 \
   1995     /* Step 3: Weave into ARGB */                                              \
   1996     __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
   1997     __asm vpermq     ymm0, ymm0, 0xd8                                          \
   1998     __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
   1999     __asm vpermq     ymm2, ymm2, 0xd8                                          \
   2000     __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
   2001     __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
   2002     __asm vmovdqu    0[edx], ymm1                                              \
   2003     __asm vmovdqu    32[edx], ymm0                                             \
   2004     __asm lea        edx,  [edx + 64]                                          \
   2005   }
   2006 
   2007 #ifdef HAS_I422TOARGBROW_AVX2
   2008 // 16 pixels
   2009 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2010 __declspec(naked)
   2011 void I422ToARGBRow_AVX2(const uint8* y_buf,
   2012                         const uint8* u_buf,
   2013                         const uint8* v_buf,
   2014                         uint8* dst_argb,
   2015                         int width) {
   2016   __asm {
   2017     push       esi
   2018     push       edi
   2019     mov        eax, [esp + 8 + 4]   // Y
   2020     mov        esi, [esp + 8 + 8]   // U
   2021     mov        edi, [esp + 8 + 12]  // V
   2022     mov        edx, [esp + 8 + 16]  // argb
   2023     mov        ecx, [esp + 8 + 20]  // width
   2024     sub        edi, esi
   2025     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2026 
   2027  convertloop:
   2028     READYUV422_AVX2
   2029     YUVTORGB_AVX2(kYuvConstants)
   2030     STOREARGB_AVX2
   2031 
   2032     sub        ecx, 16
   2033     jg         convertloop
   2034 
   2035     pop        edi
   2036     pop        esi
   2037     vzeroupper
   2038     ret
   2039   }
   2040 }
   2041 #endif  // HAS_I422TOARGBROW_AVX2
   2042 
   2043 #ifdef HAS_J422TOARGBROW_AVX2
   2044 // 16 pixels
   2045 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2046 __declspec(naked)
   2047 void J422ToARGBRow_AVX2(const uint8* y_buf,
   2048                         const uint8* u_buf,
   2049                         const uint8* v_buf,
   2050                         uint8* dst_argb,
   2051                         int width) {
   2052   __asm {
   2053     push       esi
   2054     push       edi
   2055     mov        eax, [esp + 8 + 4]   // Y
   2056     mov        esi, [esp + 8 + 8]   // U
   2057     mov        edi, [esp + 8 + 12]  // V
   2058     mov        edx, [esp + 8 + 16]  // argb
   2059     mov        ecx, [esp + 8 + 20]  // width
   2060     sub        edi, esi
   2061     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2062 
   2063  convertloop:
   2064     READYUV422_AVX2
   2065     YUVTORGB_AVX2(kYuvJConstants)
   2066     STOREARGB_AVX2
   2067 
   2068     sub        ecx, 16
   2069     jg         convertloop
   2070 
   2071     pop        edi
   2072     pop        esi
   2073     vzeroupper
   2074     ret
   2075   }
   2076 }
   2077 #endif  // HAS_J422TOARGBROW_AVX2
   2078 
   2079 #ifdef HAS_I444TOARGBROW_AVX2
   2080 // 16 pixels
   2081 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
   2082 __declspec(naked)
   2083 void I444ToARGBRow_AVX2(const uint8* y_buf,
   2084                         const uint8* u_buf,
   2085                         const uint8* v_buf,
   2086                         uint8* dst_argb,
   2087                         int width) {
   2088   __asm {
   2089     push       esi
   2090     push       edi
   2091     mov        eax, [esp + 8 + 4]   // Y
   2092     mov        esi, [esp + 8 + 8]   // U
   2093     mov        edi, [esp + 8 + 12]  // V
   2094     mov        edx, [esp + 8 + 16]  // argb
   2095     mov        ecx, [esp + 8 + 20]  // width
   2096     sub        edi, esi
   2097     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2098 
   2099  convertloop:
   2100     READYUV444_AVX2
   2101     YUVTORGB_AVX2(kYuvConstants)
   2102     STOREARGB_AVX2
   2103 
   2104     sub        ecx, 16
   2105     jg         convertloop
   2106 
   2107     pop        edi
   2108     pop        esi
   2109     vzeroupper
   2110     ret
   2111   }
   2112 }
   2113 #endif  // HAS_I444TOARGBROW_AVX2
   2114 
   2115 #ifdef HAS_I411TOARGBROW_AVX2
   2116 // 16 pixels
   2117 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2118 __declspec(naked)
   2119 void I411ToARGBRow_AVX2(const uint8* y_buf,
   2120                         const uint8* u_buf,
   2121                         const uint8* v_buf,
   2122                         uint8* dst_argb,
   2123                         int width) {
   2124   __asm {
   2125     push       esi
   2126     push       edi
   2127     mov        eax, [esp + 8 + 4]   // Y
   2128     mov        esi, [esp + 8 + 8]   // U
   2129     mov        edi, [esp + 8 + 12]  // V
   2130     mov        edx, [esp + 8 + 16]  // argb
   2131     mov        ecx, [esp + 8 + 20]  // width
   2132     sub        edi, esi
   2133     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2134 
   2135  convertloop:
   2136     READYUV411_AVX2
   2137     YUVTORGB_AVX2(kYuvConstants)
   2138     STOREARGB_AVX2
   2139 
   2140     sub        ecx, 16
   2141     jg         convertloop
   2142 
   2143     pop        edi
   2144     pop        esi
   2145     vzeroupper
   2146     ret
   2147   }
   2148 }
   2149 #endif  // HAS_I411TOARGBROW_AVX2
   2150 
   2151 #ifdef HAS_NV12TOARGBROW_AVX2
   2152 // 16 pixels.
   2153 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2154 __declspec(naked)
   2155 void NV12ToARGBRow_AVX2(const uint8* y_buf,
   2156                         const uint8* uv_buf,
   2157                         uint8* dst_argb,
   2158                         int width) {
   2159   __asm {
   2160     push       esi
   2161     mov        eax, [esp + 4 + 4]   // Y
   2162     mov        esi, [esp + 4 + 8]   // UV
   2163     mov        edx, [esp + 4 + 12]  // argb
   2164     mov        ecx, [esp + 4 + 16]  // width
   2165     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2166 
   2167  convertloop:
   2168     READNV12_AVX2
   2169     YUVTORGB_AVX2(kYuvConstants)
   2170     STOREARGB_AVX2
   2171 
   2172     sub        ecx, 16
   2173     jg         convertloop
   2174 
   2175     pop        esi
   2176     vzeroupper
   2177     ret
   2178   }
   2179 }
   2180 #endif  // HAS_NV12TOARGBROW_AVX2
   2181 
   2182 #ifdef HAS_NV21TOARGBROW_AVX2
   2183 // 16 pixels.
   2184 // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
   2185 __declspec(naked)
   2186 void NV21ToARGBRow_AVX2(const uint8* y_buf,
   2187                         const uint8* uv_buf,
   2188                         uint8* dst_argb,
   2189                         int width) {
   2190   __asm {
   2191     push       esi
   2192     mov        eax, [esp + 4 + 4]   // Y
   2193     mov        esi, [esp + 4 + 8]   // UV
   2194     mov        edx, [esp + 4 + 12]  // argb
   2195     mov        ecx, [esp + 4 + 16]  // width
   2196     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2197 
   2198  convertloop:
   2199     READNV12_AVX2
   2200     YUVTORGB_AVX2(kYvuConstants)
   2201     STOREARGB_AVX2
   2202 
   2203     sub        ecx, 16
   2204     jg         convertloop
   2205 
   2206     pop        esi
   2207     vzeroupper
   2208     ret
   2209   }
   2210 }
   2211 #endif  // HAS_NV21TOARGBROW_AVX2
   2212 
   2213 #ifdef HAS_I422TOBGRAROW_AVX2
   2214 // 16 pixels
   2215 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
   2216 // TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
   2217 __declspec(naked)
   2218 void I422ToBGRARow_AVX2(const uint8* y_buf,
   2219                         const uint8* u_buf,
   2220                         const uint8* v_buf,
   2221                         uint8* dst_argb,
   2222                         int width) {
   2223   __asm {
   2224     push       esi
   2225     push       edi
   2226     mov        eax, [esp + 8 + 4]   // Y
   2227     mov        esi, [esp + 8 + 8]   // U
   2228     mov        edi, [esp + 8 + 12]  // V
   2229     mov        edx, [esp + 8 + 16]  // argb
   2230     mov        ecx, [esp + 8 + 20]  // width
   2231     sub        edi, esi
   2232     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2233 
   2234  convertloop:
   2235     READYUV422_AVX2
   2236     YUVTORGB_AVX2(kYuvConstants)
   2237 
   2238     // Step 3: Weave into BGRA
   2239     vpunpcklbw ymm1, ymm1, ymm0           // GB
   2240     vpermq     ymm1, ymm1, 0xd8
   2241     vpunpcklbw ymm2, ymm5, ymm2           // AR
   2242     vpermq     ymm2, ymm2, 0xd8
   2243     vpunpcklwd ymm0, ymm2, ymm1           // ARGB first 8 pixels
   2244     vpunpckhwd ymm2, ymm2, ymm1           // ARGB next 8 pixels
   2245     vmovdqu    [edx], ymm0
   2246     vmovdqu    [edx + 32], ymm2
   2247     lea        edx,  [edx + 64]
   2248     sub        ecx, 16
   2249     jg         convertloop
   2250 
   2251     pop        edi
   2252     pop        esi
   2253     vzeroupper
   2254     ret
   2255   }
   2256 }
   2257 #endif  // HAS_I422TOBGRAROW_AVX2
   2258 
   2259 #ifdef HAS_I422TORGBAROW_AVX2
   2260 // 16 pixels
   2261 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
   2262 // TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
   2263 __declspec(naked)
   2264 void I422ToRGBARow_AVX2(const uint8* y_buf,
   2265                         const uint8* u_buf,
   2266                         const uint8* v_buf,
   2267                         uint8* dst_argb,
   2268                         int width) {
   2269   __asm {
   2270     push       esi
   2271     push       edi
   2272     mov        eax, [esp + 8 + 4]   // Y
   2273     mov        esi, [esp + 8 + 8]   // U
   2274     mov        edi, [esp + 8 + 12]  // V
   2275     mov        edx, [esp + 8 + 16]  // argb
   2276     mov        ecx, [esp + 8 + 20]  // width
   2277     sub        edi, esi
   2278     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2279 
   2280  convertloop:
   2281     READYUV422_AVX2
   2282     YUVTORGB_AVX2(kYuvConstants)
   2283 
   2284     // Step 3: Weave into RGBA
   2285     vpunpcklbw ymm1, ymm1, ymm2           // GR
   2286     vpermq     ymm1, ymm1, 0xd8
   2287     vpunpcklbw ymm2, ymm5, ymm0           // AB
   2288     vpermq     ymm2, ymm2, 0xd8
   2289     vpunpcklwd ymm0, ymm2, ymm1           // ABGR first 8 pixels
   2290     vpunpckhwd ymm1, ymm2, ymm1           // ABGR next 8 pixels
   2291     vmovdqu    [edx], ymm0
   2292     vmovdqu    [edx + 32], ymm1
   2293     lea        edx,  [edx + 64]
   2294     sub        ecx, 16
   2295     jg         convertloop
   2296 
   2297     pop        edi
   2298     pop        esi
   2299     vzeroupper
   2300     ret
   2301   }
   2302 }
   2303 #endif  // HAS_I422TORGBAROW_AVX2
   2304 
   2305 #ifdef HAS_I422TOABGRROW_AVX2
   2306 // 16 pixels
   2307 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
   2308 // TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
   2309 __declspec(naked)
   2310 void I422ToABGRRow_AVX2(const uint8* y_buf,
   2311                         const uint8* u_buf,
   2312                         const uint8* v_buf,
   2313                         uint8* dst_argb,
   2314                         int width) {
   2315   __asm {
   2316     push       esi
   2317     push       edi
   2318     mov        eax, [esp + 8 + 4]   // Y
   2319     mov        esi, [esp + 8 + 8]   // U
   2320     mov        edi, [esp + 8 + 12]  // V
   2321     mov        edx, [esp + 8 + 16]  // argb
   2322     mov        ecx, [esp + 8 + 20]  // width
   2323     sub        edi, esi
   2324     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
   2325 
   2326  convertloop:
   2327     READYUV422_AVX2
   2328     YUVTORGB_AVX2(kYuvConstants)
   2329 
   2330     // Step 3: Weave into ABGR
   2331     vpunpcklbw ymm1, ymm2, ymm1           // RG
   2332     vpermq     ymm1, ymm1, 0xd8
   2333     vpunpcklbw ymm2, ymm0, ymm5           // BA
   2334     vpermq     ymm2, ymm2, 0xd8
   2335     vpunpcklwd ymm0, ymm1, ymm2           // RGBA first 8 pixels
   2336     vpunpckhwd ymm1, ymm1, ymm2           // RGBA next 8 pixels
   2337     vmovdqu    [edx], ymm0
   2338     vmovdqu    [edx + 32], ymm1
   2339     lea        edx,  [edx + 64]
   2340     sub        ecx, 16
   2341     jg         convertloop
   2342 
   2343     pop        edi
   2344     pop        esi
   2345     vzeroupper
   2346     ret
   2347   }
   2348 }
   2349 #endif  // HAS_I422TOABGRROW_AVX2
   2350 
   2351 #if defined(HAS_I422TOARGBROW_SSSE3)
   2352 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
   2353 
   2354 // Read 8 UV from 444.
   2355 #define READYUV444 __asm {                                                     \
   2356     __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
   2357     __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
   2358     __asm lea        esi,  [esi + 8]                                           \
   2359     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
   2360   }
   2361 
   2362 // Read 4 UV from 422, upsample to 8 UV.
   2363 #define READYUV422 __asm {                                                     \
   2364     __asm movd       xmm0, [esi]          /* U */                              \
   2365     __asm movd       xmm1, [esi + edi]    /* V */                              \
   2366     __asm lea        esi,  [esi + 4]                                           \
   2367     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
   2368     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
   2369   }
   2370 
   2371 // Read 2 UV from 411, upsample to 8 UV.
   2372 #define READYUV411 __asm {                                                     \
   2373     __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \
   2374     __asm movd       xmm0, ebx                                                 \
   2375     __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \
   2376     __asm movd       xmm1, ebx                                                 \
   2377     __asm lea        esi,  [esi + 2]                                           \
   2378     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
   2379     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
   2380     __asm punpckldq  xmm0, xmm0           /* UVUVUVUV (upsample) */            \
   2381   }
   2382 
   2383 // Read 4 UV from NV12, upsample to 8 UV.
   2384 #define READNV12 __asm {                                                       \
   2385     __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
   2386     __asm lea        esi,  [esi + 8]                                           \
   2387     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
   2388   }
   2389 
   2390 // Convert 8 pixels: 8 UV and 8 Y.
   2391 #define YUVTORGB(YuvConstants) __asm {                                         \
   2392     /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
   2393     __asm movdqa     xmm1, xmm0                                                \
   2394     __asm movdqa     xmm2, xmm0                                                \
   2395     __asm movdqa     xmm3, xmm0                                                \
   2396     __asm movdqa     xmm0, YuvConstants.kUVBiasB /* unbias back to signed */   \
   2397     __asm pmaddubsw  xmm1, YuvConstants.kUVToB   /* scale B UV */              \
   2398     __asm psubw      xmm0, xmm1                                                \
   2399     __asm movdqa     xmm1, YuvConstants.kUVBiasG                               \
   2400     __asm pmaddubsw  xmm2, YuvConstants.kUVToG   /* scale G UV */              \
   2401     __asm psubw      xmm1, xmm2                                                \
   2402     __asm movdqa     xmm2, YuvConstants.kUVBiasR                               \
   2403     __asm pmaddubsw  xmm3, YuvConstants.kUVToR   /* scale R UV */              \
   2404     __asm psubw      xmm2, xmm3                                                \
   2405     /* Step 2: Find Y contribution to 8 R,G,B values */                        \
   2406     __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
   2407     __asm lea        eax, [eax + 8]                                            \
   2408     __asm punpcklbw  xmm3, xmm3                                                \
   2409     __asm pmulhuw    xmm3, YuvConstants.kYToRgb                                \
   2410     __asm paddsw     xmm0, xmm3           /* B += Y */                         \
   2411     __asm paddsw     xmm1, xmm3           /* G += Y */                         \
   2412     __asm paddsw     xmm2, xmm3           /* R += Y */                         \
   2413     __asm psraw      xmm0, 6                                                   \
   2414     __asm psraw      xmm1, 6                                                   \
   2415     __asm psraw      xmm2, 6                                                   \
   2416     __asm packuswb   xmm0, xmm0           /* B */                              \
   2417     __asm packuswb   xmm1, xmm1           /* G */                              \
   2418     __asm packuswb   xmm2, xmm2           /* R */                              \
   2419   }
   2420 
   2421 // Store 8 ARGB values.
   2422 #define STOREARGB __asm {                                                      \
   2423     /* Step 3: Weave into ARGB */                                              \
   2424     __asm punpcklbw  xmm0, xmm1           /* BG */                             \
   2425     __asm punpcklbw  xmm2, xmm5           /* RA */                             \
   2426     __asm movdqa     xmm1, xmm0                                                \
   2427     __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
   2428     __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
   2429     __asm movdqu     0[edx], xmm0                                              \
   2430     __asm movdqu     16[edx], xmm1                                             \
   2431     __asm lea        edx,  [edx + 32]                                          \
   2432   }
   2433 
   2434 // Store 8 BGRA values.
   2435 #define STOREBGRA __asm {                                                      \
   2436     /* Step 3: Weave into BGRA */                                              \
   2437     __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
   2438     __asm punpcklbw  xmm1, xmm0           /* GB */                             \
   2439     __asm punpcklbw  xmm5, xmm2           /* AR */                             \
   2440     __asm movdqa     xmm0, xmm5                                                \
   2441     __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
   2442     __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
   2443     __asm movdqu     0[edx], xmm5                                              \
   2444     __asm movdqu     16[edx], xmm0                                             \
   2445     __asm lea        edx,  [edx + 32]                                          \
   2446   }
   2447 
   2448 // Store 8 ABGR values.
   2449 #define STOREABGR __asm {                                                      \
   2450     /* Step 3: Weave into ABGR */                                              \
   2451     __asm punpcklbw  xmm2, xmm1           /* RG */                             \
   2452     __asm punpcklbw  xmm0, xmm5           /* BA */                             \
   2453     __asm movdqa     xmm1, xmm2                                                \
   2454     __asm punpcklwd  xmm2, xmm0           /* RGBA first 4 pixels */            \
   2455     __asm punpckhwd  xmm1, xmm0           /* RGBA next 4 pixels */             \
   2456     __asm movdqu     0[edx], xmm2                                              \
   2457     __asm movdqu     16[edx], xmm1                                             \
   2458     __asm lea        edx,  [edx + 32]                                          \
   2459   }
   2460 
   2461 // Store 8 RGBA values.
   2462 #define STORERGBA __asm {                                                      \
   2463     /* Step 3: Weave into RGBA */                                              \
   2464     __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
   2465     __asm punpcklbw  xmm1, xmm2           /* GR */                             \
   2466     __asm punpcklbw  xmm5, xmm0           /* AB */                             \
   2467     __asm movdqa     xmm0, xmm5                                                \
   2468     __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
   2469     __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
   2470     __asm movdqu     0[edx], xmm5                                              \
   2471     __asm movdqu     16[edx], xmm0                                             \
   2472     __asm lea        edx,  [edx + 32]                                          \
   2473   }
   2474 
   2475 // Store 8 RGB24 values.
   2476 #define STORERGB24 __asm {                                                     \
   2477     /* Step 3: Weave into RRGB */                                              \
   2478     __asm punpcklbw  xmm0, xmm1           /* BG */                             \
   2479     __asm punpcklbw  xmm2, xmm2           /* RR */                             \
   2480     __asm movdqa     xmm1, xmm0                                                \
   2481     __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
   2482     __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
   2483     /* Step 4: RRGB -> RGB24 */                                                \
   2484     __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
   2485     __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
   2486     __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
   2487     __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
   2488     __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
   2489     __asm lea        edx,  [edx + 24]                                          \
   2490   }
   2491 
   2492 // Store 8 RAW values.
   2493 #define STORERAW __asm {                                                       \
   2494     /* Step 3: Weave into RRGB */                                              \
   2495     __asm punpcklbw  xmm0, xmm1           /* BG */                             \
   2496     __asm punpcklbw  xmm2, xmm2           /* RR */                             \
   2497     __asm movdqa     xmm1, xmm0                                                \
   2498     __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
   2499     __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
   2500     /* Step 4: RRGB -> RAW */                                                  \
   2501     __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
   2502     __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
   2503     __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
   2504     __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
   2505     __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
   2506     __asm lea        edx,  [edx + 24]                                          \
   2507   }
   2508 
   2509 // Store 8 RGB565 values.
   2510 #define STORERGB565 __asm {                                                    \
   2511     /* Step 3: Weave into RRGB */                                              \
   2512     __asm punpcklbw  xmm0, xmm1           /* BG */                             \
   2513     __asm punpcklbw  xmm2, xmm2           /* RR */                             \
   2514     __asm movdqa     xmm1, xmm0                                                \
   2515     __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
   2516     __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
   2517     /* Step 4: RRGB -> RGB565 */                                               \
   2518     __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
   2519     __asm movdqa     xmm2, xmm0    /* G */                                     \
   2520     __asm pslld      xmm0, 8       /* R */                                     \
   2521     __asm psrld      xmm3, 3       /* B */                                     \
   2522     __asm psrld      xmm2, 5       /* G */                                     \
   2523     __asm psrad      xmm0, 16      /* R */                                     \
   2524     __asm pand       xmm3, xmm5    /* B */                                     \
   2525     __asm pand       xmm2, xmm6    /* G */                                     \
   2526     __asm pand       xmm0, xmm7    /* R */                                     \
   2527     __asm por        xmm3, xmm2    /* BG */                                    \
   2528     __asm por        xmm0, xmm3    /* BGR */                                   \
   2529     __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
   2530     __asm movdqa     xmm2, xmm1    /* G */                                     \
   2531     __asm pslld      xmm1, 8       /* R */                                     \
   2532     __asm psrld      xmm3, 3       /* B */                                     \
   2533     __asm psrld      xmm2, 5       /* G */                                     \
   2534     __asm psrad      xmm1, 16      /* R */                                     \
   2535     __asm pand       xmm3, xmm5    /* B */                                     \
   2536     __asm pand       xmm2, xmm6    /* G */                                     \
   2537     __asm pand       xmm1, xmm7    /* R */                                     \
   2538     __asm por        xmm3, xmm2    /* BG */                                    \
   2539     __asm por        xmm1, xmm3    /* BGR */                                   \
   2540     __asm packssdw   xmm0, xmm1                                                \
   2541     __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
   2542     __asm lea        edx, [edx + 16]                                           \
   2543   }
   2544 
   2545 // 8 pixels.
   2546 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
   2547 __declspec(naked)
   2548 void I444ToARGBRow_SSSE3(const uint8* y_buf,
   2549                          const uint8* u_buf,
   2550                          const uint8* v_buf,
   2551                          uint8* dst_argb,
   2552                          int width) {
   2553   __asm {
   2554     push       esi
   2555     push       edi
   2556     mov        eax, [esp + 8 + 4]   // Y
   2557     mov        esi, [esp + 8 + 8]   // U
   2558     mov        edi, [esp + 8 + 12]  // V
   2559     mov        edx, [esp + 8 + 16]  // argb
   2560     mov        ecx, [esp + 8 + 20]  // width
   2561     sub        edi, esi
   2562     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2563 
   2564  convertloop:
   2565     READYUV444
   2566     YUVTORGB(kYuvConstants)
   2567     STOREARGB
   2568 
   2569     sub        ecx, 8
   2570     jg         convertloop
   2571 
   2572     pop        edi
   2573     pop        esi
   2574     ret
   2575   }
   2576 }
   2577 
   2578 // 8 pixels.
   2579 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
   2580 __declspec(naked)
   2581 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
   2582                           const uint8* u_buf,
   2583                           const uint8* v_buf,
   2584                           uint8* dst_rgb24,
   2585                           int width) {
   2586   __asm {
   2587     push       esi
   2588     push       edi
   2589     mov        eax, [esp + 8 + 4]   // Y
   2590     mov        esi, [esp + 8 + 8]   // U
   2591     mov        edi, [esp + 8 + 12]  // V
   2592     mov        edx, [esp + 8 + 16]  // rgb24
   2593     mov        ecx, [esp + 8 + 20]  // width
   2594     sub        edi, esi
   2595     movdqa     xmm5, kShuffleMaskARGBToRGB24_0
   2596     movdqa     xmm6, kShuffleMaskARGBToRGB24
   2597 
   2598  convertloop:
   2599     READYUV422
   2600     YUVTORGB(kYuvConstants)
   2601     STORERGB24
   2602 
   2603     sub        ecx, 8
   2604     jg         convertloop
   2605 
   2606     pop        edi
   2607     pop        esi
   2608     ret
   2609   }
   2610 }
   2611 
   2612 // 8 pixels.
   2613 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
   2614 __declspec(naked)
   2615 void I422ToRAWRow_SSSE3(const uint8* y_buf,
   2616                         const uint8* u_buf,
   2617                         const uint8* v_buf,
   2618                         uint8* dst_raw,
   2619                         int width) {
   2620   __asm {
   2621     push       esi
   2622     push       edi
   2623     mov        eax, [esp + 8 + 4]   // Y
   2624     mov        esi, [esp + 8 + 8]   // U
   2625     mov        edi, [esp + 8 + 12]  // V
   2626     mov        edx, [esp + 8 + 16]  // raw
   2627     mov        ecx, [esp + 8 + 20]  // width
   2628     sub        edi, esi
   2629     movdqa     xmm5, kShuffleMaskARGBToRAW_0
   2630     movdqa     xmm6, kShuffleMaskARGBToRAW
   2631 
   2632  convertloop:
   2633     READYUV422
   2634     YUVTORGB(kYuvConstants)
   2635     STORERAW
   2636 
   2637     sub        ecx, 8
   2638     jg         convertloop
   2639 
   2640     pop        edi
   2641     pop        esi
   2642     ret
   2643   }
   2644 }
   2645 
   2646 // 8 pixels
   2647 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
   2648 __declspec(naked)
   2649 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
   2650                            const uint8* u_buf,
   2651                            const uint8* v_buf,
   2652                            uint8* rgb565_buf,
   2653                            int width) {
   2654   __asm {
   2655     push       esi
   2656     push       edi
   2657     mov        eax, [esp + 8 + 4]   // Y
   2658     mov        esi, [esp + 8 + 8]   // U
   2659     mov        edi, [esp + 8 + 12]  // V
   2660     mov        edx, [esp + 8 + 16]  // rgb565
   2661     mov        ecx, [esp + 8 + 20]  // width
   2662     sub        edi, esi
   2663     pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
   2664     psrld      xmm5, 27
   2665     pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
   2666     psrld      xmm6, 26
   2667     pslld      xmm6, 5
   2668     pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
   2669     pslld      xmm7, 11
   2670 
   2671  convertloop:
   2672     READYUV422
   2673     YUVTORGB(kYuvConstants)
   2674     STORERGB565
   2675 
   2676     sub        ecx, 8
   2677     jg         convertloop
   2678 
   2679     pop        edi
   2680     pop        esi
   2681     ret
   2682   }
   2683 }
   2684 
   2685 // 8 pixels.
   2686 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2687 __declspec(naked)
   2688 void I422ToARGBRow_SSSE3(const uint8* y_buf,
   2689                          const uint8* u_buf,
   2690                          const uint8* v_buf,
   2691                          uint8* dst_argb,
   2692                          int width) {
   2693   __asm {
   2694     push       esi
   2695     push       edi
   2696     mov        eax, [esp + 8 + 4]   // Y
   2697     mov        esi, [esp + 8 + 8]   // U
   2698     mov        edi, [esp + 8 + 12]  // V
   2699     mov        edx, [esp + 8 + 16]  // argb
   2700     mov        ecx, [esp + 8 + 20]  // width
   2701     sub        edi, esi
   2702     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2703 
   2704  convertloop:
   2705     READYUV422
   2706     YUVTORGB(kYuvConstants)
   2707     STOREARGB
   2708 
   2709     sub        ecx, 8
   2710     jg         convertloop
   2711 
   2712     pop        edi
   2713     pop        esi
   2714     ret
   2715   }
   2716 }
   2717 
   2718 // 8 pixels.
   2719 // JPeg color space version of I422ToARGB
   2720 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2721 __declspec(naked)
   2722 void J422ToARGBRow_SSSE3(const uint8* y_buf,
   2723                          const uint8* u_buf,
   2724                          const uint8* v_buf,
   2725                          uint8* dst_argb,
   2726                          int width) {
   2727   __asm {
   2728     push       esi
   2729     push       edi
   2730     mov        eax, [esp + 8 + 4]   // Y
   2731     mov        esi, [esp + 8 + 8]   // U
   2732     mov        edi, [esp + 8 + 12]  // V
   2733     mov        edx, [esp + 8 + 16]  // argb
   2734     mov        ecx, [esp + 8 + 20]  // width
   2735     sub        edi, esi
   2736     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2737 
   2738  convertloop:
   2739     READYUV422
   2740     YUVTORGB(kYuvJConstants)
   2741     STOREARGB
   2742 
   2743     sub        ecx, 8
   2744     jg         convertloop
   2745 
   2746     pop        edi
   2747     pop        esi
   2748     ret
   2749   }
   2750 }
   2751 
   2752 // 8 pixels.
   2753 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2754 // Similar to I420 but duplicate UV once more.
   2755 __declspec(naked)
   2756 void I411ToARGBRow_SSSE3(const uint8* y_buf,
   2757                          const uint8* u_buf,
   2758                          const uint8* v_buf,
   2759                          uint8* dst_argb,
   2760                          int width) {
   2761   __asm {
   2762     push       ebx
   2763     push       esi
   2764     push       edi
   2765     mov        eax, [esp + 12 + 4]   // Y
   2766     mov        esi, [esp + 12 + 8]   // U
   2767     mov        edi, [esp + 12 + 12]  // V
   2768     mov        edx, [esp + 12 + 16]  // argb
   2769     mov        ecx, [esp + 12 + 20]  // width
   2770     sub        edi, esi
   2771     pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
   2772 
   2773  convertloop:
   2774     READYUV411  // modifies EBX
   2775     YUVTORGB(kYuvConstants)
   2776     STOREARGB
   2777 
   2778     sub        ecx, 8
   2779     jg         convertloop
   2780 
   2781     pop        edi
   2782     pop        esi
   2783     pop        ebx
   2784     ret
   2785   }
   2786 }
   2787 
   2788 // 8 pixels.
   2789 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   2790 __declspec(naked)
   2791 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
   2792                          const uint8* uv_buf,
   2793                          uint8* dst_argb,
   2794                          int width) {
   2795   __asm {
   2796     push       esi
   2797     mov        eax, [esp + 4 + 4]   // Y
   2798     mov        esi, [esp + 4 + 8]   // UV
   2799     mov        edx, [esp + 4 + 12]  // argb
   2800     mov        ecx, [esp + 4 + 16]  // width
   2801     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2802 
   2803  convertloop:
   2804     READNV12
   2805     YUVTORGB(kYuvConstants)
   2806     STOREARGB
   2807 
   2808     sub        ecx, 8
   2809     jg         convertloop
   2810 
   2811     pop        esi
   2812     ret
   2813   }
   2814 }
   2815 
   2816 // 8 pixels.
   2817 // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
   2818 __declspec(naked)
   2819 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
   2820                          const uint8* uv_buf,
   2821                          uint8* dst_argb,
   2822                          int width) {
   2823   __asm {
   2824     push       esi
   2825     mov        eax, [esp + 4 + 4]   // Y
   2826     mov        esi, [esp + 4 + 8]   // UV
   2827     mov        edx, [esp + 4 + 12]  // argb
   2828     mov        ecx, [esp + 4 + 16]  // width
   2829     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2830 
   2831  convertloop:
   2832     READNV12
   2833     YUVTORGB(kYvuConstants)
   2834     STOREARGB
   2835 
   2836     sub        ecx, 8
   2837     jg         convertloop
   2838 
   2839     pop        esi
   2840     ret
   2841   }
   2842 }
   2843 
   2844 __declspec(naked)
   2845 void I422ToBGRARow_SSSE3(const uint8* y_buf,
   2846                          const uint8* u_buf,
   2847                          const uint8* v_buf,
   2848                          uint8* dst_bgra,
   2849                          int width) {
   2850   __asm {
   2851     push       esi
   2852     push       edi
   2853     mov        eax, [esp + 8 + 4]   // Y
   2854     mov        esi, [esp + 8 + 8]   // U
   2855     mov        edi, [esp + 8 + 12]  // V
   2856     mov        edx, [esp + 8 + 16]  // bgra
   2857     mov        ecx, [esp + 8 + 20]  // width
   2858     sub        edi, esi
   2859 
   2860  convertloop:
   2861     READYUV422
   2862     YUVTORGB(kYuvConstants)
   2863     STOREBGRA
   2864 
   2865     sub        ecx, 8
   2866     jg         convertloop
   2867 
   2868     pop        edi
   2869     pop        esi
   2870     ret
   2871   }
   2872 }
   2873 
   2874 __declspec(naked)
   2875 void I422ToABGRRow_SSSE3(const uint8* y_buf,
   2876                          const uint8* u_buf,
   2877                          const uint8* v_buf,
   2878                          uint8* dst_abgr,
   2879                          int width) {
   2880   __asm {
   2881     push       esi
   2882     push       edi
   2883     mov        eax, [esp + 8 + 4]   // Y
   2884     mov        esi, [esp + 8 + 8]   // U
   2885     mov        edi, [esp + 8 + 12]  // V
   2886     mov        edx, [esp + 8 + 16]  // abgr
   2887     mov        ecx, [esp + 8 + 20]  // width
   2888     sub        edi, esi
   2889     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2890 
   2891  convertloop:
   2892     READYUV422
   2893     YUVTORGB(kYuvConstants)
   2894     STOREABGR
   2895 
   2896     sub        ecx, 8
   2897     jg         convertloop
   2898 
   2899     pop        edi
   2900     pop        esi
   2901     ret
   2902   }
   2903 }
   2904 
   2905 __declspec(naked)
   2906 void I422ToRGBARow_SSSE3(const uint8* y_buf,
   2907                          const uint8* u_buf,
   2908                          const uint8* v_buf,
   2909                          uint8* dst_rgba,
   2910                          int width) {
   2911   __asm {
   2912     push       esi
   2913     push       edi
   2914     mov        eax, [esp + 8 + 4]   // Y
   2915     mov        esi, [esp + 8 + 8]   // U
   2916     mov        edi, [esp + 8 + 12]  // V
   2917     mov        edx, [esp + 8 + 16]  // rgba
   2918     mov        ecx, [esp + 8 + 20]  // width
   2919     sub        edi, esi
   2920 
   2921  convertloop:
   2922     READYUV422
   2923     YUVTORGB(kYuvConstants)
   2924     STORERGBA
   2925 
   2926     sub        ecx, 8
   2927     jg         convertloop
   2928 
   2929     pop        edi
   2930     pop        esi
   2931     ret
   2932   }
   2933 }
   2934 
   2935 #endif  // HAS_I422TOARGBROW_SSSE3
   2936 
   2937 #ifdef HAS_I400TOARGBROW_SSE2
   2938 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
   2939 __declspec(naked)
   2940 void I400ToARGBRow_SSE2(const uint8* y_buf,
   2941                         uint8* rgb_buf,
   2942                         int width) {
   2943   __asm {
   2944     mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
   2945     movd       xmm2, eax
   2946     pshufd     xmm2, xmm2,0
   2947     mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
   2948     movd       xmm3, eax
   2949     pshufd     xmm3, xmm3, 0
   2950     pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
   2951     pslld      xmm4, 24
   2952 
   2953     mov        eax, [esp + 4]       // Y
   2954     mov        edx, [esp + 8]       // rgb
   2955     mov        ecx, [esp + 12]      // width
   2956 
   2957  convertloop:
   2958     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
   2959     movq       xmm0, qword ptr [eax]
   2960     lea        eax, [eax + 8]
   2961     punpcklbw  xmm0, xmm0           // Y.Y
   2962     pmulhuw    xmm0, xmm2
   2963     psubusw    xmm0, xmm3
   2964     psrlw      xmm0, 6
   2965     packuswb   xmm0, xmm0           // G
   2966 
   2967     // Step 2: Weave into ARGB
   2968     punpcklbw  xmm0, xmm0           // GG
   2969     movdqa     xmm1, xmm0
   2970     punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
   2971     punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
   2972     por        xmm0, xmm4
   2973     por        xmm1, xmm4
   2974     movdqu     [edx], xmm0
   2975     movdqu     [edx + 16], xmm1
   2976     lea        edx,  [edx + 32]
   2977     sub        ecx, 8
   2978     jg         convertloop
   2979     ret
   2980   }
   2981 }
   2982 #endif  // HAS_I400TOARGBROW_SSE2
   2983 
   2984 #ifdef HAS_I400TOARGBROW_AVX2
   2985 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
   2986 // note: vpunpcklbw mutates and vpackuswb unmutates.
   2987 __declspec(naked)
   2988 void I400ToARGBRow_AVX2(const uint8* y_buf,
   2989                         uint8* rgb_buf,
   2990                         int width) {
   2991   __asm {
   2992     mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
   2993     vmovd      xmm2, eax
   2994     vbroadcastss ymm2, xmm2
   2995     mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
   2996     vmovd      xmm3, eax
   2997     vbroadcastss ymm3, xmm3
   2998     vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
   2999     vpslld     ymm4, ymm4, 24
   3000 
   3001     mov        eax, [esp + 4]       // Y
   3002     mov        edx, [esp + 8]       // rgb
   3003     mov        ecx, [esp + 12]      // width
   3004 
   3005  convertloop:
   3006     // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
   3007     vmovdqu    xmm0, [eax]
   3008     lea        eax, [eax + 16]
   3009     vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
   3010     vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
   3011     vpmulhuw   ymm0, ymm0, ymm2
   3012     vpsubusw   ymm0, ymm0, ymm3
   3013     vpsrlw     ymm0, ymm0, 6
   3014     vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
   3015 
   3016     // TODO(fbarchard): Weave alpha with unpack.
   3017     // Step 2: Weave into ARGB
   3018     vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
   3019     vpermq     ymm1, ymm1, 0xd8
   3020     vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
   3021     vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
   3022     vpor       ymm0, ymm0, ymm4
   3023     vpor       ymm1, ymm1, ymm4
   3024     vmovdqu    [edx], ymm0
   3025     vmovdqu    [edx + 32], ymm1
   3026     lea        edx,  [edx + 64]
   3027     sub        ecx, 16
   3028     jg         convertloop
   3029     vzeroupper
   3030     ret
   3031   }
   3032 }
   3033 #endif  // HAS_I400TOARGBROW_AVX2
   3034 
   3035 #ifdef HAS_MIRRORROW_SSSE3
   3036 // Shuffle table for reversing the bytes.
   3037 static const uvec8 kShuffleMirror = {
   3038   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
   3039 };
   3040 
   3041 // TODO(fbarchard): Replace lea with -16 offset.
   3042 __declspec(naked)
   3043 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   3044   __asm {
   3045     mov       eax, [esp + 4]   // src
   3046     mov       edx, [esp + 8]   // dst
   3047     mov       ecx, [esp + 12]  // width
   3048     movdqa    xmm5, kShuffleMirror
   3049 
   3050  convertloop:
   3051     movdqu    xmm0, [eax - 16 + ecx]
   3052     pshufb    xmm0, xmm5
   3053     movdqu    [edx], xmm0
   3054     lea       edx, [edx + 16]
   3055     sub       ecx, 16
   3056     jg        convertloop
   3057     ret
   3058   }
   3059 }
   3060 #endif  // HAS_MIRRORROW_SSSE3
   3061 
   3062 #ifdef HAS_MIRRORROW_AVX2
   3063 __declspec(naked)
   3064 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   3065   __asm {
   3066     mov       eax, [esp + 4]   // src
   3067     mov       edx, [esp + 8]   // dst
   3068     mov       ecx, [esp + 12]  // width
   3069     vbroadcastf128 ymm5, kShuffleMirror
   3070 
   3071  convertloop:
   3072     vmovdqu   ymm0, [eax - 32 + ecx]
   3073     vpshufb   ymm0, ymm0, ymm5
   3074     vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
   3075     vmovdqu   [edx], ymm0
   3076     lea       edx, [edx + 32]
   3077     sub       ecx, 32
   3078     jg        convertloop
   3079     vzeroupper
   3080     ret
   3081   }
   3082 }
   3083 #endif  // HAS_MIRRORROW_AVX2
   3084 
   3085 #ifdef HAS_MIRRORROW_SSE2
   3086 __declspec(naked)
   3087 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   3088   __asm {
   3089     mov       eax, [esp + 4]   // src
   3090     mov       edx, [esp + 8]   // dst
   3091     mov       ecx, [esp + 12]  // width
   3092 
   3093  convertloop:
   3094     movdqu    xmm0, [eax - 16 + ecx]
   3095     movdqa    xmm1, xmm0        // swap bytes
   3096     psllw     xmm0, 8
   3097     psrlw     xmm1, 8
   3098     por       xmm0, xmm1
   3099     pshuflw   xmm0, xmm0, 0x1b  // swap words
   3100     pshufhw   xmm0, xmm0, 0x1b
   3101     pshufd    xmm0, xmm0, 0x4e  // swap qwords
   3102     movdqu    [edx], xmm0
   3103     lea       edx, [edx + 16]
   3104     sub       ecx, 16
   3105     jg        convertloop
   3106     ret
   3107   }
   3108 }
   3109 #endif  // HAS_MIRRORROW_SSE2
   3110 
   3111 #ifdef HAS_MIRRORROW_UV_SSSE3
   3112 // Shuffle table for reversing the bytes of UV channels.
   3113 static const uvec8 kShuffleMirrorUV = {
   3114   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
   3115 };
   3116 
   3117 __declspec(naked)
   3118 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
   3119                        int width) {
   3120   __asm {
   3121     push      edi
   3122     mov       eax, [esp + 4 + 4]   // src
   3123     mov       edx, [esp + 4 + 8]   // dst_u
   3124     mov       edi, [esp + 4 + 12]  // dst_v
   3125     mov       ecx, [esp + 4 + 16]  // width
   3126     movdqa    xmm1, kShuffleMirrorUV
   3127     lea       eax, [eax + ecx * 2 - 16]
   3128     sub       edi, edx
   3129 
   3130  convertloop:
   3131     movdqu    xmm0, [eax]
   3132     lea       eax, [eax - 16]
   3133     pshufb    xmm0, xmm1
   3134     movlpd    qword ptr [edx], xmm0
   3135     movhpd    qword ptr [edx + edi], xmm0
   3136     lea       edx, [edx + 8]
   3137     sub       ecx, 8
   3138     jg        convertloop
   3139 
   3140     pop       edi
   3141     ret
   3142   }
   3143 }
   3144 #endif  // HAS_MIRRORROW_UV_SSSE3
   3145 
   3146 #ifdef HAS_ARGBMIRRORROW_SSE2
   3147 __declspec(naked)
   3148 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   3149   __asm {
   3150     mov       eax, [esp + 4]   // src
   3151     mov       edx, [esp + 8]   // dst
   3152     mov       ecx, [esp + 12]  // width
   3153     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
   3154 
   3155  convertloop:
   3156     movdqu    xmm0, [eax]
   3157     lea       eax, [eax - 16]
   3158     pshufd    xmm0, xmm0, 0x1b
   3159     movdqu    [edx], xmm0
   3160     lea       edx, [edx + 16]
   3161     sub       ecx, 4
   3162     jg        convertloop
   3163     ret
   3164   }
   3165 }
   3166 #endif  // HAS_ARGBMIRRORROW_SSE2
   3167 
   3168 #ifdef HAS_ARGBMIRRORROW_AVX2
   3169 // Shuffle table for reversing the bytes.
   3170 static const ulvec32 kARGBShuffleMirror_AVX2 = {
   3171   7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
   3172 };
   3173 
   3174 __declspec(naked)
   3175 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   3176   __asm {
   3177     mov       eax, [esp + 4]   // src
   3178     mov       edx, [esp + 8]   // dst
   3179     mov       ecx, [esp + 12]  // width
   3180     vmovdqu   ymm5, kARGBShuffleMirror_AVX2
   3181 
   3182  convertloop:
   3183     vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
   3184     vmovdqu   [edx], ymm0
   3185     lea       edx, [edx + 32]
   3186     sub       ecx, 8
   3187     jg        convertloop
   3188     vzeroupper
   3189     ret
   3190   }
   3191 }
   3192 #endif  // HAS_ARGBMIRRORROW_AVX2
   3193 
   3194 #ifdef HAS_SPLITUVROW_SSE2
   3195 __declspec(naked)
   3196 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   3197   __asm {
   3198     push       edi
   3199     mov        eax, [esp + 4 + 4]    // src_uv
   3200     mov        edx, [esp + 4 + 8]    // dst_u
   3201     mov        edi, [esp + 4 + 12]   // dst_v
   3202     mov        ecx, [esp + 4 + 16]   // pix
   3203     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   3204     psrlw      xmm5, 8
   3205     sub        edi, edx
   3206 
   3207   convertloop:
   3208     movdqu     xmm0, [eax]
   3209     movdqu     xmm1, [eax + 16]
   3210     lea        eax,  [eax + 32]
   3211     movdqa     xmm2, xmm0
   3212     movdqa     xmm3, xmm1
   3213     pand       xmm0, xmm5   // even bytes
   3214     pand       xmm1, xmm5
   3215     packuswb   xmm0, xmm1
   3216     psrlw      xmm2, 8      // odd bytes
   3217     psrlw      xmm3, 8
   3218     packuswb   xmm2, xmm3
   3219     movdqu     [edx], xmm0
   3220     movdqu     [edx + edi], xmm2
   3221     lea        edx, [edx + 16]
   3222     sub        ecx, 16
   3223     jg         convertloop
   3224 
   3225     pop        edi
   3226     ret
   3227   }
   3228 }
   3229 
   3230 #endif  // HAS_SPLITUVROW_SSE2
   3231 
   3232 #ifdef HAS_SPLITUVROW_AVX2
   3233 __declspec(naked)
   3234 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   3235   __asm {
   3236     push       edi
   3237     mov        eax, [esp + 4 + 4]    // src_uv
   3238     mov        edx, [esp + 4 + 8]    // dst_u
   3239     mov        edi, [esp + 4 + 12]   // dst_v
   3240     mov        ecx, [esp + 4 + 16]   // pix
   3241     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   3242     vpsrlw     ymm5, ymm5, 8
   3243     sub        edi, edx
   3244 
   3245   convertloop:
   3246     vmovdqu    ymm0, [eax]
   3247     vmovdqu    ymm1, [eax + 32]
   3248     lea        eax,  [eax + 64]
   3249     vpsrlw     ymm2, ymm0, 8      // odd bytes
   3250     vpsrlw     ymm3, ymm1, 8
   3251     vpand      ymm0, ymm0, ymm5   // even bytes
   3252     vpand      ymm1, ymm1, ymm5
   3253     vpackuswb  ymm0, ymm0, ymm1
   3254     vpackuswb  ymm2, ymm2, ymm3
   3255     vpermq     ymm0, ymm0, 0xd8
   3256     vpermq     ymm2, ymm2, 0xd8
   3257     vmovdqu    [edx], ymm0
   3258     vmovdqu    [edx + edi], ymm2
   3259     lea        edx, [edx + 32]
   3260     sub        ecx, 32
   3261     jg         convertloop
   3262 
   3263     pop        edi
   3264     vzeroupper
   3265     ret
   3266   }
   3267 }
   3268 #endif  // HAS_SPLITUVROW_AVX2
   3269 
   3270 #ifdef HAS_MERGEUVROW_SSE2
   3271 __declspec(naked)
   3272 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
   3273                      int width) {
   3274   __asm {
   3275     push       edi
   3276     mov        eax, [esp + 4 + 4]    // src_u
   3277     mov        edx, [esp + 4 + 8]    // src_v
   3278     mov        edi, [esp + 4 + 12]   // dst_uv
   3279     mov        ecx, [esp + 4 + 16]   // width
   3280     sub        edx, eax
   3281 
   3282   convertloop:
   3283     movdqu     xmm0, [eax]      // read 16 U's
   3284     movdqu     xmm1, [eax + edx]  // and 16 V's
   3285     lea        eax,  [eax + 16]
   3286     movdqa     xmm2, xmm0
   3287     punpcklbw  xmm0, xmm1       // first 8 UV pairs
   3288     punpckhbw  xmm2, xmm1       // next 8 UV pairs
   3289     movdqu     [edi], xmm0
   3290     movdqu     [edi + 16], xmm2
   3291     lea        edi, [edi + 32]
   3292     sub        ecx, 16
   3293     jg         convertloop
   3294 
   3295     pop        edi
   3296     ret
   3297   }
   3298 }
   3299 #endif  //  HAS_MERGEUVROW_SSE2
   3300 
   3301 #ifdef HAS_MERGEUVROW_AVX2
   3302 __declspec(naked)
   3303 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
   3304                      int width) {
   3305   __asm {
   3306     push       edi
   3307     mov        eax, [esp + 4 + 4]    // src_u
   3308     mov        edx, [esp + 4 + 8]    // src_v
   3309     mov        edi, [esp + 4 + 12]   // dst_uv
   3310     mov        ecx, [esp + 4 + 16]   // width
   3311     sub        edx, eax
   3312 
   3313   convertloop:
   3314     vmovdqu    ymm0, [eax]           // read 32 U's
   3315     vmovdqu    ymm1, [eax + edx]     // and 32 V's
   3316     lea        eax,  [eax + 32]
   3317     vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
   3318     vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
   3319     vextractf128 [edi], ymm2, 0       // bytes 0..15
   3320     vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
   3321     vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
   3322     vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
   3323     lea        edi, [edi + 64]
   3324     sub        ecx, 32
   3325     jg         convertloop
   3326 
   3327     pop        edi
   3328     vzeroupper
   3329     ret
   3330   }
   3331 }
   3332 #endif  //  HAS_MERGEUVROW_AVX2
   3333 
   3334 #ifdef HAS_COPYROW_SSE2
   3335 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
   3336 __declspec(naked)
   3337 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   3338   __asm {
   3339     mov        eax, [esp + 4]   // src
   3340     mov        edx, [esp + 8]   // dst
   3341     mov        ecx, [esp + 12]  // count
   3342 
   3343   convertloop:
   3344     movdqu     xmm0, [eax]
   3345     movdqu     xmm1, [eax + 16]
   3346     lea        eax, [eax + 32]
   3347     movdqu     [edx], xmm0
   3348     movdqu     [edx + 16], xmm1
   3349     lea        edx, [edx + 32]
   3350     sub        ecx, 32
   3351     jg         convertloop
   3352     ret
   3353   }
   3354 }
   3355 #endif  // HAS_COPYROW_SSE2
   3356 
   3357 #ifdef HAS_COPYROW_AVX
   3358 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
   3359 __declspec(naked)
   3360 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
   3361   __asm {
   3362     mov        eax, [esp + 4]   // src
   3363     mov        edx, [esp + 8]   // dst
   3364     mov        ecx, [esp + 12]  // count
   3365 
   3366   convertloop:
   3367     vmovdqu    ymm0, [eax]
   3368     vmovdqu    ymm1, [eax + 32]
   3369     lea        eax, [eax + 64]
   3370     vmovdqu    [edx], ymm0
   3371     vmovdqu    [edx + 32], ymm1
   3372     lea        edx, [edx + 64]
   3373     sub        ecx, 64
   3374     jg         convertloop
   3375 
   3376     vzeroupper
   3377     ret
   3378   }
   3379 }
   3380 #endif  // HAS_COPYROW_AVX
   3381 
   3382 // Multiple of 1.
   3383 __declspec(naked)
   3384 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
   3385   __asm {
   3386     mov        eax, esi
   3387     mov        edx, edi
   3388     mov        esi, [esp + 4]   // src
   3389     mov        edi, [esp + 8]   // dst
   3390     mov        ecx, [esp + 12]  // count
   3391     rep movsb
   3392     mov        edi, edx
   3393     mov        esi, eax
   3394     ret
   3395   }
   3396 }
   3397 
   3398 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
   3399 // width in pixels
   3400 __declspec(naked)
   3401 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   3402   __asm {
   3403     mov        eax, [esp + 4]   // src
   3404     mov        edx, [esp + 8]   // dst
   3405     mov        ecx, [esp + 12]  // count
   3406     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
   3407     pslld      xmm0, 24
   3408     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
   3409     psrld      xmm1, 8
   3410 
   3411   convertloop:
   3412     movdqu     xmm2, [eax]
   3413     movdqu     xmm3, [eax + 16]
   3414     lea        eax, [eax + 32]
   3415     movdqu     xmm4, [edx]
   3416     movdqu     xmm5, [edx + 16]
   3417     pand       xmm2, xmm0
   3418     pand       xmm3, xmm0
   3419     pand       xmm4, xmm1
   3420     pand       xmm5, xmm1
   3421     por        xmm2, xmm4
   3422     por        xmm3, xmm5
   3423     movdqu     [edx], xmm2
   3424     movdqu     [edx + 16], xmm3
   3425     lea        edx, [edx + 32]
   3426     sub        ecx, 8
   3427     jg         convertloop
   3428 
   3429     ret
   3430   }
   3431 }
   3432 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
   3433 
   3434 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
   3435 // width in pixels
   3436 __declspec(naked)
   3437 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   3438   __asm {
   3439     mov        eax, [esp + 4]   // src
   3440     mov        edx, [esp + 8]   // dst
   3441     mov        ecx, [esp + 12]  // count
   3442     vpcmpeqb   ymm0, ymm0, ymm0
   3443     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
   3444 
   3445   convertloop:
   3446     vmovdqu    ymm1, [eax]
   3447     vmovdqu    ymm2, [eax + 32]
   3448     lea        eax, [eax + 64]
   3449     vpblendvb  ymm1, ymm1, [edx], ymm0
   3450     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
   3451     vmovdqu    [edx], ymm1
   3452     vmovdqu    [edx + 32], ymm2
   3453     lea        edx, [edx + 64]
   3454     sub        ecx, 16
   3455     jg         convertloop
   3456 
   3457     vzeroupper
   3458     ret
   3459   }
   3460 }
   3461 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
   3462 
   3463 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
   3464 // width in pixels
   3465 __declspec(naked)
   3466 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   3467   __asm {
   3468     mov        eax, [esp + 4]   // src
   3469     mov        edx, [esp + 8]   // dst
   3470     mov        ecx, [esp + 12]  // count
   3471     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
   3472     pslld      xmm0, 24
   3473     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
   3474     psrld      xmm1, 8
   3475 
   3476   convertloop:
   3477     movq       xmm2, qword ptr [eax]  // 8 Y's
   3478     lea        eax, [eax + 8]
   3479     punpcklbw  xmm2, xmm2
   3480     punpckhwd  xmm3, xmm2
   3481     punpcklwd  xmm2, xmm2
   3482     movdqu     xmm4, [edx]
   3483     movdqu     xmm5, [edx + 16]
   3484     pand       xmm2, xmm0
   3485     pand       xmm3, xmm0
   3486     pand       xmm4, xmm1
   3487     pand       xmm5, xmm1
   3488     por        xmm2, xmm4
   3489     por        xmm3, xmm5
   3490     movdqu     [edx], xmm2
   3491     movdqu     [edx + 16], xmm3
   3492     lea        edx, [edx + 32]
   3493     sub        ecx, 8
   3494     jg         convertloop
   3495 
   3496     ret
   3497   }
   3498 }
   3499 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
   3500 
   3501 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
   3502 // width in pixels
   3503 __declspec(naked)
   3504 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   3505   __asm {
   3506     mov        eax, [esp + 4]   // src
   3507     mov        edx, [esp + 8]   // dst
   3508     mov        ecx, [esp + 12]  // count
   3509     vpcmpeqb   ymm0, ymm0, ymm0
   3510     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
   3511 
   3512   convertloop:
   3513     vpmovzxbd  ymm1, qword ptr [eax]
   3514     vpmovzxbd  ymm2, qword ptr [eax + 8]
   3515     lea        eax, [eax + 16]
   3516     vpslld     ymm1, ymm1, 24
   3517     vpslld     ymm2, ymm2, 24
   3518     vpblendvb  ymm1, ymm1, [edx], ymm0
   3519     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
   3520     vmovdqu    [edx], ymm1
   3521     vmovdqu    [edx + 32], ymm2
   3522     lea        edx, [edx + 64]
   3523     sub        ecx, 16
   3524     jg         convertloop
   3525 
   3526     vzeroupper
   3527     ret
   3528   }
   3529 }
   3530 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
   3531 
   3532 #ifdef HAS_SETROW_X86
   3533 // Write 'count' bytes using an 8 bit value repeated.
   3534 // Count should be multiple of 4.
   3535 __declspec(naked)
   3536 void SetRow_X86(uint8* dst, uint8 v8, int count) {
   3537   __asm {
   3538     movzx      eax, byte ptr [esp + 8]    // v8
   3539     mov        edx, 0x01010101  // Duplicate byte to all bytes.
   3540     mul        edx              // overwrites edx with upper part of result.
   3541     mov        edx, edi
   3542     mov        edi, [esp + 4]   // dst
   3543     mov        ecx, [esp + 12]  // count
   3544     shr        ecx, 2
   3545     rep stosd
   3546     mov        edi, edx
   3547     ret
   3548   }
   3549 }
   3550 
   3551 // Write 'count' bytes using an 8 bit value repeated.
   3552 __declspec(naked)
   3553 void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
   3554   __asm {
   3555     mov        edx, edi
   3556     mov        edi, [esp + 4]   // dst
   3557     mov        eax, [esp + 8]   // v8
   3558     mov        ecx, [esp + 12]  // count
   3559     rep stosb
   3560     mov        edi, edx
   3561     ret
   3562   }
   3563 }
   3564 
   3565 // Write 'count' 32 bit values.
   3566 __declspec(naked)
   3567 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
   3568   __asm {
   3569     mov        edx, edi
   3570     mov        edi, [esp + 4]   // dst
   3571     mov        eax, [esp + 8]   // v32
   3572     mov        ecx, [esp + 12]  // count
   3573     rep stosd
   3574     mov        edi, edx
   3575     ret
   3576   }
   3577 }
   3578 #endif  // HAS_SETROW_X86
   3579 
   3580 #ifdef HAS_YUY2TOYROW_AVX2
   3581 __declspec(naked)
   3582 void YUY2ToYRow_AVX2(const uint8* src_yuy2,
   3583                      uint8* dst_y, int pix) {
   3584   __asm {
   3585     mov        eax, [esp + 4]    // src_yuy2
   3586     mov        edx, [esp + 8]    // dst_y
   3587     mov        ecx, [esp + 12]   // pix
   3588     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
   3589     vpsrlw     ymm5, ymm5, 8
   3590 
   3591   convertloop:
   3592     vmovdqu    ymm0, [eax]
   3593     vmovdqu    ymm1, [eax + 32]
   3594     lea        eax,  [eax + 64]
   3595     vpand      ymm0, ymm0, ymm5   // even bytes are Y
   3596     vpand      ymm1, ymm1, ymm5
   3597     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   3598     vpermq     ymm0, ymm0, 0xd8
   3599     vmovdqu    [edx], ymm0
   3600     lea        edx, [edx + 32]
   3601     sub        ecx, 32
   3602     jg         convertloop
   3603     vzeroupper
   3604     ret
   3605   }
   3606 }
   3607 
   3608 __declspec(naked)
   3609 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
   3610                       uint8* dst_u, uint8* dst_v, int pix) {
   3611   __asm {
   3612     push       esi
   3613     push       edi
   3614     mov        eax, [esp + 8 + 4]    // src_yuy2
   3615     mov        esi, [esp + 8 + 8]    // stride_yuy2
   3616     mov        edx, [esp + 8 + 12]   // dst_u
   3617     mov        edi, [esp + 8 + 16]   // dst_v
   3618     mov        ecx, [esp + 8 + 20]   // pix
   3619     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   3620     vpsrlw     ymm5, ymm5, 8
   3621     sub        edi, edx
   3622 
   3623   convertloop:
   3624     vmovdqu    ymm0, [eax]
   3625     vmovdqu    ymm1, [eax + 32]
   3626     vpavgb     ymm0, ymm0, [eax + esi]
   3627     vpavgb     ymm1, ymm1, [eax + esi + 32]
   3628     lea        eax,  [eax + 64]
   3629     vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
   3630     vpsrlw     ymm1, ymm1, 8
   3631     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   3632     vpermq     ymm0, ymm0, 0xd8
   3633     vpand      ymm1, ymm0, ymm5  // U
   3634     vpsrlw     ymm0, ymm0, 8     // V
   3635     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   3636     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   3637     vpermq     ymm1, ymm1, 0xd8
   3638     vpermq     ymm0, ymm0, 0xd8
   3639     vextractf128 [edx], ymm1, 0  // U
   3640     vextractf128 [edx + edi], ymm0, 0 // V
   3641     lea        edx, [edx + 16]
   3642     sub        ecx, 32
   3643     jg         convertloop
   3644 
   3645     pop        edi
   3646     pop        esi
   3647     vzeroupper
   3648     ret
   3649   }
   3650 }
   3651 
   3652 __declspec(naked)
   3653 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
   3654                          uint8* dst_u, uint8* dst_v, int pix) {
   3655   __asm {
   3656     push       edi
   3657     mov        eax, [esp + 4 + 4]    // src_yuy2
   3658     mov        edx, [esp + 4 + 8]    // dst_u
   3659     mov        edi, [esp + 4 + 12]   // dst_v
   3660     mov        ecx, [esp + 4 + 16]   // pix
   3661     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   3662     vpsrlw     ymm5, ymm5, 8
   3663     sub        edi, edx
   3664 
   3665   convertloop:
   3666     vmovdqu    ymm0, [eax]
   3667     vmovdqu    ymm1, [eax + 32]
   3668     lea        eax,  [eax + 64]
   3669     vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
   3670     vpsrlw     ymm1, ymm1, 8
   3671     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   3672     vpermq     ymm0, ymm0, 0xd8
   3673     vpand      ymm1, ymm0, ymm5  // U
   3674     vpsrlw     ymm0, ymm0, 8     // V
   3675     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   3676     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   3677     vpermq     ymm1, ymm1, 0xd8
   3678     vpermq     ymm0, ymm0, 0xd8
   3679     vextractf128 [edx], ymm1, 0  // U
   3680     vextractf128 [edx + edi], ymm0, 0 // V
   3681     lea        edx, [edx + 16]
   3682     sub        ecx, 32
   3683     jg         convertloop
   3684 
   3685     pop        edi
   3686     vzeroupper
   3687     ret
   3688   }
   3689 }
   3690 
   3691 __declspec(naked)
   3692 void UYVYToYRow_AVX2(const uint8* src_uyvy,
   3693                      uint8* dst_y, int pix) {
   3694   __asm {
   3695     mov        eax, [esp + 4]    // src_uyvy
   3696     mov        edx, [esp + 8]    // dst_y
   3697     mov        ecx, [esp + 12]   // pix
   3698 
   3699   convertloop:
   3700     vmovdqu    ymm0, [eax]
   3701     vmovdqu    ymm1, [eax + 32]
   3702     lea        eax,  [eax + 64]
   3703     vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
   3704     vpsrlw     ymm1, ymm1, 8
   3705     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   3706     vpermq     ymm0, ymm0, 0xd8
   3707     vmovdqu    [edx], ymm0
   3708     lea        edx, [edx + 32]
   3709     sub        ecx, 32
   3710     jg         convertloop
   3711     vzeroupper
   3712     ret
   3713   }
   3714 }
   3715 
   3716 __declspec(naked)
   3717 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
   3718                       uint8* dst_u, uint8* dst_v, int pix) {
   3719   __asm {
   3720     push       esi
   3721     push       edi
   3722     mov        eax, [esp + 8 + 4]    // src_yuy2
   3723     mov        esi, [esp + 8 + 8]    // stride_yuy2
   3724     mov        edx, [esp + 8 + 12]   // dst_u
   3725     mov        edi, [esp + 8 + 16]   // dst_v
   3726     mov        ecx, [esp + 8 + 20]   // pix
   3727     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   3728     vpsrlw     ymm5, ymm5, 8
   3729     sub        edi, edx
   3730 
   3731   convertloop:
   3732     vmovdqu    ymm0, [eax]
   3733     vmovdqu    ymm1, [eax + 32]
   3734     vpavgb     ymm0, ymm0, [eax + esi]
   3735     vpavgb     ymm1, ymm1, [eax + esi + 32]
   3736     lea        eax,  [eax + 64]
   3737     vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
   3738     vpand      ymm1, ymm1, ymm5
   3739     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   3740     vpermq     ymm0, ymm0, 0xd8
   3741     vpand      ymm1, ymm0, ymm5  // U
   3742     vpsrlw     ymm0, ymm0, 8     // V
   3743     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   3744     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   3745     vpermq     ymm1, ymm1, 0xd8
   3746     vpermq     ymm0, ymm0, 0xd8
   3747     vextractf128 [edx], ymm1, 0  // U
   3748     vextractf128 [edx + edi], ymm0, 0 // V
   3749     lea        edx, [edx + 16]
   3750     sub        ecx, 32
   3751     jg         convertloop
   3752 
   3753     pop        edi
   3754     pop        esi
   3755     vzeroupper
   3756     ret
   3757   }
   3758 }
   3759 
   3760 __declspec(naked)
   3761 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
   3762                          uint8* dst_u, uint8* dst_v, int pix) {
   3763   __asm {
   3764     push       edi
   3765     mov        eax, [esp + 4 + 4]    // src_yuy2
   3766     mov        edx, [esp + 4 + 8]    // dst_u
   3767     mov        edi, [esp + 4 + 12]   // dst_v
   3768     mov        ecx, [esp + 4 + 16]   // pix
   3769     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
   3770     vpsrlw     ymm5, ymm5, 8
   3771     sub        edi, edx
   3772 
   3773   convertloop:
   3774     vmovdqu    ymm0, [eax]
   3775     vmovdqu    ymm1, [eax + 32]
   3776     lea        eax,  [eax + 64]
   3777     vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
   3778     vpand      ymm1, ymm1, ymm5
   3779     vpackuswb  ymm0, ymm0, ymm1   // mutates.
   3780     vpermq     ymm0, ymm0, 0xd8
   3781     vpand      ymm1, ymm0, ymm5  // U
   3782     vpsrlw     ymm0, ymm0, 8     // V
   3783     vpackuswb  ymm1, ymm1, ymm1  // mutates.
   3784     vpackuswb  ymm0, ymm0, ymm0  // mutates.
   3785     vpermq     ymm1, ymm1, 0xd8
   3786     vpermq     ymm0, ymm0, 0xd8
   3787     vextractf128 [edx], ymm1, 0  // U
   3788     vextractf128 [edx + edi], ymm0, 0 // V
   3789     lea        edx, [edx + 16]
   3790     sub        ecx, 32
   3791     jg         convertloop
   3792 
   3793     pop        edi
   3794     vzeroupper
   3795     ret
   3796   }
   3797 }
   3798 #endif  // HAS_YUY2TOYROW_AVX2
   3799 
   3800 #ifdef HAS_YUY2TOYROW_SSE2
   3801 __declspec(naked)
   3802 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
   3803                      uint8* dst_y, int pix) {
   3804   __asm {
   3805     mov        eax, [esp + 4]    // src_yuy2
   3806     mov        edx, [esp + 8]    // dst_y
   3807     mov        ecx, [esp + 12]   // pix
   3808     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
   3809     psrlw      xmm5, 8
   3810 
   3811   convertloop:
   3812     movdqu     xmm0, [eax]
   3813     movdqu     xmm1, [eax + 16]
   3814     lea        eax,  [eax + 32]
   3815     pand       xmm0, xmm5   // even bytes are Y
   3816     pand       xmm1, xmm5
   3817     packuswb   xmm0, xmm1
   3818     movdqu     [edx], xmm0
   3819     lea        edx, [edx + 16]
   3820     sub        ecx, 16
   3821     jg         convertloop
   3822     ret
   3823   }
   3824 }
   3825 
   3826 __declspec(naked)
   3827 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
   3828                       uint8* dst_u, uint8* dst_v, int pix) {
   3829   __asm {
   3830     push       esi
   3831     push       edi
   3832     mov        eax, [esp + 8 + 4]    // src_yuy2
   3833     mov        esi, [esp + 8 + 8]    // stride_yuy2
   3834     mov        edx, [esp + 8 + 12]   // dst_u
   3835     mov        edi, [esp + 8 + 16]   // dst_v
   3836     mov        ecx, [esp + 8 + 20]   // pix
   3837     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   3838     psrlw      xmm5, 8
   3839     sub        edi, edx
   3840 
   3841   convertloop:
   3842     movdqu     xmm0, [eax]
   3843     movdqu     xmm1, [eax + 16]
   3844     movdqu     xmm2, [eax + esi]
   3845     movdqu     xmm3, [eax + esi + 16]
   3846     lea        eax,  [eax + 32]
   3847     pavgb      xmm0, xmm2
   3848     pavgb      xmm1, xmm3
   3849     psrlw      xmm0, 8      // YUYV -> UVUV
   3850     psrlw      xmm1, 8
   3851     packuswb   xmm0, xmm1
   3852     movdqa     xmm1, xmm0
   3853     pand       xmm0, xmm5  // U
   3854     packuswb   xmm0, xmm0
   3855     psrlw      xmm1, 8     // V
   3856     packuswb   xmm1, xmm1
   3857     movq       qword ptr [edx], xmm0
   3858     movq       qword ptr [edx + edi], xmm1
   3859     lea        edx, [edx + 8]
   3860     sub        ecx, 16
   3861     jg         convertloop
   3862 
   3863     pop        edi
   3864     pop        esi
   3865     ret
   3866   }
   3867 }
   3868 
   3869 __declspec(naked)
   3870 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
   3871                          uint8* dst_u, uint8* dst_v, int pix) {
   3872   __asm {
   3873     push       edi
   3874     mov        eax, [esp + 4 + 4]    // src_yuy2
   3875     mov        edx, [esp + 4 + 8]    // dst_u
   3876     mov        edi, [esp + 4 + 12]   // dst_v
   3877     mov        ecx, [esp + 4 + 16]   // pix
   3878     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   3879     psrlw      xmm5, 8
   3880     sub        edi, edx
   3881 
   3882   convertloop:
   3883     movdqu     xmm0, [eax]
   3884     movdqu     xmm1, [eax + 16]
   3885     lea        eax,  [eax + 32]
   3886     psrlw      xmm0, 8      // YUYV -> UVUV
   3887     psrlw      xmm1, 8
   3888     packuswb   xmm0, xmm1
   3889     movdqa     xmm1, xmm0
   3890     pand       xmm0, xmm5  // U
   3891     packuswb   xmm0, xmm0
   3892     psrlw      xmm1, 8     // V
   3893     packuswb   xmm1, xmm1
   3894     movq       qword ptr [edx], xmm0
   3895     movq       qword ptr [edx + edi], xmm1
   3896     lea        edx, [edx + 8]
   3897     sub        ecx, 16
   3898     jg         convertloop
   3899 
   3900     pop        edi
   3901     ret
   3902   }
   3903 }
   3904 
   3905 __declspec(naked)
   3906 void UYVYToYRow_SSE2(const uint8* src_uyvy,
   3907                      uint8* dst_y, int pix) {
   3908   __asm {
   3909     mov        eax, [esp + 4]    // src_uyvy
   3910     mov        edx, [esp + 8]    // dst_y
   3911     mov        ecx, [esp + 12]   // pix
   3912 
   3913   convertloop:
   3914     movdqu     xmm0, [eax]
   3915     movdqu     xmm1, [eax + 16]
   3916     lea        eax,  [eax + 32]
   3917     psrlw      xmm0, 8    // odd bytes are Y
   3918     psrlw      xmm1, 8
   3919     packuswb   xmm0, xmm1
   3920     movdqu     [edx], xmm0
   3921     lea        edx, [edx + 16]
   3922     sub        ecx, 16
   3923     jg         convertloop
   3924     ret
   3925   }
   3926 }
   3927 
   3928 __declspec(naked)
   3929 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
   3930                       uint8* dst_u, uint8* dst_v, int pix) {
   3931   __asm {
   3932     push       esi
   3933     push       edi
   3934     mov        eax, [esp + 8 + 4]    // src_yuy2
   3935     mov        esi, [esp + 8 + 8]    // stride_yuy2
   3936     mov        edx, [esp + 8 + 12]   // dst_u
   3937     mov        edi, [esp + 8 + 16]   // dst_v
   3938     mov        ecx, [esp + 8 + 20]   // pix
   3939     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   3940     psrlw      xmm5, 8
   3941     sub        edi, edx
   3942 
   3943   convertloop:
   3944     movdqu     xmm0, [eax]
   3945     movdqu     xmm1, [eax + 16]
   3946     movdqu     xmm2, [eax + esi]
   3947     movdqu     xmm3, [eax + esi + 16]
   3948     lea        eax,  [eax + 32]
   3949     pavgb      xmm0, xmm2
   3950     pavgb      xmm1, xmm3
   3951     pand       xmm0, xmm5   // UYVY -> UVUV
   3952     pand       xmm1, xmm5
   3953     packuswb   xmm0, xmm1
   3954     movdqa     xmm1, xmm0
   3955     pand       xmm0, xmm5  // U
   3956     packuswb   xmm0, xmm0
   3957     psrlw      xmm1, 8     // V
   3958     packuswb   xmm1, xmm1
   3959     movq       qword ptr [edx], xmm0
   3960     movq       qword ptr [edx + edi], xmm1
   3961     lea        edx, [edx + 8]
   3962     sub        ecx, 16
   3963     jg         convertloop
   3964 
   3965     pop        edi
   3966     pop        esi
   3967     ret
   3968   }
   3969 }
   3970 
   3971 __declspec(naked)
   3972 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
   3973                          uint8* dst_u, uint8* dst_v, int pix) {
   3974   __asm {
   3975     push       edi
   3976     mov        eax, [esp + 4 + 4]    // src_yuy2
   3977     mov        edx, [esp + 4 + 8]    // dst_u
   3978     mov        edi, [esp + 4 + 12]   // dst_v
   3979     mov        ecx, [esp + 4 + 16]   // pix
   3980     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   3981     psrlw      xmm5, 8
   3982     sub        edi, edx
   3983 
   3984   convertloop:
   3985     movdqu     xmm0, [eax]
   3986     movdqu     xmm1, [eax + 16]
   3987     lea        eax,  [eax + 32]
   3988     pand       xmm0, xmm5   // UYVY -> UVUV
   3989     pand       xmm1, xmm5
   3990     packuswb   xmm0, xmm1
   3991     movdqa     xmm1, xmm0
   3992     pand       xmm0, xmm5  // U
   3993     packuswb   xmm0, xmm0
   3994     psrlw      xmm1, 8     // V
   3995     packuswb   xmm1, xmm1
   3996     movq       qword ptr [edx], xmm0
   3997     movq       qword ptr [edx + edi], xmm1
   3998     lea        edx, [edx + 8]
   3999     sub        ecx, 16
   4000     jg         convertloop
   4001 
   4002     pop        edi
   4003     ret
   4004   }
   4005 }
   4006 #endif  // HAS_YUY2TOYROW_SSE2
   4007 
   4008 #ifdef HAS_ARGBBLENDROW_SSE2
   4009 // Blend 8 pixels at a time.
   4010 __declspec(naked)
   4011 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   4012                        uint8* dst_argb, int width) {
   4013   __asm {
   4014     push       esi
   4015     mov        eax, [esp + 4 + 4]   // src_argb0
   4016     mov        esi, [esp + 4 + 8]   // src_argb1
   4017     mov        edx, [esp + 4 + 12]  // dst_argb
   4018     mov        ecx, [esp + 4 + 16]  // width
   4019     pcmpeqb    xmm7, xmm7       // generate constant 1
   4020     psrlw      xmm7, 15
   4021     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
   4022     psrlw      xmm6, 8
   4023     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
   4024     psllw      xmm5, 8
   4025     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
   4026     pslld      xmm4, 24
   4027     sub        ecx, 4
   4028     jl         convertloop4b    // less than 4 pixels?
   4029 
   4030     // 4 pixel loop.
   4031   convertloop4:
   4032     movdqu     xmm3, [eax]      // src argb
   4033     lea        eax, [eax + 16]
   4034     movdqa     xmm0, xmm3       // src argb
   4035     pxor       xmm3, xmm4       // ~alpha
   4036     movdqu     xmm2, [esi]      // _r_b
   4037     psrlw      xmm3, 8          // alpha
   4038     pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
   4039     pshuflw    xmm3, xmm3, 0F5h
   4040     pand       xmm2, xmm6       // _r_b
   4041     paddw      xmm3, xmm7       // 256 - alpha
   4042     pmullw     xmm2, xmm3       // _r_b * alpha
   4043     movdqu     xmm1, [esi]      // _a_g
   4044     lea        esi, [esi + 16]
   4045     psrlw      xmm1, 8          // _a_g
   4046     por        xmm0, xmm4       // set alpha to 255
   4047     pmullw     xmm1, xmm3       // _a_g * alpha
   4048     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   4049     paddusb    xmm0, xmm2       // + src argb
   4050     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   4051     paddusb    xmm0, xmm1       // + src argb
   4052     movdqu     [edx], xmm0
   4053     lea        edx, [edx + 16]
   4054     sub        ecx, 4
   4055     jge        convertloop4
   4056 
   4057   convertloop4b:
   4058     add        ecx, 4 - 1
   4059     jl         convertloop1b
   4060 
   4061     // 1 pixel loop.
   4062   convertloop1:
   4063     movd       xmm3, [eax]      // src argb
   4064     lea        eax, [eax + 4]
   4065     movdqa     xmm0, xmm3       // src argb
   4066     pxor       xmm3, xmm4       // ~alpha
   4067     movd       xmm2, [esi]      // _r_b
   4068     psrlw      xmm3, 8          // alpha
   4069     pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
   4070     pshuflw    xmm3, xmm3, 0F5h
   4071     pand       xmm2, xmm6       // _r_b
   4072     paddw      xmm3, xmm7       // 256 - alpha
   4073     pmullw     xmm2, xmm3       // _r_b * alpha
   4074     movd       xmm1, [esi]      // _a_g
   4075     lea        esi, [esi + 4]
   4076     psrlw      xmm1, 8          // _a_g
   4077     por        xmm0, xmm4       // set alpha to 255
   4078     pmullw     xmm1, xmm3       // _a_g * alpha
   4079     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   4080     paddusb    xmm0, xmm2       // + src argb
   4081     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   4082     paddusb    xmm0, xmm1       // + src argb
   4083     movd       [edx], xmm0
   4084     lea        edx, [edx + 4]
   4085     sub        ecx, 1
   4086     jge        convertloop1
   4087 
   4088   convertloop1b:
   4089     pop        esi
   4090     ret
   4091   }
   4092 }
   4093 #endif  // HAS_ARGBBLENDROW_SSE2
   4094 
   4095 #ifdef HAS_ARGBBLENDROW_SSSE3
   4096 // Shuffle table for isolating alpha.
   4097 static const uvec8 kShuffleAlpha = {
   4098   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
   4099   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
   4100 };
   4101 // Same as SSE2, but replaces:
   4102 //    psrlw      xmm3, 8          // alpha
   4103 //    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
   4104 //    pshuflw    xmm3, xmm3, 0F5h
   4105 // with..
   4106 //    pshufb     xmm3, kShuffleAlpha // alpha
   4107 // Blend 8 pixels at a time.
   4108 
   4109 __declspec(naked)
   4110 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
   4111                         uint8* dst_argb, int width) {
   4112   __asm {
   4113     push       esi
   4114     mov        eax, [esp + 4 + 4]   // src_argb0
   4115     mov        esi, [esp + 4 + 8]   // src_argb1
   4116     mov        edx, [esp + 4 + 12]  // dst_argb
   4117     mov        ecx, [esp + 4 + 16]  // width
   4118     pcmpeqb    xmm7, xmm7       // generate constant 0x0001
   4119     psrlw      xmm7, 15
   4120     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
   4121     psrlw      xmm6, 8
   4122     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
   4123     psllw      xmm5, 8
   4124     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
   4125     pslld      xmm4, 24
   4126     sub        ecx, 4
   4127     jl         convertloop4b    // less than 4 pixels?
   4128 
   4129     // 4 pixel loop.
   4130   convertloop4:
   4131     movdqu     xmm3, [eax]      // src argb
   4132     lea        eax, [eax + 16]
   4133     movdqa     xmm0, xmm3       // src argb
   4134     pxor       xmm3, xmm4       // ~alpha
   4135     movdqu     xmm2, [esi]      // _r_b
   4136     pshufb     xmm3, kShuffleAlpha // alpha
   4137     pand       xmm2, xmm6       // _r_b
   4138     paddw      xmm3, xmm7       // 256 - alpha
   4139     pmullw     xmm2, xmm3       // _r_b * alpha
   4140     movdqu     xmm1, [esi]      // _a_g
   4141     lea        esi, [esi + 16]
   4142     psrlw      xmm1, 8          // _a_g
   4143     por        xmm0, xmm4       // set alpha to 255
   4144     pmullw     xmm1, xmm3       // _a_g * alpha
   4145     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   4146     paddusb    xmm0, xmm2       // + src argb
   4147     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   4148     paddusb    xmm0, xmm1       // + src argb
   4149     movdqu     [edx], xmm0
   4150     lea        edx, [edx + 16]
   4151     sub        ecx, 4
   4152     jge        convertloop4
   4153 
   4154   convertloop4b:
   4155     add        ecx, 4 - 1
   4156     jl         convertloop1b
   4157 
   4158     // 1 pixel loop.
   4159   convertloop1:
   4160     movd       xmm3, [eax]      // src argb
   4161     lea        eax, [eax + 4]
   4162     movdqa     xmm0, xmm3       // src argb
   4163     pxor       xmm3, xmm4       // ~alpha
   4164     movd       xmm2, [esi]      // _r_b
   4165     pshufb     xmm3, kShuffleAlpha // alpha
   4166     pand       xmm2, xmm6       // _r_b
   4167     paddw      xmm3, xmm7       // 256 - alpha
   4168     pmullw     xmm2, xmm3       // _r_b * alpha
   4169     movd       xmm1, [esi]      // _a_g
   4170     lea        esi, [esi + 4]
   4171     psrlw      xmm1, 8          // _a_g
   4172     por        xmm0, xmm4       // set alpha to 255
   4173     pmullw     xmm1, xmm3       // _a_g * alpha
   4174     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   4175     paddusb    xmm0, xmm2       // + src argb
   4176     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   4177     paddusb    xmm0, xmm1       // + src argb
   4178     movd       [edx], xmm0
   4179     lea        edx, [edx + 4]
   4180     sub        ecx, 1
   4181     jge        convertloop1
   4182 
   4183   convertloop1b:
   4184     pop        esi
   4185     ret
   4186   }
   4187 }
   4188 #endif  // HAS_ARGBBLENDROW_SSSE3
   4189 
   4190 #ifdef HAS_ARGBATTENUATEROW_SSE2
   4191 // Attenuate 4 pixels at a time.
   4192 __declspec(naked)
   4193 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
   4194   __asm {
   4195     mov        eax, [esp + 4]   // src_argb0
   4196     mov        edx, [esp + 8]   // dst_argb
   4197     mov        ecx, [esp + 12]  // width
   4198     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
   4199     pslld      xmm4, 24
   4200     pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
   4201     psrld      xmm5, 8
   4202 
   4203  convertloop:
   4204     movdqu     xmm0, [eax]      // read 4 pixels
   4205     punpcklbw  xmm0, xmm0       // first 2
   4206     pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
   4207     pshuflw    xmm2, xmm2, 0FFh
   4208     pmulhuw    xmm0, xmm2       // rgb * a
   4209     movdqu     xmm1, [eax]      // read 4 pixels
   4210     punpckhbw  xmm1, xmm1       // next 2 pixels
   4211     pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
   4212     pshuflw    xmm2, xmm2, 0FFh
   4213     pmulhuw    xmm1, xmm2       // rgb * a
   4214     movdqu     xmm2, [eax]      // alphas
   4215     lea        eax, [eax + 16]
   4216     psrlw      xmm0, 8
   4217     pand       xmm2, xmm4
   4218     psrlw      xmm1, 8
   4219     packuswb   xmm0, xmm1
   4220     pand       xmm0, xmm5       // keep original alphas
   4221     por        xmm0, xmm2
   4222     movdqu     [edx], xmm0
   4223     lea        edx, [edx + 16]
   4224     sub        ecx, 4
   4225     jg         convertloop
   4226 
   4227     ret
   4228   }
   4229 }
   4230 #endif  // HAS_ARGBATTENUATEROW_SSE2
   4231 
   4232 #ifdef HAS_ARGBATTENUATEROW_SSSE3
   4233 // Shuffle table duplicating alpha.
   4234 static const uvec8 kShuffleAlpha0 = {
   4235   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
   4236 };
   4237 static const uvec8 kShuffleAlpha1 = {
   4238   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
   4239   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
   4240 };
   4241 __declspec(naked)
   4242 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   4243   __asm {
   4244     mov        eax, [esp + 4]   // src_argb0
   4245     mov        edx, [esp + 8]   // dst_argb
   4246     mov        ecx, [esp + 12]  // width
   4247     pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
   4248     pslld      xmm3, 24
   4249     movdqa     xmm4, kShuffleAlpha0
   4250     movdqa     xmm5, kShuffleAlpha1
   4251 
   4252  convertloop:
   4253     movdqu     xmm0, [eax]      // read 4 pixels
   4254     pshufb     xmm0, xmm4       // isolate first 2 alphas
   4255     movdqu     xmm1, [eax]      // read 4 pixels
   4256     punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
   4257     pmulhuw    xmm0, xmm1       // rgb * a
   4258     movdqu     xmm1, [eax]      // read 4 pixels
   4259     pshufb     xmm1, xmm5       // isolate next 2 alphas
   4260     movdqu     xmm2, [eax]      // read 4 pixels
   4261     punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
   4262     pmulhuw    xmm1, xmm2       // rgb * a
   4263     movdqu     xmm2, [eax]      // mask original alpha
   4264     lea        eax, [eax + 16]
   4265     pand       xmm2, xmm3
   4266     psrlw      xmm0, 8
   4267     psrlw      xmm1, 8
   4268     packuswb   xmm0, xmm1
   4269     por        xmm0, xmm2       // copy original alpha
   4270     movdqu     [edx], xmm0
   4271     lea        edx, [edx + 16]
   4272     sub        ecx, 4
   4273     jg         convertloop
   4274 
   4275     ret
   4276   }
   4277 }
   4278 #endif  // HAS_ARGBATTENUATEROW_SSSE3
   4279 
   4280 #ifdef HAS_ARGBATTENUATEROW_AVX2
   4281 // Shuffle table duplicating alpha.
   4282 static const uvec8 kShuffleAlpha_AVX2 = {
   4283   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
   4284 };
   4285 __declspec(naked)
   4286 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
   4287   __asm {
   4288     mov        eax, [esp + 4]   // src_argb0
   4289     mov        edx, [esp + 8]   // dst_argb
   4290     mov        ecx, [esp + 12]  // width
   4291     sub        edx, eax
   4292     vbroadcastf128 ymm4,kShuffleAlpha_AVX2
   4293     vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
   4294     vpslld     ymm5, ymm5, 24
   4295 
   4296  convertloop:
   4297     vmovdqu    ymm6, [eax]       // read 8 pixels.
   4298     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
   4299     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
   4300     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
   4301     vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
   4302     vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
   4303     vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
   4304     vpand      ymm6, ymm6, ymm5  // isolate alpha
   4305     vpsrlw     ymm0, ymm0, 8
   4306     vpsrlw     ymm1, ymm1, 8
   4307     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
   4308     vpor       ymm0, ymm0, ymm6  // copy original alpha
   4309     vmovdqu    [eax + edx], ymm0
   4310     lea        eax, [eax + 32]
   4311     sub        ecx, 8
   4312     jg         convertloop
   4313 
   4314     vzeroupper
   4315     ret
   4316   }
   4317 }
   4318 #endif  // HAS_ARGBATTENUATEROW_AVX2
   4319 
   4320 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
   4321 // Unattenuate 4 pixels at a time.
   4322 __declspec(naked)
   4323 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
   4324                              int width) {
   4325   __asm {
   4326     push       esi
   4327     push       edi
   4328     mov        eax, [esp + 8 + 4]   // src_argb0
   4329     mov        edx, [esp + 8 + 8]   // dst_argb
   4330     mov        ecx, [esp + 8 + 12]  // width
   4331 
   4332  convertloop:
   4333     movdqu     xmm0, [eax]      // read 4 pixels
   4334     movzx      esi, byte ptr [eax + 3]  // first alpha
   4335     movzx      edi, byte ptr [eax + 7]  // second alpha
   4336     punpcklbw  xmm0, xmm0       // first 2
   4337     movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
   4338     movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
   4339     pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
   4340     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
   4341     movlhps    xmm2, xmm3
   4342     pmulhuw    xmm0, xmm2       // rgb * a
   4343 
   4344     movdqu     xmm1, [eax]      // read 4 pixels
   4345     movzx      esi, byte ptr [eax + 11]  // third alpha
   4346     movzx      edi, byte ptr [eax + 15]  // forth alpha
   4347     punpckhbw  xmm1, xmm1       // next 2
   4348     movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
   4349     movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
   4350     pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
   4351     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
   4352     movlhps    xmm2, xmm3
   4353     pmulhuw    xmm1, xmm2       // rgb * a
   4354     lea        eax, [eax + 16]
   4355 
   4356     packuswb   xmm0, xmm1
   4357     movdqu     [edx], xmm0
   4358     lea        edx, [edx + 16]
   4359     sub        ecx, 4
   4360     jg         convertloop
   4361     pop        edi
   4362     pop        esi
   4363     ret
   4364   }
   4365 }
   4366 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
   4367 
   4368 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
   4369 // Shuffle table duplicating alpha.
   4370 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
   4371   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
   4372 };
   4373 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
   4374 // USE_GATHER is not on by default, due to being a slow instruction.
   4375 #ifdef USE_GATHER
   4376 __declspec(naked)
   4377 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
   4378                              int width) {
   4379   __asm {
   4380     mov        eax, [esp + 4]   // src_argb0
   4381     mov        edx, [esp + 8]   // dst_argb
   4382     mov        ecx, [esp + 12]  // width
   4383     sub        edx, eax
   4384     vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
   4385 
   4386  convertloop:
   4387     vmovdqu    ymm6, [eax]       // read 8 pixels.
   4388     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
   4389     vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
   4390     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
   4391     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
   4392     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
   4393     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
   4394     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
   4395     vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
   4396     vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
   4397     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
   4398     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
   4399     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
   4400     vmovdqu    [eax + edx], ymm0
   4401     lea        eax, [eax + 32]
   4402     sub        ecx, 8
   4403     jg         convertloop
   4404 
   4405     vzeroupper
   4406     ret
   4407   }
   4408 }
   4409 #else  // USE_GATHER
   4410 __declspec(naked)
   4411 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
   4412                              int width) {
   4413   __asm {
   4414 
   4415     mov        eax, [esp + 4]   // src_argb0
   4416     mov        edx, [esp + 8]   // dst_argb
   4417     mov        ecx, [esp + 12]  // width
   4418     sub        edx, eax
   4419     vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
   4420 
   4421     push       esi
   4422     push       edi
   4423 
   4424  convertloop:
   4425     // replace VPGATHER
   4426     movzx      esi, byte ptr [eax + 3]                 // alpha0
   4427     movzx      edi, byte ptr [eax + 7]                 // alpha1
   4428     vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]
   4429     vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]
   4430     movzx      esi, byte ptr [eax + 11]                // alpha2
   4431     movzx      edi, byte ptr [eax + 15]                // alpha3
   4432     vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
   4433     vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]
   4434     vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]
   4435     movzx      esi, byte ptr [eax + 19]                // alpha4
   4436     movzx      edi, byte ptr [eax + 23]                // alpha5
   4437     vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
   4438     vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]
   4439     vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]
   4440     movzx      esi, byte ptr [eax + 27]                // alpha6
   4441     movzx      edi, byte ptr [eax + 31]                // alpha7
   4442     vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
   4443     vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]
   4444     vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]
   4445     vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
   4446     vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
   4447     vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
   4448     vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
   4449     // end of VPGATHER
   4450 
   4451     vmovdqu    ymm6, [eax]       // read 8 pixels.
   4452     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
   4453     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
   4454     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
   4455     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
   4456     vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
   4457     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
   4458     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
   4459     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
   4460     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
   4461     vmovdqu    [eax + edx], ymm0
   4462     lea        eax, [eax + 32]
   4463     sub        ecx, 8
   4464     jg         convertloop
   4465 
   4466     pop        edi
   4467     pop        esi
   4468     vzeroupper
   4469     ret
   4470   }
   4471 }
   4472 #endif  // USE_GATHER
   4473 #endif  // HAS_ARGBATTENUATEROW_AVX2
   4474 
   4475 #ifdef HAS_ARGBGRAYROW_SSSE3
   4476 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
   4477 __declspec(naked)
   4478 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   4479   __asm {
   4480     mov        eax, [esp + 4]   /* src_argb */
   4481     mov        edx, [esp + 8]   /* dst_argb */
   4482     mov        ecx, [esp + 12]  /* width */
   4483     movdqa     xmm4, kARGBToYJ
   4484     movdqa     xmm5, kAddYJ64
   4485 
   4486  convertloop:
   4487     movdqu     xmm0, [eax]  // G
   4488     movdqu     xmm1, [eax + 16]
   4489     pmaddubsw  xmm0, xmm4
   4490     pmaddubsw  xmm1, xmm4
   4491     phaddw     xmm0, xmm1
   4492     paddw      xmm0, xmm5  // Add .5 for rounding.
   4493     psrlw      xmm0, 7
   4494     packuswb   xmm0, xmm0   // 8 G bytes
   4495     movdqu     xmm2, [eax]  // A
   4496     movdqu     xmm3, [eax + 16]
   4497     lea        eax, [eax + 32]
   4498     psrld      xmm2, 24
   4499     psrld      xmm3, 24
   4500     packuswb   xmm2, xmm3
   4501     packuswb   xmm2, xmm2   // 8 A bytes
   4502     movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
   4503     punpcklbw  xmm0, xmm0   // 8 GG words
   4504     punpcklbw  xmm3, xmm2   // 8 GA words
   4505     movdqa     xmm1, xmm0
   4506     punpcklwd  xmm0, xmm3   // GGGA first 4
   4507     punpckhwd  xmm1, xmm3   // GGGA next 4
   4508     movdqu     [edx], xmm0
   4509     movdqu     [edx + 16], xmm1
   4510     lea        edx, [edx + 32]
   4511     sub        ecx, 8
   4512     jg         convertloop
   4513     ret
   4514   }
   4515 }
   4516 #endif  // HAS_ARGBGRAYROW_SSSE3
   4517 
   4518 #ifdef HAS_ARGBSEPIAROW_SSSE3
   4519 //    b = (r * 35 + g * 68 + b * 17) >> 7
   4520 //    g = (r * 45 + g * 88 + b * 22) >> 7
   4521 //    r = (r * 50 + g * 98 + b * 24) >> 7
   4522 // Constant for ARGB color to sepia tone.
   4523 static const vec8 kARGBToSepiaB = {
   4524   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
   4525 };
   4526 
   4527 static const vec8 kARGBToSepiaG = {
   4528   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
   4529 };
   4530 
   4531 static const vec8 kARGBToSepiaR = {
   4532   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
   4533 };
   4534 
   4535 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
   4536 __declspec(naked)
   4537 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
   4538   __asm {
   4539     mov        eax, [esp + 4]   /* dst_argb */
   4540     mov        ecx, [esp + 8]   /* width */
   4541     movdqa     xmm2, kARGBToSepiaB
   4542     movdqa     xmm3, kARGBToSepiaG
   4543     movdqa     xmm4, kARGBToSepiaR
   4544 
   4545  convertloop:
   4546     movdqu     xmm0, [eax]  // B
   4547     movdqu     xmm6, [eax + 16]
   4548     pmaddubsw  xmm0, xmm2
   4549     pmaddubsw  xmm6, xmm2
   4550     phaddw     xmm0, xmm6
   4551     psrlw      xmm0, 7
   4552     packuswb   xmm0, xmm0   // 8 B values
   4553     movdqu     xmm5, [eax]  // G
   4554     movdqu     xmm1, [eax + 16]
   4555     pmaddubsw  xmm5, xmm3
   4556     pmaddubsw  xmm1, xmm3
   4557     phaddw     xmm5, xmm1
   4558     psrlw      xmm5, 7
   4559     packuswb   xmm5, xmm5   // 8 G values
   4560     punpcklbw  xmm0, xmm5   // 8 BG values
   4561     movdqu     xmm5, [eax]  // R
   4562     movdqu     xmm1, [eax + 16]
   4563     pmaddubsw  xmm5, xmm4
   4564     pmaddubsw  xmm1, xmm4
   4565     phaddw     xmm5, xmm1
   4566     psrlw      xmm5, 7
   4567     packuswb   xmm5, xmm5   // 8 R values
   4568     movdqu     xmm6, [eax]  // A
   4569     movdqu     xmm1, [eax + 16]
   4570     psrld      xmm6, 24
   4571     psrld      xmm1, 24
   4572     packuswb   xmm6, xmm1
   4573     packuswb   xmm6, xmm6   // 8 A values
   4574     punpcklbw  xmm5, xmm6   // 8 RA values
   4575     movdqa     xmm1, xmm0   // Weave BG, RA together
   4576     punpcklwd  xmm0, xmm5   // BGRA first 4
   4577     punpckhwd  xmm1, xmm5   // BGRA next 4
   4578     movdqu     [eax], xmm0
   4579     movdqu     [eax + 16], xmm1
   4580     lea        eax, [eax + 32]
   4581     sub        ecx, 8
   4582     jg         convertloop
   4583     ret
   4584   }
   4585 }
   4586 #endif  // HAS_ARGBSEPIAROW_SSSE3
   4587 
   4588 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
   4589 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
   4590 // Same as Sepia except matrix is provided.
   4591 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
   4592 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
   4593 __declspec(naked)
   4594 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
   4595                               const int8* matrix_argb, int width) {
   4596   __asm {
   4597     mov        eax, [esp + 4]   /* src_argb */
   4598     mov        edx, [esp + 8]   /* dst_argb */
   4599     mov        ecx, [esp + 12]  /* matrix_argb */
   4600     movdqu     xmm5, [ecx]
   4601     pshufd     xmm2, xmm5, 0x00
   4602     pshufd     xmm3, xmm5, 0x55
   4603     pshufd     xmm4, xmm5, 0xaa
   4604     pshufd     xmm5, xmm5, 0xff
   4605     mov        ecx, [esp + 16]  /* width */
   4606 
   4607  convertloop:
   4608     movdqu     xmm0, [eax]  // B
   4609     movdqu     xmm7, [eax + 16]
   4610     pmaddubsw  xmm0, xmm2
   4611     pmaddubsw  xmm7, xmm2
   4612     movdqu     xmm6, [eax]  // G
   4613     movdqu     xmm1, [eax + 16]
   4614     pmaddubsw  xmm6, xmm3
   4615     pmaddubsw  xmm1, xmm3
   4616     phaddsw    xmm0, xmm7   // B
   4617     phaddsw    xmm6, xmm1   // G
   4618     psraw      xmm0, 6      // B
   4619     psraw      xmm6, 6      // G
   4620     packuswb   xmm0, xmm0   // 8 B values
   4621     packuswb   xmm6, xmm6   // 8 G values
   4622     punpcklbw  xmm0, xmm6   // 8 BG values
   4623     movdqu     xmm1, [eax]  // R
   4624     movdqu     xmm7, [eax + 16]
   4625     pmaddubsw  xmm1, xmm4
   4626     pmaddubsw  xmm7, xmm4
   4627     phaddsw    xmm1, xmm7   // R
   4628     movdqu     xmm6, [eax]  // A
   4629     movdqu     xmm7, [eax + 16]
   4630     pmaddubsw  xmm6, xmm5
   4631     pmaddubsw  xmm7, xmm5
   4632     phaddsw    xmm6, xmm7   // A
   4633     psraw      xmm1, 6      // R
   4634     psraw      xmm6, 6      // A
   4635     packuswb   xmm1, xmm1   // 8 R values
   4636     packuswb   xmm6, xmm6   // 8 A values
   4637     punpcklbw  xmm1, xmm6   // 8 RA values
   4638     movdqa     xmm6, xmm0   // Weave BG, RA together
   4639     punpcklwd  xmm0, xmm1   // BGRA first 4
   4640     punpckhwd  xmm6, xmm1   // BGRA next 4
   4641     movdqu     [edx], xmm0
   4642     movdqu     [edx + 16], xmm6
   4643     lea        eax, [eax + 32]
   4644     lea        edx, [edx + 32]
   4645     sub        ecx, 8
   4646     jg         convertloop
   4647     ret
   4648   }
   4649 }
   4650 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
   4651 
   4652 #ifdef HAS_ARGBQUANTIZEROW_SSE2
   4653 // Quantize 4 ARGB pixels (16 bytes).
   4654 __declspec(naked)
   4655 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
   4656                           int interval_offset, int width) {
   4657   __asm {
   4658     mov        eax, [esp + 4]    /* dst_argb */
   4659     movd       xmm2, [esp + 8]   /* scale */
   4660     movd       xmm3, [esp + 12]  /* interval_size */
   4661     movd       xmm4, [esp + 16]  /* interval_offset */
   4662     mov        ecx, [esp + 20]   /* width */
   4663     pshuflw    xmm2, xmm2, 040h
   4664     pshufd     xmm2, xmm2, 044h
   4665     pshuflw    xmm3, xmm3, 040h
   4666     pshufd     xmm3, xmm3, 044h
   4667     pshuflw    xmm4, xmm4, 040h
   4668     pshufd     xmm4, xmm4, 044h
   4669     pxor       xmm5, xmm5  // constant 0
   4670     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
   4671     pslld      xmm6, 24
   4672 
   4673  convertloop:
   4674     movdqu     xmm0, [eax]  // read 4 pixels
   4675     punpcklbw  xmm0, xmm5   // first 2 pixels
   4676     pmulhuw    xmm0, xmm2   // pixel * scale >> 16
   4677     movdqu     xmm1, [eax]  // read 4 pixels
   4678     punpckhbw  xmm1, xmm5   // next 2 pixels
   4679     pmulhuw    xmm1, xmm2
   4680     pmullw     xmm0, xmm3   // * interval_size
   4681     movdqu     xmm7, [eax]  // read 4 pixels
   4682     pmullw     xmm1, xmm3
   4683     pand       xmm7, xmm6   // mask alpha
   4684     paddw      xmm0, xmm4   // + interval_size / 2
   4685     paddw      xmm1, xmm4
   4686     packuswb   xmm0, xmm1
   4687     por        xmm0, xmm7
   4688     movdqu     [eax], xmm0
   4689     lea        eax, [eax + 16]
   4690     sub        ecx, 4
   4691     jg         convertloop
   4692     ret
   4693   }
   4694 }
   4695 #endif  // HAS_ARGBQUANTIZEROW_SSE2
   4696 
   4697 #ifdef HAS_ARGBSHADEROW_SSE2
   4698 // Shade 4 pixels at a time by specified value.
   4699 __declspec(naked)
   4700 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
   4701                        uint32 value) {
   4702   __asm {
   4703     mov        eax, [esp + 4]   // src_argb
   4704     mov        edx, [esp + 8]   // dst_argb
   4705     mov        ecx, [esp + 12]  // width
   4706     movd       xmm2, [esp + 16]  // value
   4707     punpcklbw  xmm2, xmm2
   4708     punpcklqdq xmm2, xmm2
   4709 
   4710  convertloop:
   4711     movdqu     xmm0, [eax]      // read 4 pixels
   4712     lea        eax, [eax + 16]
   4713     movdqa     xmm1, xmm0
   4714     punpcklbw  xmm0, xmm0       // first 2
   4715     punpckhbw  xmm1, xmm1       // next 2
   4716     pmulhuw    xmm0, xmm2       // argb * value
   4717     pmulhuw    xmm1, xmm2       // argb * value
   4718     psrlw      xmm0, 8
   4719     psrlw      xmm1, 8
   4720     packuswb   xmm0, xmm1
   4721     movdqu     [edx], xmm0
   4722     lea        edx, [edx + 16]
   4723     sub        ecx, 4
   4724     jg         convertloop
   4725 
   4726     ret
   4727   }
   4728 }
   4729 #endif  // HAS_ARGBSHADEROW_SSE2
   4730 
   4731 #ifdef HAS_ARGBMULTIPLYROW_SSE2
   4732 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
   4733 __declspec(naked)
   4734 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   4735                           uint8* dst_argb, int width) {
   4736   __asm {
   4737     push       esi
   4738     mov        eax, [esp + 4 + 4]   // src_argb0
   4739     mov        esi, [esp + 4 + 8]   // src_argb1
   4740     mov        edx, [esp + 4 + 12]  // dst_argb
   4741     mov        ecx, [esp + 4 + 16]  // width
   4742     pxor       xmm5, xmm5  // constant 0
   4743 
   4744  convertloop:
   4745     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
   4746     movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
   4747     movdqu     xmm1, xmm0
   4748     movdqu     xmm3, xmm2
   4749     punpcklbw  xmm0, xmm0         // first 2
   4750     punpckhbw  xmm1, xmm1         // next 2
   4751     punpcklbw  xmm2, xmm5         // first 2
   4752     punpckhbw  xmm3, xmm5         // next 2
   4753     pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
   4754     pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
   4755     lea        eax, [eax + 16]
   4756     lea        esi, [esi + 16]
   4757     packuswb   xmm0, xmm1
   4758     movdqu     [edx], xmm0
   4759     lea        edx, [edx + 16]
   4760     sub        ecx, 4
   4761     jg         convertloop
   4762 
   4763     pop        esi
   4764     ret
   4765   }
   4766 }
   4767 #endif  // HAS_ARGBMULTIPLYROW_SSE2
   4768 
   4769 #ifdef HAS_ARGBADDROW_SSE2
   4770 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
   4771 // TODO(fbarchard): Port this to posix, neon and other math functions.
   4772 __declspec(naked)
   4773 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   4774                      uint8* dst_argb, int width) {
   4775   __asm {
   4776     push       esi
   4777     mov        eax, [esp + 4 + 4]   // src_argb0
   4778     mov        esi, [esp + 4 + 8]   // src_argb1
   4779     mov        edx, [esp + 4 + 12]  // dst_argb
   4780     mov        ecx, [esp + 4 + 16]  // width
   4781 
   4782     sub        ecx, 4
   4783     jl         convertloop49
   4784 
   4785  convertloop4:
   4786     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
   4787     lea        eax, [eax + 16]
   4788     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
   4789     lea        esi, [esi + 16]
   4790     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
   4791     movdqu     [edx], xmm0
   4792     lea        edx, [edx + 16]
   4793     sub        ecx, 4
   4794     jge        convertloop4
   4795 
   4796  convertloop49:
   4797     add        ecx, 4 - 1
   4798     jl         convertloop19
   4799 
   4800  convertloop1:
   4801     movd       xmm0, [eax]        // read 1 pixels from src_argb0
   4802     lea        eax, [eax + 4]
   4803     movd       xmm1, [esi]        // read 1 pixels from src_argb1
   4804     lea        esi, [esi + 4]
   4805     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
   4806     movd       [edx], xmm0
   4807     lea        edx, [edx + 4]
   4808     sub        ecx, 1
   4809     jge        convertloop1
   4810 
   4811  convertloop19:
   4812     pop        esi
   4813     ret
   4814   }
   4815 }
   4816 #endif  // HAS_ARGBADDROW_SSE2
   4817 
   4818 #ifdef HAS_ARGBSUBTRACTROW_SSE2
   4819 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
   4820 __declspec(naked)
   4821 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   4822                           uint8* dst_argb, int width) {
   4823   __asm {
   4824     push       esi
   4825     mov        eax, [esp + 4 + 4]   // src_argb0
   4826     mov        esi, [esp + 4 + 8]   // src_argb1
   4827     mov        edx, [esp + 4 + 12]  // dst_argb
   4828     mov        ecx, [esp + 4 + 16]  // width
   4829 
   4830  convertloop:
   4831     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
   4832     lea        eax, [eax + 16]
   4833     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
   4834     lea        esi, [esi + 16]
   4835     psubusb    xmm0, xmm1         // src_argb0 - src_argb1
   4836     movdqu     [edx], xmm0
   4837     lea        edx, [edx + 16]
   4838     sub        ecx, 4
   4839     jg         convertloop
   4840 
   4841     pop        esi
   4842     ret
   4843   }
   4844 }
   4845 #endif  // HAS_ARGBSUBTRACTROW_SSE2
   4846 
   4847 #ifdef HAS_ARGBMULTIPLYROW_AVX2
   4848 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
   4849 __declspec(naked)
   4850 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
   4851                           uint8* dst_argb, int width) {
   4852   __asm {
   4853     push       esi
   4854     mov        eax, [esp + 4 + 4]   // src_argb0
   4855     mov        esi, [esp + 4 + 8]   // src_argb1
   4856     mov        edx, [esp + 4 + 12]  // dst_argb
   4857     mov        ecx, [esp + 4 + 16]  // width
   4858     vpxor      ymm5, ymm5, ymm5     // constant 0
   4859 
   4860  convertloop:
   4861     vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
   4862     lea        eax, [eax + 32]
   4863     vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
   4864     lea        esi, [esi + 32]
   4865     vpunpcklbw ymm0, ymm1, ymm1   // low 4
   4866     vpunpckhbw ymm1, ymm1, ymm1   // high 4
   4867     vpunpcklbw ymm2, ymm3, ymm5   // low 4
   4868     vpunpckhbw ymm3, ymm3, ymm5   // high 4
   4869     vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
   4870     vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
   4871     vpackuswb  ymm0, ymm0, ymm1
   4872     vmovdqu    [edx], ymm0
   4873     lea        edx, [edx + 32]
   4874     sub        ecx, 8
   4875     jg         convertloop
   4876 
   4877     pop        esi
   4878     vzeroupper
   4879     ret
   4880   }
   4881 }
   4882 #endif  // HAS_ARGBMULTIPLYROW_AVX2
   4883 
   4884 #ifdef HAS_ARGBADDROW_AVX2
   4885 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
   4886 __declspec(naked)
   4887 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
   4888                      uint8* dst_argb, int width) {
   4889   __asm {
   4890     push       esi
   4891     mov        eax, [esp + 4 + 4]   // src_argb0
   4892     mov        esi, [esp + 4 + 8]   // src_argb1
   4893     mov        edx, [esp + 4 + 12]  // dst_argb
   4894     mov        ecx, [esp + 4 + 16]  // width
   4895 
   4896  convertloop:
   4897     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
   4898     lea        eax, [eax + 32]
   4899     vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
   4900     lea        esi, [esi + 32]
   4901     vmovdqu    [edx], ymm0
   4902     lea        edx, [edx + 32]
   4903     sub        ecx, 8
   4904     jg         convertloop
   4905 
   4906     pop        esi
   4907     vzeroupper
   4908     ret
   4909   }
   4910 }
   4911 #endif  // HAS_ARGBADDROW_AVX2
   4912 
   4913 #ifdef HAS_ARGBSUBTRACTROW_AVX2
   4914 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
   4915 __declspec(naked)
   4916 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
   4917                           uint8* dst_argb, int width) {
   4918   __asm {
   4919     push       esi
   4920     mov        eax, [esp + 4 + 4]   // src_argb0
   4921     mov        esi, [esp + 4 + 8]   // src_argb1
   4922     mov        edx, [esp + 4 + 12]  // dst_argb
   4923     mov        ecx, [esp + 4 + 16]  // width
   4924 
   4925  convertloop:
   4926     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
   4927     lea        eax, [eax + 32]
   4928     vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
   4929     lea        esi, [esi + 32]
   4930     vmovdqu    [edx], ymm0
   4931     lea        edx, [edx + 32]
   4932     sub        ecx, 8
   4933     jg         convertloop
   4934 
   4935     pop        esi
   4936     vzeroupper
   4937     ret
   4938   }
   4939 }
   4940 #endif  // HAS_ARGBSUBTRACTROW_AVX2
   4941 
   4942 #ifdef HAS_SOBELXROW_SSE2
   4943 // SobelX as a matrix is
   4944 // -1  0  1
   4945 // -2  0  2
   4946 // -1  0  1
   4947 __declspec(naked)
   4948 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
   4949                     const uint8* src_y2, uint8* dst_sobelx, int width) {
   4950   __asm {
   4951     push       esi
   4952     push       edi
   4953     mov        eax, [esp + 8 + 4]   // src_y0
   4954     mov        esi, [esp + 8 + 8]   // src_y1
   4955     mov        edi, [esp + 8 + 12]  // src_y2
   4956     mov        edx, [esp + 8 + 16]  // dst_sobelx
   4957     mov        ecx, [esp + 8 + 20]  // width
   4958     sub        esi, eax
   4959     sub        edi, eax
   4960     sub        edx, eax
   4961     pxor       xmm5, xmm5  // constant 0
   4962 
   4963  convertloop:
   4964     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
   4965     movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
   4966     punpcklbw  xmm0, xmm5
   4967     punpcklbw  xmm1, xmm5
   4968     psubw      xmm0, xmm1
   4969     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
   4970     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
   4971     punpcklbw  xmm1, xmm5
   4972     punpcklbw  xmm2, xmm5
   4973     psubw      xmm1, xmm2
   4974     movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
   4975     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
   4976     punpcklbw  xmm2, xmm5
   4977     punpcklbw  xmm3, xmm5
   4978     psubw      xmm2, xmm3
   4979     paddw      xmm0, xmm2
   4980     paddw      xmm0, xmm1
   4981     paddw      xmm0, xmm1
   4982     pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
   4983     psubw      xmm1, xmm0
   4984     pmaxsw     xmm0, xmm1
   4985     packuswb   xmm0, xmm0
   4986     movq       qword ptr [eax + edx], xmm0
   4987     lea        eax, [eax + 8]
   4988     sub        ecx, 8
   4989     jg         convertloop
   4990 
   4991     pop        edi
   4992     pop        esi
   4993     ret
   4994   }
   4995 }
   4996 #endif  // HAS_SOBELXROW_SSE2
   4997 
   4998 #ifdef HAS_SOBELYROW_SSE2
   4999 // SobelY as a matrix is
   5000 // -1 -2 -1
   5001 //  0  0  0
   5002 //  1  2  1
   5003 __declspec(naked)
   5004 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
   5005                     uint8* dst_sobely, int width) {
   5006   __asm {
   5007     push       esi
   5008     mov        eax, [esp + 4 + 4]   // src_y0
   5009     mov        esi, [esp + 4 + 8]   // src_y1
   5010     mov        edx, [esp + 4 + 12]  // dst_sobely
   5011     mov        ecx, [esp + 4 + 16]  // width
   5012     sub        esi, eax
   5013     sub        edx, eax
   5014     pxor       xmm5, xmm5  // constant 0
   5015 
   5016  convertloop:
   5017     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
   5018     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
   5019     punpcklbw  xmm0, xmm5
   5020     punpcklbw  xmm1, xmm5
   5021     psubw      xmm0, xmm1
   5022     movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
   5023     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
   5024     punpcklbw  xmm1, xmm5
   5025     punpcklbw  xmm2, xmm5
   5026     psubw      xmm1, xmm2
   5027     movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
   5028     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
   5029     punpcklbw  xmm2, xmm5
   5030     punpcklbw  xmm3, xmm5
   5031     psubw      xmm2, xmm3
   5032     paddw      xmm0, xmm2
   5033     paddw      xmm0, xmm1
   5034     paddw      xmm0, xmm1
   5035     pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
   5036     psubw      xmm1, xmm0
   5037     pmaxsw     xmm0, xmm1
   5038     packuswb   xmm0, xmm0
   5039     movq       qword ptr [eax + edx], xmm0
   5040     lea        eax, [eax + 8]
   5041     sub        ecx, 8
   5042     jg         convertloop
   5043 
   5044     pop        esi
   5045     ret
   5046   }
   5047 }
   5048 #endif  // HAS_SOBELYROW_SSE2
   5049 
   5050 #ifdef HAS_SOBELROW_SSE2
   5051 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
   5052 // A = 255
   5053 // R = Sobel
   5054 // G = Sobel
   5055 // B = Sobel
   5056 __declspec(naked)
   5057 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
   5058                    uint8* dst_argb, int width) {
   5059   __asm {
   5060     push       esi
   5061     mov        eax, [esp + 4 + 4]   // src_sobelx
   5062     mov        esi, [esp + 4 + 8]   // src_sobely
   5063     mov        edx, [esp + 4 + 12]  // dst_argb
   5064     mov        ecx, [esp + 4 + 16]  // width
   5065     sub        esi, eax
   5066     pcmpeqb    xmm5, xmm5           // alpha 255
   5067     pslld      xmm5, 24             // 0xff000000
   5068 
   5069  convertloop:
   5070     movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
   5071     movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
   5072     lea        eax, [eax + 16]
   5073     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
   5074     movdqa     xmm2, xmm0             // GG
   5075     punpcklbw  xmm2, xmm0             // First 8
   5076     punpckhbw  xmm0, xmm0             // Next 8
   5077     movdqa     xmm1, xmm2             // GGGG
   5078     punpcklwd  xmm1, xmm2             // First 4
   5079     punpckhwd  xmm2, xmm2             // Next 4
   5080     por        xmm1, xmm5             // GGGA
   5081     por        xmm2, xmm5
   5082     movdqa     xmm3, xmm0             // GGGG
   5083     punpcklwd  xmm3, xmm0             // Next 4
   5084     punpckhwd  xmm0, xmm0             // Last 4
   5085     por        xmm3, xmm5             // GGGA
   5086     por        xmm0, xmm5
   5087     movdqu     [edx], xmm1
   5088     movdqu     [edx + 16], xmm2
   5089     movdqu     [edx + 32], xmm3
   5090     movdqu     [edx + 48], xmm0
   5091     lea        edx, [edx + 64]
   5092     sub        ecx, 16
   5093     jg         convertloop
   5094 
   5095     pop        esi
   5096     ret
   5097   }
   5098 }
   5099 #endif  // HAS_SOBELROW_SSE2
   5100 
   5101 #ifdef HAS_SOBELTOPLANEROW_SSE2
   5102 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
   5103 __declspec(naked)
   5104 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
   5105                           uint8* dst_y, int width) {
   5106   __asm {
   5107     push       esi
   5108     mov        eax, [esp + 4 + 4]   // src_sobelx
   5109     mov        esi, [esp + 4 + 8]   // src_sobely
   5110     mov        edx, [esp + 4 + 12]  // dst_argb
   5111     mov        ecx, [esp + 4 + 16]  // width
   5112     sub        esi, eax
   5113 
   5114  convertloop:
   5115     movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
   5116     movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
   5117     lea        eax, [eax + 16]
   5118     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
   5119     movdqu     [edx], xmm0
   5120     lea        edx, [edx + 16]
   5121     sub        ecx, 16
   5122     jg         convertloop
   5123 
   5124     pop        esi
   5125     ret
   5126   }
   5127 }
   5128 #endif  // HAS_SOBELTOPLANEROW_SSE2
   5129 
   5130 #ifdef HAS_SOBELXYROW_SSE2
   5131 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
   5132 // A = 255
   5133 // R = Sobel X
   5134 // G = Sobel
   5135 // B = Sobel Y
   5136 __declspec(naked)
   5137 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
   5138                      uint8* dst_argb, int width) {
   5139   __asm {
   5140     push       esi
   5141     mov        eax, [esp + 4 + 4]   // src_sobelx
   5142     mov        esi, [esp + 4 + 8]   // src_sobely
   5143     mov        edx, [esp + 4 + 12]  // dst_argb
   5144     mov        ecx, [esp + 4 + 16]  // width
   5145     sub        esi, eax
   5146     pcmpeqb    xmm5, xmm5           // alpha 255
   5147 
   5148  convertloop:
   5149     movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
   5150     movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
   5151     lea        eax, [eax + 16]
   5152     movdqa     xmm2, xmm0
   5153     paddusb    xmm2, xmm1             // sobel = sobelx + sobely
   5154     movdqa     xmm3, xmm0             // XA
   5155     punpcklbw  xmm3, xmm5
   5156     punpckhbw  xmm0, xmm5
   5157     movdqa     xmm4, xmm1             // YS
   5158     punpcklbw  xmm4, xmm2
   5159     punpckhbw  xmm1, xmm2
   5160     movdqa     xmm6, xmm4             // YSXA
   5161     punpcklwd  xmm6, xmm3             // First 4
   5162     punpckhwd  xmm4, xmm3             // Next 4
   5163     movdqa     xmm7, xmm1             // YSXA
   5164     punpcklwd  xmm7, xmm0             // Next 4
   5165     punpckhwd  xmm1, xmm0             // Last 4
   5166     movdqu     [edx], xmm6
   5167     movdqu     [edx + 16], xmm4
   5168     movdqu     [edx + 32], xmm7
   5169     movdqu     [edx + 48], xmm1
   5170     lea        edx, [edx + 64]
   5171     sub        ecx, 16
   5172     jg         convertloop
   5173 
   5174     pop        esi
   5175     ret
   5176   }
   5177 }
   5178 #endif  // HAS_SOBELXYROW_SSE2
   5179 
   5180 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
   5181 // Consider float CumulativeSum.
   5182 // Consider calling CumulativeSum one row at time as needed.
   5183 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
   5184 // Convert cumulative sum for an area to an average for 1 pixel.
   5185 // topleft is pointer to top left of CumulativeSum buffer for area.
   5186 // botleft is pointer to bottom left of CumulativeSum buffer.
   5187 // width is offset from left to right of area in CumulativeSum buffer measured
   5188 //   in number of ints.
   5189 // area is the number of pixels in the area being averaged.
   5190 // dst points to pixel to store result to.
   5191 // count is number of averaged pixels to produce.
   5192 // Does 4 pixels at a time.
   5193 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
   5194                                     int width, int area, uint8* dst,
   5195                                     int count) {
   5196   __asm {
   5197     mov        eax, topleft  // eax topleft
   5198     mov        esi, botleft  // esi botleft
   5199     mov        edx, width
   5200     movd       xmm5, area
   5201     mov        edi, dst
   5202     mov        ecx, count
   5203     cvtdq2ps   xmm5, xmm5
   5204     rcpss      xmm4, xmm5  // 1.0f / area
   5205     pshufd     xmm4, xmm4, 0
   5206     sub        ecx, 4
   5207     jl         l4b
   5208 
   5209     cmp        area, 128  // 128 pixels will not overflow 15 bits.
   5210     ja         l4
   5211 
   5212     pshufd     xmm5, xmm5, 0        // area
   5213     pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
   5214     psrld      xmm6, 16
   5215     cvtdq2ps   xmm6, xmm6
   5216     addps      xmm5, xmm6           // (65536.0 + area - 1)
   5217     mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
   5218     cvtps2dq   xmm5, xmm5           // 0.16 fixed point
   5219     packssdw   xmm5, xmm5           // 16 bit shorts
   5220 
   5221     // 4 pixel loop small blocks.
   5222   s4:
   5223     // top left
   5224     movdqu     xmm0, [eax]
   5225     movdqu     xmm1, [eax + 16]
   5226     movdqu     xmm2, [eax + 32]
   5227     movdqu     xmm3, [eax + 48]
   5228 
   5229     // - top right
   5230     psubd      xmm0, [eax + edx * 4]
   5231     psubd      xmm1, [eax + edx * 4 + 16]
   5232     psubd      xmm2, [eax + edx * 4 + 32]
   5233     psubd      xmm3, [eax + edx * 4 + 48]
   5234     lea        eax, [eax + 64]
   5235 
   5236     // - bottom left
   5237     psubd      xmm0, [esi]
   5238     psubd      xmm1, [esi + 16]
   5239     psubd      xmm2, [esi + 32]
   5240     psubd      xmm3, [esi + 48]
   5241 
   5242     // + bottom right
   5243     paddd      xmm0, [esi + edx * 4]
   5244     paddd      xmm1, [esi + edx * 4 + 16]
   5245     paddd      xmm2, [esi + edx * 4 + 32]
   5246     paddd      xmm3, [esi + edx * 4 + 48]
   5247     lea        esi, [esi + 64]
   5248 
   5249     packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
   5250     packssdw   xmm2, xmm3
   5251 
   5252     pmulhuw    xmm0, xmm5
   5253     pmulhuw    xmm2, xmm5
   5254 
   5255     packuswb   xmm0, xmm2
   5256     movdqu     [edi], xmm0
   5257     lea        edi, [edi + 16]
   5258     sub        ecx, 4
   5259     jge        s4
   5260 
   5261     jmp        l4b
   5262 
   5263     // 4 pixel loop
   5264   l4:
   5265     // top left
   5266     movdqu     xmm0, [eax]
   5267     movdqu     xmm1, [eax + 16]
   5268     movdqu     xmm2, [eax + 32]
   5269     movdqu     xmm3, [eax + 48]
   5270 
   5271     // - top right
   5272     psubd      xmm0, [eax + edx * 4]
   5273     psubd      xmm1, [eax + edx * 4 + 16]
   5274     psubd      xmm2, [eax + edx * 4 + 32]
   5275     psubd      xmm3, [eax + edx * 4 + 48]
   5276     lea        eax, [eax + 64]
   5277 
   5278     // - bottom left
   5279     psubd      xmm0, [esi]
   5280     psubd      xmm1, [esi + 16]
   5281     psubd      xmm2, [esi + 32]
   5282     psubd      xmm3, [esi + 48]
   5283 
   5284     // + bottom right
   5285     paddd      xmm0, [esi + edx * 4]
   5286     paddd      xmm1, [esi + edx * 4 + 16]
   5287     paddd      xmm2, [esi + edx * 4 + 32]
   5288     paddd      xmm3, [esi + edx * 4 + 48]
   5289     lea        esi, [esi + 64]
   5290 
   5291     cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
   5292     cvtdq2ps   xmm1, xmm1
   5293     mulps      xmm0, xmm4
   5294     mulps      xmm1, xmm4
   5295     cvtdq2ps   xmm2, xmm2
   5296     cvtdq2ps   xmm3, xmm3
   5297     mulps      xmm2, xmm4
   5298     mulps      xmm3, xmm4
   5299     cvtps2dq   xmm0, xmm0
   5300     cvtps2dq   xmm1, xmm1
   5301     cvtps2dq   xmm2, xmm2
   5302     cvtps2dq   xmm3, xmm3
   5303     packssdw   xmm0, xmm1
   5304     packssdw   xmm2, xmm3
   5305     packuswb   xmm0, xmm2
   5306     movdqu     [edi], xmm0
   5307     lea        edi, [edi + 16]
   5308     sub        ecx, 4
   5309     jge        l4
   5310 
   5311   l4b:
   5312     add        ecx, 4 - 1
   5313     jl         l1b
   5314 
   5315     // 1 pixel loop
   5316   l1:
   5317     movdqu     xmm0, [eax]
   5318     psubd      xmm0, [eax + edx * 4]
   5319     lea        eax, [eax + 16]
   5320     psubd      xmm0, [esi]
   5321     paddd      xmm0, [esi + edx * 4]
   5322     lea        esi, [esi + 16]
   5323     cvtdq2ps   xmm0, xmm0
   5324     mulps      xmm0, xmm4
   5325     cvtps2dq   xmm0, xmm0
   5326     packssdw   xmm0, xmm0
   5327     packuswb   xmm0, xmm0
   5328     movd       dword ptr [edi], xmm0
   5329     lea        edi, [edi + 4]
   5330     sub        ecx, 1
   5331     jge        l1
   5332   l1b:
   5333   }
   5334 }
   5335 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
   5336 
   5337 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
   5338 // Creates a table of cumulative sums where each value is a sum of all values
   5339 // above and to the left of the value.
   5340 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
   5341                                   const int32* previous_cumsum, int width) {
   5342   __asm {
   5343     mov        eax, row
   5344     mov        edx, cumsum
   5345     mov        esi, previous_cumsum
   5346     mov        ecx, width
   5347     pxor       xmm0, xmm0
   5348     pxor       xmm1, xmm1
   5349 
   5350     sub        ecx, 4
   5351     jl         l4b
   5352     test       edx, 15
   5353     jne        l4b
   5354 
   5355     // 4 pixel loop
   5356   l4:
   5357     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
   5358     lea        eax, [eax + 16]
   5359     movdqa     xmm4, xmm2
   5360 
   5361     punpcklbw  xmm2, xmm1
   5362     movdqa     xmm3, xmm2
   5363     punpcklwd  xmm2, xmm1
   5364     punpckhwd  xmm3, xmm1
   5365 
   5366     punpckhbw  xmm4, xmm1
   5367     movdqa     xmm5, xmm4
   5368     punpcklwd  xmm4, xmm1
   5369     punpckhwd  xmm5, xmm1
   5370 
   5371     paddd      xmm0, xmm2
   5372     movdqu     xmm2, [esi]  // previous row above.
   5373     paddd      xmm2, xmm0
   5374 
   5375     paddd      xmm0, xmm3
   5376     movdqu     xmm3, [esi + 16]
   5377     paddd      xmm3, xmm0
   5378 
   5379     paddd      xmm0, xmm4
   5380     movdqu     xmm4, [esi + 32]
   5381     paddd      xmm4, xmm0
   5382 
   5383     paddd      xmm0, xmm5
   5384     movdqu     xmm5, [esi + 48]
   5385     lea        esi, [esi + 64]
   5386     paddd      xmm5, xmm0
   5387 
   5388     movdqu     [edx], xmm2
   5389     movdqu     [edx + 16], xmm3
   5390     movdqu     [edx + 32], xmm4
   5391     movdqu     [edx + 48], xmm5
   5392 
   5393     lea        edx, [edx + 64]
   5394     sub        ecx, 4
   5395     jge        l4
   5396 
   5397   l4b:
   5398     add        ecx, 4 - 1
   5399     jl         l1b
   5400 
   5401     // 1 pixel loop
   5402   l1:
   5403     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
   5404     lea        eax, [eax + 4]
   5405     punpcklbw  xmm2, xmm1
   5406     punpcklwd  xmm2, xmm1
   5407     paddd      xmm0, xmm2
   5408     movdqu     xmm2, [esi]
   5409     lea        esi, [esi + 16]
   5410     paddd      xmm2, xmm0
   5411     movdqu     [edx], xmm2
   5412     lea        edx, [edx + 16]
   5413     sub        ecx, 1
   5414     jge        l1
   5415 
   5416  l1b:
   5417   }
   5418 }
   5419 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
   5420 
   5421 #ifdef HAS_ARGBAFFINEROW_SSE2
   5422 // Copy ARGB pixels from source image with slope to a row of destination.
   5423 __declspec(naked)
   5424 LIBYUV_API
   5425 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
   5426                         uint8* dst_argb, const float* uv_dudv, int width) {
   5427   __asm {
   5428     push       esi
   5429     push       edi
   5430     mov        eax, [esp + 12]  // src_argb
   5431     mov        esi, [esp + 16]  // stride
   5432     mov        edx, [esp + 20]  // dst_argb
   5433     mov        ecx, [esp + 24]  // pointer to uv_dudv
   5434     movq       xmm2, qword ptr [ecx]  // uv
   5435     movq       xmm7, qword ptr [ecx + 8]  // dudv
   5436     mov        ecx, [esp + 28]  // width
   5437     shl        esi, 16          // 4, stride
   5438     add        esi, 4
   5439     movd       xmm5, esi
   5440     sub        ecx, 4
   5441     jl         l4b
   5442 
   5443     // setup for 4 pixel loop
   5444     pshufd     xmm7, xmm7, 0x44  // dup dudv
   5445     pshufd     xmm5, xmm5, 0  // dup 4, stride
   5446     movdqa     xmm0, xmm2    // x0, y0, x1, y1
   5447     addps      xmm0, xmm7
   5448     movlhps    xmm2, xmm0
   5449     movdqa     xmm4, xmm7
   5450     addps      xmm4, xmm4    // dudv *= 2
   5451     movdqa     xmm3, xmm2    // x2, y2, x3, y3
   5452     addps      xmm3, xmm4
   5453     addps      xmm4, xmm4    // dudv *= 4
   5454 
   5455     // 4 pixel loop
   5456   l4:
   5457     cvttps2dq  xmm0, xmm2    // x, y float to int first 2
   5458     cvttps2dq  xmm1, xmm3    // x, y float to int next 2
   5459     packssdw   xmm0, xmm1    // x, y as 8 shorts
   5460     pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
   5461     movd       esi, xmm0
   5462     pshufd     xmm0, xmm0, 0x39  // shift right
   5463     movd       edi, xmm0
   5464     pshufd     xmm0, xmm0, 0x39  // shift right
   5465     movd       xmm1, [eax + esi]  // read pixel 0
   5466     movd       xmm6, [eax + edi]  // read pixel 1
   5467     punpckldq  xmm1, xmm6     // combine pixel 0 and 1
   5468     addps      xmm2, xmm4    // x, y += dx, dy first 2
   5469     movq       qword ptr [edx], xmm1
   5470     movd       esi, xmm0
   5471     pshufd     xmm0, xmm0, 0x39  // shift right
   5472     movd       edi, xmm0
   5473     movd       xmm6, [eax + esi]  // read pixel 2
   5474     movd       xmm0, [eax + edi]  // read pixel 3
   5475     punpckldq  xmm6, xmm0     // combine pixel 2 and 3
   5476     addps      xmm3, xmm4    // x, y += dx, dy next 2
   5477     movq       qword ptr 8[edx], xmm6
   5478     lea        edx, [edx + 16]
   5479     sub        ecx, 4
   5480     jge        l4
   5481 
   5482   l4b:
   5483     add        ecx, 4 - 1
   5484     jl         l1b
   5485 
   5486     // 1 pixel loop
   5487   l1:
   5488     cvttps2dq  xmm0, xmm2    // x, y float to int
   5489     packssdw   xmm0, xmm0    // x, y as shorts
   5490     pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
   5491     addps      xmm2, xmm7    // x, y += dx, dy
   5492     movd       esi, xmm0
   5493     movd       xmm0, [eax + esi]  // copy a pixel
   5494     movd       [edx], xmm0
   5495     lea        edx, [edx + 4]
   5496     sub        ecx, 1
   5497     jge        l1
   5498   l1b:
   5499     pop        edi
   5500     pop        esi
   5501     ret
   5502   }
   5503 }
   5504 #endif  // HAS_ARGBAFFINEROW_SSE2
   5505 
   5506 #ifdef HAS_INTERPOLATEROW_AVX2
   5507 // Bilinear filter 32x2 -> 32x1
   5508 __declspec(naked)
   5509 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
   5510                          ptrdiff_t src_stride, int dst_width,
   5511                          int source_y_fraction) {
   5512   __asm {
   5513     push       esi
   5514     push       edi
   5515     mov        edi, [esp + 8 + 4]   // dst_ptr
   5516     mov        esi, [esp + 8 + 8]   // src_ptr
   5517     mov        edx, [esp + 8 + 12]  // src_stride
   5518     mov        ecx, [esp + 8 + 16]  // dst_width
   5519     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   5520     shr        eax, 1
   5521     // Dispatch to specialized filters if applicable.
   5522     cmp        eax, 0
   5523     je         xloop100  // 0 / 128.  Blend 100 / 0.
   5524     sub        edi, esi
   5525     cmp        eax, 32
   5526     je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
   5527     cmp        eax, 64
   5528     je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
   5529     cmp        eax, 96
   5530     je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
   5531 
   5532     vmovd      xmm0, eax  // high fraction 0..127
   5533     neg        eax
   5534     add        eax, 128
   5535     vmovd      xmm5, eax  // low fraction 128..1
   5536     vpunpcklbw xmm5, xmm5, xmm0
   5537     vpunpcklwd xmm5, xmm5, xmm5
   5538     vpxor      ymm0, ymm0, ymm0
   5539     vpermd     ymm5, ymm0, ymm5
   5540 
   5541   xloop:
   5542     vmovdqu    ymm0, [esi]
   5543     vmovdqu    ymm2, [esi + edx]
   5544     vpunpckhbw ymm1, ymm0, ymm2  // mutates
   5545     vpunpcklbw ymm0, ymm0, ymm2  // mutates
   5546     vpmaddubsw ymm0, ymm0, ymm5
   5547     vpmaddubsw ymm1, ymm1, ymm5
   5548     vpsrlw     ymm0, ymm0, 7
   5549     vpsrlw     ymm1, ymm1, 7
   5550     vpackuswb  ymm0, ymm0, ymm1  // unmutates
   5551     vmovdqu    [esi + edi], ymm0
   5552     lea        esi, [esi + 32]
   5553     sub        ecx, 32
   5554     jg         xloop
   5555     jmp        xloop99
   5556 
   5557    // Blend 25 / 75.
   5558  xloop25:
   5559    vmovdqu    ymm0, [esi]
   5560    vmovdqu    ymm1, [esi + edx]
   5561    vpavgb     ymm0, ymm0, ymm1
   5562    vpavgb     ymm0, ymm0, ymm1
   5563    vmovdqu    [esi + edi], ymm0
   5564    lea        esi, [esi + 32]
   5565    sub        ecx, 32
   5566    jg         xloop25
   5567    jmp        xloop99
   5568 
   5569    // Blend 50 / 50.
   5570  xloop50:
   5571    vmovdqu    ymm0, [esi]
   5572    vpavgb     ymm0, ymm0, [esi + edx]
   5573    vmovdqu    [esi + edi], ymm0
   5574    lea        esi, [esi + 32]
   5575    sub        ecx, 32
   5576    jg         xloop50
   5577    jmp        xloop99
   5578 
   5579    // Blend 75 / 25.
   5580  xloop75:
   5581    vmovdqu    ymm1, [esi]
   5582    vmovdqu    ymm0, [esi + edx]
   5583    vpavgb     ymm0, ymm0, ymm1
   5584    vpavgb     ymm0, ymm0, ymm1
   5585    vmovdqu    [esi + edi], ymm0
   5586    lea        esi, [esi + 32]
   5587    sub        ecx, 32
   5588    jg         xloop75
   5589    jmp        xloop99
   5590 
   5591    // Blend 100 / 0 - Copy row unchanged.
   5592  xloop100:
   5593    rep movsb
   5594 
   5595   xloop99:
   5596     pop        edi
   5597     pop        esi
   5598     vzeroupper
   5599     ret
   5600   }
   5601 }
   5602 #endif  // HAS_INTERPOLATEROW_AVX2
   5603 
   5604 // Bilinear filter 16x2 -> 16x1
   5605 __declspec(naked)
   5606 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   5607                           ptrdiff_t src_stride, int dst_width,
   5608                           int source_y_fraction) {
   5609   __asm {
   5610     push       esi
   5611     push       edi
   5612     mov        edi, [esp + 8 + 4]   // dst_ptr
   5613     mov        esi, [esp + 8 + 8]   // src_ptr
   5614     mov        edx, [esp + 8 + 12]  // src_stride
   5615     mov        ecx, [esp + 8 + 16]  // dst_width
   5616     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   5617     sub        edi, esi
   5618     shr        eax, 1
   5619     // Dispatch to specialized filters if applicable.
   5620     cmp        eax, 0
   5621     je         xloop100  // 0 / 128.  Blend 100 / 0.
   5622     cmp        eax, 32
   5623     je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
   5624     cmp        eax, 64
   5625     je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
   5626     cmp        eax, 96
   5627     je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
   5628 
   5629     movd       xmm0, eax  // high fraction 0..127
   5630     neg        eax
   5631     add        eax, 128
   5632     movd       xmm5, eax  // low fraction 128..1
   5633     punpcklbw  xmm5, xmm0
   5634     punpcklwd  xmm5, xmm5
   5635     pshufd     xmm5, xmm5, 0
   5636 
   5637   xloop:
   5638     movdqu     xmm0, [esi]
   5639     movdqu     xmm2, [esi + edx]
   5640     movdqu     xmm1, xmm0
   5641     punpcklbw  xmm0, xmm2
   5642     punpckhbw  xmm1, xmm2
   5643     pmaddubsw  xmm0, xmm5
   5644     pmaddubsw  xmm1, xmm5
   5645     psrlw      xmm0, 7
   5646     psrlw      xmm1, 7
   5647     packuswb   xmm0, xmm1
   5648     movdqu     [esi + edi], xmm0
   5649     lea        esi, [esi + 16]
   5650     sub        ecx, 16
   5651     jg         xloop
   5652     jmp        xloop99
   5653 
   5654     // Blend 25 / 75.
   5655   xloop25:
   5656     movdqu     xmm0, [esi]
   5657     movdqu     xmm1, [esi + edx]
   5658     pavgb      xmm0, xmm1
   5659     pavgb      xmm0, xmm1
   5660     movdqu     [esi + edi], xmm0
   5661     lea        esi, [esi + 16]
   5662     sub        ecx, 16
   5663     jg         xloop25
   5664     jmp        xloop99
   5665 
   5666     // Blend 50 / 50.
   5667   xloop50:
   5668     movdqu     xmm0, [esi]
   5669     movdqu     xmm1, [esi + edx]
   5670     pavgb      xmm0, xmm1
   5671     movdqu     [esi + edi], xmm0
   5672     lea        esi, [esi + 16]
   5673     sub        ecx, 16
   5674     jg         xloop50
   5675     jmp        xloop99
   5676 
   5677     // Blend 75 / 25.
   5678   xloop75:
   5679     movdqu     xmm1, [esi]
   5680     movdqu     xmm0, [esi + edx]
   5681     pavgb      xmm0, xmm1
   5682     pavgb      xmm0, xmm1
   5683     movdqu     [esi + edi], xmm0
   5684     lea        esi, [esi + 16]
   5685     sub        ecx, 16
   5686     jg         xloop75
   5687     jmp        xloop99
   5688 
   5689     // Blend 100 / 0 - Copy row unchanged.
   5690   xloop100:
   5691     movdqu     xmm0, [esi]
   5692     movdqu     [esi + edi], xmm0
   5693     lea        esi, [esi + 16]
   5694     sub        ecx, 16
   5695     jg         xloop100
   5696 
   5697   xloop99:
   5698     pop        edi
   5699     pop        esi
   5700     ret
   5701   }
   5702 }
   5703 
   5704 #ifdef HAS_INTERPOLATEROW_SSE2
   5705 // Bilinear filter 16x2 -> 16x1
   5706 __declspec(naked)
   5707 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
   5708                          ptrdiff_t src_stride, int dst_width,
   5709                          int source_y_fraction) {
   5710   __asm {
   5711     push       esi
   5712     push       edi
   5713     mov        edi, [esp + 8 + 4]   // dst_ptr
   5714     mov        esi, [esp + 8 + 8]   // src_ptr
   5715     mov        edx, [esp + 8 + 12]  // src_stride
   5716     mov        ecx, [esp + 8 + 16]  // dst_width
   5717     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   5718     sub        edi, esi
   5719     // Dispatch to specialized filters if applicable.
   5720     cmp        eax, 0
   5721     je         xloop100  // 0 / 256.  Blend 100 / 0.
   5722     cmp        eax, 64
   5723     je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
   5724     cmp        eax, 128
   5725     je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
   5726     cmp        eax, 192
   5727     je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
   5728 
   5729     movd       xmm5, eax            // xmm5 = y fraction
   5730     punpcklbw  xmm5, xmm5
   5731     psrlw      xmm5, 1
   5732     punpcklwd  xmm5, xmm5
   5733     punpckldq  xmm5, xmm5
   5734     punpcklqdq xmm5, xmm5
   5735     pxor       xmm4, xmm4
   5736 
   5737   xloop:
   5738     movdqu     xmm0, [esi]  // row0
   5739     movdqu     xmm2, [esi + edx]  // row1
   5740     movdqu     xmm1, xmm0
   5741     movdqu     xmm3, xmm2
   5742     punpcklbw  xmm2, xmm4
   5743     punpckhbw  xmm3, xmm4
   5744     punpcklbw  xmm0, xmm4
   5745     punpckhbw  xmm1, xmm4
   5746     psubw      xmm2, xmm0  // row1 - row0
   5747     psubw      xmm3, xmm1
   5748     paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
   5749     paddw      xmm3, xmm3
   5750     pmulhw     xmm2, xmm5  // scale diff
   5751     pmulhw     xmm3, xmm5
   5752     paddw      xmm0, xmm2  // sum rows
   5753     paddw      xmm1, xmm3
   5754     packuswb   xmm0, xmm1
   5755     movdqu     [esi + edi], xmm0
   5756     lea        esi, [esi + 16]
   5757     sub        ecx, 16
   5758     jg         xloop
   5759     jmp        xloop99
   5760 
   5761     // Blend 25 / 75.
   5762   xloop25:
   5763     movdqu     xmm0, [esi]
   5764     movdqu     xmm1, [esi + edx]
   5765     pavgb      xmm0, xmm1
   5766     pavgb      xmm0, xmm1
   5767     movdqu     [esi + edi], xmm0
   5768     lea        esi, [esi + 16]
   5769     sub        ecx, 16
   5770     jg         xloop25
   5771     jmp        xloop99
   5772 
   5773     // Blend 50 / 50.
   5774   xloop50:
   5775     movdqu     xmm0, [esi]
   5776     movdqu     xmm1, [esi + edx]
   5777     pavgb      xmm0, xmm1
   5778     movdqu     [esi + edi], xmm0
   5779     lea        esi, [esi + 16]
   5780     sub        ecx, 16
   5781     jg         xloop50
   5782     jmp        xloop99
   5783 
   5784     // Blend 75 / 25.
   5785   xloop75:
   5786     movdqu     xmm1, [esi]
   5787     movdqu     xmm0, [esi + edx]
   5788     pavgb      xmm0, xmm1
   5789     pavgb      xmm0, xmm1
   5790     movdqu     [esi + edi], xmm0
   5791     lea        esi, [esi + 16]
   5792     sub        ecx, 16
   5793     jg         xloop75
   5794     jmp        xloop99
   5795 
   5796     // Blend 100 / 0 - Copy row unchanged.
   5797   xloop100:
   5798     movdqu     xmm0, [esi]
   5799     movdqu     [esi + edi], xmm0
   5800     lea        esi, [esi + 16]
   5801     sub        ecx, 16
   5802     jg         xloop100
   5803 
   5804   xloop99:
   5805     pop        edi
   5806     pop        esi
   5807     ret
   5808   }
   5809 }
   5810 #endif  // HAS_INTERPOLATEROW_SSE2
   5811 
   5812 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
   5813 __declspec(naked)
   5814 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
   5815                           const uint8* shuffler, int pix) {
   5816   __asm {
   5817     mov        eax, [esp + 4]    // src_argb
   5818     mov        edx, [esp + 8]    // dst_argb
   5819     mov        ecx, [esp + 12]   // shuffler
   5820     movdqu     xmm5, [ecx]
   5821     mov        ecx, [esp + 16]   // pix
   5822 
   5823   wloop:
   5824     movdqu     xmm0, [eax]
   5825     movdqu     xmm1, [eax + 16]
   5826     lea        eax, [eax + 32]
   5827     pshufb     xmm0, xmm5
   5828     pshufb     xmm1, xmm5
   5829     movdqu     [edx], xmm0
   5830     movdqu     [edx + 16], xmm1
   5831     lea        edx, [edx + 32]
   5832     sub        ecx, 8
   5833     jg         wloop
   5834     ret
   5835   }
   5836 }
   5837 
   5838 #ifdef HAS_ARGBSHUFFLEROW_AVX2
   5839 __declspec(naked)
   5840 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
   5841                          const uint8* shuffler, int pix) {
   5842   __asm {
   5843     mov        eax, [esp + 4]     // src_argb
   5844     mov        edx, [esp + 8]     // dst_argb
   5845     mov        ecx, [esp + 12]    // shuffler
   5846     vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
   5847     mov        ecx, [esp + 16]    // pix
   5848 
   5849   wloop:
   5850     vmovdqu    ymm0, [eax]
   5851     vmovdqu    ymm1, [eax + 32]
   5852     lea        eax, [eax + 64]
   5853     vpshufb    ymm0, ymm0, ymm5
   5854     vpshufb    ymm1, ymm1, ymm5
   5855     vmovdqu    [edx], ymm0
   5856     vmovdqu    [edx + 32], ymm1
   5857     lea        edx, [edx + 64]
   5858     sub        ecx, 16
   5859     jg         wloop
   5860 
   5861     vzeroupper
   5862     ret
   5863   }
   5864 }
   5865 #endif  // HAS_ARGBSHUFFLEROW_AVX2
   5866 
   5867 __declspec(naked)
   5868 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
   5869                          const uint8* shuffler, int pix) {
   5870   __asm {
   5871     push       ebx
   5872     push       esi
   5873     mov        eax, [esp + 8 + 4]    // src_argb
   5874     mov        edx, [esp + 8 + 8]    // dst_argb
   5875     mov        esi, [esp + 8 + 12]   // shuffler
   5876     mov        ecx, [esp + 8 + 16]   // pix
   5877     pxor       xmm5, xmm5
   5878 
   5879     mov        ebx, [esi]   // shuffler
   5880     cmp        ebx, 0x03000102
   5881     je         shuf_3012
   5882     cmp        ebx, 0x00010203
   5883     je         shuf_0123
   5884     cmp        ebx, 0x00030201
   5885     je         shuf_0321
   5886     cmp        ebx, 0x02010003
   5887     je         shuf_2103
   5888 
   5889   // TODO(fbarchard): Use one source pointer and 3 offsets.
   5890   shuf_any1:
   5891     movzx      ebx, byte ptr [esi]
   5892     movzx      ebx, byte ptr [eax + ebx]
   5893     mov        [edx], bl
   5894     movzx      ebx, byte ptr [esi + 1]
   5895     movzx      ebx, byte ptr [eax + ebx]
   5896     mov        [edx + 1], bl
   5897     movzx      ebx, byte ptr [esi + 2]
   5898     movzx      ebx, byte ptr [eax + ebx]
   5899     mov        [edx + 2], bl
   5900     movzx      ebx, byte ptr [esi + 3]
   5901     movzx      ebx, byte ptr [eax + ebx]
   5902     mov        [edx + 3], bl
   5903     lea        eax, [eax + 4]
   5904     lea        edx, [edx + 4]
   5905     sub        ecx, 1
   5906     jg         shuf_any1
   5907     jmp        shuf99
   5908 
   5909   shuf_0123:
   5910     movdqu     xmm0, [eax]
   5911     lea        eax, [eax + 16]
   5912     movdqa     xmm1, xmm0
   5913     punpcklbw  xmm0, xmm5
   5914     punpckhbw  xmm1, xmm5
   5915     pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
   5916     pshuflw    xmm0, xmm0, 01Bh
   5917     pshufhw    xmm1, xmm1, 01Bh
   5918     pshuflw    xmm1, xmm1, 01Bh
   5919     packuswb   xmm0, xmm1
   5920     movdqu     [edx], xmm0
   5921     lea        edx, [edx + 16]
   5922     sub        ecx, 4
   5923     jg         shuf_0123
   5924     jmp        shuf99
   5925 
   5926   shuf_0321:
   5927     movdqu     xmm0, [eax]
   5928     lea        eax, [eax + 16]
   5929     movdqa     xmm1, xmm0
   5930     punpcklbw  xmm0, xmm5
   5931     punpckhbw  xmm1, xmm5
   5932     pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
   5933     pshuflw    xmm0, xmm0, 039h
   5934     pshufhw    xmm1, xmm1, 039h
   5935     pshuflw    xmm1, xmm1, 039h
   5936     packuswb   xmm0, xmm1
   5937     movdqu     [edx], xmm0
   5938     lea        edx, [edx + 16]
   5939     sub        ecx, 4
   5940     jg         shuf_0321
   5941     jmp        shuf99
   5942 
   5943   shuf_2103:
   5944     movdqu     xmm0, [eax]
   5945     lea        eax, [eax + 16]
   5946     movdqa     xmm1, xmm0
   5947     punpcklbw  xmm0, xmm5
   5948     punpckhbw  xmm1, xmm5
   5949     pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
   5950     pshuflw    xmm0, xmm0, 093h
   5951     pshufhw    xmm1, xmm1, 093h
   5952     pshuflw    xmm1, xmm1, 093h
   5953     packuswb   xmm0, xmm1
   5954     movdqu     [edx], xmm0
   5955     lea        edx, [edx + 16]
   5956     sub        ecx, 4
   5957     jg         shuf_2103
   5958     jmp        shuf99
   5959 
   5960   shuf_3012:
   5961     movdqu     xmm0, [eax]
   5962     lea        eax, [eax + 16]
   5963     movdqa     xmm1, xmm0
   5964     punpcklbw  xmm0, xmm5
   5965     punpckhbw  xmm1, xmm5
   5966     pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
   5967     pshuflw    xmm0, xmm0, 0C6h
   5968     pshufhw    xmm1, xmm1, 0C6h
   5969     pshuflw    xmm1, xmm1, 0C6h
   5970     packuswb   xmm0, xmm1
   5971     movdqu     [edx], xmm0
   5972     lea        edx, [edx + 16]
   5973     sub        ecx, 4
   5974     jg         shuf_3012
   5975 
   5976   shuf99:
   5977     pop        esi
   5978     pop        ebx
   5979     ret
   5980   }
   5981 }
   5982 
   5983 // YUY2 - Macro-pixel = 2 image pixels
   5984 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
   5985 
   5986 // UYVY - Macro-pixel = 2 image pixels
   5987 // U0Y0V0Y1
   5988 
   5989 __declspec(naked)
   5990 void I422ToYUY2Row_SSE2(const uint8* src_y,
   5991                         const uint8* src_u,
   5992                         const uint8* src_v,
   5993                         uint8* dst_frame, int width) {
   5994   __asm {
   5995     push       esi
   5996     push       edi
   5997     mov        eax, [esp + 8 + 4]    // src_y
   5998     mov        esi, [esp + 8 + 8]    // src_u
   5999     mov        edx, [esp + 8 + 12]   // src_v
   6000     mov        edi, [esp + 8 + 16]   // dst_frame
   6001     mov        ecx, [esp + 8 + 20]   // width
   6002     sub        edx, esi
   6003 
   6004   convertloop:
   6005     movq       xmm2, qword ptr [esi] // U
   6006     movq       xmm3, qword ptr [esi + edx] // V
   6007     lea        esi, [esi + 8]
   6008     punpcklbw  xmm2, xmm3 // UV
   6009     movdqu     xmm0, [eax] // Y
   6010     lea        eax, [eax + 16]
   6011     movdqa     xmm1, xmm0
   6012     punpcklbw  xmm0, xmm2 // YUYV
   6013     punpckhbw  xmm1, xmm2
   6014     movdqu     [edi], xmm0
   6015     movdqu     [edi + 16], xmm1
   6016     lea        edi, [edi + 32]
   6017     sub        ecx, 16
   6018     jg         convertloop
   6019 
   6020     pop        edi
   6021     pop        esi
   6022     ret
   6023   }
   6024 }
   6025 
   6026 __declspec(naked)
   6027 void I422ToUYVYRow_SSE2(const uint8* src_y,
   6028                         const uint8* src_u,
   6029                         const uint8* src_v,
   6030                         uint8* dst_frame, int width) {
   6031   __asm {
   6032     push       esi
   6033     push       edi
   6034     mov        eax, [esp + 8 + 4]    // src_y
   6035     mov        esi, [esp + 8 + 8]    // src_u
   6036     mov        edx, [esp + 8 + 12]   // src_v
   6037     mov        edi, [esp + 8 + 16]   // dst_frame
   6038     mov        ecx, [esp + 8 + 20]   // width
   6039     sub        edx, esi
   6040 
   6041   convertloop:
   6042     movq       xmm2, qword ptr [esi] // U
   6043     movq       xmm3, qword ptr [esi + edx] // V
   6044     lea        esi, [esi + 8]
   6045     punpcklbw  xmm2, xmm3 // UV
   6046     movdqu     xmm0, [eax] // Y
   6047     movdqa     xmm1, xmm2
   6048     lea        eax, [eax + 16]
   6049     punpcklbw  xmm1, xmm0 // UYVY
   6050     punpckhbw  xmm2, xmm0
   6051     movdqu     [edi], xmm1
   6052     movdqu     [edi + 16], xmm2
   6053     lea        edi, [edi + 32]
   6054     sub        ecx, 16
   6055     jg         convertloop
   6056 
   6057     pop        edi
   6058     pop        esi
   6059     ret
   6060   }
   6061 }
   6062 
   6063 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
   6064 __declspec(naked)
   6065 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
   6066                             uint8* dst_argb, const float* poly,
   6067                             int width) {
   6068   __asm {
   6069     push       esi
   6070     mov        eax, [esp + 4 + 4]   /* src_argb */
   6071     mov        edx, [esp + 4 + 8]   /* dst_argb */
   6072     mov        esi, [esp + 4 + 12]  /* poly */
   6073     mov        ecx, [esp + 4 + 16]  /* width */
   6074     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
   6075 
   6076     // 2 pixel loop.
   6077  convertloop:
   6078 //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
   6079 //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
   6080     movq       xmm0, qword ptr [eax]  // BGRABGRA
   6081     lea        eax, [eax + 8]
   6082     punpcklbw  xmm0, xmm3
   6083     movdqa     xmm4, xmm0
   6084     punpcklwd  xmm0, xmm3  // pixel 0
   6085     punpckhwd  xmm4, xmm3  // pixel 1
   6086     cvtdq2ps   xmm0, xmm0  // 4 floats
   6087     cvtdq2ps   xmm4, xmm4
   6088     movdqa     xmm1, xmm0  // X
   6089     movdqa     xmm5, xmm4
   6090     mulps      xmm0, [esi + 16]  // C1 * X
   6091     mulps      xmm4, [esi + 16]
   6092     addps      xmm0, [esi]  // result = C0 + C1 * X
   6093     addps      xmm4, [esi]
   6094     movdqa     xmm2, xmm1
   6095     movdqa     xmm6, xmm5
   6096     mulps      xmm2, xmm1  // X * X
   6097     mulps      xmm6, xmm5
   6098     mulps      xmm1, xmm2  // X * X * X
   6099     mulps      xmm5, xmm6
   6100     mulps      xmm2, [esi + 32]  // C2 * X * X
   6101     mulps      xmm6, [esi + 32]
   6102     mulps      xmm1, [esi + 48]  // C3 * X * X * X
   6103     mulps      xmm5, [esi + 48]
   6104     addps      xmm0, xmm2  // result += C2 * X * X
   6105     addps      xmm4, xmm6
   6106     addps      xmm0, xmm1  // result += C3 * X * X * X
   6107     addps      xmm4, xmm5
   6108     cvttps2dq  xmm0, xmm0
   6109     cvttps2dq  xmm4, xmm4
   6110     packuswb   xmm0, xmm4
   6111     packuswb   xmm0, xmm0
   6112     movq       qword ptr [edx], xmm0
   6113     lea        edx, [edx + 8]
   6114     sub        ecx, 2
   6115     jg         convertloop
   6116     pop        esi
   6117     ret
   6118   }
   6119 }
   6120 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
   6121 
   6122 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
   6123 __declspec(naked)
   6124 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
   6125                             uint8* dst_argb, const float* poly,
   6126                             int width) {
   6127   __asm {
   6128     mov        eax, [esp + 4]   /* src_argb */
   6129     mov        edx, [esp + 8]   /* dst_argb */
   6130     mov        ecx, [esp + 12]   /* poly */
   6131     vbroadcastf128 ymm4, [ecx]       // C0
   6132     vbroadcastf128 ymm5, [ecx + 16]  // C1
   6133     vbroadcastf128 ymm6, [ecx + 32]  // C2
   6134     vbroadcastf128 ymm7, [ecx + 48]  // C3
   6135     mov        ecx, [esp + 16]  /* width */
   6136 
   6137     // 2 pixel loop.
   6138  convertloop:
   6139     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
   6140     lea         eax, [eax + 8]
   6141     vcvtdq2ps   ymm0, ymm0        // X 8 floats
   6142     vmulps      ymm2, ymm0, ymm0  // X * X
   6143     vmulps      ymm3, ymm0, ymm7  // C3 * X
   6144     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
   6145     vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
   6146     vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
   6147     vcvttps2dq  ymm0, ymm0
   6148     vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
   6149     vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
   6150     vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
   6151     vmovq       qword ptr [edx], xmm0
   6152     lea         edx, [edx + 8]
   6153     sub         ecx, 2
   6154     jg          convertloop
   6155     vzeroupper
   6156     ret
   6157   }
   6158 }
   6159 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
   6160 
   6161 #ifdef HAS_ARGBCOLORTABLEROW_X86
   6162 // Tranform ARGB pixels with color table.
   6163 __declspec(naked)
   6164 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
   6165                            int width) {
   6166   __asm {
   6167     push       esi
   6168     mov        eax, [esp + 4 + 4]   /* dst_argb */
   6169     mov        esi, [esp + 4 + 8]   /* table_argb */
   6170     mov        ecx, [esp + 4 + 12]  /* width */
   6171 
   6172     // 1 pixel loop.
   6173   convertloop:
   6174     movzx      edx, byte ptr [eax]
   6175     lea        eax, [eax + 4]
   6176     movzx      edx, byte ptr [esi + edx * 4]
   6177     mov        byte ptr [eax - 4], dl
   6178     movzx      edx, byte ptr [eax - 4 + 1]
   6179     movzx      edx, byte ptr [esi + edx * 4 + 1]
   6180     mov        byte ptr [eax - 4 + 1], dl
   6181     movzx      edx, byte ptr [eax - 4 + 2]
   6182     movzx      edx, byte ptr [esi + edx * 4 + 2]
   6183     mov        byte ptr [eax - 4 + 2], dl
   6184     movzx      edx, byte ptr [eax - 4 + 3]
   6185     movzx      edx, byte ptr [esi + edx * 4 + 3]
   6186     mov        byte ptr [eax - 4 + 3], dl
   6187     dec        ecx
   6188     jg         convertloop
   6189     pop        esi
   6190     ret
   6191   }
   6192 }
   6193 #endif  // HAS_ARGBCOLORTABLEROW_X86
   6194 
   6195 #ifdef HAS_RGBCOLORTABLEROW_X86
   6196 // Tranform RGB pixels with color table.
   6197 __declspec(naked)
   6198 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
   6199   __asm {
   6200     push       esi
   6201     mov        eax, [esp + 4 + 4]   /* dst_argb */
   6202     mov        esi, [esp + 4 + 8]   /* table_argb */
   6203     mov        ecx, [esp + 4 + 12]  /* width */
   6204 
   6205     // 1 pixel loop.
   6206   convertloop:
   6207     movzx      edx, byte ptr [eax]
   6208     lea        eax, [eax + 4]
   6209     movzx      edx, byte ptr [esi + edx * 4]
   6210     mov        byte ptr [eax - 4], dl
   6211     movzx      edx, byte ptr [eax - 4 + 1]
   6212     movzx      edx, byte ptr [esi + edx * 4 + 1]
   6213     mov        byte ptr [eax - 4 + 1], dl
   6214     movzx      edx, byte ptr [eax - 4 + 2]
   6215     movzx      edx, byte ptr [esi + edx * 4 + 2]
   6216     mov        byte ptr [eax - 4 + 2], dl
   6217     dec        ecx
   6218     jg         convertloop
   6219 
   6220     pop        esi
   6221     ret
   6222   }
   6223 }
   6224 #endif  // HAS_RGBCOLORTABLEROW_X86
   6225 
   6226 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
   6227 // Tranform RGB pixels with luma table.
   6228 __declspec(naked)
   6229 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
   6230                                  int width,
   6231                                  const uint8* luma, uint32 lumacoeff) {
   6232   __asm {
   6233     push       esi
   6234     push       edi
   6235     mov        eax, [esp + 8 + 4]   /* src_argb */
   6236     mov        edi, [esp + 8 + 8]   /* dst_argb */
   6237     mov        ecx, [esp + 8 + 12]  /* width */
   6238     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
   6239     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
   6240     pshufd     xmm2, xmm2, 0
   6241     pshufd     xmm3, xmm3, 0
   6242     pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
   6243     psllw      xmm4, 8
   6244     pxor       xmm5, xmm5
   6245 
   6246     // 4 pixel loop.
   6247   convertloop:
   6248     movdqu     xmm0, qword ptr [eax]      // generate luma ptr
   6249     pmaddubsw  xmm0, xmm3
   6250     phaddw     xmm0, xmm0
   6251     pand       xmm0, xmm4  // mask out low bits
   6252     punpcklwd  xmm0, xmm5
   6253     paddd      xmm0, xmm2  // add table base
   6254     movd       esi, xmm0
   6255     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
   6256 
   6257     movzx      edx, byte ptr [eax]
   6258     movzx      edx, byte ptr [esi + edx]
   6259     mov        byte ptr [edi], dl
   6260     movzx      edx, byte ptr [eax + 1]
   6261     movzx      edx, byte ptr [esi + edx]
   6262     mov        byte ptr [edi + 1], dl
   6263     movzx      edx, byte ptr [eax + 2]
   6264     movzx      edx, byte ptr [esi + edx]
   6265     mov        byte ptr [edi + 2], dl
   6266     movzx      edx, byte ptr [eax + 3]  // copy alpha.
   6267     mov        byte ptr [edi + 3], dl
   6268 
   6269     movd       esi, xmm0
   6270     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
   6271 
   6272     movzx      edx, byte ptr [eax + 4]
   6273     movzx      edx, byte ptr [esi + edx]
   6274     mov        byte ptr [edi + 4], dl
   6275     movzx      edx, byte ptr [eax + 5]
   6276     movzx      edx, byte ptr [esi + edx]
   6277     mov        byte ptr [edi + 5], dl
   6278     movzx      edx, byte ptr [eax + 6]
   6279     movzx      edx, byte ptr [esi + edx]
   6280     mov        byte ptr [edi + 6], dl
   6281     movzx      edx, byte ptr [eax + 7]  // copy alpha.
   6282     mov        byte ptr [edi + 7], dl
   6283 
   6284     movd       esi, xmm0
   6285     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
   6286 
   6287     movzx      edx, byte ptr [eax + 8]
   6288     movzx      edx, byte ptr [esi + edx]
   6289     mov        byte ptr [edi + 8], dl
   6290     movzx      edx, byte ptr [eax + 9]
   6291     movzx      edx, byte ptr [esi + edx]
   6292     mov        byte ptr [edi + 9], dl
   6293     movzx      edx, byte ptr [eax + 10]
   6294     movzx      edx, byte ptr [esi + edx]
   6295     mov        byte ptr [edi + 10], dl
   6296     movzx      edx, byte ptr [eax + 11]  // copy alpha.
   6297     mov        byte ptr [edi + 11], dl
   6298 
   6299     movd       esi, xmm0
   6300 
   6301     movzx      edx, byte ptr [eax + 12]
   6302     movzx      edx, byte ptr [esi + edx]
   6303     mov        byte ptr [edi + 12], dl
   6304     movzx      edx, byte ptr [eax + 13]
   6305     movzx      edx, byte ptr [esi + edx]
   6306     mov        byte ptr [edi + 13], dl
   6307     movzx      edx, byte ptr [eax + 14]
   6308     movzx      edx, byte ptr [esi + edx]
   6309     mov        byte ptr [edi + 14], dl
   6310     movzx      edx, byte ptr [eax + 15]  // copy alpha.
   6311     mov        byte ptr [edi + 15], dl
   6312 
   6313     lea        eax, [eax + 16]
   6314     lea        edi, [edi + 16]
   6315     sub        ecx, 4
   6316     jg         convertloop
   6317 
   6318     pop        edi
   6319     pop        esi
   6320     ret
   6321   }
   6322 }
   6323 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
   6324 
   6325 #endif  // defined(_M_X64)
   6326 #endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
   6327 
   6328 #ifdef __cplusplus
   6329 }  // extern "C"
   6330 }  // namespace libyuv
   6331 #endif
   6332