Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 
     13 #ifdef __cplusplus
     14 namespace libyuv {
     15 extern "C" {
     16 #endif
     17 
     18 // This module is for Visual C x86.
     19 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
     20 
     21 // TODO(fbarchard): I420ToRGB24, I420ToRAW
     22 #ifdef HAS_ARGBTOYROW_SSSE3
     23 
     24 // Constants for ARGB.
     25 static const vec8 kARGBToY = {
     26   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
     27 };
     28 
     29 static const vec8 kARGBToU = {
     30   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
     31 };
     32 
     33 static const vec8 kARGBToV = {
     34   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
     35 };
     36 
     37 // Constants for BGRA.
     38 static const vec8 kBGRAToY = {
     39   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
     40 };
     41 
     42 static const vec8 kBGRAToU = {
     43   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
     44 };
     45 
     46 static const vec8 kBGRAToV = {
     47   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
     48 };
     49 
     50 // Constants for ABGR.
     51 static const vec8 kABGRToY = {
     52   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
     53 };
     54 
     55 static const vec8 kABGRToU = {
     56   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
     57 };
     58 
     59 static const vec8 kABGRToV = {
     60   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
     61 };
     62 
     63 // Constants for RGBA.
     64 static const vec8 kRGBAToY = {
     65   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
     66 };
     67 
     68 static const vec8 kRGBAToU = {
     69   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
     70 };
     71 
     72 static const vec8 kRGBAToV = {
     73   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
     74 };
     75 
     76 static const uvec8 kAddY16 = {
     77   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
     78 };
     79 
     80 static const uvec8 kAddUV128 = {
     81   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
     82   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
     83 };
     84 
     85 // Shuffle table for converting RGB24 to ARGB.
     86 static const uvec8 kShuffleMaskRGB24ToARGB = {
     87   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
     88 };
     89 
     90 // Shuffle table for converting RAW to ARGB.
     91 static const uvec8 kShuffleMaskRAWToARGB = {
     92   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
     93 };
     94 
     95 // Shuffle table for converting BGRA to ARGB.
     96 static const uvec8 kShuffleMaskBGRAToARGB = {
     97   3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
     98 };
     99 
    100 // Shuffle table for converting ABGR to ARGB.
    101 static const uvec8 kShuffleMaskABGRToARGB = {
    102   2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
    103 };
    104 
    105 // Shuffle table for converting RGBA to ARGB.
    106 static const uvec8 kShuffleMaskRGBAToARGB = {
    107   1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
    108 };
    109 
    110 // Shuffle table for converting ARGB to RGBA.
    111 static const uvec8 kShuffleMaskARGBToRGBA = {
    112   3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
    113 };
    114 
    115 // Shuffle table for converting ARGB to RGB24.
    116 static const uvec8 kShuffleMaskARGBToRGB24 = {
    117   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
    118 };
    119 
    120 // Shuffle table for converting ARGB to RAW.
    121 static const uvec8 kShuffleMaskARGBToRAW = {
    122   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
    123 };
    124 
    125 __declspec(naked) __declspec(align(16))
    126 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
    127   __asm {
    128     mov        eax, [esp + 4]        // src_y
    129     mov        edx, [esp + 8]        // dst_argb
    130     mov        ecx, [esp + 12]       // pix
    131     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
    132     pslld      xmm5, 24
    133 
    134     align      16
    135   convertloop:
    136     movq       xmm0, qword ptr [eax]
    137     lea        eax,  [eax + 8]
    138     punpcklbw  xmm0, xmm0
    139     movdqa     xmm1, xmm0
    140     punpcklwd  xmm0, xmm0
    141     punpckhwd  xmm1, xmm1
    142     por        xmm0, xmm5
    143     por        xmm1, xmm5
    144     movdqa     [edx], xmm0
    145     movdqa     [edx + 16], xmm1
    146     lea        edx, [edx + 32]
    147     sub        ecx, 8
    148     jg         convertloop
    149     ret
    150   }
    151 }
    152 
    153 __declspec(naked) __declspec(align(16))
    154 void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
    155 __asm {
    156     mov       eax, [esp + 4]   // src_bgra
    157     mov       edx, [esp + 8]   // dst_argb
    158     mov       ecx, [esp + 12]  // pix
    159     movdqa    xmm5, kShuffleMaskBGRAToARGB
    160     sub       edx, eax
    161 
    162     align      16
    163  convertloop:
    164     movdqa    xmm0, [eax]
    165     pshufb    xmm0, xmm5
    166     sub       ecx, 4
    167     movdqa    [eax + edx], xmm0
    168     lea       eax, [eax + 16]
    169     jg        convertloop
    170     ret
    171   }
    172 }
    173 
    174 __declspec(naked) __declspec(align(16))
    175 void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
    176 __asm {
    177     mov       eax, [esp + 4]   // src_abgr
    178     mov       edx, [esp + 8]   // dst_argb
    179     mov       ecx, [esp + 12]  // pix
    180     movdqa    xmm5, kShuffleMaskABGRToARGB
    181     sub       edx, eax
    182 
    183     align      16
    184  convertloop:
    185     movdqa    xmm0, [eax]
    186     pshufb    xmm0, xmm5
    187     sub       ecx, 4
    188     movdqa    [eax + edx], xmm0
    189     lea       eax, [eax + 16]
    190     jg        convertloop
    191     ret
    192   }
    193 }
    194 
    195 __declspec(naked) __declspec(align(16))
    196 void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
    197 __asm {
    198     mov       eax, [esp + 4]   // src_rgba
    199     mov       edx, [esp + 8]   // dst_argb
    200     mov       ecx, [esp + 12]  // pix
    201     movdqa    xmm5, kShuffleMaskRGBAToARGB
    202     sub       edx, eax
    203 
    204     align      16
    205  convertloop:
    206     movdqa    xmm0, [eax]
    207     pshufb    xmm0, xmm5
    208     sub       ecx, 4
    209     movdqa    [eax + edx], xmm0
    210     lea       eax, [eax + 16]
    211     jg        convertloop
    212     ret
    213   }
    214 }
    215 
    216 __declspec(naked) __declspec(align(16))
    217 void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
    218 __asm {
    219     mov       eax, [esp + 4]   // src_argb
    220     mov       edx, [esp + 8]   // dst_rgba
    221     mov       ecx, [esp + 12]  // pix
    222     movdqa    xmm5, kShuffleMaskARGBToRGBA
    223     sub       edx, eax
    224 
    225     align      16
    226  convertloop:
    227     movdqa    xmm0, [eax]
    228     pshufb    xmm0, xmm5
    229     sub       ecx, 4
    230     movdqa    [eax + edx], xmm0
    231     lea       eax, [eax + 16]
    232     jg        convertloop
    233     ret
    234   }
    235 }
    236 
    237 __declspec(naked) __declspec(align(16))
    238 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
    239 __asm {
    240     mov       eax, [esp + 4]   // src_rgb24
    241     mov       edx, [esp + 8]   // dst_argb
    242     mov       ecx, [esp + 12]  // pix
    243     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
    244     pslld     xmm5, 24
    245     movdqa    xmm4, kShuffleMaskRGB24ToARGB
    246 
    247     align      16
    248  convertloop:
    249     movdqu    xmm0, [eax]
    250     movdqu    xmm1, [eax + 16]
    251     movdqu    xmm3, [eax + 32]
    252     lea       eax, [eax + 48]
    253     movdqa    xmm2, xmm3
    254     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
    255     pshufb    xmm2, xmm4
    256     por       xmm2, xmm5
    257     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
    258     pshufb    xmm0, xmm4
    259     movdqa    [edx + 32], xmm2
    260     por       xmm0, xmm5
    261     pshufb    xmm1, xmm4
    262     movdqa    [edx], xmm0
    263     por       xmm1, xmm5
    264     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
    265     pshufb    xmm3, xmm4
    266     movdqa    [edx + 16], xmm1
    267     por       xmm3, xmm5
    268     sub       ecx, 16
    269     movdqa    [edx + 48], xmm3
    270     lea       edx, [edx + 64]
    271     jg        convertloop
    272     ret
    273   }
    274 }
    275 
    276 __declspec(naked) __declspec(align(16))
    277 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
    278                         int pix) {
    279 __asm {
    280     mov       eax, [esp + 4]   // src_raw
    281     mov       edx, [esp + 8]   // dst_argb
    282     mov       ecx, [esp + 12]  // pix
    283     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
    284     pslld     xmm5, 24
    285     movdqa    xmm4, kShuffleMaskRAWToARGB
    286 
    287     align      16
    288  convertloop:
    289     movdqu    xmm0, [eax]
    290     movdqu    xmm1, [eax + 16]
    291     movdqu    xmm3, [eax + 32]
    292     lea       eax, [eax + 48]
    293     movdqa    xmm2, xmm3
    294     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
    295     pshufb    xmm2, xmm4
    296     por       xmm2, xmm5
    297     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
    298     pshufb    xmm0, xmm4
    299     movdqa    [edx + 32], xmm2
    300     por       xmm0, xmm5
    301     pshufb    xmm1, xmm4
    302     movdqa    [edx], xmm0
    303     por       xmm1, xmm5
    304     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
    305     pshufb    xmm3, xmm4
    306     movdqa    [edx + 16], xmm1
    307     por       xmm3, xmm5
    308     sub       ecx, 16
    309     movdqa    [edx + 48], xmm3
    310     lea       edx, [edx + 64]
    311     jg        convertloop
    312     ret
    313   }
    314 }
    315 
    316 // pmul method to replicate bits.
    317 // Math to replicate bits:
    318 // (v << 8) | (v << 3)
    319 // v * 256 + v * 8
    320 // v * (256 + 8)
    321 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
    322 // 20 instructions.
    323 __declspec(naked) __declspec(align(16))
    324 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
    325                           int pix) {
    326 __asm {
    327     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
    328     movd      xmm5, eax
    329     pshufd    xmm5, xmm5, 0
    330     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
    331     movd      xmm6, eax
    332     pshufd    xmm6, xmm6, 0
    333     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
    334     psllw     xmm3, 11
    335     pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
    336     psllw     xmm4, 10
    337     psrlw     xmm4, 5
    338     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
    339     psllw     xmm7, 8
    340 
    341     mov       eax, [esp + 4]   // src_rgb565
    342     mov       edx, [esp + 8]   // dst_argb
    343     mov       ecx, [esp + 12]  // pix
    344     sub       edx, eax
    345     sub       edx, eax
    346 
    347     align      16
    348  convertloop:
    349     movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
    350     movdqa    xmm1, xmm0
    351     movdqa    xmm2, xmm0
    352     pand      xmm1, xmm3    // R in upper 5 bits
    353     psllw     xmm2, 11      // B in upper 5 bits
    354     pmulhuw   xmm1, xmm5    // * (256 + 8)
    355     pmulhuw   xmm2, xmm5    // * (256 + 8)
    356     psllw     xmm1, 8
    357     por       xmm1, xmm2    // RB
    358     pand      xmm0, xmm4    // G in middle 6 bits
    359     pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
    360     por       xmm0, xmm7    // AG
    361     movdqa    xmm2, xmm1
    362     punpcklbw xmm1, xmm0
    363     punpckhbw xmm2, xmm0
    364     movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
    365     movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
    366     lea       eax, [eax + 16]
    367     sub       ecx, 8
    368     jg        convertloop
    369     ret
    370   }
    371 }
    372 
    373 // 24 instructions
    374 __declspec(naked) __declspec(align(16))
    375 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
    376                             int pix) {
    377 __asm {
    378     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
    379     movd      xmm5, eax
    380     pshufd    xmm5, xmm5, 0
    381     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
    382     movd      xmm6, eax
    383     pshufd    xmm6, xmm6, 0
    384     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
    385     psllw     xmm3, 11
    386     movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
    387     psrlw     xmm4, 6
    388     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
    389     psllw     xmm7, 8
    390 
    391     mov       eax, [esp + 4]   // src_argb1555
    392     mov       edx, [esp + 8]   // dst_argb
    393     mov       ecx, [esp + 12]  // pix
    394     sub       edx, eax
    395     sub       edx, eax
    396 
    397     align      16
    398  convertloop:
    399     movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
    400     movdqa    xmm1, xmm0
    401     movdqa    xmm2, xmm0
    402     psllw     xmm1, 1       // R in upper 5 bits
    403     psllw     xmm2, 11      // B in upper 5 bits
    404     pand      xmm1, xmm3
    405     pmulhuw   xmm2, xmm5    // * (256 + 8)
    406     pmulhuw   xmm1, xmm5    // * (256 + 8)
    407     psllw     xmm1, 8
    408     por       xmm1, xmm2    // RB
    409     movdqa    xmm2, xmm0
    410     pand      xmm0, xmm4    // G in middle 5 bits
    411     psraw     xmm2, 8       // A
    412     pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
    413     pand      xmm2, xmm7
    414     por       xmm0, xmm2    // AG
    415     movdqa    xmm2, xmm1
    416     punpcklbw xmm1, xmm0
    417     punpckhbw xmm2, xmm0
    418     movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
    419     movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
    420     lea       eax, [eax + 16]
    421     sub       ecx, 8
    422     jg        convertloop
    423     ret
    424   }
    425 }
    426 
    427 // 18 instructions.
    428 __declspec(naked) __declspec(align(16))
    429 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
    430                             int pix) {
    431 __asm {
    432     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
    433     movd      xmm4, eax
    434     pshufd    xmm4, xmm4, 0
    435     movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
    436     pslld     xmm5, 4
    437     mov       eax, [esp + 4]   // src_argb4444
    438     mov       edx, [esp + 8]   // dst_argb
    439     mov       ecx, [esp + 12]  // pix
    440     sub       edx, eax
    441     sub       edx, eax
    442 
    443     align      16
    444  convertloop:
    445     movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
    446     movdqa    xmm2, xmm0
    447     pand      xmm0, xmm4    // mask low nibbles
    448     pand      xmm2, xmm5    // mask high nibbles
    449     movdqa    xmm1, xmm0
    450     movdqa    xmm3, xmm2
    451     psllw     xmm1, 4
    452     psrlw     xmm3, 4
    453     por       xmm0, xmm1
    454     por       xmm2, xmm3
    455     movdqa    xmm1, xmm0
    456     punpcklbw xmm0, xmm2
    457     punpckhbw xmm1, xmm2
    458     movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
    459     movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
    460     lea       eax, [eax + 16]
    461     sub       ecx, 8
    462     jg        convertloop
    463     ret
    464   }
    465 }
    466 
    467 __declspec(naked) __declspec(align(16))
    468 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
    469 __asm {
    470     mov       eax, [esp + 4]   // src_argb
    471     mov       edx, [esp + 8]   // dst_rgb
    472     mov       ecx, [esp + 12]  // pix
    473     movdqa    xmm6, kShuffleMaskARGBToRGB24
    474 
    475     align      16
    476  convertloop:
    477     movdqa    xmm0, [eax]   // fetch 16 pixels of argb
    478     movdqa    xmm1, [eax + 16]
    479     movdqa    xmm2, [eax + 32]
    480     movdqa    xmm3, [eax + 48]
    481     lea       eax, [eax + 64]
    482     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
    483     pshufb    xmm1, xmm6
    484     pshufb    xmm2, xmm6
    485     pshufb    xmm3, xmm6
    486     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
    487     psrldq    xmm1, 4      // 8 bytes from 1
    488     pslldq    xmm4, 12     // 4 bytes from 1 for 0
    489     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
    490     por       xmm0, xmm4   // 4 bytes from 1 for 0
    491     pslldq    xmm5, 8      // 8 bytes from 2 for 1
    492     movdqa    [edx], xmm0  // store 0
    493     por       xmm1, xmm5   // 8 bytes from 2 for 1
    494     psrldq    xmm2, 8      // 4 bytes from 2
    495     pslldq    xmm3, 4      // 12 bytes from 3 for 2
    496     por       xmm2, xmm3   // 12 bytes from 3 for 2
    497     movdqa    [edx + 16], xmm1   // store 1
    498     movdqa    [edx + 32], xmm2   // store 2
    499     lea       edx, [edx + 48]
    500     sub       ecx, 16
    501     jg        convertloop
    502     ret
    503   }
    504 }
    505 
    506 __declspec(naked) __declspec(align(16))
    507 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
    508 __asm {
    509     mov       eax, [esp + 4]   // src_argb
    510     mov       edx, [esp + 8]   // dst_rgb
    511     mov       ecx, [esp + 12]  // pix
    512     movdqa    xmm6, kShuffleMaskARGBToRAW
    513 
    514     align      16
    515  convertloop:
    516     movdqa    xmm0, [eax]   // fetch 16 pixels of argb
    517     movdqa    xmm1, [eax + 16]
    518     movdqa    xmm2, [eax + 32]
    519     movdqa    xmm3, [eax + 48]
    520     lea       eax, [eax + 64]
    521     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
    522     pshufb    xmm1, xmm6
    523     pshufb    xmm2, xmm6
    524     pshufb    xmm3, xmm6
    525     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
    526     psrldq    xmm1, 4      // 8 bytes from 1
    527     pslldq    xmm4, 12     // 4 bytes from 1 for 0
    528     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
    529     por       xmm0, xmm4   // 4 bytes from 1 for 0
    530     pslldq    xmm5, 8      // 8 bytes from 2 for 1
    531     movdqa    [edx], xmm0  // store 0
    532     por       xmm1, xmm5   // 8 bytes from 2 for 1
    533     psrldq    xmm2, 8      // 4 bytes from 2
    534     pslldq    xmm3, 4      // 12 bytes from 3 for 2
    535     por       xmm2, xmm3   // 12 bytes from 3 for 2
    536     movdqa    [edx + 16], xmm1   // store 1
    537     movdqa    [edx + 32], xmm2   // store 2
    538     lea       edx, [edx + 48]
    539     sub       ecx, 16
    540     jg        convertloop
    541     ret
    542   }
    543 }
    544 
    545 __declspec(naked) __declspec(align(16))
    546 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
    547 __asm {
    548     mov       eax, [esp + 4]   // src_argb
    549     mov       edx, [esp + 8]   // dst_rgb
    550     mov       ecx, [esp + 12]  // pix
    551     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
    552     psrld     xmm3, 27
    553     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
    554     psrld     xmm4, 26
    555     pslld     xmm4, 5
    556     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
    557     pslld     xmm5, 11
    558 
    559     align      16
    560  convertloop:
    561     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
    562     movdqa    xmm1, xmm0    // B
    563     movdqa    xmm2, xmm0    // G
    564     pslld     xmm0, 8       // R
    565     psrld     xmm1, 3       // B
    566     psrld     xmm2, 5       // G
    567     psrad     xmm0, 16      // R
    568     pand      xmm1, xmm3    // B
    569     pand      xmm2, xmm4    // G
    570     pand      xmm0, xmm5    // R
    571     por       xmm1, xmm2    // BG
    572     por       xmm0, xmm1    // BGR
    573     packssdw  xmm0, xmm0
    574     lea       eax, [eax + 16]
    575     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
    576     lea       edx, [edx + 8]
    577     sub       ecx, 4
    578     jg        convertloop
    579     ret
    580   }
    581 }
    582 
    583 // TODO(fbarchard): Improve sign extension/packing.
    584 __declspec(naked) __declspec(align(16))
    585 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
    586 __asm {
    587     mov       eax, [esp + 4]   // src_argb
    588     mov       edx, [esp + 8]   // dst_rgb
    589     mov       ecx, [esp + 12]  // pix
    590     pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
    591     psrld     xmm4, 27
    592     movdqa    xmm5, xmm4       // generate mask 0x000003e0
    593     pslld     xmm5, 5
    594     movdqa    xmm6, xmm4       // generate mask 0x00007c00
    595     pslld     xmm6, 10
    596     pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
    597     pslld     xmm7, 15
    598 
    599     align      16
    600  convertloop:
    601     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
    602     movdqa    xmm1, xmm0    // B
    603     movdqa    xmm2, xmm0    // G
    604     movdqa    xmm3, xmm0    // R
    605     psrad     xmm0, 16      // A
    606     psrld     xmm1, 3       // B
    607     psrld     xmm2, 6       // G
    608     psrld     xmm3, 9       // R
    609     pand      xmm0, xmm7    // A
    610     pand      xmm1, xmm4    // B
    611     pand      xmm2, xmm5    // G
    612     pand      xmm3, xmm6    // R
    613     por       xmm0, xmm1    // BA
    614     por       xmm2, xmm3    // GR
    615     por       xmm0, xmm2    // BGRA
    616     packssdw  xmm0, xmm0
    617     lea       eax, [eax + 16]
    618     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
    619     lea       edx, [edx + 8]
    620     sub       ecx, 4
    621     jg        convertloop
    622     ret
    623   }
    624 }
    625 
    626 __declspec(naked) __declspec(align(16))
    627 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
    628 __asm {
    629     mov       eax, [esp + 4]   // src_argb
    630     mov       edx, [esp + 8]   // dst_rgb
    631     mov       ecx, [esp + 12]  // pix
    632     pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
    633     psllw     xmm4, 12
    634     movdqa    xmm3, xmm4       // generate mask 0x00f000f0
    635     psrlw     xmm3, 8
    636 
    637     align      16
    638  convertloop:
    639     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
    640     movdqa    xmm1, xmm0
    641     pand      xmm0, xmm3    // low nibble
    642     pand      xmm1, xmm4    // high nibble
    643     psrl      xmm0, 4
    644     psrl      xmm1, 8
    645     por       xmm0, xmm1
    646     packuswb  xmm0, xmm0
    647     lea       eax, [eax + 16]
    648     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
    649     lea       edx, [edx + 8]
    650     sub       ecx, 4
    651     jg        convertloop
    652     ret
    653   }
    654 }
    655 
    656 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
    657 __declspec(naked) __declspec(align(16))
    658 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    659 __asm {
    660     mov        eax, [esp + 4]   /* src_argb */
    661     mov        edx, [esp + 8]   /* dst_y */
    662     mov        ecx, [esp + 12]  /* pix */
    663     movdqa     xmm5, kAddY16
    664     movdqa     xmm4, kARGBToY
    665 
    666     align      16
    667  convertloop:
    668     movdqa     xmm0, [eax]
    669     movdqa     xmm1, [eax + 16]
    670     movdqa     xmm2, [eax + 32]
    671     movdqa     xmm3, [eax + 48]
    672     pmaddubsw  xmm0, xmm4
    673     pmaddubsw  xmm1, xmm4
    674     pmaddubsw  xmm2, xmm4
    675     pmaddubsw  xmm3, xmm4
    676     lea        eax, [eax + 64]
    677     phaddw     xmm0, xmm1
    678     phaddw     xmm2, xmm3
    679     psrlw      xmm0, 7
    680     psrlw      xmm2, 7
    681     packuswb   xmm0, xmm2
    682     paddb      xmm0, xmm5
    683     sub        ecx, 16
    684     movdqa     [edx], xmm0
    685     lea        edx, [edx + 16]
    686     jg         convertloop
    687     ret
    688   }
    689 }
    690 
    691 __declspec(naked) __declspec(align(16))
    692 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    693 __asm {
    694     mov        eax, [esp + 4]   /* src_argb */
    695     mov        edx, [esp + 8]   /* dst_y */
    696     mov        ecx, [esp + 12]  /* pix */
    697     movdqa     xmm5, kAddY16
    698     movdqa     xmm4, kARGBToY
    699 
    700     align      16
    701  convertloop:
    702     movdqu     xmm0, [eax]
    703     movdqu     xmm1, [eax + 16]
    704     movdqu     xmm2, [eax + 32]
    705     movdqu     xmm3, [eax + 48]
    706     pmaddubsw  xmm0, xmm4
    707     pmaddubsw  xmm1, xmm4
    708     pmaddubsw  xmm2, xmm4
    709     pmaddubsw  xmm3, xmm4
    710     lea        eax, [eax + 64]
    711     phaddw     xmm0, xmm1
    712     phaddw     xmm2, xmm3
    713     psrlw      xmm0, 7
    714     psrlw      xmm2, 7
    715     packuswb   xmm0, xmm2
    716     paddb      xmm0, xmm5
    717     sub        ecx, 16
    718     movdqu     [edx], xmm0
    719     lea        edx, [edx + 16]
    720     jg         convertloop
    721     ret
    722   }
    723 }
    724 
    725 __declspec(naked) __declspec(align(16))
    726 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    727 __asm {
    728     mov        eax, [esp + 4]   /* src_argb */
    729     mov        edx, [esp + 8]   /* dst_y */
    730     mov        ecx, [esp + 12]  /* pix */
    731     movdqa     xmm5, kAddY16
    732     movdqa     xmm4, kBGRAToY
    733 
    734     align      16
    735  convertloop:
    736     movdqa     xmm0, [eax]
    737     movdqa     xmm1, [eax + 16]
    738     movdqa     xmm2, [eax + 32]
    739     movdqa     xmm3, [eax + 48]
    740     pmaddubsw  xmm0, xmm4
    741     pmaddubsw  xmm1, xmm4
    742     pmaddubsw  xmm2, xmm4
    743     pmaddubsw  xmm3, xmm4
    744     lea        eax, [eax + 64]
    745     phaddw     xmm0, xmm1
    746     phaddw     xmm2, xmm3
    747     psrlw      xmm0, 7
    748     psrlw      xmm2, 7
    749     packuswb   xmm0, xmm2
    750     paddb      xmm0, xmm5
    751     sub        ecx, 16
    752     movdqa     [edx], xmm0
    753     lea        edx, [edx + 16]
    754     jg         convertloop
    755     ret
    756   }
    757 }
    758 
    759 __declspec(naked) __declspec(align(16))
    760 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    761 __asm {
    762     mov        eax, [esp + 4]   /* src_argb */
    763     mov        edx, [esp + 8]   /* dst_y */
    764     mov        ecx, [esp + 12]  /* pix */
    765     movdqa     xmm5, kAddY16
    766     movdqa     xmm4, kBGRAToY
    767 
    768     align      16
    769  convertloop:
    770     movdqu     xmm0, [eax]
    771     movdqu     xmm1, [eax + 16]
    772     movdqu     xmm2, [eax + 32]
    773     movdqu     xmm3, [eax + 48]
    774     pmaddubsw  xmm0, xmm4
    775     pmaddubsw  xmm1, xmm4
    776     pmaddubsw  xmm2, xmm4
    777     pmaddubsw  xmm3, xmm4
    778     lea        eax, [eax + 64]
    779     phaddw     xmm0, xmm1
    780     phaddw     xmm2, xmm3
    781     psrlw      xmm0, 7
    782     psrlw      xmm2, 7
    783     packuswb   xmm0, xmm2
    784     paddb      xmm0, xmm5
    785     sub        ecx, 16
    786     movdqu     [edx], xmm0
    787     lea        edx, [edx + 16]
    788     jg         convertloop
    789     ret
    790   }
    791 }
    792 
    793 __declspec(naked) __declspec(align(16))
    794 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    795 __asm {
    796     mov        eax, [esp + 4]   /* src_argb */
    797     mov        edx, [esp + 8]   /* dst_y */
    798     mov        ecx, [esp + 12]  /* pix */
    799     movdqa     xmm5, kAddY16
    800     movdqa     xmm4, kABGRToY
    801 
    802     align      16
    803  convertloop:
    804     movdqa     xmm0, [eax]
    805     movdqa     xmm1, [eax + 16]
    806     movdqa     xmm2, [eax + 32]
    807     movdqa     xmm3, [eax + 48]
    808     pmaddubsw  xmm0, xmm4
    809     pmaddubsw  xmm1, xmm4
    810     pmaddubsw  xmm2, xmm4
    811     pmaddubsw  xmm3, xmm4
    812     lea        eax, [eax + 64]
    813     phaddw     xmm0, xmm1
    814     phaddw     xmm2, xmm3
    815     psrlw      xmm0, 7
    816     psrlw      xmm2, 7
    817     packuswb   xmm0, xmm2
    818     paddb      xmm0, xmm5
    819     sub        ecx, 16
    820     movdqa     [edx], xmm0
    821     lea        edx, [edx + 16]
    822     jg         convertloop
    823     ret
    824   }
    825 }
    826 
    827 __declspec(naked) __declspec(align(16))
    828 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    829 __asm {
    830     mov        eax, [esp + 4]   /* src_argb */
    831     mov        edx, [esp + 8]   /* dst_y */
    832     mov        ecx, [esp + 12]  /* pix */
    833     movdqa     xmm5, kAddY16
    834     movdqa     xmm4, kABGRToY
    835 
    836     align      16
    837  convertloop:
    838     movdqu     xmm0, [eax]
    839     movdqu     xmm1, [eax + 16]
    840     movdqu     xmm2, [eax + 32]
    841     movdqu     xmm3, [eax + 48]
    842     pmaddubsw  xmm0, xmm4
    843     pmaddubsw  xmm1, xmm4
    844     pmaddubsw  xmm2, xmm4
    845     pmaddubsw  xmm3, xmm4
    846     lea        eax, [eax + 64]
    847     phaddw     xmm0, xmm1
    848     phaddw     xmm2, xmm3
    849     psrlw      xmm0, 7
    850     psrlw      xmm2, 7
    851     packuswb   xmm0, xmm2
    852     paddb      xmm0, xmm5
    853     sub        ecx, 16
    854     movdqu     [edx], xmm0
    855     lea        edx, [edx + 16]
    856     jg         convertloop
    857     ret
    858   }
    859 }
    860 
    861 __declspec(naked) __declspec(align(16))
    862 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    863 __asm {
    864     mov        eax, [esp + 4]   /* src_argb */
    865     mov        edx, [esp + 8]   /* dst_y */
    866     mov        ecx, [esp + 12]  /* pix */
    867     movdqa     xmm5, kAddY16
    868     movdqa     xmm4, kRGBAToY
    869 
    870     align      16
    871  convertloop:
    872     movdqa     xmm0, [eax]
    873     movdqa     xmm1, [eax + 16]
    874     movdqa     xmm2, [eax + 32]
    875     movdqa     xmm3, [eax + 48]
    876     pmaddubsw  xmm0, xmm4
    877     pmaddubsw  xmm1, xmm4
    878     pmaddubsw  xmm2, xmm4
    879     pmaddubsw  xmm3, xmm4
    880     lea        eax, [eax + 64]
    881     phaddw     xmm0, xmm1
    882     phaddw     xmm2, xmm3
    883     psrlw      xmm0, 7
    884     psrlw      xmm2, 7
    885     packuswb   xmm0, xmm2
    886     paddb      xmm0, xmm5
    887     sub        ecx, 16
    888     movdqa     [edx], xmm0
    889     lea        edx, [edx + 16]
    890     jg         convertloop
    891     ret
    892   }
    893 }
    894 
    895 __declspec(naked) __declspec(align(16))
    896 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    897 __asm {
    898     mov        eax, [esp + 4]   /* src_argb */
    899     mov        edx, [esp + 8]   /* dst_y */
    900     mov        ecx, [esp + 12]  /* pix */
    901     movdqa     xmm5, kAddY16
    902     movdqa     xmm4, kRGBAToY
    903 
    904     align      16
    905  convertloop:
    906     movdqu     xmm0, [eax]
    907     movdqu     xmm1, [eax + 16]
    908     movdqu     xmm2, [eax + 32]
    909     movdqu     xmm3, [eax + 48]
    910     pmaddubsw  xmm0, xmm4
    911     pmaddubsw  xmm1, xmm4
    912     pmaddubsw  xmm2, xmm4
    913     pmaddubsw  xmm3, xmm4
    914     lea        eax, [eax + 64]
    915     phaddw     xmm0, xmm1
    916     phaddw     xmm2, xmm3
    917     psrlw      xmm0, 7
    918     psrlw      xmm2, 7
    919     packuswb   xmm0, xmm2
    920     paddb      xmm0, xmm5
    921     sub        ecx, 16
    922     movdqu     [edx], xmm0
    923     lea        edx, [edx + 16]
    924     jg         convertloop
    925     ret
    926   }
    927 }
    928 
    929 __declspec(naked) __declspec(align(16))
    930 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
    931                        uint8* dst_u, uint8* dst_v, int width) {
    932 __asm {
    933     push       esi
    934     push       edi
    935     mov        eax, [esp + 8 + 4]   // src_argb
    936     mov        esi, [esp + 8 + 8]   // src_stride_argb
    937     mov        edx, [esp + 8 + 12]  // dst_u
    938     mov        edi, [esp + 8 + 16]  // dst_v
    939     mov        ecx, [esp + 8 + 20]  // pix
    940     movdqa     xmm7, kARGBToU
    941     movdqa     xmm6, kARGBToV
    942     movdqa     xmm5, kAddUV128
    943     sub        edi, edx             // stride from u to v
    944 
    945     align      16
    946  convertloop:
    947     /* step 1 - subsample 16x2 argb pixels to 8x1 */
    948     movdqa     xmm0, [eax]
    949     movdqa     xmm1, [eax + 16]
    950     movdqa     xmm2, [eax + 32]
    951     movdqa     xmm3, [eax + 48]
    952     pavgb      xmm0, [eax + esi]
    953     pavgb      xmm1, [eax + esi + 16]
    954     pavgb      xmm2, [eax + esi + 32]
    955     pavgb      xmm3, [eax + esi + 48]
    956     lea        eax,  [eax + 64]
    957     movdqa     xmm4, xmm0
    958     shufps     xmm0, xmm1, 0x88
    959     shufps     xmm4, xmm1, 0xdd
    960     pavgb      xmm0, xmm4
    961     movdqa     xmm4, xmm2
    962     shufps     xmm2, xmm3, 0x88
    963     shufps     xmm4, xmm3, 0xdd
    964     pavgb      xmm2, xmm4
    965 
    966     // step 2 - convert to U and V
    967     // from here down is very similar to Y code except
    968     // instead of 16 different pixels, its 8 pixels of U and 8 of V
    969     movdqa     xmm1, xmm0
    970     movdqa     xmm3, xmm2
    971     pmaddubsw  xmm0, xmm7  // U
    972     pmaddubsw  xmm2, xmm7
    973     pmaddubsw  xmm1, xmm6  // V
    974     pmaddubsw  xmm3, xmm6
    975     phaddw     xmm0, xmm2
    976     phaddw     xmm1, xmm3
    977     psraw      xmm0, 8
    978     psraw      xmm1, 8
    979     packsswb   xmm0, xmm1
    980     paddb      xmm0, xmm5            // -> unsigned
    981 
    982     // step 3 - store 8 U and 8 V values
    983     sub        ecx, 16
    984     movlps     qword ptr [edx], xmm0 // U
    985     movhps     qword ptr [edx + edi], xmm0 // V
    986     lea        edx, [edx + 8]
    987     jg         convertloop
    988 
    989     pop        edi
    990     pop        esi
    991     ret
    992   }
    993 }
    994 
    995 __declspec(naked) __declspec(align(16))
    996 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
    997                                  uint8* dst_u, uint8* dst_v, int width) {
    998 __asm {
    999     push       esi
   1000     push       edi
   1001     mov        eax, [esp + 8 + 4]   // src_argb
   1002     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1003     mov        edx, [esp + 8 + 12]  // dst_u
   1004     mov        edi, [esp + 8 + 16]  // dst_v
   1005     mov        ecx, [esp + 8 + 20]  // pix
   1006     movdqa     xmm7, kARGBToU
   1007     movdqa     xmm6, kARGBToV
   1008     movdqa     xmm5, kAddUV128
   1009     sub        edi, edx             // stride from u to v
   1010 
   1011     align      16
   1012  convertloop:
   1013     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1014     movdqu     xmm0, [eax]
   1015     movdqu     xmm1, [eax + 16]
   1016     movdqu     xmm2, [eax + 32]
   1017     movdqu     xmm3, [eax + 48]
   1018     movdqu     xmm4, [eax + esi]
   1019     pavgb      xmm0, xmm4
   1020     movdqu     xmm4, [eax + esi + 16]
   1021     pavgb      xmm1, xmm4
   1022     movdqu     xmm4, [eax + esi + 32]
   1023     pavgb      xmm2, xmm4
   1024     movdqu     xmm4, [eax + esi + 48]
   1025     pavgb      xmm3, xmm4
   1026     lea        eax,  [eax + 64]
   1027     movdqa     xmm4, xmm0
   1028     shufps     xmm0, xmm1, 0x88
   1029     shufps     xmm4, xmm1, 0xdd
   1030     pavgb      xmm0, xmm4
   1031     movdqa     xmm4, xmm2
   1032     shufps     xmm2, xmm3, 0x88
   1033     shufps     xmm4, xmm3, 0xdd
   1034     pavgb      xmm2, xmm4
   1035 
   1036     // step 2 - convert to U and V
   1037     // from here down is very similar to Y code except
   1038     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1039     movdqa     xmm1, xmm0
   1040     movdqa     xmm3, xmm2
   1041     pmaddubsw  xmm0, xmm7  // U
   1042     pmaddubsw  xmm2, xmm7
   1043     pmaddubsw  xmm1, xmm6  // V
   1044     pmaddubsw  xmm3, xmm6
   1045     phaddw     xmm0, xmm2
   1046     phaddw     xmm1, xmm3
   1047     psraw      xmm0, 8
   1048     psraw      xmm1, 8
   1049     packsswb   xmm0, xmm1
   1050     paddb      xmm0, xmm5            // -> unsigned
   1051 
   1052     // step 3 - store 8 U and 8 V values
   1053     sub        ecx, 16
   1054     movlps     qword ptr [edx], xmm0 // U
   1055     movhps     qword ptr [edx + edi], xmm0 // V
   1056     lea        edx, [edx + 8]
   1057     jg         convertloop
   1058 
   1059     pop        edi
   1060     pop        esi
   1061     ret
   1062   }
   1063 }
   1064 
   1065 __declspec(naked) __declspec(align(16))
   1066 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1067                        uint8* dst_u, uint8* dst_v, int width) {
   1068 __asm {
   1069     push       esi
   1070     push       edi
   1071     mov        eax, [esp + 8 + 4]   // src_argb
   1072     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1073     mov        edx, [esp + 8 + 12]  // dst_u
   1074     mov        edi, [esp + 8 + 16]  // dst_v
   1075     mov        ecx, [esp + 8 + 20]  // pix
   1076     movdqa     xmm7, kBGRAToU
   1077     movdqa     xmm6, kBGRAToV
   1078     movdqa     xmm5, kAddUV128
   1079     sub        edi, edx             // stride from u to v
   1080 
   1081     align      16
   1082  convertloop:
   1083     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1084     movdqa     xmm0, [eax]
   1085     movdqa     xmm1, [eax + 16]
   1086     movdqa     xmm2, [eax + 32]
   1087     movdqa     xmm3, [eax + 48]
   1088     pavgb      xmm0, [eax + esi]
   1089     pavgb      xmm1, [eax + esi + 16]
   1090     pavgb      xmm2, [eax + esi + 32]
   1091     pavgb      xmm3, [eax + esi + 48]
   1092     lea        eax,  [eax + 64]
   1093     movdqa     xmm4, xmm0
   1094     shufps     xmm0, xmm1, 0x88
   1095     shufps     xmm4, xmm1, 0xdd
   1096     pavgb      xmm0, xmm4
   1097     movdqa     xmm4, xmm2
   1098     shufps     xmm2, xmm3, 0x88
   1099     shufps     xmm4, xmm3, 0xdd
   1100     pavgb      xmm2, xmm4
   1101 
   1102     // step 2 - convert to U and V
   1103     // from here down is very similar to Y code except
   1104     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1105     movdqa     xmm1, xmm0
   1106     movdqa     xmm3, xmm2
   1107     pmaddubsw  xmm0, xmm7  // U
   1108     pmaddubsw  xmm2, xmm7
   1109     pmaddubsw  xmm1, xmm6  // V
   1110     pmaddubsw  xmm3, xmm6
   1111     phaddw     xmm0, xmm2
   1112     phaddw     xmm1, xmm3
   1113     psraw      xmm0, 8
   1114     psraw      xmm1, 8
   1115     packsswb   xmm0, xmm1
   1116     paddb      xmm0, xmm5            // -> unsigned
   1117 
   1118     // step 3 - store 8 U and 8 V values
   1119     sub        ecx, 16
   1120     movlps     qword ptr [edx], xmm0 // U
   1121     movhps     qword ptr [edx + edi], xmm0 // V
   1122     lea        edx, [edx + 8]
   1123     jg         convertloop
   1124 
   1125     pop        edi
   1126     pop        esi
   1127     ret
   1128   }
   1129 }
   1130 
   1131 __declspec(naked) __declspec(align(16))
   1132 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1133                                  uint8* dst_u, uint8* dst_v, int width) {
   1134 __asm {
   1135     push       esi
   1136     push       edi
   1137     mov        eax, [esp + 8 + 4]   // src_argb
   1138     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1139     mov        edx, [esp + 8 + 12]  // dst_u
   1140     mov        edi, [esp + 8 + 16]  // dst_v
   1141     mov        ecx, [esp + 8 + 20]  // pix
   1142     movdqa     xmm7, kBGRAToU
   1143     movdqa     xmm6, kBGRAToV
   1144     movdqa     xmm5, kAddUV128
   1145     sub        edi, edx             // stride from u to v
   1146 
   1147     align      16
   1148  convertloop:
   1149     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1150     movdqu     xmm0, [eax]
   1151     movdqu     xmm1, [eax + 16]
   1152     movdqu     xmm2, [eax + 32]
   1153     movdqu     xmm3, [eax + 48]
   1154     movdqu     xmm4, [eax + esi]
   1155     pavgb      xmm0, xmm4
   1156     movdqu     xmm4, [eax + esi + 16]
   1157     pavgb      xmm1, xmm4
   1158     movdqu     xmm4, [eax + esi + 32]
   1159     pavgb      xmm2, xmm4
   1160     movdqu     xmm4, [eax + esi + 48]
   1161     pavgb      xmm3, xmm4
   1162     lea        eax,  [eax + 64]
   1163     movdqa     xmm4, xmm0
   1164     shufps     xmm0, xmm1, 0x88
   1165     shufps     xmm4, xmm1, 0xdd
   1166     pavgb      xmm0, xmm4
   1167     movdqa     xmm4, xmm2
   1168     shufps     xmm2, xmm3, 0x88
   1169     shufps     xmm4, xmm3, 0xdd
   1170     pavgb      xmm2, xmm4
   1171 
   1172     // step 2 - convert to U and V
   1173     // from here down is very similar to Y code except
   1174     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1175     movdqa     xmm1, xmm0
   1176     movdqa     xmm3, xmm2
   1177     pmaddubsw  xmm0, xmm7  // U
   1178     pmaddubsw  xmm2, xmm7
   1179     pmaddubsw  xmm1, xmm6  // V
   1180     pmaddubsw  xmm3, xmm6
   1181     phaddw     xmm0, xmm2
   1182     phaddw     xmm1, xmm3
   1183     psraw      xmm0, 8
   1184     psraw      xmm1, 8
   1185     packsswb   xmm0, xmm1
   1186     paddb      xmm0, xmm5            // -> unsigned
   1187 
   1188     // step 3 - store 8 U and 8 V values
   1189     sub        ecx, 16
   1190     movlps     qword ptr [edx], xmm0 // U
   1191     movhps     qword ptr [edx + edi], xmm0 // V
   1192     lea        edx, [edx + 8]
   1193     jg         convertloop
   1194 
   1195     pop        edi
   1196     pop        esi
   1197     ret
   1198   }
   1199 }
   1200 
   1201 __declspec(naked) __declspec(align(16))
   1202 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1203                        uint8* dst_u, uint8* dst_v, int width) {
   1204 __asm {
   1205     push       esi
   1206     push       edi
   1207     mov        eax, [esp + 8 + 4]   // src_argb
   1208     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1209     mov        edx, [esp + 8 + 12]  // dst_u
   1210     mov        edi, [esp + 8 + 16]  // dst_v
   1211     mov        ecx, [esp + 8 + 20]  // pix
   1212     movdqa     xmm7, kABGRToU
   1213     movdqa     xmm6, kABGRToV
   1214     movdqa     xmm5, kAddUV128
   1215     sub        edi, edx             // stride from u to v
   1216 
   1217     align      16
   1218  convertloop:
   1219     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1220     movdqa     xmm0, [eax]
   1221     movdqa     xmm1, [eax + 16]
   1222     movdqa     xmm2, [eax + 32]
   1223     movdqa     xmm3, [eax + 48]
   1224     pavgb      xmm0, [eax + esi]
   1225     pavgb      xmm1, [eax + esi + 16]
   1226     pavgb      xmm2, [eax + esi + 32]
   1227     pavgb      xmm3, [eax + esi + 48]
   1228     lea        eax,  [eax + 64]
   1229     movdqa     xmm4, xmm0
   1230     shufps     xmm0, xmm1, 0x88
   1231     shufps     xmm4, xmm1, 0xdd
   1232     pavgb      xmm0, xmm4
   1233     movdqa     xmm4, xmm2
   1234     shufps     xmm2, xmm3, 0x88
   1235     shufps     xmm4, xmm3, 0xdd
   1236     pavgb      xmm2, xmm4
   1237 
   1238     // step 2 - convert to U and V
   1239     // from here down is very similar to Y code except
   1240     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1241     movdqa     xmm1, xmm0
   1242     movdqa     xmm3, xmm2
   1243     pmaddubsw  xmm0, xmm7  // U
   1244     pmaddubsw  xmm2, xmm7
   1245     pmaddubsw  xmm1, xmm6  // V
   1246     pmaddubsw  xmm3, xmm6
   1247     phaddw     xmm0, xmm2
   1248     phaddw     xmm1, xmm3
   1249     psraw      xmm0, 8
   1250     psraw      xmm1, 8
   1251     packsswb   xmm0, xmm1
   1252     paddb      xmm0, xmm5            // -> unsigned
   1253 
   1254     // step 3 - store 8 U and 8 V values
   1255     sub        ecx, 16
   1256     movlps     qword ptr [edx], xmm0 // U
   1257     movhps     qword ptr [edx + edi], xmm0 // V
   1258     lea        edx, [edx + 8]
   1259     jg         convertloop
   1260 
   1261     pop        edi
   1262     pop        esi
   1263     ret
   1264   }
   1265 }
   1266 
   1267 __declspec(naked) __declspec(align(16))
   1268 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1269                                  uint8* dst_u, uint8* dst_v, int width) {
   1270 __asm {
   1271     push       esi
   1272     push       edi
   1273     mov        eax, [esp + 8 + 4]   // src_argb
   1274     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1275     mov        edx, [esp + 8 + 12]  // dst_u
   1276     mov        edi, [esp + 8 + 16]  // dst_v
   1277     mov        ecx, [esp + 8 + 20]  // pix
   1278     movdqa     xmm7, kABGRToU
   1279     movdqa     xmm6, kABGRToV
   1280     movdqa     xmm5, kAddUV128
   1281     sub        edi, edx             // stride from u to v
   1282 
   1283     align      16
   1284  convertloop:
   1285     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1286     movdqu     xmm0, [eax]
   1287     movdqu     xmm1, [eax + 16]
   1288     movdqu     xmm2, [eax + 32]
   1289     movdqu     xmm3, [eax + 48]
   1290     movdqu     xmm4, [eax + esi]
   1291     pavgb      xmm0, xmm4
   1292     movdqu     xmm4, [eax + esi + 16]
   1293     pavgb      xmm1, xmm4
   1294     movdqu     xmm4, [eax + esi + 32]
   1295     pavgb      xmm2, xmm4
   1296     movdqu     xmm4, [eax + esi + 48]
   1297     pavgb      xmm3, xmm4
   1298     lea        eax,  [eax + 64]
   1299     movdqa     xmm4, xmm0
   1300     shufps     xmm0, xmm1, 0x88
   1301     shufps     xmm4, xmm1, 0xdd
   1302     pavgb      xmm0, xmm4
   1303     movdqa     xmm4, xmm2
   1304     shufps     xmm2, xmm3, 0x88
   1305     shufps     xmm4, xmm3, 0xdd
   1306     pavgb      xmm2, xmm4
   1307 
   1308     // step 2 - convert to U and V
   1309     // from here down is very similar to Y code except
   1310     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1311     movdqa     xmm1, xmm0
   1312     movdqa     xmm3, xmm2
   1313     pmaddubsw  xmm0, xmm7  // U
   1314     pmaddubsw  xmm2, xmm7
   1315     pmaddubsw  xmm1, xmm6  // V
   1316     pmaddubsw  xmm3, xmm6
   1317     phaddw     xmm0, xmm2
   1318     phaddw     xmm1, xmm3
   1319     psraw      xmm0, 8
   1320     psraw      xmm1, 8
   1321     packsswb   xmm0, xmm1
   1322     paddb      xmm0, xmm5            // -> unsigned
   1323 
   1324     // step 3 - store 8 U and 8 V values
   1325     sub        ecx, 16
   1326     movlps     qword ptr [edx], xmm0 // U
   1327     movhps     qword ptr [edx + edi], xmm0 // V
   1328     lea        edx, [edx + 8]
   1329     jg         convertloop
   1330 
   1331     pop        edi
   1332     pop        esi
   1333     ret
   1334   }
   1335 }
   1336 
   1337 __declspec(naked) __declspec(align(16))
   1338 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1339                        uint8* dst_u, uint8* dst_v, int width) {
   1340 __asm {
   1341     push       esi
   1342     push       edi
   1343     mov        eax, [esp + 8 + 4]   // src_argb
   1344     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1345     mov        edx, [esp + 8 + 12]  // dst_u
   1346     mov        edi, [esp + 8 + 16]  // dst_v
   1347     mov        ecx, [esp + 8 + 20]  // pix
   1348     movdqa     xmm7, kRGBAToU
   1349     movdqa     xmm6, kRGBAToV
   1350     movdqa     xmm5, kAddUV128
   1351     sub        edi, edx             // stride from u to v
   1352 
   1353     align      16
   1354  convertloop:
   1355     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1356     movdqa     xmm0, [eax]
   1357     movdqa     xmm1, [eax + 16]
   1358     movdqa     xmm2, [eax + 32]
   1359     movdqa     xmm3, [eax + 48]
   1360     pavgb      xmm0, [eax + esi]
   1361     pavgb      xmm1, [eax + esi + 16]
   1362     pavgb      xmm2, [eax + esi + 32]
   1363     pavgb      xmm3, [eax + esi + 48]
   1364     lea        eax,  [eax + 64]
   1365     movdqa     xmm4, xmm0
   1366     shufps     xmm0, xmm1, 0x88
   1367     shufps     xmm4, xmm1, 0xdd
   1368     pavgb      xmm0, xmm4
   1369     movdqa     xmm4, xmm2
   1370     shufps     xmm2, xmm3, 0x88
   1371     shufps     xmm4, xmm3, 0xdd
   1372     pavgb      xmm2, xmm4
   1373 
   1374     // step 2 - convert to U and V
   1375     // from here down is very similar to Y code except
   1376     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1377     movdqa     xmm1, xmm0
   1378     movdqa     xmm3, xmm2
   1379     pmaddubsw  xmm0, xmm7  // U
   1380     pmaddubsw  xmm2, xmm7
   1381     pmaddubsw  xmm1, xmm6  // V
   1382     pmaddubsw  xmm3, xmm6
   1383     phaddw     xmm0, xmm2
   1384     phaddw     xmm1, xmm3
   1385     psraw      xmm0, 8
   1386     psraw      xmm1, 8
   1387     packsswb   xmm0, xmm1
   1388     paddb      xmm0, xmm5            // -> unsigned
   1389 
   1390     // step 3 - store 8 U and 8 V values
   1391     sub        ecx, 16
   1392     movlps     qword ptr [edx], xmm0 // U
   1393     movhps     qword ptr [edx + edi], xmm0 // V
   1394     lea        edx, [edx + 8]
   1395     jg         convertloop
   1396 
   1397     pop        edi
   1398     pop        esi
   1399     ret
   1400   }
   1401 }
   1402 
   1403 __declspec(naked) __declspec(align(16))
   1404 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1405                                  uint8* dst_u, uint8* dst_v, int width) {
   1406 __asm {
   1407     push       esi
   1408     push       edi
   1409     mov        eax, [esp + 8 + 4]   // src_argb
   1410     mov        esi, [esp + 8 + 8]   // src_stride_argb
   1411     mov        edx, [esp + 8 + 12]  // dst_u
   1412     mov        edi, [esp + 8 + 16]  // dst_v
   1413     mov        ecx, [esp + 8 + 20]  // pix
   1414     movdqa     xmm7, kRGBAToU
   1415     movdqa     xmm6, kRGBAToV
   1416     movdqa     xmm5, kAddUV128
   1417     sub        edi, edx             // stride from u to v
   1418 
   1419     align      16
   1420  convertloop:
   1421     /* step 1 - subsample 16x2 argb pixels to 8x1 */
   1422     movdqu     xmm0, [eax]
   1423     movdqu     xmm1, [eax + 16]
   1424     movdqu     xmm2, [eax + 32]
   1425     movdqu     xmm3, [eax + 48]
   1426     movdqu     xmm4, [eax + esi]
   1427     pavgb      xmm0, xmm4
   1428     movdqu     xmm4, [eax + esi + 16]
   1429     pavgb      xmm1, xmm4
   1430     movdqu     xmm4, [eax + esi + 32]
   1431     pavgb      xmm2, xmm4
   1432     movdqu     xmm4, [eax + esi + 48]
   1433     pavgb      xmm3, xmm4
   1434     lea        eax,  [eax + 64]
   1435     movdqa     xmm4, xmm0
   1436     shufps     xmm0, xmm1, 0x88
   1437     shufps     xmm4, xmm1, 0xdd
   1438     pavgb      xmm0, xmm4
   1439     movdqa     xmm4, xmm2
   1440     shufps     xmm2, xmm3, 0x88
   1441     shufps     xmm4, xmm3, 0xdd
   1442     pavgb      xmm2, xmm4
   1443 
   1444     // step 2 - convert to U and V
   1445     // from here down is very similar to Y code except
   1446     // instead of 16 different pixels, its 8 pixels of U and 8 of V
   1447     movdqa     xmm1, xmm0
   1448     movdqa     xmm3, xmm2
   1449     pmaddubsw  xmm0, xmm7  // U
   1450     pmaddubsw  xmm2, xmm7
   1451     pmaddubsw  xmm1, xmm6  // V
   1452     pmaddubsw  xmm3, xmm6
   1453     phaddw     xmm0, xmm2
   1454     phaddw     xmm1, xmm3
   1455     psraw      xmm0, 8
   1456     psraw      xmm1, 8
   1457     packsswb   xmm0, xmm1
   1458     paddb      xmm0, xmm5            // -> unsigned
   1459 
   1460     // step 3 - store 8 U and 8 V values
   1461     sub        ecx, 16
   1462     movlps     qword ptr [edx], xmm0 // U
   1463     movhps     qword ptr [edx + edi], xmm0 // V
   1464     lea        edx, [edx + 8]
   1465     jg         convertloop
   1466 
   1467     pop        edi
   1468     pop        esi
   1469     ret
   1470   }
   1471 }
   1472 #endif  // HAS_ARGBTOYROW_SSSE3
   1473 
   1474 #ifdef HAS_I422TOARGBROW_SSSE3
   1475 
   1476 #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
   1477 
   1478 #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
   1479 #define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
   1480 #define UR 0
   1481 
   1482 #define VB 0
   1483 #define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
   1484 #define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
   1485 
   1486 // Bias
   1487 #define BB UB * 128 + VB * 128
   1488 #define BG UG * 128 + VG * 128
   1489 #define BR UR * 128 + VR * 128
   1490 
   1491 static const vec8 kUVToB = {
   1492   UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
   1493 };
   1494 
   1495 static const vec8 kUVToR = {
   1496   UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
   1497 };
   1498 
   1499 static const vec8 kUVToG = {
   1500   UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
   1501 };
   1502 
   1503 static const vec8 kVUToB = {
   1504   VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
   1505 };
   1506 
   1507 static const vec8 kVUToR = {
   1508   VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
   1509 };
   1510 
   1511 static const vec8 kVUToG = {
   1512   VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
   1513 };
   1514 
   1515 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
   1516 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
   1517 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
   1518 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
   1519 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
   1520 
   1521 // TODO(fbarchard): NV12/NV21 fetch UV and use directly.
   1522 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
   1523 
   1524 // Read 8 UV from 411.
   1525 #define READYUV444 __asm {                                                     \
   1526     __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
   1527     __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
   1528     __asm lea        esi,  [esi + 8]                                           \
   1529     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
   1530   }
   1531 
   1532 // Read 4 UV from 422, upsample to 8 UV.
   1533 #define READYUV422 __asm {                                                     \
   1534     __asm movd       xmm0, [esi]          /* U */                              \
   1535     __asm movd       xmm1, [esi + edi]    /* V */                              \
   1536     __asm lea        esi,  [esi + 4]                                           \
   1537     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
   1538     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
   1539   }
   1540 
   1541 // Read 2 UV from 411, upsample to 8 UV.
   1542 #define READYUV411 __asm {                                                     \
   1543     __asm movd       xmm0, [esi]          /* U */                              \
   1544     __asm movd       xmm1, [esi + edi]    /* V */                              \
   1545     __asm lea        esi,  [esi + 2]                                           \
   1546     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
   1547     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
   1548     __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
   1549   }
   1550 
   1551 // Read 4 UV from NV12, upsample to 8 UV.
   1552 #define READNV12 __asm {                                                       \
   1553     __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
   1554     __asm lea        esi,  [esi + 8]                                           \
   1555     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
   1556   }
   1557 
   1558 // Convert 8 pixels: 8 UV and 8 Y.
   1559 #define YUVTORGB __asm {                                                       \
   1560     /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
   1561     __asm movdqa     xmm1, xmm0                                                \
   1562     __asm movdqa     xmm2, xmm0                                                \
   1563     __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
   1564     __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
   1565     __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
   1566     __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
   1567     __asm psubw      xmm1, kUVBiasG                                            \
   1568     __asm psubw      xmm2, kUVBiasR                                            \
   1569     /* Step 2: Find Y contribution to 8 R,G,B values */                        \
   1570     __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
   1571     __asm lea        eax, [eax + 8]                                            \
   1572     __asm punpcklbw  xmm3, xmm4                                                \
   1573     __asm psubsw     xmm3, kYSub16                                             \
   1574     __asm pmullw     xmm3, kYToRgb                                             \
   1575     __asm paddsw     xmm0, xmm3           /* B += Y */                         \
   1576     __asm paddsw     xmm1, xmm3           /* G += Y */                         \
   1577     __asm paddsw     xmm2, xmm3           /* R += Y */                         \
   1578     __asm psraw      xmm0, 6                                                   \
   1579     __asm psraw      xmm1, 6                                                   \
   1580     __asm psraw      xmm2, 6                                                   \
   1581     __asm packuswb   xmm0, xmm0           /* B */                              \
   1582     __asm packuswb   xmm1, xmm1           /* G */                              \
   1583     __asm packuswb   xmm2, xmm2           /* R */                              \
   1584   }
   1585 
   1586 // Convert 8 pixels: 8 VU and 8 Y.
   1587 #define YVUTORGB __asm {                                                       \
   1588     /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
   1589     __asm movdqa     xmm1, xmm0                                                \
   1590     __asm movdqa     xmm2, xmm0                                                \
   1591     __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
   1592     __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
   1593     __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
   1594     __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
   1595     __asm psubw      xmm1, kUVBiasG                                            \
   1596     __asm psubw      xmm2, kUVBiasR                                            \
   1597     /* Step 2: Find Y contribution to 8 R,G,B values */                        \
   1598     __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
   1599     __asm lea        eax, [eax + 8]                                            \
   1600     __asm punpcklbw  xmm3, xmm4                                                \
   1601     __asm psubsw     xmm3, kYSub16                                             \
   1602     __asm pmullw     xmm3, kYToRgb                                             \
   1603     __asm paddsw     xmm0, xmm3           /* B += Y */                         \
   1604     __asm paddsw     xmm1, xmm3           /* G += Y */                         \
   1605     __asm paddsw     xmm2, xmm3           /* R += Y */                         \
   1606     __asm psraw      xmm0, 6                                                   \
   1607     __asm psraw      xmm1, 6                                                   \
   1608     __asm psraw      xmm2, 6                                                   \
   1609     __asm packuswb   xmm0, xmm0           /* B */                              \
   1610     __asm packuswb   xmm1, xmm1           /* G */                              \
   1611     __asm packuswb   xmm2, xmm2           /* R */                              \
   1612   }
   1613 
   1614 // 8 pixels, dest aligned 16.
   1615 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
   1616 __declspec(naked) __declspec(align(16))
   1617 void I444ToARGBRow_SSSE3(const uint8* y_buf,
   1618                          const uint8* u_buf,
   1619                          const uint8* v_buf,
   1620                          uint8* argb_buf,
   1621                          int width) {
   1622   __asm {
   1623     push       esi
   1624     push       edi
   1625     mov        eax, [esp + 8 + 4]   // Y
   1626     mov        esi, [esp + 8 + 8]   // U
   1627     mov        edi, [esp + 8 + 12]  // V
   1628     mov        edx, [esp + 8 + 16]  // argb
   1629     mov        ecx, [esp + 8 + 20]  // width
   1630     sub        edi, esi
   1631     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   1632     pxor       xmm4, xmm4
   1633 
   1634     align      16
   1635  convertloop:
   1636     READYUV444
   1637     YUVTORGB
   1638 
   1639     // Step 3: Weave into ARGB
   1640     punpcklbw  xmm0, xmm1           // BG
   1641     punpcklbw  xmm2, xmm5           // RA
   1642     movdqa     xmm1, xmm0
   1643     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   1644     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   1645     movdqa     [edx], xmm0
   1646     movdqa     [edx + 16], xmm1
   1647     lea        edx,  [edx + 32]
   1648     sub        ecx, 8
   1649     jg         convertloop
   1650 
   1651     pop        edi
   1652     pop        esi
   1653     ret
   1654   }
   1655 }
   1656 
   1657 // 8 pixels, dest aligned 16.
   1658 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   1659 __declspec(naked) __declspec(align(16))
   1660 void I422ToARGBRow_SSSE3(const uint8* y_buf,
   1661                          const uint8* u_buf,
   1662                          const uint8* v_buf,
   1663                          uint8* argb_buf,
   1664                          int width) {
   1665   __asm {
   1666     push       esi
   1667     push       edi
   1668     mov        eax, [esp + 8 + 4]   // Y
   1669     mov        esi, [esp + 8 + 8]   // U
   1670     mov        edi, [esp + 8 + 12]  // V
   1671     mov        edx, [esp + 8 + 16]  // argb
   1672     mov        ecx, [esp + 8 + 20]  // width
   1673     sub        edi, esi
   1674     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   1675     pxor       xmm4, xmm4
   1676 
   1677     align      16
   1678  convertloop:
   1679     READYUV422
   1680     YUVTORGB
   1681 
   1682     // Step 3: Weave into ARGB
   1683     punpcklbw  xmm0, xmm1           // BG
   1684     punpcklbw  xmm2, xmm5           // RA
   1685     movdqa     xmm1, xmm0
   1686     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   1687     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   1688     movdqa     [edx], xmm0
   1689     movdqa     [edx + 16], xmm1
   1690     lea        edx,  [edx + 32]
   1691     sub        ecx, 8
   1692     jg         convertloop
   1693 
   1694     pop        edi
   1695     pop        esi
   1696     ret
   1697   }
   1698 }
   1699 
   1700 // 8 pixels, dest aligned 16.
   1701 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   1702 // Similar to I420 but duplicate UV once more.
   1703 __declspec(naked) __declspec(align(16))
   1704 void I411ToARGBRow_SSSE3(const uint8* y_buf,
   1705                          const uint8* u_buf,
   1706                          const uint8* v_buf,
   1707                          uint8* argb_buf,
   1708                          int width) {
   1709   __asm {
   1710     push       esi
   1711     push       edi
   1712     mov        eax, [esp + 8 + 4]   // Y
   1713     mov        esi, [esp + 8 + 8]   // U
   1714     mov        edi, [esp + 8 + 12]  // V
   1715     mov        edx, [esp + 8 + 16]  // argb
   1716     mov        ecx, [esp + 8 + 20]  // width
   1717     sub        edi, esi
   1718     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   1719     pxor       xmm4, xmm4
   1720 
   1721     align      16
   1722  convertloop:
   1723     READYUV411
   1724     YUVTORGB
   1725 
   1726     // Step 3: Weave into ARGB
   1727     punpcklbw  xmm0, xmm1           // BG
   1728     punpcklbw  xmm2, xmm5           // RA
   1729     movdqa     xmm1, xmm0
   1730     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   1731     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   1732     movdqa     [edx], xmm0
   1733     movdqa     [edx + 16], xmm1
   1734     lea        edx,  [edx + 32]
   1735     sub        ecx, 8
   1736     jg         convertloop
   1737 
   1738     pop        edi
   1739     pop        esi
   1740     ret
   1741   }
   1742 }
   1743 
   1744 // 8 pixels, dest aligned 16.
   1745 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   1746 __declspec(naked) __declspec(align(16))
   1747 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
   1748                          const uint8* uv_buf,
   1749                          uint8* argb_buf,
   1750                          int width) {
   1751   __asm {
   1752     push       esi
   1753     mov        eax, [esp + 4 + 4]   // Y
   1754     mov        esi, [esp + 4 + 8]   // UV
   1755     mov        edx, [esp + 4 + 12]  // argb
   1756     mov        ecx, [esp + 4 + 16]  // width
   1757     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   1758     pxor       xmm4, xmm4
   1759 
   1760     align      16
   1761  convertloop:
   1762     READNV12
   1763     YUVTORGB
   1764 
   1765     // Step 3: Weave into ARGB
   1766     punpcklbw  xmm0, xmm1           // BG
   1767     punpcklbw  xmm2, xmm5           // RA
   1768     movdqa     xmm1, xmm0
   1769     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   1770     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   1771     movdqa     [edx], xmm0
   1772     movdqa     [edx + 16], xmm1
   1773     lea        edx,  [edx + 32]
   1774     sub        ecx, 8
   1775     jg         convertloop
   1776 
   1777     pop        esi
   1778     ret
   1779   }
   1780 }
   1781 
   1782 // 8 pixels, dest aligned 16.
   1783 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   1784 __declspec(naked) __declspec(align(16))
   1785 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
   1786                          const uint8* uv_buf,
   1787                          uint8* argb_buf,
   1788                          int width) {
   1789   __asm {
   1790     push       esi
   1791     mov        eax, [esp + 4 + 4]   // Y
   1792     mov        esi, [esp + 4 + 8]   // VU
   1793     mov        edx, [esp + 4 + 12]  // argb
   1794     mov        ecx, [esp + 4 + 16]  // width
   1795     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   1796     pxor       xmm4, xmm4
   1797 
   1798     align      16
   1799  convertloop:
   1800     READNV12
   1801     YVUTORGB
   1802 
   1803     // Step 3: Weave into ARGB
   1804     punpcklbw  xmm0, xmm1           // BG
   1805     punpcklbw  xmm2, xmm5           // RA
   1806     movdqa     xmm1, xmm0
   1807     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   1808     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   1809     movdqa     [edx], xmm0
   1810     movdqa     [edx + 16], xmm1
   1811     lea        edx,  [edx + 32]
   1812     sub        ecx, 8
   1813     jg         convertloop
   1814 
   1815     pop        esi
   1816     ret
   1817   }
   1818 }
   1819 
   1820 // 8 pixels, unaligned.
   1821 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
   1822 __declspec(naked) __declspec(align(16))
   1823 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   1824                                    const uint8* u_buf,
   1825                                    const uint8* v_buf,
   1826                                    uint8* argb_buf,
   1827                                    int width) {
   1828   __asm {
   1829     push       esi
   1830     push       edi
   1831     mov        eax, [esp + 8 + 4]   // Y
   1832     mov        esi, [esp + 8 + 8]   // U
   1833     mov        edi, [esp + 8 + 12]  // V
   1834     mov        edx, [esp + 8 + 16]  // argb
   1835     mov        ecx, [esp + 8 + 20]  // width
   1836     sub        edi, esi
   1837     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   1838     pxor       xmm4, xmm4
   1839 
   1840     align      16
   1841  convertloop:
   1842     READYUV444
   1843     YUVTORGB
   1844 
   1845     // Step 3: Weave into ARGB
   1846     punpcklbw  xmm0, xmm1           // BG
   1847     punpcklbw  xmm2, xmm5           // RA
   1848     movdqa     xmm1, xmm0
   1849     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   1850     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   1851     movdqu     [edx], xmm0
   1852     movdqu     [edx + 16], xmm1
   1853     lea        edx,  [edx + 32]
   1854     sub        ecx, 8
   1855     jg         convertloop
   1856 
   1857     pop        edi
   1858     pop        esi
   1859     ret
   1860   }
   1861 }
   1862 
   1863 // 8 pixels, unaligned.
   1864 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   1865 __declspec(naked) __declspec(align(16))
   1866 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   1867                                    const uint8* u_buf,
   1868                                    const uint8* v_buf,
   1869                                    uint8* argb_buf,
   1870                                    int width) {
   1871   __asm {
   1872     push       esi
   1873     push       edi
   1874     mov        eax, [esp + 8 + 4]   // Y
   1875     mov        esi, [esp + 8 + 8]   // U
   1876     mov        edi, [esp + 8 + 12]  // V
   1877     mov        edx, [esp + 8 + 16]  // argb
   1878     mov        ecx, [esp + 8 + 20]  // width
   1879     sub        edi, esi
   1880     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   1881     pxor       xmm4, xmm4
   1882 
   1883     align      16
   1884  convertloop:
   1885     READYUV422
   1886     YUVTORGB
   1887 
   1888     // Step 3: Weave into ARGB
   1889     punpcklbw  xmm0, xmm1           // BG
   1890     punpcklbw  xmm2, xmm5           // RA
   1891     movdqa     xmm1, xmm0
   1892     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   1893     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   1894     movdqu     [edx], xmm0
   1895     movdqu     [edx + 16], xmm1
   1896     lea        edx,  [edx + 32]
   1897     sub        ecx, 8
   1898     jg         convertloop
   1899 
   1900     pop        edi
   1901     pop        esi
   1902     ret
   1903   }
   1904 }
   1905 
   1906 // 8 pixels, unaligned.
   1907 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   1908 // Similar to I420 but duplicate UV once more.
   1909 __declspec(naked) __declspec(align(16))
   1910 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   1911                                    const uint8* u_buf,
   1912                                    const uint8* v_buf,
   1913                                    uint8* argb_buf,
   1914                                    int width) {
   1915   __asm {
   1916     push       esi
   1917     push       edi
   1918     mov        eax, [esp + 8 + 4]   // Y
   1919     mov        esi, [esp + 8 + 8]   // U
   1920     mov        edi, [esp + 8 + 12]  // V
   1921     mov        edx, [esp + 8 + 16]  // argb
   1922     mov        ecx, [esp + 8 + 20]  // width
   1923     sub        edi, esi
   1924     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   1925     pxor       xmm4, xmm4
   1926 
   1927     align      16
   1928  convertloop:
   1929     READYUV411
   1930     YUVTORGB
   1931 
   1932     // Step 3: Weave into ARGB
   1933     punpcklbw  xmm0, xmm1           // BG
   1934     punpcklbw  xmm2, xmm5           // RA
   1935     movdqa     xmm1, xmm0
   1936     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   1937     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   1938     movdqu     [edx], xmm0
   1939     movdqu     [edx + 16], xmm1
   1940     lea        edx,  [edx + 32]
   1941     sub        ecx, 8
   1942     jg         convertloop
   1943 
   1944     pop        edi
   1945     pop        esi
   1946     ret
   1947   }
   1948 }
   1949 
   1950 
   1951 // 8 pixels, dest aligned 16.
   1952 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   1953 __declspec(naked) __declspec(align(16))
   1954 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   1955                                    const uint8* uv_buf,
   1956                                    uint8* argb_buf,
   1957                                    int width) {
   1958   __asm {
   1959     push       esi
   1960     mov        eax, [esp + 4 + 4]   // Y
   1961     mov        esi, [esp + 4 + 8]   // UV
   1962     mov        edx, [esp + 4 + 12]  // argb
   1963     mov        ecx, [esp + 4 + 16]  // width
   1964     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   1965     pxor       xmm4, xmm4
   1966 
   1967     align      16
   1968  convertloop:
   1969     READNV12
   1970     YUVTORGB
   1971 
   1972     // Step 3: Weave into ARGB
   1973     punpcklbw  xmm0, xmm1           // BG
   1974     punpcklbw  xmm2, xmm5           // RA
   1975     movdqa     xmm1, xmm0
   1976     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   1977     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   1978     movdqu     [edx], xmm0
   1979     movdqu     [edx + 16], xmm1
   1980     lea        edx,  [edx + 32]
   1981     sub        ecx, 8
   1982     jg         convertloop
   1983 
   1984     pop        esi
   1985     ret
   1986   }
   1987 }
   1988 
   1989 // 8 pixels, dest aligned 16.
   1990 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
   1991 __declspec(naked) __declspec(align(16))
   1992 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   1993                                    const uint8* uv_buf,
   1994                                    uint8* argb_buf,
   1995                                    int width) {
   1996   __asm {
   1997     push       esi
   1998     mov        eax, [esp + 4 + 4]   // Y
   1999     mov        esi, [esp + 4 + 8]   // VU
   2000     mov        edx, [esp + 4 + 12]  // argb
   2001     mov        ecx, [esp + 4 + 16]  // width
   2002     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2003     pxor       xmm4, xmm4
   2004 
   2005     align      16
   2006  convertloop:
   2007     READNV12
   2008     YVUTORGB
   2009 
   2010     // Step 3: Weave into ARGB
   2011     punpcklbw  xmm0, xmm1           // BG
   2012     punpcklbw  xmm2, xmm5           // RA
   2013     movdqa     xmm1, xmm0
   2014     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
   2015     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
   2016     movdqu     [edx], xmm0
   2017     movdqu     [edx + 16], xmm1
   2018     lea        edx,  [edx + 32]
   2019     sub        ecx, 8
   2020     jg         convertloop
   2021 
   2022     pop        esi
   2023     ret
   2024   }
   2025 }
   2026 
   2027 __declspec(naked) __declspec(align(16))
   2028 void I422ToBGRARow_SSSE3(const uint8* y_buf,
   2029                          const uint8* u_buf,
   2030                          const uint8* v_buf,
   2031                          uint8* bgra_buf,
   2032                          int width) {
   2033   __asm {
   2034     push       esi
   2035     push       edi
   2036     mov        eax, [esp + 8 + 4]   // Y
   2037     mov        esi, [esp + 8 + 8]   // U
   2038     mov        edi, [esp + 8 + 12]  // V
   2039     mov        edx, [esp + 8 + 16]  // bgra
   2040     mov        ecx, [esp + 8 + 20]  // width
   2041     sub        edi, esi
   2042     pxor       xmm4, xmm4
   2043 
   2044     align      16
   2045  convertloop:
   2046     READYUV422
   2047     YUVTORGB
   2048 
   2049     // Step 3: Weave into BGRA
   2050     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2051     punpcklbw  xmm1, xmm0           // GB
   2052     punpcklbw  xmm5, xmm2           // AR
   2053     movdqa     xmm0, xmm5
   2054     punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
   2055     punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
   2056     movdqa     [edx], xmm5
   2057     movdqa     [edx + 16], xmm0
   2058     lea        edx,  [edx + 32]
   2059     sub        ecx, 8
   2060     jg         convertloop
   2061 
   2062     pop        edi
   2063     pop        esi
   2064     ret
   2065   }
   2066 }
   2067 
   2068 __declspec(naked) __declspec(align(16))
   2069 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
   2070                                    const uint8* u_buf,
   2071                                    const uint8* v_buf,
   2072                                    uint8* bgra_buf,
   2073                                    int width) {
   2074   __asm {
   2075     push       esi
   2076     push       edi
   2077     mov        eax, [esp + 8 + 4]   // Y
   2078     mov        esi, [esp + 8 + 8]   // U
   2079     mov        edi, [esp + 8 + 12]  // V
   2080     mov        edx, [esp + 8 + 16]  // bgra
   2081     mov        ecx, [esp + 8 + 20]  // width
   2082     sub        edi, esi
   2083     pxor       xmm4, xmm4
   2084 
   2085     align      16
   2086  convertloop:
   2087     READYUV422
   2088     YUVTORGB
   2089 
   2090     // Step 3: Weave into BGRA
   2091     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2092     punpcklbw  xmm1, xmm0           // GB
   2093     punpcklbw  xmm5, xmm2           // AR
   2094     movdqa     xmm0, xmm5
   2095     punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
   2096     punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
   2097     movdqu     [edx], xmm5
   2098     movdqu     [edx + 16], xmm0
   2099     lea        edx,  [edx + 32]
   2100     sub        ecx, 8
   2101     jg         convertloop
   2102 
   2103     pop        edi
   2104     pop        esi
   2105     ret
   2106   }
   2107 }
   2108 
   2109 __declspec(naked) __declspec(align(16))
   2110 void I422ToABGRRow_SSSE3(const uint8* y_buf,
   2111                          const uint8* u_buf,
   2112                          const uint8* v_buf,
   2113                          uint8* abgr_buf,
   2114                          int width) {
   2115   __asm {
   2116     push       esi
   2117     push       edi
   2118     mov        eax, [esp + 8 + 4]   // Y
   2119     mov        esi, [esp + 8 + 8]   // U
   2120     mov        edi, [esp + 8 + 12]  // V
   2121     mov        edx, [esp + 8 + 16]  // abgr
   2122     mov        ecx, [esp + 8 + 20]  // width
   2123     sub        edi, esi
   2124     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2125     pxor       xmm4, xmm4
   2126 
   2127     align      16
   2128  convertloop:
   2129     READYUV422
   2130     YUVTORGB
   2131 
   2132     // Step 3: Weave into ARGB
   2133     punpcklbw  xmm2, xmm1           // RG
   2134     punpcklbw  xmm0, xmm5           // BA
   2135     movdqa     xmm1, xmm2
   2136     punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
   2137     punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
   2138     movdqa     [edx], xmm2
   2139     movdqa     [edx + 16], xmm1
   2140     lea        edx,  [edx + 32]
   2141     sub        ecx, 8
   2142     jg         convertloop
   2143 
   2144     pop        edi
   2145     pop        esi
   2146     ret
   2147   }
   2148 }
   2149 
   2150 __declspec(naked) __declspec(align(16))
   2151 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
   2152                                    const uint8* u_buf,
   2153                                    const uint8* v_buf,
   2154                                    uint8* abgr_buf,
   2155                                    int width) {
   2156   __asm {
   2157     push       esi
   2158     push       edi
   2159     mov        eax, [esp + 8 + 4]   // Y
   2160     mov        esi, [esp + 8 + 8]   // U
   2161     mov        edi, [esp + 8 + 12]  // V
   2162     mov        edx, [esp + 8 + 16]  // abgr
   2163     mov        ecx, [esp + 8 + 20]  // width
   2164     sub        edi, esi
   2165     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2166     pxor       xmm4, xmm4
   2167 
   2168     align      16
   2169  convertloop:
   2170     READYUV422
   2171     YUVTORGB
   2172 
   2173     // Step 3: Weave into ARGB
   2174     punpcklbw  xmm2, xmm1           // RG
   2175     punpcklbw  xmm0, xmm5           // BA
   2176     movdqa     xmm1, xmm2
   2177     punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
   2178     punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
   2179     movdqu     [edx], xmm2
   2180     movdqu     [edx + 16], xmm1
   2181     lea        edx,  [edx + 32]
   2182     sub        ecx, 8
   2183     jg         convertloop
   2184 
   2185     pop        edi
   2186     pop        esi
   2187     ret
   2188   }
   2189 }
   2190 
   2191 __declspec(naked) __declspec(align(16))
   2192 void I422ToRGBARow_SSSE3(const uint8* y_buf,
   2193                          const uint8* u_buf,
   2194                          const uint8* v_buf,
   2195                          uint8* rgba_buf,
   2196                          int width) {
   2197   __asm {
   2198     push       esi
   2199     push       edi
   2200     mov        eax, [esp + 8 + 4]   // Y
   2201     mov        esi, [esp + 8 + 8]   // U
   2202     mov        edi, [esp + 8 + 12]  // V
   2203     mov        edx, [esp + 8 + 16]  // rgba
   2204     mov        ecx, [esp + 8 + 20]  // width
   2205     sub        edi, esi
   2206     pxor       xmm4, xmm4
   2207 
   2208     align      16
   2209  convertloop:
   2210     READYUV422
   2211     YUVTORGB
   2212 
   2213     // Step 3: Weave into RGBA
   2214     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2215     punpcklbw  xmm1, xmm2           // GR
   2216     punpcklbw  xmm5, xmm0           // AB
   2217     movdqa     xmm0, xmm5
   2218     punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
   2219     punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
   2220     movdqa     [edx], xmm5
   2221     movdqa     [edx + 16], xmm0
   2222     lea        edx,  [edx + 32]
   2223     sub        ecx, 8
   2224     jg         convertloop
   2225 
   2226     pop        edi
   2227     pop        esi
   2228     ret
   2229   }
   2230 }
   2231 
   2232 __declspec(naked) __declspec(align(16))
   2233 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
   2234                                    const uint8* u_buf,
   2235                                    const uint8* v_buf,
   2236                                    uint8* rgba_buf,
   2237                                    int width) {
   2238   __asm {
   2239     push       esi
   2240     push       edi
   2241     mov        eax, [esp + 8 + 4]   // Y
   2242     mov        esi, [esp + 8 + 8]   // U
   2243     mov        edi, [esp + 8 + 12]  // V
   2244     mov        edx, [esp + 8 + 16]  // rgba
   2245     mov        ecx, [esp + 8 + 20]  // width
   2246     sub        edi, esi
   2247     pxor       xmm4, xmm4
   2248 
   2249     align      16
   2250  convertloop:
   2251     READYUV422
   2252     YUVTORGB
   2253 
   2254     // Step 3: Weave into RGBA
   2255     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
   2256     punpcklbw  xmm1, xmm2           // GR
   2257     punpcklbw  xmm5, xmm0           // AB
   2258     movdqa     xmm0, xmm5
   2259     punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
   2260     punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
   2261     movdqu     [edx], xmm5
   2262     movdqu     [edx + 16], xmm0
   2263     lea        edx,  [edx + 32]
   2264     sub        ecx, 8
   2265     jg         convertloop
   2266 
   2267     pop        edi
   2268     pop        esi
   2269     ret
   2270   }
   2271 }
   2272 
   2273 #endif  // HAS_I422TOARGBROW_SSSE3
   2274 
   2275 #ifdef HAS_YTOARGBROW_SSE2
   2276 __declspec(naked) __declspec(align(16))
   2277 void YToARGBRow_SSE2(const uint8* y_buf,
   2278                      uint8* rgb_buf,
   2279                      int width) {
   2280   __asm {
   2281     pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
   2282     pslld      xmm4, 24
   2283     mov        eax,0x10001000
   2284     movd       xmm3,eax
   2285     pshufd     xmm3,xmm3,0
   2286     mov        eax,0x012a012a
   2287     movd       xmm2,eax
   2288     pshufd     xmm2,xmm2,0
   2289     mov        eax, [esp + 4]       // Y
   2290     mov        edx, [esp + 8]       // rgb
   2291     mov        ecx, [esp + 12]      // width
   2292 
   2293     align      16
   2294  convertloop:
   2295     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
   2296     movq       xmm0, qword ptr [eax]
   2297     lea        eax, [eax + 8]
   2298     punpcklbw  xmm0, xmm0           // Y.Y
   2299     psubusw    xmm0, xmm3
   2300     pmulhuw    xmm0, xmm2
   2301     packuswb   xmm0, xmm0           // G
   2302 
   2303     // Step 2: Weave into ARGB
   2304     punpcklbw  xmm0, xmm0           // GG
   2305     movdqa     xmm1, xmm0
   2306     punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
   2307     punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
   2308     por        xmm0, xmm4
   2309     por        xmm1, xmm4
   2310     movdqa     [edx], xmm0
   2311     movdqa     [edx + 16], xmm1
   2312     lea        edx,  [edx + 32]
   2313     sub        ecx, 8
   2314     jg         convertloop
   2315 
   2316     ret
   2317   }
   2318 }
   2319 #endif  // HAS_YTOARGBROW_SSE2
   2320 
   2321 #ifdef HAS_MIRRORROW_SSSE3
   2322 
   2323 // Shuffle table for reversing the bytes.
   2324 static const uvec8 kShuffleMirror = {
   2325   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
   2326 };
   2327 
   2328 __declspec(naked) __declspec(align(16))
   2329 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   2330 __asm {
   2331     mov       eax, [esp + 4]   // src
   2332     mov       edx, [esp + 8]   // dst
   2333     mov       ecx, [esp + 12]  // width
   2334     movdqa    xmm5, kShuffleMirror
   2335     lea       eax, [eax - 16]
   2336 
   2337     align      16
   2338  convertloop:
   2339     movdqa    xmm0, [eax + ecx]
   2340     pshufb    xmm0, xmm5
   2341     sub       ecx, 16
   2342     movdqa    [edx], xmm0
   2343     lea       edx, [edx + 16]
   2344     jg        convertloop
   2345     ret
   2346   }
   2347 }
   2348 #endif  // HAS_MIRRORROW_SSSE3
   2349 
   2350 #ifdef HAS_MIRRORROW_SSE2
   2351 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
   2352 // version can not.
   2353 __declspec(naked) __declspec(align(16))
   2354 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   2355 __asm {
   2356     mov       eax, [esp + 4]   // src
   2357     mov       edx, [esp + 8]   // dst
   2358     mov       ecx, [esp + 12]  // width
   2359     lea       eax, [eax - 16]
   2360 
   2361     align      16
   2362  convertloop:
   2363     movdqu    xmm0, [eax + ecx]
   2364     movdqa    xmm1, xmm0        // swap bytes
   2365     psllw     xmm0, 8
   2366     psrlw     xmm1, 8
   2367     por       xmm0, xmm1
   2368     pshuflw   xmm0, xmm0, 0x1b  // swap words
   2369     pshufhw   xmm0, xmm0, 0x1b
   2370     pshufd    xmm0, xmm0, 0x4e  // swap qwords
   2371     sub       ecx, 16
   2372     movdqu    [edx], xmm0
   2373     lea       edx, [edx + 16]
   2374     jg        convertloop
   2375     ret
   2376   }
   2377 }
   2378 #endif  // HAS_MIRRORROW_SSE2
   2379 
   2380 #ifdef HAS_MIRRORROW_UV_SSSE3
   2381 // Shuffle table for reversing the bytes of UV channels.
   2382 static const uvec8 kShuffleMirrorUV = {
   2383   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
   2384 };
   2385 
   2386 __declspec(naked) __declspec(align(16))
   2387 void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
   2388                        int width) {
   2389   __asm {
   2390     push      edi
   2391     mov       eax, [esp + 4 + 4]   // src
   2392     mov       edx, [esp + 4 + 8]   // dst_u
   2393     mov       edi, [esp + 4 + 12]  // dst_v
   2394     mov       ecx, [esp + 4 + 16]  // width
   2395     movdqa    xmm1, kShuffleMirrorUV
   2396     lea       eax, [eax + ecx * 2 - 16]
   2397     sub       edi, edx
   2398 
   2399     align      16
   2400  convertloop:
   2401     movdqa    xmm0, [eax]
   2402     lea       eax, [eax - 16]
   2403     pshufb    xmm0, xmm1
   2404     sub       ecx, 8
   2405     movlpd    qword ptr [edx], xmm0
   2406     movhpd    qword ptr [edx + edi], xmm0
   2407     lea       edx, [edx + 8]
   2408     jg        convertloop
   2409 
   2410     pop       edi
   2411     ret
   2412   }
   2413 }
   2414 #endif  // HAS_MIRRORROW_UV_SSSE3
   2415 
   2416 #ifdef HAS_ARGBMIRRORROW_SSSE3
   2417 
   2418 // Shuffle table for reversing the bytes.
   2419 static const uvec8 kARGBShuffleMirror = {
   2420   12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
   2421 };
   2422 
   2423 __declspec(naked) __declspec(align(16))
   2424 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   2425 __asm {
   2426     mov       eax, [esp + 4]   // src
   2427     mov       edx, [esp + 8]   // dst
   2428     mov       ecx, [esp + 12]  // width
   2429     movdqa    xmm5, kARGBShuffleMirror
   2430     lea       eax, [eax - 16]
   2431 
   2432     align      16
   2433  convertloop:
   2434     movdqa    xmm0, [eax + ecx * 4]
   2435     pshufb    xmm0, xmm5
   2436     sub       ecx, 4
   2437     movdqa    [edx], xmm0
   2438     lea       edx, [edx + 16]
   2439     jg        convertloop
   2440     ret
   2441   }
   2442 }
   2443 #endif  // HAS_ARGBMIRRORROW_SSSE3
   2444 
   2445 #ifdef HAS_SPLITUV_SSE2
   2446 __declspec(naked) __declspec(align(16))
   2447 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   2448   __asm {
   2449     push       edi
   2450     mov        eax, [esp + 4 + 4]    // src_uv
   2451     mov        edx, [esp + 4 + 8]    // dst_u
   2452     mov        edi, [esp + 4 + 12]   // dst_v
   2453     mov        ecx, [esp + 4 + 16]   // pix
   2454     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   2455     psrlw      xmm5, 8
   2456     sub        edi, edx
   2457 
   2458     align      16
   2459   convertloop:
   2460     movdqa     xmm0, [eax]
   2461     movdqa     xmm1, [eax + 16]
   2462     lea        eax,  [eax + 32]
   2463     movdqa     xmm2, xmm0
   2464     movdqa     xmm3, xmm1
   2465     pand       xmm0, xmm5   // even bytes
   2466     pand       xmm1, xmm5
   2467     packuswb   xmm0, xmm1
   2468     psrlw      xmm2, 8      // odd bytes
   2469     psrlw      xmm3, 8
   2470     packuswb   xmm2, xmm3
   2471     movdqa     [edx], xmm0
   2472     movdqa     [edx + edi], xmm2
   2473     lea        edx, [edx + 16]
   2474     sub        ecx, 16
   2475     jg         convertloop
   2476 
   2477     pop        edi
   2478     ret
   2479   }
   2480 }
   2481 #endif  // HAS_SPLITUV_SSE2
   2482 
   2483 #ifdef HAS_COPYROW_SSE2
   2484 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
   2485 __declspec(naked) __declspec(align(16))
   2486 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   2487   __asm {
   2488     mov        eax, [esp + 4]   // src
   2489     mov        edx, [esp + 8]   // dst
   2490     mov        ecx, [esp + 12]  // count
   2491     sub        edx, eax
   2492 
   2493     align      16
   2494   convertloop:
   2495     movdqa     xmm0, [eax]
   2496     movdqa     xmm1, [eax + 16]
   2497     movdqa     [eax + edx], xmm0
   2498     movdqa     [eax + edx + 16], xmm1
   2499     lea        eax, [eax + 32]
   2500     sub        ecx, 32
   2501     jg         convertloop
   2502     ret
   2503   }
   2504 }
   2505 #endif  // HAS_COPYROW_SSE2
   2506 
   2507 #ifdef HAS_COPYROW_X86
   2508 __declspec(naked) __declspec(align(16))
   2509 void CopyRow_X86(const uint8* src, uint8* dst, int count) {
   2510   __asm {
   2511     mov        eax, esi
   2512     mov        edx, edi
   2513     mov        esi, [esp + 4]   // src
   2514     mov        edi, [esp + 8]   // dst
   2515     mov        ecx, [esp + 12]  // count
   2516     shr        ecx, 2
   2517     rep movsd
   2518     mov        edi, edx
   2519     mov        esi, eax
   2520     ret
   2521   }
   2522 }
   2523 #endif  // HAS_COPYROW_X86
   2524 
   2525 #ifdef HAS_SETROW_X86
   2526 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
   2527 __declspec(naked) __declspec(align(16))
   2528 void SetRow8_X86(uint8* dst, uint32 v32, int count) {
   2529   __asm {
   2530     mov        edx, edi
   2531     mov        edi, [esp + 4]   // dst
   2532     mov        eax, [esp + 8]   // v32
   2533     mov        ecx, [esp + 12]  // count
   2534     shr        ecx, 2
   2535     rep stosd
   2536     mov        edi, edx
   2537     ret
   2538   }
   2539 }
   2540 
   2541 // SetRow32 writes 'count' words using a 32 bit value repeated.
   2542 __declspec(naked) __declspec(align(16))
   2543 void SetRows32_X86(uint8* dst, uint32 v32, int width,
   2544                    int dst_stride, int height) {
   2545   __asm {
   2546     push       esi
   2547     push       edi
   2548     push       ebp
   2549     mov        edi, [esp + 12 + 4]   // dst
   2550     mov        eax, [esp + 12 + 8]   // v32
   2551     mov        ebp, [esp + 12 + 12]  // width
   2552     mov        edx, [esp + 12 + 16]  // dst_stride
   2553     mov        esi, [esp + 12 + 20]  // height
   2554     lea        ecx, [ebp * 4]
   2555     sub        edx, ecx             // stride - width * 4
   2556 
   2557     align      16
   2558   convertloop:
   2559     mov        ecx, ebp
   2560     rep stosd
   2561     add        edi, edx
   2562     sub        esi, 1
   2563     jg         convertloop
   2564 
   2565     pop        ebp
   2566     pop        edi
   2567     pop        esi
   2568     ret
   2569   }
   2570 }
   2571 #endif  // HAS_SETROW_X86
   2572 
   2573 #ifdef HAS_YUY2TOYROW_SSE2
   2574 __declspec(naked) __declspec(align(16))
   2575 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
   2576                      uint8* dst_y, int pix) {
   2577   __asm {
   2578     mov        eax, [esp + 4]    // src_yuy2
   2579     mov        edx, [esp + 8]    // dst_y
   2580     mov        ecx, [esp + 12]   // pix
   2581     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
   2582     psrlw      xmm5, 8
   2583 
   2584     align      16
   2585   convertloop:
   2586     movdqa     xmm0, [eax]
   2587     movdqa     xmm1, [eax + 16]
   2588     lea        eax,  [eax + 32]
   2589     pand       xmm0, xmm5   // even bytes are Y
   2590     pand       xmm1, xmm5
   2591     packuswb   xmm0, xmm1
   2592     sub        ecx, 16
   2593     movdqa     [edx], xmm0
   2594     lea        edx, [edx + 16]
   2595     jg         convertloop
   2596     ret
   2597   }
   2598 }
   2599 
   2600 __declspec(naked) __declspec(align(16))
   2601 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
   2602                       uint8* dst_u, uint8* dst_v, int pix) {
   2603   __asm {
   2604     push       esi
   2605     push       edi
   2606     mov        eax, [esp + 8 + 4]    // src_yuy2
   2607     mov        esi, [esp + 8 + 8]    // stride_yuy2
   2608     mov        edx, [esp + 8 + 12]   // dst_u
   2609     mov        edi, [esp + 8 + 16]   // dst_v
   2610     mov        ecx, [esp + 8 + 20]   // pix
   2611     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   2612     psrlw      xmm5, 8
   2613     sub        edi, edx
   2614 
   2615     align      16
   2616   convertloop:
   2617     movdqa     xmm0, [eax]
   2618     movdqa     xmm1, [eax + 16]
   2619     movdqa     xmm2, [eax + esi]
   2620     movdqa     xmm3, [eax + esi + 16]
   2621     lea        eax,  [eax + 32]
   2622     pavgb      xmm0, xmm2
   2623     pavgb      xmm1, xmm3
   2624     psrlw      xmm0, 8      // YUYV -> UVUV
   2625     psrlw      xmm1, 8
   2626     packuswb   xmm0, xmm1
   2627     movdqa     xmm1, xmm0
   2628     pand       xmm0, xmm5  // U
   2629     packuswb   xmm0, xmm0
   2630     psrlw      xmm1, 8     // V
   2631     packuswb   xmm1, xmm1
   2632     movq       qword ptr [edx], xmm0
   2633     movq       qword ptr [edx + edi], xmm1
   2634     lea        edx, [edx + 8]
   2635     sub        ecx, 16
   2636     jg         convertloop
   2637 
   2638     pop        edi
   2639     pop        esi
   2640     ret
   2641   }
   2642 }
   2643 
   2644 __declspec(naked) __declspec(align(16))
   2645 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
   2646                          uint8* dst_u, uint8* dst_v, int pix) {
   2647   __asm {
   2648     push       edi
   2649     mov        eax, [esp + 4 + 4]    // src_yuy2
   2650     mov        edx, [esp + 4 + 8]    // dst_u
   2651     mov        edi, [esp + 4 + 12]   // dst_v
   2652     mov        ecx, [esp + 4 + 16]   // pix
   2653     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   2654     psrlw      xmm5, 8
   2655     sub        edi, edx
   2656 
   2657     align      16
   2658   convertloop:
   2659     movdqa     xmm0, [eax]
   2660     movdqa     xmm1, [eax + 16]
   2661     lea        eax,  [eax + 32]
   2662     psrlw      xmm0, 8      // YUYV -> UVUV
   2663     psrlw      xmm1, 8
   2664     packuswb   xmm0, xmm1
   2665     movdqa     xmm1, xmm0
   2666     pand       xmm0, xmm5  // U
   2667     packuswb   xmm0, xmm0
   2668     psrlw      xmm1, 8     // V
   2669     packuswb   xmm1, xmm1
   2670     movq       qword ptr [edx], xmm0
   2671     movq       qword ptr [edx + edi], xmm1
   2672     lea        edx, [edx + 8]
   2673     sub        ecx, 16
   2674     jg         convertloop
   2675 
   2676     pop        edi
   2677     ret
   2678   }
   2679 }
   2680 
   2681 __declspec(naked) __declspec(align(16))
   2682 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
   2683                                uint8* dst_y, int pix) {
   2684   __asm {
   2685     mov        eax, [esp + 4]    // src_yuy2
   2686     mov        edx, [esp + 8]    // dst_y
   2687     mov        ecx, [esp + 12]   // pix
   2688     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
   2689     psrlw      xmm5, 8
   2690 
   2691     align      16
   2692   convertloop:
   2693     movdqu     xmm0, [eax]
   2694     movdqu     xmm1, [eax + 16]
   2695     lea        eax,  [eax + 32]
   2696     pand       xmm0, xmm5   // even bytes are Y
   2697     pand       xmm1, xmm5
   2698     packuswb   xmm0, xmm1
   2699     sub        ecx, 16
   2700     movdqu     [edx], xmm0
   2701     lea        edx, [edx + 16]
   2702     jg         convertloop
   2703     ret
   2704   }
   2705 }
   2706 
   2707 __declspec(naked) __declspec(align(16))
   2708 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
   2709                                 uint8* dst_u, uint8* dst_v, int pix) {
   2710   __asm {
   2711     push       esi
   2712     push       edi
   2713     mov        eax, [esp + 8 + 4]    // src_yuy2
   2714     mov        esi, [esp + 8 + 8]    // stride_yuy2
   2715     mov        edx, [esp + 8 + 12]   // dst_u
   2716     mov        edi, [esp + 8 + 16]   // dst_v
   2717     mov        ecx, [esp + 8 + 20]   // pix
   2718     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   2719     psrlw      xmm5, 8
   2720     sub        edi, edx
   2721 
   2722     align      16
   2723   convertloop:
   2724     movdqu     xmm0, [eax]
   2725     movdqu     xmm1, [eax + 16]
   2726     movdqu     xmm2, [eax + esi]
   2727     movdqu     xmm3, [eax + esi + 16]
   2728     lea        eax,  [eax + 32]
   2729     pavgb      xmm0, xmm2
   2730     pavgb      xmm1, xmm3
   2731     psrlw      xmm0, 8      // YUYV -> UVUV
   2732     psrlw      xmm1, 8
   2733     packuswb   xmm0, xmm1
   2734     movdqa     xmm1, xmm0
   2735     pand       xmm0, xmm5  // U
   2736     packuswb   xmm0, xmm0
   2737     psrlw      xmm1, 8     // V
   2738     packuswb   xmm1, xmm1
   2739     movq       qword ptr [edx], xmm0
   2740     movq       qword ptr [edx + edi], xmm1
   2741     lea        edx, [edx + 8]
   2742     sub        ecx, 16
   2743     jg         convertloop
   2744 
   2745     pop        edi
   2746     pop        esi
   2747     ret
   2748   }
   2749 }
   2750 
   2751 __declspec(naked) __declspec(align(16))
   2752 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
   2753                                    uint8* dst_u, uint8* dst_v, int pix) {
   2754   __asm {
   2755     push       edi
   2756     mov        eax, [esp + 4 + 4]    // src_yuy2
   2757     mov        edx, [esp + 4 + 8]    // dst_u
   2758     mov        edi, [esp + 4 + 12]   // dst_v
   2759     mov        ecx, [esp + 4 + 16]   // pix
   2760     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   2761     psrlw      xmm5, 8
   2762     sub        edi, edx
   2763 
   2764     align      16
   2765   convertloop:
   2766     movdqu     xmm0, [eax]
   2767     movdqu     xmm1, [eax + 16]
   2768     lea        eax,  [eax + 32]
   2769     psrlw      xmm0, 8      // YUYV -> UVUV
   2770     psrlw      xmm1, 8
   2771     packuswb   xmm0, xmm1
   2772     movdqa     xmm1, xmm0
   2773     pand       xmm0, xmm5  // U
   2774     packuswb   xmm0, xmm0
   2775     psrlw      xmm1, 8     // V
   2776     packuswb   xmm1, xmm1
   2777     movq       qword ptr [edx], xmm0
   2778     movq       qword ptr [edx + edi], xmm1
   2779     lea        edx, [edx + 8]
   2780     sub        ecx, 16
   2781     jg         convertloop
   2782 
   2783     pop        edi
   2784     ret
   2785   }
   2786 }
   2787 
   2788 __declspec(naked) __declspec(align(16))
   2789 void UYVYToYRow_SSE2(const uint8* src_uyvy,
   2790                      uint8* dst_y, int pix) {
   2791   __asm {
   2792     mov        eax, [esp + 4]    // src_uyvy
   2793     mov        edx, [esp + 8]    // dst_y
   2794     mov        ecx, [esp + 12]   // pix
   2795 
   2796     align      16
   2797   convertloop:
   2798     movdqa     xmm0, [eax]
   2799     movdqa     xmm1, [eax + 16]
   2800     lea        eax,  [eax + 32]
   2801     psrlw      xmm0, 8    // odd bytes are Y
   2802     psrlw      xmm1, 8
   2803     packuswb   xmm0, xmm1
   2804     sub        ecx, 16
   2805     movdqa     [edx], xmm0
   2806     lea        edx, [edx + 16]
   2807     jg         convertloop
   2808     ret
   2809   }
   2810 }
   2811 
   2812 __declspec(naked) __declspec(align(16))
   2813 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
   2814                       uint8* dst_u, uint8* dst_v, int pix) {
   2815   __asm {
   2816     push       esi
   2817     push       edi
   2818     mov        eax, [esp + 8 + 4]    // src_yuy2
   2819     mov        esi, [esp + 8 + 8]    // stride_yuy2
   2820     mov        edx, [esp + 8 + 12]   // dst_u
   2821     mov        edi, [esp + 8 + 16]   // dst_v
   2822     mov        ecx, [esp + 8 + 20]   // pix
   2823     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   2824     psrlw      xmm5, 8
   2825     sub        edi, edx
   2826 
   2827     align      16
   2828   convertloop:
   2829     movdqa     xmm0, [eax]
   2830     movdqa     xmm1, [eax + 16]
   2831     movdqa     xmm2, [eax + esi]
   2832     movdqa     xmm3, [eax + esi + 16]
   2833     lea        eax,  [eax + 32]
   2834     pavgb      xmm0, xmm2
   2835     pavgb      xmm1, xmm3
   2836     pand       xmm0, xmm5   // UYVY -> UVUV
   2837     pand       xmm1, xmm5
   2838     packuswb   xmm0, xmm1
   2839     movdqa     xmm1, xmm0
   2840     pand       xmm0, xmm5  // U
   2841     packuswb   xmm0, xmm0
   2842     psrlw      xmm1, 8     // V
   2843     packuswb   xmm1, xmm1
   2844     movq       qword ptr [edx], xmm0
   2845     movq       qword ptr [edx + edi], xmm1
   2846     lea        edx, [edx + 8]
   2847     sub        ecx, 16
   2848     jg         convertloop
   2849 
   2850     pop        edi
   2851     pop        esi
   2852     ret
   2853   }
   2854 }
   2855 
   2856 __declspec(naked) __declspec(align(16))
   2857 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
   2858                          uint8* dst_u, uint8* dst_v, int pix) {
   2859   __asm {
   2860     push       edi
   2861     mov        eax, [esp + 4 + 4]    // src_yuy2
   2862     mov        edx, [esp + 4 + 8]    // dst_u
   2863     mov        edi, [esp + 4 + 12]   // dst_v
   2864     mov        ecx, [esp + 4 + 16]   // pix
   2865     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   2866     psrlw      xmm5, 8
   2867     sub        edi, edx
   2868 
   2869     align      16
   2870   convertloop:
   2871     movdqa     xmm0, [eax]
   2872     movdqa     xmm1, [eax + 16]
   2873     lea        eax,  [eax + 32]
   2874     pand       xmm0, xmm5   // UYVY -> UVUV
   2875     pand       xmm1, xmm5
   2876     packuswb   xmm0, xmm1
   2877     movdqa     xmm1, xmm0
   2878     pand       xmm0, xmm5  // U
   2879     packuswb   xmm0, xmm0
   2880     psrlw      xmm1, 8     // V
   2881     packuswb   xmm1, xmm1
   2882     movq       qword ptr [edx], xmm0
   2883     movq       qword ptr [edx + edi], xmm1
   2884     lea        edx, [edx + 8]
   2885     sub        ecx, 16
   2886     jg         convertloop
   2887 
   2888     pop        edi
   2889     ret
   2890   }
   2891 }
   2892 
   2893 __declspec(naked) __declspec(align(16))
   2894 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
   2895                                uint8* dst_y, int pix) {
   2896   __asm {
   2897     mov        eax, [esp + 4]    // src_uyvy
   2898     mov        edx, [esp + 8]    // dst_y
   2899     mov        ecx, [esp + 12]   // pix
   2900 
   2901     align      16
   2902   convertloop:
   2903     movdqu     xmm0, [eax]
   2904     movdqu     xmm1, [eax + 16]
   2905     lea        eax,  [eax + 32]
   2906     psrlw      xmm0, 8    // odd bytes are Y
   2907     psrlw      xmm1, 8
   2908     packuswb   xmm0, xmm1
   2909     sub        ecx, 16
   2910     movdqu     [edx], xmm0
   2911     lea        edx, [edx + 16]
   2912     jg         convertloop
   2913     ret
   2914   }
   2915 }
   2916 
   2917 __declspec(naked) __declspec(align(16))
   2918 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
   2919                                 uint8* dst_u, uint8* dst_v, int pix) {
   2920   __asm {
   2921     push       esi
   2922     push       edi
   2923     mov        eax, [esp + 8 + 4]    // src_yuy2
   2924     mov        esi, [esp + 8 + 8]    // stride_yuy2
   2925     mov        edx, [esp + 8 + 12]   // dst_u
   2926     mov        edi, [esp + 8 + 16]   // dst_v
   2927     mov        ecx, [esp + 8 + 20]   // pix
   2928     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   2929     psrlw      xmm5, 8
   2930     sub        edi, edx
   2931 
   2932     align      16
   2933   convertloop:
   2934     movdqu     xmm0, [eax]
   2935     movdqu     xmm1, [eax + 16]
   2936     movdqu     xmm2, [eax + esi]
   2937     movdqu     xmm3, [eax + esi + 16]
   2938     lea        eax,  [eax + 32]
   2939     pavgb      xmm0, xmm2
   2940     pavgb      xmm1, xmm3
   2941     pand       xmm0, xmm5   // UYVY -> UVUV
   2942     pand       xmm1, xmm5
   2943     packuswb   xmm0, xmm1
   2944     movdqa     xmm1, xmm0
   2945     pand       xmm0, xmm5  // U
   2946     packuswb   xmm0, xmm0
   2947     psrlw      xmm1, 8     // V
   2948     packuswb   xmm1, xmm1
   2949     movq       qword ptr [edx], xmm0
   2950     movq       qword ptr [edx + edi], xmm1
   2951     lea        edx, [edx + 8]
   2952     sub        ecx, 16
   2953     jg         convertloop
   2954 
   2955     pop        edi
   2956     pop        esi
   2957     ret
   2958   }
   2959 }
   2960 
   2961 __declspec(naked) __declspec(align(16))
   2962 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
   2963                                    uint8* dst_u, uint8* dst_v, int pix) {
   2964   __asm {
   2965     push       edi
   2966     mov        eax, [esp + 4 + 4]    // src_yuy2
   2967     mov        edx, [esp + 4 + 8]    // dst_u
   2968     mov        edi, [esp + 4 + 12]   // dst_v
   2969     mov        ecx, [esp + 4 + 16]   // pix
   2970     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   2971     psrlw      xmm5, 8
   2972     sub        edi, edx
   2973 
   2974     align      16
   2975   convertloop:
   2976     movdqu     xmm0, [eax]
   2977     movdqu     xmm1, [eax + 16]
   2978     lea        eax,  [eax + 32]
   2979     pand       xmm0, xmm5   // UYVY -> UVUV
   2980     pand       xmm1, xmm5
   2981     packuswb   xmm0, xmm1
   2982     movdqa     xmm1, xmm0
   2983     pand       xmm0, xmm5  // U
   2984     packuswb   xmm0, xmm0
   2985     psrlw      xmm1, 8     // V
   2986     packuswb   xmm1, xmm1
   2987     movq       qword ptr [edx], xmm0
   2988     movq       qword ptr [edx + edi], xmm1
   2989     lea        edx, [edx + 8]
   2990     sub        ecx, 16
   2991     jg         convertloop
   2992 
   2993     pop        edi
   2994     ret
   2995   }
   2996 }
   2997 #endif  // HAS_YUY2TOYROW_SSE2
   2998 
   2999 #ifdef HAS_ARGBBLENDROW_SSE2
   3000 // Blend 8 pixels at a time.
   3001 __declspec(naked) __declspec(align(16))
   3002 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   3003                        uint8* dst_argb, int width) {
   3004   __asm {
   3005     push       esi
   3006     mov        eax, [esp + 4 + 4]   // src_argb0
   3007     mov        esi, [esp + 4 + 8]   // src_argb1
   3008     mov        edx, [esp + 4 + 12]  // dst_argb
   3009     mov        ecx, [esp + 4 + 16]  // width
   3010     pcmpeqb    xmm7, xmm7       // generate constant 1
   3011     psrlw      xmm7, 15
   3012     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
   3013     psrlw      xmm6, 8
   3014     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
   3015     psllw      xmm5, 8
   3016     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
   3017     pslld      xmm4, 24
   3018 
   3019     sub        ecx, 1
   3020     je         convertloop1     // only 1 pixel?
   3021     jl         convertloop1b
   3022 
   3023     // 1 pixel loop until destination pointer is aligned.
   3024   alignloop1:
   3025     test       edx, 15          // aligned?
   3026     je         alignloop1b
   3027     movd       xmm3, [eax]
   3028     lea        eax, [eax + 4]
   3029     movdqa     xmm0, xmm3       // src argb
   3030     pxor       xmm3, xmm4       // ~alpha
   3031     movd       xmm2, [esi]      // _r_b
   3032     psrlw      xmm3, 8          // alpha
   3033     pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
   3034     pshuflw    xmm3, xmm3,0F5h
   3035     pand       xmm2, xmm6       // _r_b
   3036     paddw      xmm3, xmm7       // 256 - alpha
   3037     pmullw     xmm2, xmm3       // _r_b * alpha
   3038     movd       xmm1, [esi]      // _a_g
   3039     lea        esi, [esi + 4]
   3040     psrlw      xmm1, 8          // _a_g
   3041     por        xmm0, xmm4       // set alpha to 255
   3042     pmullw     xmm1, xmm3       // _a_g * alpha
   3043     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   3044     paddusb    xmm0, xmm2       // + src argb
   3045     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   3046     paddusb    xmm0, xmm1       // + src argb
   3047     sub        ecx, 1
   3048     movd       [edx], xmm0
   3049     lea        edx, [edx + 4]
   3050     jge        alignloop1
   3051 
   3052   alignloop1b:
   3053     add        ecx, 1 - 4
   3054     jl         convertloop4b
   3055 
   3056     // 4 pixel loop.
   3057   convertloop4:
   3058     movdqu     xmm3, [eax]      // src argb
   3059     lea        eax, [eax + 16]
   3060     movdqa     xmm0, xmm3       // src argb
   3061     pxor       xmm3, xmm4       // ~alpha
   3062     movdqu     xmm2, [esi]      // _r_b
   3063     psrlw      xmm3, 8          // alpha
   3064     pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
   3065     pshuflw    xmm3, xmm3,0F5h
   3066     pand       xmm2, xmm6       // _r_b
   3067     paddw      xmm3, xmm7       // 256 - alpha
   3068     pmullw     xmm2, xmm3       // _r_b * alpha
   3069     movdqu     xmm1, [esi]      // _a_g
   3070     lea        esi, [esi + 16]
   3071     psrlw      xmm1, 8          // _a_g
   3072     por        xmm0, xmm4       // set alpha to 255
   3073     pmullw     xmm1, xmm3       // _a_g * alpha
   3074     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   3075     paddusb    xmm0, xmm2       // + src argb
   3076     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   3077     paddusb    xmm0, xmm1       // + src argb
   3078     sub        ecx, 4
   3079     movdqa     [edx], xmm0
   3080     lea        edx, [edx + 16]
   3081     jge        convertloop4
   3082 
   3083   convertloop4b:
   3084     add        ecx, 4 - 1
   3085     jl         convertloop1b
   3086 
   3087     // 1 pixel loop.
   3088   convertloop1:
   3089     movd       xmm3, [eax]      // src argb
   3090     lea        eax, [eax + 4]
   3091     movdqa     xmm0, xmm3       // src argb
   3092     pxor       xmm3, xmm4       // ~alpha
   3093     movd       xmm2, [esi]      // _r_b
   3094     psrlw      xmm3, 8          // alpha
   3095     pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
   3096     pshuflw    xmm3, xmm3,0F5h
   3097     pand       xmm2, xmm6       // _r_b
   3098     paddw      xmm3, xmm7       // 256 - alpha
   3099     pmullw     xmm2, xmm3       // _r_b * alpha
   3100     movd       xmm1, [esi]      // _a_g
   3101     lea        esi, [esi + 4]
   3102     psrlw      xmm1, 8          // _a_g
   3103     por        xmm0, xmm4       // set alpha to 255
   3104     pmullw     xmm1, xmm3       // _a_g * alpha
   3105     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   3106     paddusb    xmm0, xmm2       // + src argb
   3107     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   3108     paddusb    xmm0, xmm1       // + src argb
   3109     sub        ecx, 1
   3110     movd       [edx], xmm0
   3111     lea        edx, [edx + 4]
   3112     jge        convertloop1
   3113 
   3114   convertloop1b:
   3115     pop        esi
   3116     ret
   3117   }
   3118 }
   3119 #endif  // HAS_ARGBBLENDROW_SSE2
   3120 
   3121 #ifdef HAS_ARGBBLENDROW_SSSE3
   3122 // Shuffle table for isolating alpha.
   3123 static const uvec8 kShuffleAlpha = {
   3124   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
   3125   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
   3126 };
   3127 // Same as SSE2, but replaces:
   3128 //    psrlw      xmm3, 8          // alpha
   3129 //    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
   3130 //    pshuflw    xmm3, xmm3,0F5h
   3131 // with..
   3132 //    pshufb     xmm3, kShuffleAlpha // alpha
   3133 // Blend 8 pixels at a time.
   3134 
   3135 __declspec(naked) __declspec(align(16))
   3136 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
   3137                         uint8* dst_argb, int width) {
   3138   __asm {
   3139     push       esi
   3140     mov        eax, [esp + 4 + 4]   // src_argb0
   3141     mov        esi, [esp + 4 + 8]   // src_argb1
   3142     mov        edx, [esp + 4 + 12]  // dst_argb
   3143     mov        ecx, [esp + 4 + 16]  // width
   3144     pcmpeqb    xmm7, xmm7       // generate constant 1
   3145     psrlw      xmm7, 15
   3146     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
   3147     psrlw      xmm6, 8
   3148     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
   3149     psllw      xmm5, 8
   3150     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
   3151     pslld      xmm4, 24
   3152 
   3153     sub        ecx, 1
   3154     je         convertloop1     // only 1 pixel?
   3155     jl         convertloop1b
   3156 
   3157     // 1 pixel loop until destination pointer is aligned.
   3158   alignloop1:
   3159     test       edx, 15          // aligned?
   3160     je         alignloop1b
   3161     movd       xmm3, [eax]
   3162     lea        eax, [eax + 4]
   3163     movdqa     xmm0, xmm3       // src argb
   3164     pxor       xmm3, xmm4       // ~alpha
   3165     movd       xmm2, [esi]      // _r_b
   3166     pshufb     xmm3, kShuffleAlpha // alpha
   3167     pand       xmm2, xmm6       // _r_b
   3168     paddw      xmm3, xmm7       // 256 - alpha
   3169     pmullw     xmm2, xmm3       // _r_b * alpha
   3170     movd       xmm1, [esi]      // _a_g
   3171     lea        esi, [esi + 4]
   3172     psrlw      xmm1, 8          // _a_g
   3173     por        xmm0, xmm4       // set alpha to 255
   3174     pmullw     xmm1, xmm3       // _a_g * alpha
   3175     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   3176     paddusb    xmm0, xmm2       // + src argb
   3177     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   3178     paddusb    xmm0, xmm1       // + src argb
   3179     sub        ecx, 1
   3180     movd       [edx], xmm0
   3181     lea        edx, [edx + 4]
   3182     jge        alignloop1
   3183 
   3184   alignloop1b:
   3185     add        ecx, 1 - 4
   3186     jl         convertloop4b
   3187 
   3188     test       eax, 15          // unaligned?
   3189     jne        convertuloop4
   3190     test       esi, 15          // unaligned?
   3191     jne        convertuloop4
   3192 
   3193     // 4 pixel loop.
   3194   convertloop4:
   3195     movdqa     xmm3, [eax]      // src argb
   3196     lea        eax, [eax + 16]
   3197     movdqa     xmm0, xmm3       // src argb
   3198     pxor       xmm3, xmm4       // ~alpha
   3199     movdqa     xmm2, [esi]      // _r_b
   3200     pshufb     xmm3, kShuffleAlpha // alpha
   3201     pand       xmm2, xmm6       // _r_b
   3202     paddw      xmm3, xmm7       // 256 - alpha
   3203     pmullw     xmm2, xmm3       // _r_b * alpha
   3204     movdqa     xmm1, [esi]      // _a_g
   3205     lea        esi, [esi + 16]
   3206     psrlw      xmm1, 8          // _a_g
   3207     por        xmm0, xmm4       // set alpha to 255
   3208     pmullw     xmm1, xmm3       // _a_g * alpha
   3209     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   3210     paddusb    xmm0, xmm2       // + src argb
   3211     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   3212     paddusb    xmm0, xmm1       // + src argb
   3213     sub        ecx, 4
   3214     movdqa     [edx], xmm0
   3215     lea        edx, [edx + 16]
   3216     jge        convertloop4
   3217     jmp        convertloop4b
   3218 
   3219     // 4 pixel unaligned loop.
   3220   convertuloop4:
   3221     movdqu     xmm3, [eax]      // src argb
   3222     lea        eax, [eax + 16]
   3223     movdqa     xmm0, xmm3       // src argb
   3224     pxor       xmm3, xmm4       // ~alpha
   3225     movdqu     xmm2, [esi]      // _r_b
   3226     pshufb     xmm3, kShuffleAlpha // alpha
   3227     pand       xmm2, xmm6       // _r_b
   3228     paddw      xmm3, xmm7       // 256 - alpha
   3229     pmullw     xmm2, xmm3       // _r_b * alpha
   3230     movdqu     xmm1, [esi]      // _a_g
   3231     lea        esi, [esi + 16]
   3232     psrlw      xmm1, 8          // _a_g
   3233     por        xmm0, xmm4       // set alpha to 255
   3234     pmullw     xmm1, xmm3       // _a_g * alpha
   3235     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   3236     paddusb    xmm0, xmm2       // + src argb
   3237     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   3238     paddusb    xmm0, xmm1       // + src argb
   3239     sub        ecx, 4
   3240     movdqa     [edx], xmm0
   3241     lea        edx, [edx + 16]
   3242     jge        convertuloop4
   3243 
   3244   convertloop4b:
   3245     add        ecx, 4 - 1
   3246     jl         convertloop1b
   3247 
   3248     // 1 pixel loop.
   3249   convertloop1:
   3250     movd       xmm3, [eax]      // src argb
   3251     lea        eax, [eax + 4]
   3252     movdqa     xmm0, xmm3       // src argb
   3253     pxor       xmm3, xmm4       // ~alpha
   3254     movd       xmm2, [esi]      // _r_b
   3255     pshufb     xmm3, kShuffleAlpha // alpha
   3256     pand       xmm2, xmm6       // _r_b
   3257     paddw      xmm3, xmm7       // 256 - alpha
   3258     pmullw     xmm2, xmm3       // _r_b * alpha
   3259     movd       xmm1, [esi]      // _a_g
   3260     lea        esi, [esi + 4]
   3261     psrlw      xmm1, 8          // _a_g
   3262     por        xmm0, xmm4       // set alpha to 255
   3263     pmullw     xmm1, xmm3       // _a_g * alpha
   3264     psrlw      xmm2, 8          // _r_b convert to 8 bits again
   3265     paddusb    xmm0, xmm2       // + src argb
   3266     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
   3267     paddusb    xmm0, xmm1       // + src argb
   3268     sub        ecx, 1
   3269     movd       [edx], xmm0
   3270     lea        edx, [edx + 4]
   3271     jge        convertloop1
   3272 
   3273   convertloop1b:
   3274     pop        esi
   3275     ret
   3276   }
   3277 }
   3278 #endif  // HAS_ARGBBLENDROW_SSSE3
   3279 
   3280 #ifdef HAS_ARGBATTENUATE_SSE2
   3281 // Attenuate 4 pixels at a time.
   3282 // Aligned to 16 bytes.
   3283 __declspec(naked) __declspec(align(16))
   3284 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
   3285   __asm {
   3286     mov        eax, [esp + 4]   // src_argb0
   3287     mov        edx, [esp + 8]   // dst_argb
   3288     mov        ecx, [esp + 12]  // width
   3289     sub        edx, eax
   3290     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
   3291     pslld      xmm4, 24
   3292     pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
   3293     psrld      xmm5, 8
   3294 
   3295     align      16
   3296  convertloop:
   3297     movdqa     xmm0, [eax]      // read 4 pixels
   3298     punpcklbw  xmm0, xmm0       // first 2
   3299     pshufhw    xmm2, xmm0,0FFh  // 8 alpha words
   3300     pshuflw    xmm2, xmm2,0FFh
   3301     pmulhuw    xmm0, xmm2       // rgb * a
   3302     movdqa     xmm1, [eax]      // read 4 pixels
   3303     punpckhbw  xmm1, xmm1       // next 2 pixels
   3304     pshufhw    xmm2, xmm1,0FFh  // 8 alpha words
   3305     pshuflw    xmm2, xmm2,0FFh
   3306     pmulhuw    xmm1, xmm2       // rgb * a
   3307     movdqa     xmm2, [eax]      // alphas
   3308     psrlw      xmm0, 8
   3309     pand       xmm2, xmm4
   3310     psrlw      xmm1, 8
   3311     packuswb   xmm0, xmm1
   3312     pand       xmm0, xmm5       // keep original alphas
   3313     por        xmm0, xmm2
   3314     sub        ecx, 4
   3315     movdqa     [eax + edx], xmm0
   3316     lea        eax, [eax + 16]
   3317     jg         convertloop
   3318 
   3319     ret
   3320   }
   3321 }
   3322 #endif  // HAS_ARGBATTENUATE_SSE2
   3323 
   3324 #ifdef HAS_ARGBATTENUATEROW_SSSE3
   3325 // Shuffle table duplicating alpha.
   3326 static const uvec8 kShuffleAlpha0 = {
   3327   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
   3328 };
   3329 static const uvec8 kShuffleAlpha1 = {
   3330   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
   3331   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
   3332 };
   3333 __declspec(naked) __declspec(align(16))
   3334 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   3335   __asm {
   3336     mov        eax, [esp + 4]   // src_argb0
   3337     mov        edx, [esp + 8]   // dst_argb
   3338     mov        ecx, [esp + 12]  // width
   3339     sub        edx, eax
   3340     pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
   3341     pslld      xmm3, 24
   3342     movdqa     xmm4, kShuffleAlpha0
   3343     movdqa     xmm5, kShuffleAlpha1
   3344 
   3345     align      16
   3346  convertloop:
   3347     movdqa     xmm0, [eax]      // read 4 pixels
   3348     pshufb     xmm0, xmm4       // isolate first 2 alphas
   3349     movdqa     xmm1, [eax]      // read 4 pixels
   3350     punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
   3351     pmulhuw    xmm0, xmm1       // rgb * a
   3352     movdqa     xmm1, [eax]      // read 4 pixels
   3353     pshufb     xmm1, xmm5       // isolate next 2 alphas
   3354     movdqa     xmm2, [eax]      // read 4 pixels
   3355     punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
   3356     pmulhuw    xmm1, xmm2       // rgb * a
   3357     movdqa     xmm2, [eax]      // mask original alpha
   3358     pand       xmm2, xmm3
   3359     psrlw      xmm0, 8
   3360     psrlw      xmm1, 8
   3361     packuswb   xmm0, xmm1
   3362     por        xmm0, xmm2       // copy original alpha
   3363     sub        ecx, 4
   3364     movdqa     [eax + edx], xmm0
   3365     lea        eax, [eax + 16]
   3366     jg         convertloop
   3367 
   3368     ret
   3369   }
   3370 }
   3371 #endif  // HAS_ARGBATTENUATEROW_SSSE3
   3372 
   3373 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
   3374 // Unattenuate 4 pixels at a time.
   3375 // Aligned to 16 bytes.
   3376 __declspec(naked) __declspec(align(16))
   3377 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
   3378                              int width) {
   3379   __asm {
   3380     push       esi
   3381     push       edi
   3382     mov        eax, [esp + 8 + 4]   // src_argb0
   3383     mov        edx, [esp + 8 + 8]   // dst_argb
   3384     mov        ecx, [esp + 8 + 12]  // width
   3385     sub        edx, eax
   3386     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
   3387     pslld      xmm4, 24
   3388 
   3389     align      16
   3390  convertloop:
   3391     movdqa     xmm0, [eax]      // read 4 pixels
   3392     movzx      esi, byte ptr [eax + 3]  // first alpha
   3393     movzx      edi, byte ptr [eax + 7]  // second alpha
   3394     punpcklbw  xmm0, xmm0       // first 2
   3395     movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
   3396     movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
   3397     pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
   3398     pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
   3399     movlhps    xmm2, xmm3
   3400     pmulhuw    xmm0, xmm2       // rgb * a
   3401 
   3402     movdqa     xmm1, [eax]      // read 4 pixels
   3403     movzx      esi, byte ptr [eax + 11]  // third alpha
   3404     movzx      edi, byte ptr [eax + 15]  // forth alpha
   3405     punpckhbw  xmm1, xmm1       // next 2
   3406     movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
   3407     movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
   3408     pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
   3409     pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
   3410     movlhps    xmm2, xmm3
   3411     pmulhuw    xmm1, xmm2       // rgb * a
   3412 
   3413     movdqa     xmm2, [eax]      // alphas
   3414     pand       xmm2, xmm4
   3415     packuswb   xmm0, xmm1
   3416     por        xmm0, xmm2
   3417     sub        ecx, 4
   3418     movdqa     [eax + edx], xmm0
   3419     lea        eax, [eax + 16]
   3420     jg         convertloop
   3421     pop        edi
   3422     pop        esi
   3423     ret
   3424   }
   3425 }
   3426 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
   3427 
   3428 #ifdef HAS_ARGBGRAYROW_SSSE3
   3429 // Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R
   3430 static const vec8 kARGBToGray = {
   3431   14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
   3432 };
   3433 
   3434 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
   3435 __declspec(naked) __declspec(align(16))
   3436 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   3437   __asm {
   3438     mov        eax, [esp + 4]   /* src_argb */
   3439     mov        edx, [esp + 8]   /* dst_argb */
   3440     mov        ecx, [esp + 12]  /* width */
   3441     movdqa     xmm4, kARGBToGray
   3442     sub        edx, eax
   3443 
   3444     align      16
   3445  convertloop:
   3446     movdqa     xmm0, [eax]  // G
   3447     movdqa     xmm1, [eax + 16]
   3448     pmaddubsw  xmm0, xmm4
   3449     pmaddubsw  xmm1, xmm4
   3450     phaddw     xmm0, xmm1
   3451     psrlw      xmm0, 7
   3452     packuswb   xmm0, xmm0   // 8 G bytes
   3453     movdqa     xmm2, [eax]  // A
   3454     movdqa     xmm3, [eax + 16]
   3455     psrld      xmm2, 24
   3456     psrld      xmm3, 24
   3457     packuswb   xmm2, xmm3
   3458     packuswb   xmm2, xmm2   // 8 A bytes
   3459     movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
   3460     punpcklbw  xmm0, xmm0   // 8 GG words
   3461     punpcklbw  xmm3, xmm2   // 8 GA words
   3462     movdqa     xmm1, xmm0
   3463     punpcklwd  xmm0, xmm3   // GGGA first 4
   3464     punpckhwd  xmm1, xmm3   // GGGA next 4
   3465     sub        ecx, 8
   3466     movdqa     [eax + edx], xmm0
   3467     movdqa     [eax + edx + 16], xmm1
   3468     lea        eax, [eax + 32]
   3469     jg         convertloop
   3470     ret
   3471   }
   3472 }
   3473 #endif  // HAS_ARGBGRAYROW_SSSE3
   3474 
   3475 #ifdef HAS_ARGBSEPIAROW_SSSE3
   3476 //    b = (r * 35 + g * 68 + b * 17) >> 7
   3477 //    g = (r * 45 + g * 88 + b * 22) >> 7
   3478 //    r = (r * 50 + g * 98 + b * 24) >> 7
   3479 // Constant for ARGB color to sepia tone.
   3480 static const vec8 kARGBToSepiaB = {
   3481   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
   3482 };
   3483 
   3484 static const vec8 kARGBToSepiaG = {
   3485   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
   3486 };
   3487 
   3488 static const vec8 kARGBToSepiaR = {
   3489   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
   3490 };
   3491 
   3492 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
   3493 __declspec(naked) __declspec(align(16))
   3494 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
   3495   __asm {
   3496     mov        eax, [esp + 4]   /* dst_argb */
   3497     mov        ecx, [esp + 8]   /* width */
   3498     movdqa     xmm2, kARGBToSepiaB
   3499     movdqa     xmm3, kARGBToSepiaG
   3500     movdqa     xmm4, kARGBToSepiaR
   3501 
   3502     align      16
   3503  convertloop:
   3504     movdqa     xmm0, [eax]  // B
   3505     movdqa     xmm6, [eax + 16]
   3506     pmaddubsw  xmm0, xmm2
   3507     pmaddubsw  xmm6, xmm2
   3508     phaddw     xmm0, xmm6
   3509     psrlw      xmm0, 7
   3510     packuswb   xmm0, xmm0   // 8 B values
   3511     movdqa     xmm5, [eax]  // G
   3512     movdqa     xmm1, [eax + 16]
   3513     pmaddubsw  xmm5, xmm3
   3514     pmaddubsw  xmm1, xmm3
   3515     phaddw     xmm5, xmm1
   3516     psrlw      xmm5, 7
   3517     packuswb   xmm5, xmm5   // 8 G values
   3518     punpcklbw  xmm0, xmm5   // 8 BG values
   3519     movdqa     xmm5, [eax]  // R
   3520     movdqa     xmm1, [eax + 16]
   3521     pmaddubsw  xmm5, xmm4
   3522     pmaddubsw  xmm1, xmm4
   3523     phaddw     xmm5, xmm1
   3524     psrlw      xmm5, 7
   3525     packuswb   xmm5, xmm5   // 8 R values
   3526     movdqa     xmm6, [eax]  // A
   3527     movdqa     xmm1, [eax + 16]
   3528     psrld      xmm6, 24
   3529     psrld      xmm1, 24
   3530     packuswb   xmm6, xmm1
   3531     packuswb   xmm6, xmm6   // 8 A values
   3532     punpcklbw  xmm5, xmm6   // 8 RA values
   3533     movdqa     xmm1, xmm0   // Weave BG, RA together
   3534     punpcklwd  xmm0, xmm5   // BGRA first 4
   3535     punpckhwd  xmm1, xmm5   // BGRA next 4
   3536     sub        ecx, 8
   3537     movdqa     [eax], xmm0
   3538     movdqa     [eax + 16], xmm1
   3539     lea        eax, [eax + 32]
   3540     jg         convertloop
   3541     ret
   3542   }
   3543 }
   3544 #endif  // HAS_ARGBSEPIAROW_SSSE3
   3545 
   3546 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
   3547 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
   3548 // Same as Sepia except matrix is provided.
   3549 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
   3550 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
   3551 __declspec(naked) __declspec(align(16))
   3552 void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
   3553                               int width) {
   3554   __asm {
   3555     mov        eax, [esp + 4]   /* dst_argb */
   3556     mov        edx, [esp + 8]   /* matrix_argb */
   3557     mov        ecx, [esp + 12]  /* width */
   3558     movd       xmm2, [edx]
   3559     movd       xmm3, [edx + 4]
   3560     movd       xmm4, [edx + 8]
   3561     pshufd     xmm2, xmm2, 0
   3562     pshufd     xmm3, xmm3, 0
   3563     pshufd     xmm4, xmm4, 0
   3564 
   3565     align      16
   3566  convertloop:
   3567     movdqa     xmm0, [eax]  // B
   3568     movdqa     xmm6, [eax + 16]
   3569     pmaddubsw  xmm0, xmm2
   3570     pmaddubsw  xmm6, xmm2
   3571     movdqa     xmm5, [eax]  // G
   3572     movdqa     xmm1, [eax + 16]
   3573     pmaddubsw  xmm5, xmm3
   3574     pmaddubsw  xmm1, xmm3
   3575     phaddsw    xmm0, xmm6   // B
   3576     phaddsw    xmm5, xmm1   // G
   3577     psraw      xmm0, 7      // B
   3578     psraw      xmm5, 7      // G
   3579     packuswb   xmm0, xmm0   // 8 B values
   3580     packuswb   xmm5, xmm5   // 8 G values
   3581     punpcklbw  xmm0, xmm5   // 8 BG values
   3582     movdqa     xmm5, [eax]  // R
   3583     movdqa     xmm1, [eax + 16]
   3584     pmaddubsw  xmm5, xmm4
   3585     pmaddubsw  xmm1, xmm4
   3586     phaddsw    xmm5, xmm1
   3587     psraw      xmm5, 7
   3588     packuswb   xmm5, xmm5   // 8 R values
   3589     movdqa     xmm6, [eax]  // A
   3590     movdqa     xmm1, [eax + 16]
   3591     psrld      xmm6, 24
   3592     psrld      xmm1, 24
   3593     packuswb   xmm6, xmm1
   3594     packuswb   xmm6, xmm6   // 8 A values
   3595     movdqa     xmm1, xmm0   // Weave BG, RA together
   3596     punpcklbw  xmm5, xmm6   // 8 RA values
   3597     punpcklwd  xmm0, xmm5   // BGRA first 4
   3598     punpckhwd  xmm1, xmm5   // BGRA next 4
   3599     sub        ecx, 8
   3600     movdqa     [eax], xmm0
   3601     movdqa     [eax + 16], xmm1
   3602     lea        eax, [eax + 32]
   3603     jg         convertloop
   3604     ret
   3605   }
   3606 }
   3607 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
   3608 
   3609 #ifdef HAS_ARGBCOLORTABLEROW_X86
   3610 // Tranform ARGB pixels with color table.
   3611 __declspec(naked) __declspec(align(16))
   3612 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
   3613                            int width) {
   3614   __asm {
   3615     push       ebx
   3616     push       esi
   3617     push       edi
   3618     push       ebp
   3619     mov        eax, [esp + 16 + 4]   /* dst_argb */
   3620     mov        edi, [esp + 16 + 8]   /* table_argb */
   3621     mov        ecx, [esp + 16 + 12]  /* width */
   3622     xor        ebx, ebx
   3623     xor        edx, edx
   3624 
   3625     align      16
   3626  convertloop:
   3627     mov        ebp, dword ptr [eax]  // BGRA
   3628     mov        esi, ebp
   3629     and        ebp, 255
   3630     shr        esi, 8
   3631     and        esi, 255
   3632     mov        bl, [edi + ebp * 4 + 0]  // B
   3633     mov        dl, [edi + esi * 4 + 1]  // G
   3634     mov        ebp, dword ptr [eax]  // BGRA
   3635     mov        esi, ebp
   3636     shr        ebp, 16
   3637     shr        esi, 24
   3638     and        ebp, 255
   3639     mov        [eax], bl
   3640     mov        [eax + 1], dl
   3641     mov        bl, [edi + ebp * 4 + 2]  // R
   3642     mov        dl, [edi + esi * 4 + 3]  // A
   3643     mov        [eax + 2], bl
   3644     mov        [eax + 3], dl
   3645     lea        eax, [eax + 4]
   3646     sub        ecx, 1
   3647     jg         convertloop
   3648     pop        ebp
   3649     pop        edi
   3650     pop        esi
   3651     pop        ebx
   3652     ret
   3653   }
   3654 }
   3655 #endif  // HAS_ARGBCOLORTABLEROW_X86
   3656 
   3657 #ifdef HAS_ARGBQUANTIZEROW_SSE2
   3658 // Quantize 4 ARGB pixels (16 bytes).
   3659 // Aligned to 16 bytes.
   3660 __declspec(naked) __declspec(align(16))
   3661 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
   3662                           int interval_offset, int width) {
   3663   __asm {
   3664     mov        eax, [esp + 4]    /* dst_argb */
   3665     movd       xmm2, [esp + 8]   /* scale */
   3666     movd       xmm3, [esp + 12]  /* interval_size */
   3667     movd       xmm4, [esp + 16]  /* interval_offset */
   3668     mov        ecx, [esp + 20]   /* width */
   3669     pshuflw    xmm2, xmm2, 040h
   3670     pshufd     xmm2, xmm2, 044h
   3671     pshuflw    xmm3, xmm3, 040h
   3672     pshufd     xmm3, xmm3, 044h
   3673     pshuflw    xmm4, xmm4, 040h
   3674     pshufd     xmm4, xmm4, 044h
   3675     pxor       xmm5, xmm5  // constant 0
   3676     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
   3677     pslld      xmm6, 24
   3678 
   3679     align      16
   3680  convertloop:
   3681     movdqa     xmm0, [eax]  // read 4 pixels
   3682     punpcklbw  xmm0, xmm5   // first 2 pixels
   3683     pmulhuw    xmm0, xmm2   // pixel * scale >> 16
   3684     movdqa     xmm1, [eax]  // read 4 pixels
   3685     punpckhbw  xmm1, xmm5   // next 2 pixels
   3686     pmulhuw    xmm1, xmm2
   3687     pmullw     xmm0, xmm3   // * interval_size
   3688     movdqa     xmm7, [eax]  // read 4 pixels
   3689     pmullw     xmm1, xmm3
   3690     pand       xmm7, xmm6   // mask alpha
   3691     paddw      xmm0, xmm4   // + interval_size / 2
   3692     paddw      xmm1, xmm4
   3693     packuswb   xmm0, xmm1
   3694     por        xmm0, xmm7
   3695     sub        ecx, 4
   3696     movdqa     [eax], xmm0
   3697     lea        eax, [eax + 16]
   3698     jg         convertloop
   3699     ret
   3700   }
   3701 }
   3702 #endif  // HAS_ARGBQUANTIZEROW_SSE2
   3703 
   3704 #ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
   3705 // Consider float CumulativeSum.
   3706 // Consider calling CumulativeSum one row at time as needed.
   3707 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
   3708 // Convert cumulative sum for an area to an average for 1 pixel.
   3709 // topleft is pointer to top left of CumulativeSum buffer for area.
   3710 // botleft is pointer to bottom left of CumulativeSum buffer.
   3711 // width is offset from left to right of area in CumulativeSum buffer measured
   3712 //   in number of ints.
   3713 // area is the number of pixels in the area being averaged.
   3714 // dst points to pixel to store result to.
   3715 // count is number of averaged pixels to produce.
   3716 // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
   3717 // aligned.
   3718 void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
   3719                                  int width, int area, uint8* dst, int count) {
   3720   __asm {
   3721     mov        eax, topleft  // eax topleft
   3722     mov        esi, botleft  // esi botleft
   3723     mov        edx, width
   3724     movd       xmm4, area
   3725     mov        edi, dst
   3726     mov        ecx, count
   3727     cvtdq2ps   xmm4, xmm4
   3728     rcpss      xmm4, xmm4  // 1.0f / area
   3729     pshufd     xmm4, xmm4, 0
   3730     sub        ecx, 4
   3731     jl         l4b
   3732 
   3733     // 4 pixel loop
   3734     align      4
   3735   l4:
   3736     // top left
   3737     movdqa     xmm0, [eax]
   3738     movdqa     xmm1, [eax + 16]
   3739     movdqa     xmm2, [eax + 32]
   3740     movdqa     xmm3, [eax + 48]
   3741 
   3742     // - top right
   3743     psubd      xmm0, [eax + edx * 4]
   3744     psubd      xmm1, [eax + edx * 4 + 16]
   3745     psubd      xmm2, [eax + edx * 4 + 32]
   3746     psubd      xmm3, [eax + edx * 4 + 48]
   3747     lea        eax, [eax + 64]
   3748 
   3749     // - bottom left
   3750     psubd      xmm0, [esi]
   3751     psubd      xmm1, [esi + 16]
   3752     psubd      xmm2, [esi + 32]
   3753     psubd      xmm3, [esi + 48]
   3754 
   3755     // + bottom right
   3756     paddd      xmm0, [esi + edx * 4]
   3757     paddd      xmm1, [esi + edx * 4 + 16]
   3758     paddd      xmm2, [esi + edx * 4 + 32]
   3759     paddd      xmm3, [esi + edx * 4 + 48]
   3760     lea        esi, [esi + 64]
   3761 
   3762     cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
   3763     cvtdq2ps   xmm1, xmm1
   3764     mulps      xmm0, xmm4
   3765     mulps      xmm1, xmm4
   3766     cvtdq2ps   xmm2, xmm2
   3767     cvtdq2ps   xmm3, xmm3
   3768     mulps      xmm2, xmm4
   3769     mulps      xmm3, xmm4
   3770     cvtps2dq   xmm0, xmm0
   3771     cvtps2dq   xmm1, xmm1
   3772     cvtps2dq   xmm2, xmm2
   3773     cvtps2dq   xmm3, xmm3
   3774     packssdw   xmm0, xmm1
   3775     packssdw   xmm2, xmm3
   3776     packuswb   xmm0, xmm2
   3777     movdqu     [edi], xmm0
   3778     lea        edi, [edi + 16]
   3779     sub        ecx, 4
   3780     jge        l4
   3781 
   3782   l4b:
   3783     add        ecx, 4 - 1
   3784     jl         l1b
   3785 
   3786     // 1 pixel loop
   3787     align      4
   3788   l1:
   3789     movdqa     xmm0, [eax]
   3790     psubd      xmm0, [eax + edx * 4]
   3791     lea        eax, [eax + 16]
   3792     psubd      xmm0, [esi]
   3793     paddd      xmm0, [esi + edx * 4]
   3794     lea        esi, [esi + 16]
   3795     cvtdq2ps   xmm0, xmm0
   3796     mulps      xmm0, xmm4
   3797     cvtps2dq   xmm0, xmm0
   3798     packssdw   xmm0, xmm0
   3799     packuswb   xmm0, xmm0
   3800     movd       dword ptr [edi], xmm0
   3801     lea        edi, [edi + 4]
   3802     sub        ecx, 1
   3803     jge        l1
   3804   l1b:
   3805   }
   3806 }
   3807 #endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
   3808 
   3809 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
   3810 // Creates a table of cumulative sums where each value is a sum of all values
   3811 // above and to the left of the value.
   3812 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
   3813                                   const int32* previous_cumsum, int width) {
   3814   __asm {
   3815     mov        eax, row
   3816     mov        edx, cumsum
   3817     mov        esi, previous_cumsum
   3818     mov        ecx, width
   3819     sub        esi, edx
   3820     pxor       xmm0, xmm0
   3821     pxor       xmm1, xmm1
   3822 
   3823     sub        ecx, 4
   3824     jl         l4b
   3825     test       edx, 15
   3826     jne        l4b
   3827 
   3828     // 4 pixel loop
   3829     align      4
   3830   l4:
   3831     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
   3832     lea        eax, [eax + 16]
   3833     movdqa     xmm4, xmm2
   3834 
   3835     punpcklbw  xmm2, xmm1
   3836     movdqa     xmm3, xmm2
   3837     punpcklwd  xmm2, xmm1
   3838     punpckhwd  xmm3, xmm1
   3839 
   3840     punpckhbw  xmm4, xmm1
   3841     movdqa     xmm5, xmm4
   3842     punpcklwd  xmm4, xmm1
   3843     punpckhwd  xmm5, xmm1
   3844 
   3845     paddd      xmm0, xmm2
   3846     movdqa     xmm2, [edx + esi]  // previous row above.
   3847     paddd      xmm2, xmm0
   3848 
   3849     paddd      xmm0, xmm3
   3850     movdqa     xmm3, [edx + esi + 16]
   3851     paddd      xmm3, xmm0
   3852 
   3853     paddd      xmm0, xmm4
   3854     movdqa     xmm4, [edx + esi + 32]
   3855     paddd      xmm4, xmm0
   3856 
   3857     paddd      xmm0, xmm5
   3858     movdqa     xmm5, [edx + esi + 48]
   3859     paddd      xmm5, xmm0
   3860 
   3861     movdqa     [edx], xmm2
   3862     movdqa     [edx + 16], xmm3
   3863     movdqa     [edx + 32], xmm4
   3864     movdqa     [edx + 48], xmm5
   3865 
   3866     lea        edx, [edx + 64]
   3867     sub        ecx, 4
   3868     jge        l4
   3869 
   3870   l4b:
   3871     add        ecx, 4 - 1
   3872     jl         l1b
   3873 
   3874     // 1 pixel loop
   3875     align      4
   3876   l1:
   3877     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
   3878     lea        eax, [eax + 4]
   3879     punpcklbw  xmm2, xmm1
   3880     punpcklwd  xmm2, xmm1
   3881     paddd      xmm0, xmm2
   3882     movdqu     xmm2, [edx + esi]
   3883     paddd      xmm2, xmm0
   3884     movdqu     [edx], xmm2
   3885     lea        edx, [edx + 16]
   3886     sub        ecx, 1
   3887     jge        l1
   3888 
   3889  l1b:
   3890   }
   3891 }
   3892 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
   3893 
   3894 #ifdef HAS_ARGBSHADE_SSE2
   3895 // Shade 4 pixels at a time by specified value.
   3896 // Aligned to 16 bytes.
   3897 __declspec(naked) __declspec(align(16))
   3898 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
   3899                        uint32 value) {
   3900   __asm {
   3901     mov        eax, [esp + 4]   // src_argb
   3902     mov        edx, [esp + 8]   // dst_argb
   3903     mov        ecx, [esp + 12]  // width
   3904     movd       xmm2, [esp + 16]  // value
   3905     sub        edx, eax
   3906     punpcklbw  xmm2, xmm2
   3907     punpcklqdq xmm2, xmm2
   3908 
   3909     align      16
   3910  convertloop:
   3911     movdqa     xmm0, [eax]      // read 4 pixels
   3912     movdqa     xmm1, xmm0
   3913     punpcklbw  xmm0, xmm0       // first 2
   3914     punpckhbw  xmm1, xmm1       // next 2
   3915     pmulhuw    xmm0, xmm2       // argb * value
   3916     pmulhuw    xmm1, xmm2       // argb * value
   3917     psrlw      xmm0, 8
   3918     psrlw      xmm1, 8
   3919     packuswb   xmm0, xmm1
   3920     sub        ecx, 4
   3921     movdqa     [eax + edx], xmm0
   3922     lea        eax, [eax + 16]
   3923     jg         convertloop
   3924 
   3925     ret
   3926   }
   3927 }
   3928 #endif  // HAS_ARGBSHADE_SSE2
   3929 
   3930 #ifdef HAS_ARGBAFFINEROW_SSE2
   3931 // Copy ARGB pixels from source image with slope to a row of destination.
   3932 __declspec(naked) __declspec(align(16))
   3933 LIBYUV_API
   3934 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
   3935                         uint8* dst_argb, const float* uv_dudv, int width) {
   3936   __asm {
   3937     push       esi
   3938     push       edi
   3939     mov        eax, [esp + 12]   // src_argb
   3940     mov        esi, [esp + 16]  // stride
   3941     mov        edx, [esp + 20]  // dst_argb
   3942     mov        ecx, [esp + 24]  // pointer to uv_dudv
   3943     movq       xmm2, qword ptr [ecx]  // uv
   3944     movq       xmm7, qword ptr [ecx + 8]  // dudv
   3945     mov        ecx, [esp + 28]  // width
   3946     shl        esi, 16          // 4, stride
   3947     add        esi, 4
   3948     movd       xmm5, esi
   3949     sub        ecx, 4
   3950     jl         l4b
   3951 
   3952     // setup for 4 pixel loop
   3953     pshufd     xmm7, xmm7, 0x44  // dup dudv
   3954     pshufd     xmm5, xmm5, 0  // dup 4, stride
   3955     movdqa     xmm0, xmm2    // x0, y0, x1, y1
   3956     addps      xmm0, xmm7
   3957     movlhps    xmm2, xmm0
   3958     movdqa     xmm4, xmm7
   3959     addps      xmm4, xmm4    // dudv *= 2
   3960     movdqa     xmm3, xmm2    // x2, y2, x3, y3
   3961     addps      xmm3, xmm4
   3962     addps      xmm4, xmm4    // dudv *= 4
   3963 
   3964     // 4 pixel loop
   3965     align      4
   3966   l4:
   3967     cvttps2dq  xmm0, xmm2    // x, y float to int first 2
   3968     cvttps2dq  xmm1, xmm3    // x, y float to int next 2
   3969     packssdw   xmm0, xmm1    // x, y as 8 shorts
   3970     pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
   3971     movd       esi, xmm0
   3972     pshufd     xmm0, xmm0, 0x39  // shift right
   3973     movd       edi, xmm0
   3974     pshufd     xmm0, xmm0, 0x39  // shift right
   3975     movd       xmm1, [eax + esi]  // read pixel 0
   3976     movd       xmm6, [eax + edi]  // read pixel 1
   3977     punpckldq  xmm1, xmm6     // combine pixel 0 and 1
   3978     addps      xmm2, xmm4    // x, y += dx, dy first 2
   3979     movq       qword ptr [edx], xmm1
   3980     movd       esi, xmm0
   3981     pshufd     xmm0, xmm0, 0x39  // shift right
   3982     movd       edi, xmm0
   3983     movd       xmm6, [eax + esi]  // read pixel 2
   3984     movd       xmm0, [eax + edi]  // read pixel 3
   3985     punpckldq  xmm6, xmm0     // combine pixel 2 and 3
   3986     addps      xmm3, xmm4    // x, y += dx, dy next 2
   3987     sub        ecx, 4
   3988     movq       qword ptr 8[edx], xmm6
   3989     lea        edx, [edx + 16]
   3990     jge        l4
   3991 
   3992   l4b:
   3993     add        ecx, 4 - 1
   3994     jl         l1b
   3995 
   3996     // 1 pixel loop
   3997     align      4
   3998   l1:
   3999     cvttps2dq  xmm0, xmm2    // x, y float to int
   4000     packssdw   xmm0, xmm0    // x, y as shorts
   4001     pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
   4002     addps      xmm2, xmm7    // x, y += dx, dy
   4003     movd       esi, xmm0
   4004     movd       xmm0, [eax + esi]  // copy a pixel
   4005     sub        ecx, 1
   4006     movd       [edx], xmm0
   4007     lea        edx, [edx + 4]
   4008     jge        l1
   4009   l1b:
   4010     pop        edi
   4011     pop        esi
   4012     ret
   4013   }
   4014 }
   4015 #endif  // HAS_ARGBAFFINEROW_SSE2
   4016 
   4017 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
   4018 __declspec(naked) __declspec(align(16))
   4019 void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   4020                               ptrdiff_t src_stride, int dst_width,
   4021                               int source_y_fraction) {
   4022   __asm {
   4023     push       esi
   4024     push       edi
   4025     mov        edi, [esp + 8 + 4]   // dst_ptr
   4026     mov        esi, [esp + 8 + 8]   // src_ptr
   4027     mov        edx, [esp + 8 + 12]  // src_stride
   4028     mov        ecx, [esp + 8 + 16]  // dst_width
   4029     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   4030     sub        edi, esi
   4031     shr        eax, 1
   4032     cmp        eax, 0
   4033     je         xloop1
   4034     cmp        eax, 64
   4035     je         xloop2
   4036     movd       xmm0, eax  // high fraction 0..127
   4037     neg        eax
   4038     add        eax, 128
   4039     movd       xmm5, eax  // low fraction 128..1
   4040     punpcklbw  xmm5, xmm0
   4041     punpcklwd  xmm5, xmm5
   4042     pshufd     xmm5, xmm5, 0
   4043 
   4044     align      16
   4045   xloop:
   4046     movdqa     xmm0, [esi]
   4047     movdqa     xmm2, [esi + edx]
   4048     movdqa     xmm1, xmm0
   4049     punpcklbw  xmm0, xmm2
   4050     punpckhbw  xmm1, xmm2
   4051     pmaddubsw  xmm0, xmm5
   4052     pmaddubsw  xmm1, xmm5
   4053     psrlw      xmm0, 7
   4054     psrlw      xmm1, 7
   4055     packuswb   xmm0, xmm1
   4056     sub        ecx, 4
   4057     movdqa     [esi + edi], xmm0
   4058     lea        esi, [esi + 16]
   4059     jg         xloop
   4060 
   4061     pop        edi
   4062     pop        esi
   4063     ret
   4064 
   4065     align      16
   4066   xloop1:
   4067     movdqa     xmm0, [esi]
   4068     sub        ecx, 4
   4069     movdqa     [esi + edi], xmm0
   4070     lea        esi, [esi + 16]
   4071     jg         xloop1
   4072 
   4073     pop        edi
   4074     pop        esi
   4075     ret
   4076 
   4077     align      16
   4078   xloop2:
   4079     movdqa     xmm0, [esi]
   4080     pavgb      xmm0, [esi + edx]
   4081     sub        ecx, 4
   4082     movdqa     [esi + edi], xmm0
   4083     lea        esi, [esi + 16]
   4084     jg         xloop2
   4085 
   4086     pop        edi
   4087     pop        esi
   4088     ret
   4089   }
   4090 }
   4091 
   4092 #endif  // _M_IX86
   4093 
   4094 #ifdef __cplusplus
   4095 }  // extern "C"
   4096 }  // namespace libyuv
   4097 #endif
   4098