Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 
     13 #include "libyuv/basic_types.h"
     14 
     15 #ifdef __cplusplus
     16 namespace libyuv {
     17 extern "C" {
     18 #endif
     19 
     20 // This module is for GCC x86 and x64
     21 #if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
     22 
     23 // GCC 4.2 on OSX has link error when passing static or const to inline.
     24 // TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
     25 #ifdef __APPLE__
     26 #define CONST
     27 #else
     28 #define CONST static const
     29 #endif
     30 
     31 #ifdef HAS_ARGBTOYROW_SSSE3
     32 
     33 // Constants for ARGB
     34 CONST vec8 kARGBToY = {
     35   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
     36 };
     37 
     38 CONST vec8 kARGBToU = {
     39   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
     40 };
     41 
     42 CONST vec8 kARGBToV = {
     43   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
     44 };
     45 
     46 // Constants for BGRA
     47 CONST vec8 kBGRAToY = {
     48   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
     49 };
     50 
     51 CONST vec8 kBGRAToU = {
     52   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
     53 };
     54 
     55 CONST vec8 kBGRAToV = {
     56   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
     57 };
     58 
     59 // Constants for ABGR
     60 CONST vec8 kABGRToY = {
     61   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
     62 };
     63 
     64 CONST vec8 kABGRToU = {
     65   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
     66 };
     67 
     68 CONST vec8 kABGRToV = {
     69   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
     70 };
     71 
     72 CONST uvec8 kAddY16 = {
     73   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
     74 };
     75 
     76 CONST uvec8 kAddUV128 = {
     77   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
     78   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
     79 };
     80 
     81 // Shuffle table for converting RGB24 to ARGB.
     82 CONST uvec8 kShuffleMaskRGB24ToARGB = {
     83   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
     84 };
     85 
     86 // Shuffle table for converting RAW to ARGB.
     87 CONST uvec8 kShuffleMaskRAWToARGB = {
     88   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
     89 };
     90 
     91 // Shuffle table for converting ABGR to ARGB.
     92 CONST uvec8 kShuffleMaskABGRToARGB = {
     93   2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
     94 };
     95 
     96 // Shuffle table for converting BGRA to ARGB.
     97 CONST uvec8 kShuffleMaskBGRAToARGB = {
     98   3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
     99 };
    100 
    101 // Shuffle table for converting RGBA to ARGB.
    102 CONST uvec8 kShuffleMaskRGBAToARGB = {
    103   1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
    104 };
    105 
    106 // Shuffle table for converting ARGB to RGBA.
    107 CONST uvec8 kShuffleMaskARGBToRGBA = {
    108   3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
    109 };
    110 
    111 // Shuffle table for converting ARGB to RGB24.
    112 CONST uvec8 kShuffleMaskARGBToRGB24 = {
    113   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
    114 };
    115 
    116 // Shuffle table for converting ARGB to RAW.
    117 CONST uvec8 kShuffleMaskARGBToRAW = {
    118   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
    119 };
    120 
    121 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
    122   asm volatile (
    123     "pcmpeqb   %%xmm5,%%xmm5                   \n"
    124     "pslld     $0x18,%%xmm5                    \n"
    125     ".p2align  4                               \n"
    126   "1:                                          \n"
    127     "movq      (%0),%%xmm0                     \n"
    128     "lea       0x8(%0),%0                      \n"
    129     "punpcklbw %%xmm0,%%xmm0                   \n"
    130     "movdqa    %%xmm0,%%xmm1                   \n"
    131     "punpcklwd %%xmm0,%%xmm0                   \n"
    132     "punpckhwd %%xmm1,%%xmm1                   \n"
    133     "por       %%xmm5,%%xmm0                   \n"
    134     "por       %%xmm5,%%xmm1                   \n"
    135     "movdqa    %%xmm0,(%1)                     \n"
    136     "movdqa    %%xmm1,0x10(%1)                 \n"
    137     "lea       0x20(%1),%1                     \n"
    138     "sub       $0x8,%2                         \n"
    139     "jg        1b                              \n"
    140   : "+r"(src_y),     // %0
    141     "+r"(dst_argb),  // %1
    142     "+r"(pix)        // %2
    143   :
    144   : "memory", "cc"
    145 #if defined(__SSE2__)
    146     , "xmm0", "xmm1", "xmm5"
    147 #endif
    148   );
    149 }
    150 
    151 void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
    152   asm volatile (
    153     "movdqa    %3,%%xmm5                       \n"
    154     "sub       %0,%1                           \n"
    155     ".p2align  4                               \n"
    156   "1:                                          \n"
    157     "movdqa    (%0),%%xmm0                     \n"
    158     "pshufb    %%xmm5,%%xmm0                   \n"
    159     "sub       $0x4,%2                         \n"
    160     "movdqa    %%xmm0,(%0,%1,1)                \n"
    161     "lea       0x10(%0),%0                     \n"
    162     "jg        1b                              \n"
    163 
    164   : "+r"(src_abgr),  // %0
    165     "+r"(dst_argb),  // %1
    166     "+r"(pix)        // %2
    167   : "m"(kShuffleMaskABGRToARGB)  // %3
    168   : "memory", "cc"
    169 #if defined(__SSE2__)
    170     , "xmm0", "xmm5"
    171 #endif
    172   );
    173 }
    174 
    175 void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
    176   asm volatile (
    177     "movdqa    %3,%%xmm5                       \n"
    178     "sub       %0,%1                           \n"
    179     ".p2align  4                               \n"
    180   "1:                                          \n"
    181     "movdqa    (%0),%%xmm0                     \n"
    182     "pshufb    %%xmm5,%%xmm0                   \n"
    183     "sub       $0x4,%2                         \n"
    184     "movdqa    %%xmm0,(%0,%1,1)                \n"
    185     "lea       0x10(%0),%0                     \n"
    186     "jg        1b                              \n"
    187   : "+r"(src_bgra),  // %0
    188     "+r"(dst_argb),  // %1
    189     "+r"(pix)        // %2
    190   : "m"(kShuffleMaskBGRAToARGB)  // %3
    191   : "memory", "cc"
    192 #if defined(__SSE2__)
    193     , "xmm0", "xmm5"
    194 #endif
    195   );
    196 }
    197 
    198 void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
    199   asm volatile (
    200     "movdqa    %3,%%xmm5                       \n"
    201     "sub       %0,%1                           \n"
    202     ".p2align  4                               \n"
    203   "1:                                          \n"
    204     "movdqa    (%0),%%xmm0                     \n"
    205     "pshufb    %%xmm5,%%xmm0                   \n"
    206     "sub       $0x4,%2                         \n"
    207     "movdqa    %%xmm0,(%0,%1,1)                \n"
    208     "lea       0x10(%0),%0                     \n"
    209     "jg        1b                              \n"
    210 
    211   : "+r"(src_rgba),  // %0
    212     "+r"(dst_argb),  // %1
    213     "+r"(pix)        // %2
    214   : "m"(kShuffleMaskRGBAToARGB)  // %3
    215   : "memory", "cc"
    216 #if defined(__SSE2__)
    217     , "xmm0", "xmm5"
    218 #endif
    219   );
    220 }
    221 
    222 void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
    223   asm volatile (
    224     "movdqa    %3,%%xmm5                       \n"
    225     "sub       %0,%1                           \n"
    226     ".p2align  4                               \n"
    227   "1:                                          \n"
    228     "movdqa    (%0),%%xmm0                     \n"
    229     "pshufb    %%xmm5,%%xmm0                   \n"
    230     "sub       $0x4,%2                         \n"
    231     "movdqa    %%xmm0,(%0,%1,1)                \n"
    232     "lea       0x10(%0),%0                     \n"
    233     "jg        1b                              \n"
    234 
    235   : "+r"(src_argb),  // %0
    236     "+r"(dst_rgba),  // %1
    237     "+r"(pix)        // %2
    238   : "m"(kShuffleMaskARGBToRGBA)  // %3
    239   : "memory", "cc"
    240 #if defined(__SSE2__)
    241     , "xmm0", "xmm5"
    242 #endif
    243   );
    244 }
    245 
    246 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
    247   asm volatile (
    248     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
    249     "pslld     $0x18,%%xmm5                    \n"
    250     "movdqa    %3,%%xmm4                       \n"
    251     ".p2align  4                               \n"
    252   "1:                                          \n"
    253     "movdqu    (%0),%%xmm0                     \n"
    254     "movdqu    0x10(%0),%%xmm1                 \n"
    255     "movdqu    0x20(%0),%%xmm3                 \n"
    256     "lea       0x30(%0),%0                     \n"
    257     "movdqa    %%xmm3,%%xmm2                   \n"
    258     "palignr   $0x8,%%xmm1,%%xmm2              \n"
    259     "pshufb    %%xmm4,%%xmm2                   \n"
    260     "por       %%xmm5,%%xmm2                   \n"
    261     "palignr   $0xc,%%xmm0,%%xmm1              \n"
    262     "pshufb    %%xmm4,%%xmm0                   \n"
    263     "movdqa    %%xmm2,0x20(%1)                 \n"
    264     "por       %%xmm5,%%xmm0                   \n"
    265     "pshufb    %%xmm4,%%xmm1                   \n"
    266     "movdqa    %%xmm0,(%1)                     \n"
    267     "por       %%xmm5,%%xmm1                   \n"
    268     "palignr   $0x4,%%xmm3,%%xmm3              \n"
    269     "pshufb    %%xmm4,%%xmm3                   \n"
    270     "movdqa    %%xmm1,0x10(%1)                 \n"
    271     "por       %%xmm5,%%xmm3                   \n"
    272     "sub       $0x10,%2                        \n"
    273     "movdqa    %%xmm3,0x30(%1)                 \n"
    274     "lea       0x40(%1),%1                     \n"
    275     "jg        1b                              \n"
    276   : "+r"(src_rgb24),  // %0
    277     "+r"(dst_argb),  // %1
    278     "+r"(pix)        // %2
    279   : "m"(kShuffleMaskRGB24ToARGB)  // %3
    280   : "memory", "cc"
    281 #if defined(__SSE2__)
    282     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    283 #endif
    284   );
    285 }
    286 
    287 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
    288   asm volatile (
    289     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
    290     "pslld     $0x18,%%xmm5                    \n"
    291     "movdqa    %3,%%xmm4                       \n"
    292     ".p2align  4                               \n"
    293   "1:                                          \n"
    294     "movdqu    (%0),%%xmm0                     \n"
    295     "movdqu    0x10(%0),%%xmm1                 \n"
    296     "movdqu    0x20(%0),%%xmm3                 \n"
    297     "lea       0x30(%0),%0                     \n"
    298     "movdqa    %%xmm3,%%xmm2                   \n"
    299     "palignr   $0x8,%%xmm1,%%xmm2              \n"
    300     "pshufb    %%xmm4,%%xmm2                   \n"
    301     "por       %%xmm5,%%xmm2                   \n"
    302     "palignr   $0xc,%%xmm0,%%xmm1              \n"
    303     "pshufb    %%xmm4,%%xmm0                   \n"
    304     "movdqa    %%xmm2,0x20(%1)                 \n"
    305     "por       %%xmm5,%%xmm0                   \n"
    306     "pshufb    %%xmm4,%%xmm1                   \n"
    307     "movdqa    %%xmm0,(%1)                     \n"
    308     "por       %%xmm5,%%xmm1                   \n"
    309     "palignr   $0x4,%%xmm3,%%xmm3              \n"
    310     "pshufb    %%xmm4,%%xmm3                   \n"
    311     "movdqa    %%xmm1,0x10(%1)                 \n"
    312     "por       %%xmm5,%%xmm3                   \n"
    313     "sub       $0x10,%2                        \n"
    314     "movdqa    %%xmm3,0x30(%1)                 \n"
    315     "lea       0x40(%1),%1                     \n"
    316     "jg        1b                              \n"
    317   : "+r"(src_raw),   // %0
    318     "+r"(dst_argb),  // %1
    319     "+r"(pix)        // %2
    320   : "m"(kShuffleMaskRAWToARGB)  // %3
    321   : "memory", "cc"
    322 #if defined(__SSE2__)
    323     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    324 #endif
    325   );
    326 }
    327 
    328 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
    329   asm volatile (
    330     "mov       $0x1080108,%%eax                \n"
    331     "movd      %%eax,%%xmm5                    \n"
    332     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
    333     "mov       $0x20802080,%%eax               \n"
    334     "movd      %%eax,%%xmm6                    \n"
    335     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
    336     "pcmpeqb   %%xmm3,%%xmm3                   \n"
    337     "psllw     $0xb,%%xmm3                     \n"
    338     "pcmpeqb   %%xmm4,%%xmm4                   \n"
    339     "psllw     $0xa,%%xmm4                     \n"
    340     "psrlw     $0x5,%%xmm4                     \n"
    341     "pcmpeqb   %%xmm7,%%xmm7                   \n"
    342     "psllw     $0x8,%%xmm7                     \n"
    343     "sub       %0,%1                           \n"
    344     "sub       %0,%1                           \n"
    345     ".p2align  4                               \n"
    346   "1:                                          \n"
    347     "movdqu    (%0),%%xmm0                     \n"
    348     "movdqa    %%xmm0,%%xmm1                   \n"
    349     "movdqa    %%xmm0,%%xmm2                   \n"
    350     "pand      %%xmm3,%%xmm1                   \n"
    351     "psllw     $0xb,%%xmm2                     \n"
    352     "pmulhuw   %%xmm5,%%xmm1                   \n"
    353     "pmulhuw   %%xmm5,%%xmm2                   \n"
    354     "psllw     $0x8,%%xmm1                     \n"
    355     "por       %%xmm2,%%xmm1                   \n"
    356     "pand      %%xmm4,%%xmm0                   \n"
    357     "pmulhuw   %%xmm6,%%xmm0                   \n"
    358     "por       %%xmm7,%%xmm0                   \n"
    359     "movdqa    %%xmm1,%%xmm2                   \n"
    360     "punpcklbw %%xmm0,%%xmm1                   \n"
    361     "punpckhbw %%xmm0,%%xmm2                   \n"
    362     "movdqa    %%xmm1,(%1,%0,2)                \n"
    363     "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
    364     "lea       0x10(%0),%0                     \n"
    365     "sub       $0x8,%2                         \n"
    366     "jg        1b                              \n"
    367   : "+r"(src),  // %0
    368     "+r"(dst),  // %1
    369     "+r"(pix)   // %2
    370   :
    371   : "memory", "cc", "eax"
    372 #if defined(__SSE2__)
    373     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    374 #endif
    375   );
    376 }
    377 
    378 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
    379   asm volatile (
    380     "mov       $0x1080108,%%eax                \n"
    381     "movd      %%eax,%%xmm5                    \n"
    382     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
    383     "mov       $0x42004200,%%eax               \n"
    384     "movd      %%eax,%%xmm6                    \n"
    385     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
    386     "pcmpeqb   %%xmm3,%%xmm3                   \n"
    387     "psllw     $0xb,%%xmm3                     \n"
    388     "movdqa    %%xmm3,%%xmm4                   \n"
    389     "psrlw     $0x6,%%xmm4                     \n"
    390     "pcmpeqb   %%xmm7,%%xmm7                   \n"
    391     "psllw     $0x8,%%xmm7                     \n"
    392     "sub       %0,%1                           \n"
    393     "sub       %0,%1                           \n"
    394     ".p2align  4                               \n"
    395   "1:                                          \n"
    396     "movdqu    (%0),%%xmm0                     \n"
    397     "movdqa    %%xmm0,%%xmm1                   \n"
    398     "movdqa    %%xmm0,%%xmm2                   \n"
    399     "psllw     $0x1,%%xmm1                     \n"
    400     "psllw     $0xb,%%xmm2                     \n"
    401     "pand      %%xmm3,%%xmm1                   \n"
    402     "pmulhuw   %%xmm5,%%xmm2                   \n"
    403     "pmulhuw   %%xmm5,%%xmm1                   \n"
    404     "psllw     $0x8,%%xmm1                     \n"
    405     "por       %%xmm2,%%xmm1                   \n"
    406     "movdqa    %%xmm0,%%xmm2                   \n"
    407     "pand      %%xmm4,%%xmm0                   \n"
    408     "psraw     $0x8,%%xmm2                     \n"
    409     "pmulhuw   %%xmm6,%%xmm0                   \n"
    410     "pand      %%xmm7,%%xmm2                   \n"
    411     "por       %%xmm2,%%xmm0                   \n"
    412     "movdqa    %%xmm1,%%xmm2                   \n"
    413     "punpcklbw %%xmm0,%%xmm1                   \n"
    414     "punpckhbw %%xmm0,%%xmm2                   \n"
    415     "movdqa    %%xmm1,(%1,%0,2)                \n"
    416     "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
    417     "lea       0x10(%0),%0                     \n"
    418     "sub       $0x8,%2                         \n"
    419     "jg        1b                              \n"
    420   : "+r"(src),  // %0
    421     "+r"(dst),  // %1
    422     "+r"(pix)   // %2
    423   :
    424   : "memory", "cc", "eax"
    425 #if defined(__SSE2__)
    426     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    427 #endif
    428   );
    429 }
    430 
    431 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
    432   asm volatile (
    433     "mov       $0xf0f0f0f,%%eax                \n"
    434     "movd      %%eax,%%xmm4                    \n"
    435     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
    436     "movdqa    %%xmm4,%%xmm5                   \n"
    437     "pslld     $0x4,%%xmm5                     \n"
    438     "sub       %0,%1                           \n"
    439     "sub       %0,%1                           \n"
    440     ".p2align  4                               \n"
    441   "1:                                          \n"
    442     "movdqu    (%0),%%xmm0                     \n"
    443     "movdqa    %%xmm0,%%xmm2                   \n"
    444     "pand      %%xmm4,%%xmm0                   \n"
    445     "pand      %%xmm5,%%xmm2                   \n"
    446     "movdqa    %%xmm0,%%xmm1                   \n"
    447     "movdqa    %%xmm2,%%xmm3                   \n"
    448     "psllw     $0x4,%%xmm1                     \n"
    449     "psrlw     $0x4,%%xmm3                     \n"
    450     "por       %%xmm1,%%xmm0                   \n"
    451     "por       %%xmm3,%%xmm2                   \n"
    452     "movdqa    %%xmm0,%%xmm1                   \n"
    453     "punpcklbw %%xmm2,%%xmm0                   \n"
    454     "punpckhbw %%xmm2,%%xmm1                   \n"
    455     "movdqa    %%xmm0,(%1,%0,2)                \n"
    456     "movdqa    %%xmm1,0x10(%1,%0,2)            \n"
    457     "lea       0x10(%0),%0                     \n"
    458     "sub       $0x8,%2                         \n"
    459     "jg        1b                              \n"
    460   : "+r"(src),  // %0
    461     "+r"(dst),  // %1
    462     "+r"(pix)   // %2
    463   :
    464   : "memory", "cc", "eax"
    465 #if defined(__SSE2__)
    466     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    467 #endif
    468   );
    469 }
    470 
    471 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
    472   asm volatile (
    473     "movdqa    %3,%%xmm6                       \n"
    474     ".p2align  4                               \n"
    475   "1:                                          \n"
    476     "movdqa    (%0),%%xmm0                     \n"
    477     "movdqa    0x10(%0),%%xmm1                 \n"
    478     "movdqa    0x20(%0),%%xmm2                 \n"
    479     "movdqa    0x30(%0),%%xmm3                 \n"
    480     "lea       0x40(%0),%0                     \n"
    481     "pshufb    %%xmm6,%%xmm0                   \n"
    482     "pshufb    %%xmm6,%%xmm1                   \n"
    483     "pshufb    %%xmm6,%%xmm2                   \n"
    484     "pshufb    %%xmm6,%%xmm3                   \n"
    485     "movdqa    %%xmm1,%%xmm4                   \n"
    486     "psrldq    $0x4,%%xmm1                     \n"
    487     "pslldq    $0xc,%%xmm4                     \n"
    488     "movdqa    %%xmm2,%%xmm5                   \n"
    489     "por       %%xmm4,%%xmm0                   \n"
    490     "pslldq    $0x8,%%xmm5                     \n"
    491     "movdqa    %%xmm0,(%1)                     \n"
    492     "por       %%xmm5,%%xmm1                   \n"
    493     "psrldq    $0x8,%%xmm2                     \n"
    494     "pslldq    $0x4,%%xmm3                     \n"
    495     "por       %%xmm3,%%xmm2                   \n"
    496     "movdqa    %%xmm1,0x10(%1)                 \n"
    497     "movdqa    %%xmm2,0x20(%1)                 \n"
    498     "lea       0x30(%1),%1                     \n"
    499     "sub       $0x10,%2                        \n"
    500     "jg        1b                              \n"
    501   : "+r"(src),  // %0
    502     "+r"(dst),  // %1
    503     "+r"(pix)   // %2
    504   : "m"(kShuffleMaskARGBToRGB24)  // %3
    505   : "memory", "cc"
    506 #if defined(__SSE2__)
    507     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
    508 #endif
    509   );
    510 }
    511 
    512 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
    513   asm volatile (
    514     "movdqa    %3,%%xmm6                       \n"
    515     ".p2align  4                               \n"
    516   "1:                                          \n"
    517     "movdqa    (%0),%%xmm0                     \n"
    518     "movdqa    0x10(%0),%%xmm1                 \n"
    519     "movdqa    0x20(%0),%%xmm2                 \n"
    520     "movdqa    0x30(%0),%%xmm3                 \n"
    521     "lea       0x40(%0),%0                     \n"
    522     "pshufb    %%xmm6,%%xmm0                   \n"
    523     "pshufb    %%xmm6,%%xmm1                   \n"
    524     "pshufb    %%xmm6,%%xmm2                   \n"
    525     "pshufb    %%xmm6,%%xmm3                   \n"
    526     "movdqa    %%xmm1,%%xmm4                   \n"
    527     "psrldq    $0x4,%%xmm1                     \n"
    528     "pslldq    $0xc,%%xmm4                     \n"
    529     "movdqa    %%xmm2,%%xmm5                   \n"
    530     "por       %%xmm4,%%xmm0                   \n"
    531     "pslldq    $0x8,%%xmm5                     \n"
    532     "movdqa    %%xmm0,(%1)                     \n"
    533     "por       %%xmm5,%%xmm1                   \n"
    534     "psrldq    $0x8,%%xmm2                     \n"
    535     "pslldq    $0x4,%%xmm3                     \n"
    536     "por       %%xmm3,%%xmm2                   \n"
    537     "movdqa    %%xmm1,0x10(%1)                 \n"
    538     "movdqa    %%xmm2,0x20(%1)                 \n"
    539     "lea       0x30(%1),%1                     \n"
    540     "sub       $0x10,%2                        \n"
    541     "jg        1b                              \n"
    542   : "+r"(src),  // %0
    543     "+r"(dst),  // %1
    544     "+r"(pix)   // %2
    545   : "m"(kShuffleMaskARGBToRAW)  // %3
    546   : "memory", "cc"
    547 #if defined(__SSE2__)
    548     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
    549 #endif
    550   );
    551 }
    552 
    553 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
    554   asm volatile (
    555     "pcmpeqb   %%xmm3,%%xmm3                   \n"
    556     "psrld     $0x1b,%%xmm3                    \n"
    557     "pcmpeqb   %%xmm4,%%xmm4                   \n"
    558     "psrld     $0x1a,%%xmm4                    \n"
    559     "pslld     $0x5,%%xmm4                     \n"
    560     "pcmpeqb   %%xmm5,%%xmm5                   \n"
    561     "pslld     $0xb,%%xmm5                     \n"
    562     ".p2align  4                               \n"
    563   "1:                                          \n"
    564     "movdqa    (%0),%%xmm0                     \n"
    565     "movdqa    %%xmm0,%%xmm1                   \n"
    566     "movdqa    %%xmm0,%%xmm2                   \n"
    567     "pslld     $0x8,%%xmm0                     \n"
    568     "psrld     $0x3,%%xmm1                     \n"
    569     "psrld     $0x5,%%xmm2                     \n"
    570     "psrad     $0x10,%%xmm0                    \n"
    571     "pand      %%xmm3,%%xmm1                   \n"
    572     "pand      %%xmm4,%%xmm2                   \n"
    573     "pand      %%xmm5,%%xmm0                   \n"
    574     "por       %%xmm2,%%xmm1                   \n"
    575     "por       %%xmm1,%%xmm0                   \n"
    576     "packssdw  %%xmm0,%%xmm0                   \n"
    577     "lea       0x10(%0),%0                     \n"
    578     "movq      %%xmm0,(%1)                     \n"
    579     "lea       0x8(%1),%1                      \n"
    580     "sub       $0x4,%2                         \n"
    581     "jg        1b                              \n"
    582   : "+r"(src),  // %0
    583     "+r"(dst),  // %1
    584     "+r"(pix)   // %2
    585   :
    586   : "memory", "cc"
    587 #if defined(__SSE2__)
    588     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    589 #endif
    590   );
    591 }
    592 
    593 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
    594   asm volatile (
    595     "pcmpeqb   %%xmm4,%%xmm4                   \n"
    596     "psrld     $0x1b,%%xmm4                    \n"
    597     "movdqa    %%xmm4,%%xmm5                   \n"
    598     "pslld     $0x5,%%xmm5                     \n"
    599     "movdqa    %%xmm4,%%xmm6                   \n"
    600     "pslld     $0xa,%%xmm6                     \n"
    601     "pcmpeqb   %%xmm7,%%xmm7                   \n"
    602     "pslld     $0xf,%%xmm7                     \n"
    603     ".p2align  4                               \n"
    604   "1:                                          \n"
    605     "movdqa    (%0),%%xmm0                     \n"
    606     "movdqa    %%xmm0,%%xmm1                   \n"
    607     "movdqa    %%xmm0,%%xmm2                   \n"
    608     "movdqa    %%xmm0,%%xmm3                   \n"
    609     "psrad     $0x10,%%xmm0                    \n"
    610     "psrld     $0x3,%%xmm1                     \n"
    611     "psrld     $0x6,%%xmm2                     \n"
    612     "psrld     $0x9,%%xmm3                     \n"
    613     "pand      %%xmm7,%%xmm0                   \n"
    614     "pand      %%xmm4,%%xmm1                   \n"
    615     "pand      %%xmm5,%%xmm2                   \n"
    616     "pand      %%xmm6,%%xmm3                   \n"
    617     "por       %%xmm1,%%xmm0                   \n"
    618     "por       %%xmm3,%%xmm2                   \n"
    619     "por       %%xmm2,%%xmm0                   \n"
    620     "packssdw  %%xmm0,%%xmm0                   \n"
    621     "lea       0x10(%0),%0                     \n"
    622     "movq      %%xmm0,(%1)                     \n"
    623     "lea       0x8(%1),%1                      \n"
    624     "sub       $0x4,%2                         \n"
    625     "jg        1b                              \n"
    626   : "+r"(src),  // %0
    627     "+r"(dst),  // %1
    628     "+r"(pix)   // %2
    629   :
    630   : "memory", "cc"
    631 #if defined(__SSE2__)
    632     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    633 #endif
    634   );
    635 }
    636 
    637 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
    638   asm volatile (
    639     "pcmpeqb   %%xmm4,%%xmm4                   \n"
    640     "psllw     $0xc,%%xmm4                     \n"
    641     "movdqa    %%xmm4,%%xmm3                   \n"
    642     "psrlw     $0x8,%%xmm3                     \n"
    643     ".p2align  4                               \n"
    644   "1:                                          \n"
    645     "movdqa    (%0),%%xmm0                     \n"
    646     "movdqa    %%xmm0,%%xmm1                   \n"
    647     "pand      %%xmm3,%%xmm0                   \n"
    648     "pand      %%xmm4,%%xmm1                   \n"
    649     "psrlq     $0x4,%%xmm0                     \n"
    650     "psrlq     $0x8,%%xmm1                     \n"
    651     "por       %%xmm1,%%xmm0                   \n"
    652     "packuswb  %%xmm0,%%xmm0                   \n"
    653     "lea       0x10(%0),%0                     \n"
    654     "movq      %%xmm0,(%1)                     \n"
    655     "lea       0x8(%1),%1                      \n"
    656     "sub       $0x4,%2                         \n"
    657     "jg        1b                              \n"
    658   : "+r"(src),  // %0
    659     "+r"(dst),  // %1
    660     "+r"(pix)   // %2
    661   :
    662   : "memory", "cc"
    663 #if defined(__SSE2__)
    664     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
    665 #endif
    666   );
    667 }
    668 
    669 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    670   asm volatile (
    671     "movdqa    %4,%%xmm5                       \n"
    672     "movdqa    %3,%%xmm4                       \n"
    673     ".p2align  4                               \n"
    674   "1:                                          \n"
    675     "movdqa    (%0),%%xmm0                     \n"
    676     "movdqa    0x10(%0),%%xmm1                 \n"
    677     "movdqa    0x20(%0),%%xmm2                 \n"
    678     "movdqa    0x30(%0),%%xmm3                 \n"
    679     "pmaddubsw %%xmm4,%%xmm0                   \n"
    680     "pmaddubsw %%xmm4,%%xmm1                   \n"
    681     "pmaddubsw %%xmm4,%%xmm2                   \n"
    682     "pmaddubsw %%xmm4,%%xmm3                   \n"
    683     "lea       0x40(%0),%0                     \n"
    684     "phaddw    %%xmm1,%%xmm0                   \n"
    685     "phaddw    %%xmm3,%%xmm2                   \n"
    686     "psrlw     $0x7,%%xmm0                     \n"
    687     "psrlw     $0x7,%%xmm2                     \n"
    688     "packuswb  %%xmm2,%%xmm0                   \n"
    689     "paddb     %%xmm5,%%xmm0                   \n"
    690     "sub       $0x10,%2                        \n"
    691     "movdqa    %%xmm0,(%1)                     \n"
    692     "lea       0x10(%1),%1                     \n"
    693     "jg        1b                              \n"
    694   : "+r"(src_argb),  // %0
    695     "+r"(dst_y),     // %1
    696     "+r"(pix)        // %2
    697   : "m"(kARGBToY),   // %3
    698     "m"(kAddY16)     // %4
    699   : "memory", "cc"
    700 #if defined(__SSE2__)
    701     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    702 #endif
    703   );
    704 }
    705 
    706 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    707   asm volatile (
    708     "movdqa    %4,%%xmm5                       \n"
    709     "movdqa    %3,%%xmm4                       \n"
    710     ".p2align  4                               \n"
    711   "1:                                          \n"
    712     "movdqu    (%0),%%xmm0                     \n"
    713     "movdqu    0x10(%0),%%xmm1                 \n"
    714     "movdqu    0x20(%0),%%xmm2                 \n"
    715     "movdqu    0x30(%0),%%xmm3                 \n"
    716     "pmaddubsw %%xmm4,%%xmm0                   \n"
    717     "pmaddubsw %%xmm4,%%xmm1                   \n"
    718     "pmaddubsw %%xmm4,%%xmm2                   \n"
    719     "pmaddubsw %%xmm4,%%xmm3                   \n"
    720     "lea       0x40(%0),%0                     \n"
    721     "phaddw    %%xmm1,%%xmm0                   \n"
    722     "phaddw    %%xmm3,%%xmm2                   \n"
    723     "psrlw     $0x7,%%xmm0                     \n"
    724     "psrlw     $0x7,%%xmm2                     \n"
    725     "packuswb  %%xmm2,%%xmm0                   \n"
    726     "paddb     %%xmm5,%%xmm0                   \n"
    727     "sub       $0x10,%2                        \n"
    728     "movdqu    %%xmm0,(%1)                     \n"
    729     "lea       0x10(%1),%1                     \n"
    730     "jg        1b                              \n"
    731   : "+r"(src_argb),  // %0
    732     "+r"(dst_y),     // %1
    733     "+r"(pix)        // %2
    734   : "m"(kARGBToY),   // %3
    735     "m"(kAddY16)     // %4
    736   : "memory", "cc"
    737 #if defined(__SSE2__)
    738     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    739 #endif
    740   );
    741 }
    742 
    743 // TODO(fbarchard): pass xmm constants to single block of assembly.
    744 // fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
    745 // 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
    746 // or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
    747 // and considered unsafe.
    748 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
    749                        uint8* dst_u, uint8* dst_v, int width) {
    750   asm volatile (
    751     "movdqa    %0,%%xmm4                       \n"
    752     "movdqa    %1,%%xmm3                       \n"
    753     "movdqa    %2,%%xmm5                       \n"
    754   :
    755   : "m"(kARGBToU),  // %0
    756     "m"(kARGBToV),  // %1
    757     "m"(kAddUV128)  // %2
    758   );
    759   asm volatile (
    760     "sub       %1,%2                           \n"
    761     ".p2align  4                               \n"
    762   "1:                                          \n"
    763     "movdqa    (%0),%%xmm0                     \n"
    764     "movdqa    0x10(%0),%%xmm1                 \n"
    765     "movdqa    0x20(%0),%%xmm2                 \n"
    766     "movdqa    0x30(%0),%%xmm6                 \n"
    767     "pavgb     (%0,%4,1),%%xmm0                \n"
    768     "pavgb     0x10(%0,%4,1),%%xmm1            \n"
    769     "pavgb     0x20(%0,%4,1),%%xmm2            \n"
    770     "pavgb     0x30(%0,%4,1),%%xmm6            \n"
    771     "lea       0x40(%0),%0                     \n"
    772     "movdqa    %%xmm0,%%xmm7                   \n"
    773     "shufps    $0x88,%%xmm1,%%xmm0             \n"
    774     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
    775     "pavgb     %%xmm7,%%xmm0                   \n"
    776     "movdqa    %%xmm2,%%xmm7                   \n"
    777     "shufps    $0x88,%%xmm6,%%xmm2             \n"
    778     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
    779     "pavgb     %%xmm7,%%xmm2                   \n"
    780     "movdqa    %%xmm0,%%xmm1                   \n"
    781     "movdqa    %%xmm2,%%xmm6                   \n"
    782     "pmaddubsw %%xmm4,%%xmm0                   \n"
    783     "pmaddubsw %%xmm4,%%xmm2                   \n"
    784     "pmaddubsw %%xmm3,%%xmm1                   \n"
    785     "pmaddubsw %%xmm3,%%xmm6                   \n"
    786     "phaddw    %%xmm2,%%xmm0                   \n"
    787     "phaddw    %%xmm6,%%xmm1                   \n"
    788     "psraw     $0x8,%%xmm0                     \n"
    789     "psraw     $0x8,%%xmm1                     \n"
    790     "packsswb  %%xmm1,%%xmm0                   \n"
    791     "paddb     %%xmm5,%%xmm0                   \n"
    792     "sub       $0x10,%3                        \n"
    793     "movlps    %%xmm0,(%1)                     \n"
    794     "movhps    %%xmm0,(%1,%2,1)                \n"
    795     "lea       0x8(%1),%1                      \n"
    796     "jg        1b                              \n"
    797   : "+r"(src_argb0),       // %0
    798     "+r"(dst_u),           // %1
    799     "+r"(dst_v),           // %2
    800     "+rm"(width)           // %3
    801   : "r"(static_cast<intptr_t>(src_stride_argb))
    802   : "memory", "cc"
    803 #if defined(__SSE2__)
    804     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
    805 #endif
    806   );
    807 }
    808 
    809 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
    810                                  uint8* dst_u, uint8* dst_v, int width) {
    811   asm volatile (
    812     "movdqa    %0,%%xmm4                       \n"
    813     "movdqa    %1,%%xmm3                       \n"
    814     "movdqa    %2,%%xmm5                       \n"
    815   :
    816   : "m"(kARGBToU),         // %0
    817     "m"(kARGBToV),         // %1
    818     "m"(kAddUV128)         // %2
    819   );
    820   asm volatile (
    821     "sub       %1,%2                           \n"
    822     ".p2align  4                               \n"
    823   "1:                                          \n"
    824     "movdqu    (%0),%%xmm0                     \n"
    825     "movdqu    0x10(%0),%%xmm1                 \n"
    826     "movdqu    0x20(%0),%%xmm2                 \n"
    827     "movdqu    0x30(%0),%%xmm6                 \n"
    828     "movdqu    (%0,%4,1),%%xmm7                \n"
    829     "pavgb     %%xmm7,%%xmm0                   \n"
    830     "movdqu    0x10(%0,%4,1),%%xmm7            \n"
    831     "pavgb     %%xmm7,%%xmm1                   \n"
    832     "movdqu    0x20(%0,%4,1),%%xmm7            \n"
    833     "pavgb     %%xmm7,%%xmm2                   \n"
    834     "movdqu    0x30(%0,%4,1),%%xmm7            \n"
    835     "pavgb     %%xmm7,%%xmm6                   \n"
    836     "lea       0x40(%0),%0                     \n"
    837     "movdqa    %%xmm0,%%xmm7                   \n"
    838     "shufps    $0x88,%%xmm1,%%xmm0             \n"
    839     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
    840     "pavgb     %%xmm7,%%xmm0                   \n"
    841     "movdqa    %%xmm2,%%xmm7                   \n"
    842     "shufps    $0x88,%%xmm6,%%xmm2             \n"
    843     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
    844     "pavgb     %%xmm7,%%xmm2                   \n"
    845     "movdqa    %%xmm0,%%xmm1                   \n"
    846     "movdqa    %%xmm2,%%xmm6                   \n"
    847     "pmaddubsw %%xmm4,%%xmm0                   \n"
    848     "pmaddubsw %%xmm4,%%xmm2                   \n"
    849     "pmaddubsw %%xmm3,%%xmm1                   \n"
    850     "pmaddubsw %%xmm3,%%xmm6                   \n"
    851     "phaddw    %%xmm2,%%xmm0                   \n"
    852     "phaddw    %%xmm6,%%xmm1                   \n"
    853     "psraw     $0x8,%%xmm0                     \n"
    854     "psraw     $0x8,%%xmm1                     \n"
    855     "packsswb  %%xmm1,%%xmm0                   \n"
    856     "paddb     %%xmm5,%%xmm0                   \n"
    857     "sub       $0x10,%3                        \n"
    858     "movlps    %%xmm0,(%1)                     \n"
    859     "movhps    %%xmm0,(%1,%2,1)                \n"
    860     "lea       0x8(%1),%1                      \n"
    861     "jg        1b                              \n"
    862   : "+r"(src_argb0),       // %0
    863     "+r"(dst_u),           // %1
    864     "+r"(dst_v),           // %2
    865     "+rm"(width)           // %3
    866   : "r"(static_cast<intptr_t>(src_stride_argb))
    867   : "memory", "cc"
    868 #if defined(__SSE2__)
    869     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
    870 #endif
    871   );
    872 }
    873 
    874 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
    875   asm volatile (
    876     "movdqa    %4,%%xmm5                       \n"
    877     "movdqa    %3,%%xmm4                       \n"
    878     ".p2align  4                               \n"
    879   "1:                                          \n"
    880     "movdqa    (%0),%%xmm0                     \n"
    881     "movdqa    0x10(%0),%%xmm1                 \n"
    882     "movdqa    0x20(%0),%%xmm2                 \n"
    883     "movdqa    0x30(%0),%%xmm3                 \n"
    884     "pmaddubsw %%xmm4,%%xmm0                   \n"
    885     "pmaddubsw %%xmm4,%%xmm1                   \n"
    886     "pmaddubsw %%xmm4,%%xmm2                   \n"
    887     "pmaddubsw %%xmm4,%%xmm3                   \n"
    888     "lea       0x40(%0),%0                     \n"
    889     "phaddw    %%xmm1,%%xmm0                   \n"
    890     "phaddw    %%xmm3,%%xmm2                   \n"
    891     "psrlw     $0x7,%%xmm0                     \n"
    892     "psrlw     $0x7,%%xmm2                     \n"
    893     "packuswb  %%xmm2,%%xmm0                   \n"
    894     "paddb     %%xmm5,%%xmm0                   \n"
    895     "sub       $0x10,%2                        \n"
    896     "movdqa    %%xmm0,(%1)                     \n"
    897     "lea       0x10(%1),%1                     \n"
    898     "jg        1b                              \n"
    899   : "+r"(src_bgra),  // %0
    900     "+r"(dst_y),     // %1
    901     "+r"(pix)        // %2
    902   : "m"(kBGRAToY),   // %3
    903     "m"(kAddY16)     // %4
    904   : "memory", "cc"
    905 #if defined(__SSE2__)
    906     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    907 #endif
    908   );
    909 }
    910 
    911 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
    912   asm volatile (
    913     "movdqa    %4,%%xmm5                       \n"
    914     "movdqa    %3,%%xmm4                       \n"
    915     ".p2align  4                               \n"
    916   "1:                                          \n"
    917     "movdqu    (%0),%%xmm0                     \n"
    918     "movdqu    0x10(%0),%%xmm1                 \n"
    919     "movdqu    0x20(%0),%%xmm2                 \n"
    920     "movdqu    0x30(%0),%%xmm3                 \n"
    921     "pmaddubsw %%xmm4,%%xmm0                   \n"
    922     "pmaddubsw %%xmm4,%%xmm1                   \n"
    923     "pmaddubsw %%xmm4,%%xmm2                   \n"
    924     "pmaddubsw %%xmm4,%%xmm3                   \n"
    925     "lea       0x40(%0),%0                     \n"
    926     "phaddw    %%xmm1,%%xmm0                   \n"
    927     "phaddw    %%xmm3,%%xmm2                   \n"
    928     "psrlw     $0x7,%%xmm0                     \n"
    929     "psrlw     $0x7,%%xmm2                     \n"
    930     "packuswb  %%xmm2,%%xmm0                   \n"
    931     "paddb     %%xmm5,%%xmm0                   \n"
    932     "sub       $0x10,%2                        \n"
    933     "movdqu    %%xmm0,(%1)                     \n"
    934     "lea       0x10(%1),%1                     \n"
    935     "jg        1b                              \n"
    936   : "+r"(src_bgra),  // %0
    937     "+r"(dst_y),     // %1
    938     "+r"(pix)        // %2
    939   : "m"(kBGRAToY),   // %3
    940     "m"(kAddY16)     // %4
    941   : "memory", "cc"
    942 #if defined(__SSE2__)
    943     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    944 #endif
    945   );
    946 }
    947 
    948 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
    949                        uint8* dst_u, uint8* dst_v, int width) {
    950   asm volatile (
    951     "movdqa    %0,%%xmm4                       \n"
    952     "movdqa    %1,%%xmm3                       \n"
    953     "movdqa    %2,%%xmm5                       \n"
    954   :
    955   : "m"(kBGRAToU),         // %0
    956     "m"(kBGRAToV),         // %1
    957     "m"(kAddUV128)         // %2
    958   );
    959   asm volatile (
    960     "sub       %1,%2                           \n"
    961     ".p2align  4                               \n"
    962   "1:                                          \n"
    963     "movdqa    (%0),%%xmm0                     \n"
    964     "movdqa    0x10(%0),%%xmm1                 \n"
    965     "movdqa    0x20(%0),%%xmm2                 \n"
    966     "movdqa    0x30(%0),%%xmm6                 \n"
    967     "pavgb     (%0,%4,1),%%xmm0                \n"
    968     "pavgb     0x10(%0,%4,1),%%xmm1            \n"
    969     "pavgb     0x20(%0,%4,1),%%xmm2            \n"
    970     "pavgb     0x30(%0,%4,1),%%xmm6            \n"
    971     "lea       0x40(%0),%0                     \n"
    972     "movdqa    %%xmm0,%%xmm7                   \n"
    973     "shufps    $0x88,%%xmm1,%%xmm0             \n"
    974     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
    975     "pavgb     %%xmm7,%%xmm0                   \n"
    976     "movdqa    %%xmm2,%%xmm7                   \n"
    977     "shufps    $0x88,%%xmm6,%%xmm2             \n"
    978     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
    979     "pavgb     %%xmm7,%%xmm2                   \n"
    980     "movdqa    %%xmm0,%%xmm1                   \n"
    981     "movdqa    %%xmm2,%%xmm6                   \n"
    982     "pmaddubsw %%xmm4,%%xmm0                   \n"
    983     "pmaddubsw %%xmm4,%%xmm2                   \n"
    984     "pmaddubsw %%xmm3,%%xmm1                   \n"
    985     "pmaddubsw %%xmm3,%%xmm6                   \n"
    986     "phaddw    %%xmm2,%%xmm0                   \n"
    987     "phaddw    %%xmm6,%%xmm1                   \n"
    988     "psraw     $0x8,%%xmm0                     \n"
    989     "psraw     $0x8,%%xmm1                     \n"
    990     "packsswb  %%xmm1,%%xmm0                   \n"
    991     "paddb     %%xmm5,%%xmm0                   \n"
    992     "sub       $0x10,%3                        \n"
    993     "movlps    %%xmm0,(%1)                     \n"
    994     "movhps    %%xmm0,(%1,%2,1)                \n"
    995     "lea       0x8(%1),%1                      \n"
    996     "jg        1b                              \n"
    997   : "+r"(src_bgra0),       // %0
    998     "+r"(dst_u),           // %1
    999     "+r"(dst_v),           // %2
   1000     "+rm"(width)           // %3
   1001   : "r"(static_cast<intptr_t>(src_stride_bgra))
   1002   : "memory", "cc"
   1003 #if defined(__SSE2__)
   1004     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   1005 #endif
   1006   );
   1007 }
   1008 
   1009 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
   1010                                  uint8* dst_u, uint8* dst_v, int width) {
   1011   asm volatile (
   1012     "movdqa    %0,%%xmm4                       \n"
   1013     "movdqa    %1,%%xmm3                       \n"
   1014     "movdqa    %2,%%xmm5                       \n"
   1015   :
   1016   : "m"(kBGRAToU),         // %0
   1017     "m"(kBGRAToV),         // %1
   1018     "m"(kAddUV128)         // %2
   1019   );
   1020   asm volatile (
   1021     "sub       %1,%2                           \n"
   1022     ".p2align  4                               \n"
   1023   "1:                                          \n"
   1024     "movdqu    (%0),%%xmm0                     \n"
   1025     "movdqu    0x10(%0),%%xmm1                 \n"
   1026     "movdqu    0x20(%0),%%xmm2                 \n"
   1027     "movdqu    0x30(%0),%%xmm6                 \n"
   1028     "movdqu    (%0,%4,1),%%xmm7                \n"
   1029     "pavgb     %%xmm7,%%xmm0                   \n"
   1030     "movdqu    0x10(%0,%4,1),%%xmm7            \n"
   1031     "pavgb     %%xmm7,%%xmm1                   \n"
   1032     "movdqu    0x20(%0,%4,1),%%xmm7            \n"
   1033     "pavgb     %%xmm7,%%xmm2                   \n"
   1034     "movdqu    0x30(%0,%4,1),%%xmm7            \n"
   1035     "pavgb     %%xmm7,%%xmm6                   \n"
   1036     "lea       0x40(%0),%0                     \n"
   1037     "movdqa    %%xmm0,%%xmm7                   \n"
   1038     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1039     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   1040     "pavgb     %%xmm7,%%xmm0                   \n"
   1041     "movdqa    %%xmm2,%%xmm7                   \n"
   1042     "shufps    $0x88,%%xmm6,%%xmm2             \n"
   1043     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
   1044     "pavgb     %%xmm7,%%xmm2                   \n"
   1045     "movdqa    %%xmm0,%%xmm1                   \n"
   1046     "movdqa    %%xmm2,%%xmm6                   \n"
   1047     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1048     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1049     "pmaddubsw %%xmm3,%%xmm1                   \n"
   1050     "pmaddubsw %%xmm3,%%xmm6                   \n"
   1051     "phaddw    %%xmm2,%%xmm0                   \n"
   1052     "phaddw    %%xmm6,%%xmm1                   \n"
   1053     "psraw     $0x8,%%xmm0                     \n"
   1054     "psraw     $0x8,%%xmm1                     \n"
   1055     "packsswb  %%xmm1,%%xmm0                   \n"
   1056     "paddb     %%xmm5,%%xmm0                   \n"
   1057     "sub       $0x10,%3                        \n"
   1058     "movlps    %%xmm0,(%1)                     \n"
   1059     "movhps    %%xmm0,(%1,%2,1)                \n"
   1060     "lea       0x8(%1),%1                      \n"
   1061     "jg        1b                              \n"
   1062   : "+r"(src_bgra0),       // %0
   1063     "+r"(dst_u),           // %1
   1064     "+r"(dst_v),           // %2
   1065     "+rm"(width)           // %3
   1066   : "r"(static_cast<intptr_t>(src_stride_bgra))
   1067   : "memory", "cc"
   1068 #if defined(__SSE2__)
   1069     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   1070 #endif
   1071   );
   1072 }
   1073 
   1074 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
   1075   asm volatile (
   1076     "movdqa    %4,%%xmm5                       \n"
   1077     "movdqa    %3,%%xmm4                       \n"
   1078     ".p2align  4                               \n"
   1079   "1:                                          \n"
   1080     "movdqa    (%0),%%xmm0                     \n"
   1081     "movdqa    0x10(%0),%%xmm1                 \n"
   1082     "movdqa    0x20(%0),%%xmm2                 \n"
   1083     "movdqa    0x30(%0),%%xmm3                 \n"
   1084     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1085     "pmaddubsw %%xmm4,%%xmm1                   \n"
   1086     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1087     "pmaddubsw %%xmm4,%%xmm3                   \n"
   1088     "lea       0x40(%0),%0                     \n"
   1089     "phaddw    %%xmm1,%%xmm0                   \n"
   1090     "phaddw    %%xmm3,%%xmm2                   \n"
   1091     "psrlw     $0x7,%%xmm0                     \n"
   1092     "psrlw     $0x7,%%xmm2                     \n"
   1093     "packuswb  %%xmm2,%%xmm0                   \n"
   1094     "paddb     %%xmm5,%%xmm0                   \n"
   1095     "sub       $0x10,%2                        \n"
   1096     "movdqa    %%xmm0,(%1)                     \n"
   1097     "lea       0x10(%1),%1                     \n"
   1098     "jg        1b                              \n"
   1099   : "+r"(src_abgr),  // %0
   1100     "+r"(dst_y),     // %1
   1101     "+r"(pix)        // %2
   1102   : "m"(kABGRToY),   // %3
   1103     "m"(kAddY16)     // %4
   1104   : "memory", "cc"
   1105 #if defined(__SSE2__)
   1106     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1107 #endif
   1108   );
   1109 }
   1110 
   1111 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
   1112   asm volatile (
   1113     "movdqa    %4,%%xmm5                       \n"
   1114     "movdqa    %3,%%xmm4                       \n"
   1115     ".p2align  4                               \n"
   1116   "1:                                          \n"
   1117     "movdqu    (%0),%%xmm0                     \n"
   1118     "movdqu    0x10(%0),%%xmm1                 \n"
   1119     "movdqu    0x20(%0),%%xmm2                 \n"
   1120     "movdqu    0x30(%0),%%xmm3                 \n"
   1121     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1122     "pmaddubsw %%xmm4,%%xmm1                   \n"
   1123     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1124     "pmaddubsw %%xmm4,%%xmm3                   \n"
   1125     "lea       0x40(%0),%0                     \n"
   1126     "phaddw    %%xmm1,%%xmm0                   \n"
   1127     "phaddw    %%xmm3,%%xmm2                   \n"
   1128     "psrlw     $0x7,%%xmm0                     \n"
   1129     "psrlw     $0x7,%%xmm2                     \n"
   1130     "packuswb  %%xmm2,%%xmm0                   \n"
   1131     "paddb     %%xmm5,%%xmm0                   \n"
   1132     "sub       $0x10,%2                        \n"
   1133     "movdqu    %%xmm0,(%1)                     \n"
   1134     "lea       0x10(%1),%1                     \n"
   1135     "jg        1b                              \n"
   1136   : "+r"(src_abgr),  // %0
   1137     "+r"(dst_y),     // %1
   1138     "+r"(pix)        // %2
   1139   : "m"(kABGRToY),   // %3
   1140     "m"(kAddY16)     // %4
   1141   : "memory", "cc"
   1142 #if defined(__SSE2__)
   1143     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1144 #endif
   1145   );
   1146 }
   1147 
   1148 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
   1149                        uint8* dst_u, uint8* dst_v, int width) {
   1150   asm volatile (
   1151     "movdqa    %0,%%xmm4                       \n"
   1152     "movdqa    %1,%%xmm3                       \n"
   1153     "movdqa    %2,%%xmm5                       \n"
   1154   :
   1155   : "m"(kABGRToU),         // %0
   1156     "m"(kABGRToV),         // %1
   1157     "m"(kAddUV128)         // %2
   1158   );
   1159   asm volatile (
   1160     "sub       %1,%2                           \n"
   1161     ".p2align  4                               \n"
   1162   "1:                                          \n"
   1163     "movdqa    (%0),%%xmm0                     \n"
   1164     "movdqa    0x10(%0),%%xmm1                 \n"
   1165     "movdqa    0x20(%0),%%xmm2                 \n"
   1166     "movdqa    0x30(%0),%%xmm6                 \n"
   1167     "pavgb     (%0,%4,1),%%xmm0                \n"
   1168     "pavgb     0x10(%0,%4,1),%%xmm1            \n"
   1169     "pavgb     0x20(%0,%4,1),%%xmm2            \n"
   1170     "pavgb     0x30(%0,%4,1),%%xmm6            \n"
   1171     "lea       0x40(%0),%0                     \n"
   1172     "movdqa    %%xmm0,%%xmm7                   \n"
   1173     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1174     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   1175     "pavgb     %%xmm7,%%xmm0                   \n"
   1176     "movdqa    %%xmm2,%%xmm7                   \n"
   1177     "shufps    $0x88,%%xmm6,%%xmm2             \n"
   1178     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
   1179     "pavgb     %%xmm7,%%xmm2                   \n"
   1180     "movdqa    %%xmm0,%%xmm1                   \n"
   1181     "movdqa    %%xmm2,%%xmm6                   \n"
   1182     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1183     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1184     "pmaddubsw %%xmm3,%%xmm1                   \n"
   1185     "pmaddubsw %%xmm3,%%xmm6                   \n"
   1186     "phaddw    %%xmm2,%%xmm0                   \n"
   1187     "phaddw    %%xmm6,%%xmm1                   \n"
   1188     "psraw     $0x8,%%xmm0                     \n"
   1189     "psraw     $0x8,%%xmm1                     \n"
   1190     "packsswb  %%xmm1,%%xmm0                   \n"
   1191     "paddb     %%xmm5,%%xmm0                   \n"
   1192     "sub       $0x10,%3                        \n"
   1193     "movlps    %%xmm0,(%1)                     \n"
   1194     "movhps    %%xmm0,(%1,%2,1)                \n"
   1195     "lea       0x8(%1),%1                      \n"
   1196     "jg        1b                              \n"
   1197   : "+r"(src_abgr0),       // %0
   1198     "+r"(dst_u),           // %1
   1199     "+r"(dst_v),           // %2
   1200     "+rm"(width)           // %3
   1201   : "r"(static_cast<intptr_t>(src_stride_abgr))
   1202   : "memory", "cc"
   1203 #if defined(__SSE2__)
   1204     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   1205 #endif
   1206   );
   1207 }
   1208 
   1209 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
   1210                                  uint8* dst_u, uint8* dst_v, int width) {
   1211   asm volatile (
   1212     "movdqa    %0,%%xmm4                       \n"
   1213     "movdqa    %1,%%xmm3                       \n"
   1214     "movdqa    %2,%%xmm5                       \n"
   1215   :
   1216   : "m"(kABGRToU),         // %0
   1217     "m"(kABGRToV),         // %1
   1218     "m"(kAddUV128)         // %2
   1219   );
   1220   asm volatile (
   1221     "sub       %1,%2                           \n"
   1222     ".p2align  4                               \n"
   1223   "1:                                          \n"
   1224     "movdqu    (%0),%%xmm0                     \n"
   1225     "movdqu    0x10(%0),%%xmm1                 \n"
   1226     "movdqu    0x20(%0),%%xmm2                 \n"
   1227     "movdqu    0x30(%0),%%xmm6                 \n"
   1228     "movdqu    (%0,%4,1),%%xmm7                \n"
   1229     "pavgb     %%xmm7,%%xmm0                   \n"
   1230     "movdqu    0x10(%0,%4,1),%%xmm7            \n"
   1231     "pavgb     %%xmm7,%%xmm1                   \n"
   1232     "movdqu    0x20(%0,%4,1),%%xmm7            \n"
   1233     "pavgb     %%xmm7,%%xmm2                   \n"
   1234     "movdqu    0x30(%0,%4,1),%%xmm7            \n"
   1235     "pavgb     %%xmm7,%%xmm6                   \n"
   1236     "lea       0x40(%0),%0                     \n"
   1237     "movdqa    %%xmm0,%%xmm7                   \n"
   1238     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1239     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   1240     "pavgb     %%xmm7,%%xmm0                   \n"
   1241     "movdqa    %%xmm2,%%xmm7                   \n"
   1242     "shufps    $0x88,%%xmm6,%%xmm2             \n"
   1243     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
   1244     "pavgb     %%xmm7,%%xmm2                   \n"
   1245     "movdqa    %%xmm0,%%xmm1                   \n"
   1246     "movdqa    %%xmm2,%%xmm6                   \n"
   1247     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1248     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1249     "pmaddubsw %%xmm3,%%xmm1                   \n"
   1250     "pmaddubsw %%xmm3,%%xmm6                   \n"
   1251     "phaddw    %%xmm2,%%xmm0                   \n"
   1252     "phaddw    %%xmm6,%%xmm1                   \n"
   1253     "psraw     $0x8,%%xmm0                     \n"
   1254     "psraw     $0x8,%%xmm1                     \n"
   1255     "packsswb  %%xmm1,%%xmm0                   \n"
   1256     "paddb     %%xmm5,%%xmm0                   \n"
   1257     "sub       $0x10,%3                        \n"
   1258     "movlps    %%xmm0,(%1)                     \n"
   1259     "movhps    %%xmm0,(%1,%2,1)                \n"
   1260     "lea       0x8(%1),%1                      \n"
   1261     "jg        1b                              \n"
   1262   : "+r"(src_abgr0),       // %0
   1263     "+r"(dst_u),           // %1
   1264     "+r"(dst_v),           // %2
   1265     "+rm"(width)           // %3
   1266   : "r"(static_cast<intptr_t>(src_stride_abgr))
   1267   : "memory", "cc"
   1268 #if defined(__SSE2__)
   1269     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   1270 #endif
   1271   );
   1272 }
   1273 #endif  // HAS_ARGBTOYROW_SSSE3
   1274 
   1275 #ifdef HAS_I422TOARGBROW_SSSE3
   1276 #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
   1277 #define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
   1278 #define UR 0
   1279 
   1280 #define VB 0
   1281 #define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
   1282 #define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
   1283 
   1284 // Bias
   1285 #define BB UB * 128 + VB * 128
   1286 #define BG UG * 128 + VG * 128
   1287 #define BR UR * 128 + VR * 128
   1288 
   1289 #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
   1290 
   1291 struct {
   1292   vec8 kUVToB;  // 0
   1293   vec8 kUVToG;  // 16
   1294   vec8 kUVToR;  // 32
   1295   vec16 kUVBiasB;  // 48
   1296   vec16 kUVBiasG;  // 64
   1297   vec16 kUVBiasR;  // 80
   1298   vec16 kYSub16;  // 96
   1299   vec16 kYToRgb;  // 112
   1300   vec8 kVUToB;  // 128
   1301   vec8 kVUToG;  // 144
   1302   vec8 kVUToR;  // 160
   1303 } CONST SIMD_ALIGNED(kYuvConstants) = {
   1304   { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
   1305   { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
   1306   { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
   1307   { BB, BB, BB, BB, BB, BB, BB, BB },
   1308   { BG, BG, BG, BG, BG, BG, BG, BG },
   1309   { BR, BR, BR, BR, BR, BR, BR, BR },
   1310   { 16, 16, 16, 16, 16, 16, 16, 16 },
   1311   { YG, YG, YG, YG, YG, YG, YG, YG },
   1312   { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
   1313   { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
   1314   { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
   1315 };
   1316 
   1317 
   1318 // Read 8 UV from 411
   1319 #define READYUV444                                                             \
   1320     "movq       (%[u_buf]),%%xmm0              \n"                             \
   1321     "movq       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
   1322     "lea        0x8(%[u_buf]),%[u_buf]         \n"                             \
   1323     "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
   1324 
   1325 // Read 4 UV from 422, upsample to 8 UV
   1326 #define READYUV422                                                             \
   1327     "movd       (%[u_buf]),%%xmm0              \n"                             \
   1328     "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
   1329     "lea        0x4(%[u_buf]),%[u_buf]         \n"                             \
   1330     "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
   1331     "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
   1332 
   1333 // Read 2 UV from 411, upsample to 8 UV
   1334 #define READYUV411                                                             \
   1335     "movd       (%[u_buf]),%%xmm0              \n"                             \
   1336     "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
   1337     "lea        0x2(%[u_buf]),%[u_buf]         \n"                             \
   1338     "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
   1339     "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
   1340     "punpckldq  %%xmm0,%%xmm0                  \n"                             \
   1341 
   1342 // Read 4 UV from NV12, upsample to 8 UV
   1343 #define READNV12                                                               \
   1344     "movq       (%[uv_buf]),%%xmm0             \n"                             \
   1345     "lea        0x8(%[uv_buf]),%[uv_buf]       \n"                             \
   1346     "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
   1347 
   1348 // Convert 8 pixels: 8 UV and 8 Y
   1349 #define YUVTORGB                                                               \
   1350     "movdqa     %%xmm0,%%xmm1                  \n"                             \
   1351     "movdqa     %%xmm0,%%xmm2                  \n"                             \
   1352     "pmaddubsw  (%[kYuvConstants]),%%xmm0      \n"                             \
   1353     "pmaddubsw  16(%[kYuvConstants]),%%xmm1    \n"                             \
   1354     "pmaddubsw  32(%[kYuvConstants]),%%xmm2    \n"                             \
   1355     "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
   1356     "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
   1357     "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
   1358     "movq       (%[y_buf]),%%xmm3              \n"                             \
   1359     "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
   1360     "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
   1361     "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
   1362     "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
   1363     "paddsw     %%xmm3,%%xmm0                  \n"                             \
   1364     "paddsw     %%xmm3,%%xmm1                  \n"                             \
   1365     "paddsw     %%xmm3,%%xmm2                  \n"                             \
   1366     "psraw      $0x6,%%xmm0                    \n"                             \
   1367     "psraw      $0x6,%%xmm1                    \n"                             \
   1368     "psraw      $0x6,%%xmm2                    \n"                             \
   1369     "packuswb   %%xmm0,%%xmm0                  \n"                             \
   1370     "packuswb   %%xmm1,%%xmm1                  \n"                             \
   1371     "packuswb   %%xmm2,%%xmm2                  \n"                             \
   1372 
   1373 // Convert 8 pixels: 8 VU and 8 Y
   1374 #define YVUTORGB                                                               \
   1375     "movdqa     %%xmm0,%%xmm1                  \n"                             \
   1376     "movdqa     %%xmm0,%%xmm2                  \n"                             \
   1377     "pmaddubsw  128(%[kYuvConstants]),%%xmm0   \n"                             \
   1378     "pmaddubsw  144(%[kYuvConstants]),%%xmm1   \n"                             \
   1379     "pmaddubsw  160(%[kYuvConstants]),%%xmm2   \n"                             \
   1380     "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
   1381     "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
   1382     "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
   1383     "movq       (%[y_buf]),%%xmm3              \n"                             \
   1384     "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
   1385     "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
   1386     "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
   1387     "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
   1388     "paddsw     %%xmm3,%%xmm0                  \n"                             \
   1389     "paddsw     %%xmm3,%%xmm1                  \n"                             \
   1390     "paddsw     %%xmm3,%%xmm2                  \n"                             \
   1391     "psraw      $0x6,%%xmm0                    \n"                             \
   1392     "psraw      $0x6,%%xmm1                    \n"                             \
   1393     "psraw      $0x6,%%xmm2                    \n"                             \
   1394     "packuswb   %%xmm0,%%xmm0                  \n"                             \
   1395     "packuswb   %%xmm1,%%xmm1                  \n"                             \
   1396     "packuswb   %%xmm2,%%xmm2                  \n"                             \
   1397 
   1398 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
   1399                                 const uint8* u_buf,
   1400                                 const uint8* v_buf,
   1401                                 uint8* argb_buf,
   1402                                 int width) {
   1403   asm volatile (
   1404     "sub       %[u_buf],%[v_buf]               \n"
   1405     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1406     "pxor      %%xmm4,%%xmm4                   \n"
   1407     ".p2align  4                               \n"
   1408   "1:                                          \n"
   1409     READYUV444
   1410     YUVTORGB
   1411     "punpcklbw %%xmm1,%%xmm0                   \n"
   1412     "punpcklbw %%xmm5,%%xmm2                   \n"
   1413     "movdqa    %%xmm0,%%xmm1                   \n"
   1414     "punpcklwd %%xmm2,%%xmm0                   \n"
   1415     "punpckhwd %%xmm2,%%xmm1                   \n"
   1416     "movdqa    %%xmm0,(%[argb_buf])            \n"
   1417     "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
   1418     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
   1419     "sub       $0x8,%[width]                   \n"
   1420     "jg        1b                              \n"
   1421   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1422     [u_buf]"+r"(u_buf),    // %[u_buf]
   1423     [v_buf]"+r"(v_buf),    // %[v_buf]
   1424     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
   1425     [width]"+rm"(width)    // %[width]
   1426   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   1427   : "memory", "cc"
   1428 #if defined(__SSE2__)
   1429     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1430 #endif
   1431   );
   1432 }
   1433 
   1434 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
   1435                                 const uint8* u_buf,
   1436                                 const uint8* v_buf,
   1437                                 uint8* argb_buf,
   1438                                 int width) {
   1439   asm volatile (
   1440     "sub       %[u_buf],%[v_buf]               \n"
   1441     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1442     "pxor      %%xmm4,%%xmm4                   \n"
   1443     ".p2align  4                               \n"
   1444   "1:                                          \n"
   1445     READYUV422
   1446     YUVTORGB
   1447     "punpcklbw %%xmm1,%%xmm0                   \n"
   1448     "punpcklbw %%xmm5,%%xmm2                   \n"
   1449     "movdqa    %%xmm0,%%xmm1                   \n"
   1450     "punpcklwd %%xmm2,%%xmm0                   \n"
   1451     "punpckhwd %%xmm2,%%xmm1                   \n"
   1452     "movdqa    %%xmm0,(%[argb_buf])            \n"
   1453     "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
   1454     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
   1455     "sub       $0x8,%[width]                   \n"
   1456     "jg        1b                              \n"
   1457   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1458     [u_buf]"+r"(u_buf),    // %[u_buf]
   1459     [v_buf]"+r"(v_buf),    // %[v_buf]
   1460     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
   1461     [width]"+rm"(width)    // %[width]
   1462   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   1463   : "memory", "cc"
   1464 #if defined(__SSE2__)
   1465     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1466 #endif
   1467   );
   1468 }
   1469 
   1470 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
   1471                                 const uint8* u_buf,
   1472                                 const uint8* v_buf,
   1473                                 uint8* argb_buf,
   1474                                 int width) {
   1475   asm volatile (
   1476     "sub       %[u_buf],%[v_buf]               \n"
   1477     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1478     "pxor      %%xmm4,%%xmm4                   \n"
   1479     ".p2align  4                               \n"
   1480   "1:                                          \n"
   1481     READYUV411
   1482     YUVTORGB
   1483     "punpcklbw %%xmm1,%%xmm0                   \n"
   1484     "punpcklbw %%xmm5,%%xmm2                   \n"
   1485     "movdqa    %%xmm0,%%xmm1                   \n"
   1486     "punpcklwd %%xmm2,%%xmm0                   \n"
   1487     "punpckhwd %%xmm2,%%xmm1                   \n"
   1488     "movdqa    %%xmm0,(%[argb_buf])            \n"
   1489     "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
   1490     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
   1491     "sub       $0x8,%[width]                   \n"
   1492     "jg        1b                              \n"
   1493   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1494     [u_buf]"+r"(u_buf),    // %[u_buf]
   1495     [v_buf]"+r"(v_buf),    // %[v_buf]
   1496     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
   1497     [width]"+rm"(width)    // %[width]
   1498   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   1499   : "memory", "cc"
   1500 #if defined(__SSE2__)
   1501     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1502 #endif
   1503   );
   1504 }
   1505 
   1506 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
   1507                                 const uint8* uv_buf,
   1508                                 uint8* argb_buf,
   1509                                 int width) {
   1510   asm volatile (
   1511     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1512     "pxor      %%xmm4,%%xmm4                   \n"
   1513     ".p2align  4                               \n"
   1514   "1:                                          \n"
   1515     READNV12
   1516     YUVTORGB
   1517     "punpcklbw %%xmm1,%%xmm0                   \n"
   1518     "punpcklbw %%xmm5,%%xmm2                   \n"
   1519     "movdqa    %%xmm0,%%xmm1                   \n"
   1520     "punpcklwd %%xmm2,%%xmm0                   \n"
   1521     "punpckhwd %%xmm2,%%xmm1                   \n"
   1522     "movdqa    %%xmm0,(%[argb_buf])            \n"
   1523     "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
   1524     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
   1525     "sub       $0x8,%[width]                   \n"
   1526     "jg        1b                              \n"
   1527   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1528     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
   1529     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
   1530     [width]"+rm"(width)    // %[width]
   1531   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   1532   : "memory", "cc"
   1533 #if defined(__SSE2__)
   1534     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1535 #endif
   1536   );
   1537 }
   1538 
   1539 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
   1540                                 const uint8* vu_buf,
   1541                                 uint8* argb_buf,
   1542                                 int width) {
   1543   asm volatile (
   1544     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1545     "pxor      %%xmm4,%%xmm4                   \n"
   1546     ".p2align  4                               \n"
   1547   "1:                                          \n"
   1548     READNV12
   1549     YVUTORGB
   1550     "punpcklbw %%xmm1,%%xmm0                   \n"
   1551     "punpcklbw %%xmm5,%%xmm2                   \n"
   1552     "movdqa    %%xmm0,%%xmm1                   \n"
   1553     "punpcklwd %%xmm2,%%xmm0                   \n"
   1554     "punpckhwd %%xmm2,%%xmm1                   \n"
   1555     "movdqa    %%xmm0,(%[argb_buf])            \n"
   1556     "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
   1557     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
   1558     "sub       $0x8,%[width]                   \n"
   1559     "jg        1b                              \n"
   1560   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1561     [uv_buf]"+r"(vu_buf),    // %[uv_buf]
   1562     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
   1563     [width]"+rm"(width)    // %[width]
   1564   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   1565   : "memory", "cc"
   1566 #if defined(__SSE2__)
   1567     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1568 #endif
   1569   );
   1570 }
   1571 
   1572 void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   1573                                           const uint8* u_buf,
   1574                                           const uint8* v_buf,
   1575                                           uint8* argb_buf,
   1576                                           int width) {
   1577   asm volatile (
   1578     "sub       %[u_buf],%[v_buf]               \n"
   1579     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1580     "pxor      %%xmm4,%%xmm4                   \n"
   1581     ".p2align  4                               \n"
   1582   "1:                                          \n"
   1583     READYUV444
   1584     YUVTORGB
   1585     "punpcklbw %%xmm1,%%xmm0                   \n"
   1586     "punpcklbw %%xmm5,%%xmm2                   \n"
   1587     "movdqa    %%xmm0,%%xmm1                   \n"
   1588     "punpcklwd %%xmm2,%%xmm0                   \n"
   1589     "punpckhwd %%xmm2,%%xmm1                   \n"
   1590     "movdqu    %%xmm0,(%[argb_buf])            \n"
   1591     "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
   1592     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
   1593     "sub       $0x8,%[width]                   \n"
   1594     "jg        1b                              \n"
   1595   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1596     [u_buf]"+r"(u_buf),    // %[u_buf]
   1597     [v_buf]"+r"(v_buf),    // %[v_buf]
   1598     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
   1599     [width]"+rm"(width)    // %[width]
   1600   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   1601   : "memory", "cc"
   1602 #if defined(__SSE2__)
   1603     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1604 #endif
   1605   );
   1606 }
   1607 
   1608 void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   1609                                           const uint8* u_buf,
   1610                                           const uint8* v_buf,
   1611                                           uint8* argb_buf,
   1612                                           int width) {
   1613   asm volatile (
   1614     "sub       %[u_buf],%[v_buf]               \n"
   1615     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1616     "pxor      %%xmm4,%%xmm4                   \n"
   1617     ".p2align  4                               \n"
   1618   "1:                                          \n"
   1619     READYUV422
   1620     YUVTORGB
   1621     "punpcklbw %%xmm1,%%xmm0                   \n"
   1622     "punpcklbw %%xmm5,%%xmm2                   \n"
   1623     "movdqa    %%xmm0,%%xmm1                   \n"
   1624     "punpcklwd %%xmm2,%%xmm0                   \n"
   1625     "punpckhwd %%xmm2,%%xmm1                   \n"
   1626     "movdqu    %%xmm0,(%[argb_buf])            \n"
   1627     "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
   1628     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
   1629     "sub       $0x8,%[width]                   \n"
   1630     "jg        1b                              \n"
   1631   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1632     [u_buf]"+r"(u_buf),    // %[u_buf]
   1633     [v_buf]"+r"(v_buf),    // %[v_buf]
   1634     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
   1635     [width]"+rm"(width)    // %[width]
   1636   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   1637   : "memory", "cc"
   1638 #if defined(__SSE2__)
   1639     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1640 #endif
   1641   );
   1642 }
   1643 
   1644 void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   1645                                           const uint8* u_buf,
   1646                                           const uint8* v_buf,
   1647                                           uint8* argb_buf,
   1648                                           int width) {
   1649   asm volatile (
   1650     "sub       %[u_buf],%[v_buf]               \n"
   1651     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1652     "pxor      %%xmm4,%%xmm4                   \n"
   1653     ".p2align  4                               \n"
   1654   "1:                                          \n"
   1655     READYUV411
   1656     YUVTORGB
   1657     "punpcklbw %%xmm1,%%xmm0                   \n"
   1658     "punpcklbw %%xmm5,%%xmm2                   \n"
   1659     "movdqa    %%xmm0,%%xmm1                   \n"
   1660     "punpcklwd %%xmm2,%%xmm0                   \n"
   1661     "punpckhwd %%xmm2,%%xmm1                   \n"
   1662     "movdqu    %%xmm0,(%[argb_buf])            \n"
   1663     "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
   1664     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
   1665     "sub       $0x8,%[width]                   \n"
   1666     "jg        1b                              \n"
   1667   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1668     [u_buf]"+r"(u_buf),    // %[u_buf]
   1669     [v_buf]"+r"(v_buf),    // %[v_buf]
   1670     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
   1671     [width]"+rm"(width)    // %[width]
   1672   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   1673   : "memory", "cc"
   1674 #if defined(__SSE2__)
   1675     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1676 #endif
   1677   );
   1678 }
   1679 
   1680 void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   1681                                           const uint8* uv_buf,
   1682                                           uint8* argb_buf,
   1683                                           int width) {
   1684   asm volatile (
   1685     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1686     "pxor      %%xmm4,%%xmm4                   \n"
   1687     ".p2align  4                               \n"
   1688   "1:                                          \n"
   1689     READNV12
   1690     YUVTORGB
   1691     "punpcklbw %%xmm1,%%xmm0                   \n"
   1692     "punpcklbw %%xmm5,%%xmm2                   \n"
   1693     "movdqa    %%xmm0,%%xmm1                   \n"
   1694     "punpcklwd %%xmm2,%%xmm0                   \n"
   1695     "punpckhwd %%xmm2,%%xmm1                   \n"
   1696     "movdqu    %%xmm0,(%[argb_buf])            \n"
   1697     "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
   1698     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
   1699     "sub       $0x8,%[width]                   \n"
   1700     "jg        1b                              \n"
   1701   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1702     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
   1703     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
   1704     [width]"+rm"(width)    // %[width]
   1705   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   1706   : "memory", "cc"
   1707 #if defined(__SSE2__)
   1708     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1709 #endif
   1710   );
   1711 }
   1712 
   1713 void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   1714                                           const uint8* vu_buf,
   1715                                           uint8* argb_buf,
   1716                                           int width) {
   1717   asm volatile (
   1718     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1719     "pxor      %%xmm4,%%xmm4                   \n"
   1720     ".p2align  4                               \n"
   1721   "1:                                          \n"
   1722     READNV12
   1723     YVUTORGB
   1724     "punpcklbw %%xmm1,%%xmm0                   \n"
   1725     "punpcklbw %%xmm5,%%xmm2                   \n"
   1726     "movdqa    %%xmm0,%%xmm1                   \n"
   1727     "punpcklwd %%xmm2,%%xmm0                   \n"
   1728     "punpckhwd %%xmm2,%%xmm1                   \n"
   1729     "movdqu    %%xmm0,(%[argb_buf])            \n"
   1730     "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
   1731     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
   1732     "sub       $0x8,%[width]                   \n"
   1733     "jg        1b                              \n"
   1734   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1735     [uv_buf]"+r"(vu_buf),    // %[uv_buf]
   1736     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
   1737     [width]"+rm"(width)    // %[width]
   1738   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   1739   : "memory", "cc"
   1740 #if defined(__SSE2__)
   1741     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1742 #endif
   1743   );
   1744 }
   1745 
   1746 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
   1747                                 const uint8* u_buf,
   1748                                 const uint8* v_buf,
   1749                                 uint8* bgra_buf,
   1750                                 int width) {
   1751   asm volatile (
   1752     "sub       %[u_buf],%[v_buf]               \n"
   1753     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1754     "pxor      %%xmm4,%%xmm4                   \n"
   1755     ".p2align  4                               \n"
   1756   "1:                                          \n"
   1757     READYUV422
   1758     YUVTORGB
   1759     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1760     "punpcklbw %%xmm0,%%xmm1                   \n"
   1761     "punpcklbw %%xmm2,%%xmm5                   \n"
   1762     "movdqa    %%xmm5,%%xmm0                   \n"
   1763     "punpcklwd %%xmm1,%%xmm5                   \n"
   1764     "punpckhwd %%xmm1,%%xmm0                   \n"
   1765     "movdqa    %%xmm5,(%[argb_buf])            \n"
   1766     "movdqa    %%xmm0,0x10(%[argb_buf])        \n"
   1767     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
   1768     "sub       $0x8,%[width]                   \n"
   1769     "jg        1b                              \n"
   1770   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1771     [u_buf]"+r"(u_buf),    // %[u_buf]
   1772     [v_buf]"+r"(v_buf),    // %[v_buf]
   1773     [argb_buf]"+r"(bgra_buf),  // %[argb_buf]
   1774     [width]"+rm"(width)    // %[width]
   1775   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   1776   : "memory", "cc"
   1777 #if defined(__SSE2__)
   1778     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1779 #endif
   1780   );
   1781 }
   1782 
   1783 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
   1784                                 const uint8* u_buf,
   1785                                 const uint8* v_buf,
   1786                                 uint8* abgr_buf,
   1787                                 int width) {
   1788   asm volatile (
   1789     "sub       %[u_buf],%[v_buf]               \n"
   1790     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1791     "pxor      %%xmm4,%%xmm4                   \n"
   1792     ".p2align  4                               \n"
   1793   "1:                                          \n"
   1794     READYUV422
   1795     YUVTORGB
   1796     "punpcklbw %%xmm1,%%xmm2                   \n"
   1797     "punpcklbw %%xmm5,%%xmm0                   \n"
   1798     "movdqa    %%xmm2,%%xmm1                   \n"
   1799     "punpcklwd %%xmm0,%%xmm2                   \n"
   1800     "punpckhwd %%xmm0,%%xmm1                   \n"
   1801     "movdqa    %%xmm2,(%[argb_buf])            \n"
   1802     "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
   1803     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
   1804     "sub       $0x8,%[width]                   \n"
   1805     "jg        1b                              \n"
   1806   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1807     [u_buf]"+r"(u_buf),    // %[u_buf]
   1808     [v_buf]"+r"(v_buf),    // %[v_buf]
   1809     [argb_buf]"+r"(abgr_buf),  // %[argb_buf]
   1810     [width]"+rm"(width)    // %[width]
   1811   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   1812   : "memory", "cc"
   1813 #if defined(__SSE2__)
   1814     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1815 #endif
   1816   );
   1817 }
   1818 
   1819 void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
   1820                                           const uint8* u_buf,
   1821                                           const uint8* v_buf,
   1822                                           uint8* bgra_buf,
   1823                                           int width) {
   1824   asm volatile (
   1825     "sub       %[u_buf],%[v_buf]               \n"
   1826     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1827     "pxor      %%xmm4,%%xmm4                   \n"
   1828     ".p2align  4                               \n"
   1829   "1:                                          \n"
   1830     READYUV422
   1831     YUVTORGB
   1832     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1833     "punpcklbw %%xmm0,%%xmm1                   \n"
   1834     "punpcklbw %%xmm2,%%xmm5                   \n"
   1835     "movdqa    %%xmm5,%%xmm0                   \n"
   1836     "punpcklwd %%xmm1,%%xmm5                   \n"
   1837     "punpckhwd %%xmm1,%%xmm0                   \n"
   1838     "movdqu    %%xmm5,(%[argb_buf])            \n"
   1839     "movdqu    %%xmm0,0x10(%[argb_buf])        \n"
   1840     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
   1841     "sub       $0x8,%[width]                   \n"
   1842     "jg        1b                              \n"
   1843   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1844     [u_buf]"+r"(u_buf),    // %[u_buf]
   1845     [v_buf]"+r"(v_buf),    // %[v_buf]
   1846     [argb_buf]"+r"(bgra_buf),  // %[argb_buf]
   1847     [width]"+rm"(width)    // %[width]
   1848   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   1849   : "memory", "cc"
   1850 #if defined(__SSE2__)
   1851     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1852 #endif
   1853   );
   1854 }
   1855 
   1856 void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
   1857                                           const uint8* u_buf,
   1858                                           const uint8* v_buf,
   1859                                           uint8* abgr_buf,
   1860                                           int width) {
   1861   asm volatile (
   1862     "sub       %[u_buf],%[v_buf]               \n"
   1863     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1864     "pxor      %%xmm4,%%xmm4                   \n"
   1865     ".p2align  4                               \n"
   1866   "1:                                          \n"
   1867     READYUV422
   1868     YUVTORGB
   1869     "punpcklbw %%xmm1,%%xmm2                   \n"
   1870     "punpcklbw %%xmm5,%%xmm0                   \n"
   1871     "movdqa    %%xmm2,%%xmm1                   \n"
   1872     "punpcklwd %%xmm0,%%xmm2                   \n"
   1873     "punpckhwd %%xmm0,%%xmm1                   \n"
   1874     "movdqu    %%xmm2,(%[argb_buf])            \n"
   1875     "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
   1876     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
   1877     "sub       $0x8,%[width]                   \n"
   1878     "jg        1b                              \n"
   1879   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1880     [u_buf]"+r"(u_buf),    // %[u_buf]
   1881     [v_buf]"+r"(v_buf),    // %[v_buf]
   1882     [argb_buf]"+r"(abgr_buf),  // %[argb_buf]
   1883     [width]"+rm"(width)    // %[width]
   1884   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   1885   : "memory", "cc"
   1886 #if defined(__SSE2__)
   1887     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1888 #endif
   1889   );
   1890 }
   1891 #endif  // HAS_I422TOARGBROW_SSSE3
   1892 
   1893 #ifdef HAS_YTOARGBROW_SSE2
   1894 void YToARGBRow_SSE2(const uint8* y_buf,
   1895                      uint8* rgb_buf,
   1896                      int width) {
   1897   asm volatile (
   1898     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   1899     "pslld     $0x18,%%xmm4                    \n"
   1900     "mov       $0x10001000,%%eax               \n"
   1901     "movd      %%eax,%%xmm3                    \n"
   1902     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
   1903     "mov       $0x012a012a,%%eax               \n"
   1904     "movd      %%eax,%%xmm2                    \n"
   1905     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
   1906     ".p2align  4                               \n"
   1907   "1:                                          \n"
   1908     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
   1909     "movq      (%0),%%xmm0                     \n"
   1910     "lea       0x8(%0),%0                      \n"
   1911     "punpcklbw %%xmm0,%%xmm0                   \n"
   1912     "psubusw   %%xmm3,%%xmm0                   \n"
   1913     "pmulhuw   %%xmm2,%%xmm0                   \n"
   1914     "packuswb  %%xmm0,%%xmm0                   \n"
   1915 
   1916     // Step 2: Weave into ARGB
   1917     "punpcklbw %%xmm0,%%xmm0                   \n"
   1918     "movdqa    %%xmm0,%%xmm1                   \n"
   1919     "punpcklwd %%xmm0,%%xmm0                   \n"
   1920     "punpckhwd %%xmm1,%%xmm1                   \n"
   1921     "por       %%xmm4,%%xmm0                   \n"
   1922     "por       %%xmm4,%%xmm1                   \n"
   1923     "movdqa    %%xmm0,(%1)                     \n"
   1924     "movdqa    %%xmm1,16(%1)                   \n"
   1925     "lea       32(%1),%1                       \n"
   1926 
   1927     "sub       $0x8,%2                         \n"
   1928     "jg        1b                              \n"
   1929   : "+r"(y_buf),    // %0
   1930     "+r"(rgb_buf),  // %1
   1931     "+rm"(width)    // %2
   1932   :
   1933   : "memory", "cc", "eax"
   1934 #if defined(__SSE2__)
   1935     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   1936 #endif
   1937   );
   1938 }
   1939 #endif  // HAS_YTOARGBROW_SSE2
   1940 
   1941 #ifdef HAS_MIRRORROW_SSSE3
   1942 // Shuffle table for reversing the bytes.
   1943 CONST uvec8 kShuffleMirror = {
   1944   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
   1945 };
   1946 
   1947 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   1948   intptr_t temp_width = static_cast<intptr_t>(width);
   1949   asm volatile (
   1950     "movdqa    %3,%%xmm5                       \n"
   1951     "lea       -0x10(%0),%0                    \n"
   1952     ".p2align  4                               \n"
   1953   "1:                                          \n"
   1954     "movdqa    (%0,%2),%%xmm0                  \n"
   1955     "pshufb    %%xmm5,%%xmm0                   \n"
   1956     "sub       $0x10,%2                        \n"
   1957     "movdqa    %%xmm0,(%1)                     \n"
   1958     "lea       0x10(%1),%1                     \n"
   1959     "jg        1b                              \n"
   1960   : "+r"(src),  // %0
   1961     "+r"(dst),  // %1
   1962     "+r"(temp_width)  // %2
   1963   : "m"(kShuffleMirror) // %3
   1964   : "memory", "cc"
   1965 #if defined(__SSE2__)
   1966     , "xmm0", "xmm5"
   1967 #endif
   1968   );
   1969 }
   1970 #endif  // HAS_MIRRORROW_SSSE3
   1971 
   1972 #ifdef HAS_MIRRORROW_SSE2
   1973 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   1974   intptr_t temp_width = static_cast<intptr_t>(width);
   1975   asm volatile (
   1976     "lea       -0x10(%0),%0                    \n"
   1977     ".p2align  4                               \n"
   1978   "1:                                          \n"
   1979     "movdqu    (%0,%2),%%xmm0                  \n"
   1980     "movdqa    %%xmm0,%%xmm1                   \n"
   1981     "psllw     $0x8,%%xmm0                     \n"
   1982     "psrlw     $0x8,%%xmm1                     \n"
   1983     "por       %%xmm1,%%xmm0                   \n"
   1984     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
   1985     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
   1986     "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
   1987     "sub       $0x10,%2                        \n"
   1988     "movdqu    %%xmm0,(%1)                     \n"
   1989     "lea       0x10(%1),%1                     \n"
   1990     "jg        1b                              \n"
   1991   : "+r"(src),  // %0
   1992     "+r"(dst),  // %1
   1993     "+r"(temp_width)  // %2
   1994   :
   1995   : "memory", "cc"
   1996 #if defined(__SSE2__)
   1997     , "xmm0", "xmm1"
   1998 #endif
   1999   );
   2000 }
   2001 #endif  // HAS_MIRRORROW_SSE2
   2002 
   2003 #ifdef HAS_MIRRORROW_UV_SSSE3
   2004 // Shuffle table for reversing the bytes of UV channels.
   2005 CONST uvec8 kShuffleMirrorUV = {
   2006   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
   2007 };
   2008 void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
   2009                        int width) {
   2010   intptr_t temp_width = static_cast<intptr_t>(width);
   2011   asm volatile (
   2012     "movdqa    %4,%%xmm1                       \n"
   2013     "lea       -16(%0,%3,2),%0                 \n"
   2014     "sub       %1,%2                           \n"
   2015     ".p2align  4                               \n"
   2016   "1:                                          \n"
   2017     "movdqa    (%0),%%xmm0                     \n"
   2018     "lea       -16(%0),%0                      \n"
   2019     "pshufb    %%xmm1,%%xmm0                   \n"
   2020     "sub       $8,%3                           \n"
   2021     "movlpd    %%xmm0,(%1)                     \n"
   2022     "movhpd    %%xmm0,(%1,%2)                  \n"
   2023     "lea       8(%1),%1                        \n"
   2024     "jg        1b                              \n"
   2025   : "+r"(src),      // %0
   2026     "+r"(dst_u),    // %1
   2027     "+r"(dst_v),    // %2
   2028     "+r"(temp_width)  // %3
   2029   : "m"(kShuffleMirrorUV)  // %4
   2030   : "memory", "cc"
   2031 #if defined(__SSE2__)
   2032     , "xmm0", "xmm1"
   2033 #endif
   2034   );
   2035 }
   2036 #endif  // HAS_MIRRORROW_UV_SSSE3
   2037 
   2038 #ifdef HAS_ARGBMIRRORROW_SSSE3
   2039 // Shuffle table for reversing the bytes.
   2040 CONST uvec8 kARGBShuffleMirror = {
   2041   12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
   2042 };
   2043 
   2044 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   2045   intptr_t temp_width = static_cast<intptr_t>(width);
   2046   asm volatile (
   2047     "movdqa    %3,%%xmm5                       \n"
   2048     "lea       -0x10(%0),%0                    \n"
   2049     ".p2align  4                               \n"
   2050   "1:                                          \n"
   2051     "movdqa    (%0,%2,4),%%xmm0                \n"
   2052     "pshufb    %%xmm5,%%xmm0                   \n"
   2053     "sub       $0x4,%2                         \n"
   2054     "movdqa    %%xmm0,(%1)                     \n"
   2055     "lea       0x10(%1),%1                     \n"
   2056     "jg        1b                              \n"
   2057   : "+r"(src),  // %0
   2058     "+r"(dst),  // %1
   2059     "+r"(temp_width)  // %2
   2060   : "m"(kARGBShuffleMirror)  // %3
   2061   : "memory", "cc"
   2062 #if defined(__SSE2__)
   2063     , "xmm0", "xmm5"
   2064 #endif
   2065   );
   2066 }
   2067 #endif  // HAS_ARGBMIRRORROW_SSSE3
   2068 
   2069 #ifdef HAS_SPLITUV_SSE2
   2070 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   2071   asm volatile (
   2072     "pcmpeqb    %%xmm5,%%xmm5                    \n"
   2073     "psrlw      $0x8,%%xmm5                      \n"
   2074     "sub        %1,%2                            \n"
   2075     ".p2align  4                               \n"
   2076   "1:                                            \n"
   2077     "movdqa     (%0),%%xmm0                      \n"
   2078     "movdqa     0x10(%0),%%xmm1                  \n"
   2079     "lea        0x20(%0),%0                      \n"
   2080     "movdqa     %%xmm0,%%xmm2                    \n"
   2081     "movdqa     %%xmm1,%%xmm3                    \n"
   2082     "pand       %%xmm5,%%xmm0                    \n"
   2083     "pand       %%xmm5,%%xmm1                    \n"
   2084     "packuswb   %%xmm1,%%xmm0                    \n"
   2085     "psrlw      $0x8,%%xmm2                      \n"
   2086     "psrlw      $0x8,%%xmm3                      \n"
   2087     "packuswb   %%xmm3,%%xmm2                    \n"
   2088     "movdqa     %%xmm0,(%1)                      \n"
   2089     "movdqa     %%xmm2,(%1,%2)                   \n"
   2090     "lea        0x10(%1),%1                      \n"
   2091     "sub        $0x10,%3                         \n"
   2092     "jg         1b                               \n"
   2093   : "+r"(src_uv),     // %0
   2094     "+r"(dst_u),      // %1
   2095     "+r"(dst_v),      // %2
   2096     "+r"(pix)         // %3
   2097   :
   2098   : "memory", "cc"
   2099 #if defined(__SSE2__)
   2100     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   2101 #endif
   2102   );
   2103 }
   2104 #endif  // HAS_SPLITUV_SSE2
   2105 
   2106 #ifdef HAS_COPYROW_SSE2
   2107 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   2108   asm volatile (
   2109     "sub        %0,%1                          \n"
   2110     ".p2align  4                               \n"
   2111   "1:                                          \n"
   2112     "movdqa    (%0),%%xmm0                     \n"
   2113     "movdqa    0x10(%0),%%xmm1                 \n"
   2114     "movdqa    %%xmm0,(%0,%1)                  \n"
   2115     "movdqa    %%xmm1,0x10(%0,%1)              \n"
   2116     "lea       0x20(%0),%0                     \n"
   2117     "sub       $0x20,%2                        \n"
   2118     "jg        1b                              \n"
   2119   : "+r"(src),   // %0
   2120     "+r"(dst),   // %1
   2121     "+r"(count)  // %2
   2122   :
   2123   : "memory", "cc"
   2124 #if defined(__SSE2__)
   2125     , "xmm0", "xmm1"
   2126 #endif
   2127   );
   2128 }
   2129 #endif  // HAS_COPYROW_SSE2
   2130 
   2131 #ifdef HAS_COPYROW_X86
   2132 void CopyRow_X86(const uint8* src, uint8* dst, int width) {
   2133   size_t width_tmp = static_cast<size_t>(width);
   2134   asm volatile (
   2135     "shr       $0x2,%2                         \n"
   2136     "rep movsl                                 \n"
   2137   : "+S"(src),  // %0
   2138     "+D"(dst),  // %1
   2139     "+c"(width_tmp) // %2
   2140   :
   2141   : "memory", "cc"
   2142   );
   2143 }
   2144 #endif  // HAS_COPYROW_X86
   2145 
   2146 #ifdef HAS_SETROW_X86
   2147 void SetRow8_X86(uint8* dst, uint32 v32, int width) {
   2148   size_t width_tmp = static_cast<size_t>(width);
   2149   asm volatile (
   2150     "shr       $0x2,%1                         \n"
   2151     "rep stosl                                 \n"
   2152     : "+D"(dst),       // %0
   2153       "+c"(width_tmp)  // %1
   2154     : "a"(v32)         // %2
   2155     : "memory", "cc");
   2156 }
   2157 
   2158 void SetRows32_X86(uint8* dst, uint32 v32, int width,
   2159                    int dst_stride, int height) {
   2160   for (int y = 0; y < height; ++y) {
   2161     size_t width_tmp = static_cast<size_t>(width);
   2162     uint32* d = reinterpret_cast<uint32*>(dst);
   2163     asm volatile (
   2164       "rep stosl                               \n"
   2165       : "+D"(d),         // %0
   2166         "+c"(width_tmp)  // %1
   2167       : "a"(v32)         // %2
   2168       : "memory", "cc");
   2169     dst += dst_stride;
   2170   }
   2171 }
   2172 #endif  // HAS_SETROW_X86
   2173 
   2174 #ifdef HAS_YUY2TOYROW_SSE2
   2175 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
   2176   asm volatile (
   2177     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   2178     "psrlw     $0x8,%%xmm5                     \n"
   2179     ".p2align  4                               \n"
   2180   "1:                                          \n"
   2181     "movdqa    (%0),%%xmm0                     \n"
   2182     "movdqa    0x10(%0),%%xmm1                 \n"
   2183     "lea       0x20(%0),%0                     \n"
   2184     "pand      %%xmm5,%%xmm0                   \n"
   2185     "pand      %%xmm5,%%xmm1                   \n"
   2186     "packuswb  %%xmm1,%%xmm0                   \n"
   2187     "movdqa    %%xmm0,(%1)                     \n"
   2188     "lea       0x10(%1),%1                     \n"
   2189     "sub       $0x10,%2                        \n"
   2190     "jg        1b                              \n"
   2191   : "+r"(src_yuy2),  // %0
   2192     "+r"(dst_y),     // %1
   2193     "+r"(pix)        // %2
   2194   :
   2195   : "memory", "cc"
   2196 #if defined(__SSE2__)
   2197     , "xmm0", "xmm1", "xmm5"
   2198 #endif
   2199   );
   2200 }
   2201 
   2202 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
   2203                       uint8* dst_u, uint8* dst_v, int pix) {
   2204   asm volatile (
   2205     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   2206     "psrlw     $0x8,%%xmm5                     \n"
   2207     "sub       %1,%2                           \n"
   2208     ".p2align  4                               \n"
   2209   "1:                                          \n"
   2210     "movdqa    (%0),%%xmm0                     \n"
   2211     "movdqa    0x10(%0),%%xmm1                 \n"
   2212     "movdqa    (%0,%4,1),%%xmm2                \n"
   2213     "movdqa    0x10(%0,%4,1),%%xmm3            \n"
   2214     "lea       0x20(%0),%0                     \n"
   2215     "pavgb     %%xmm2,%%xmm0                   \n"
   2216     "pavgb     %%xmm3,%%xmm1                   \n"
   2217     "psrlw     $0x8,%%xmm0                     \n"
   2218     "psrlw     $0x8,%%xmm1                     \n"
   2219     "packuswb  %%xmm1,%%xmm0                   \n"
   2220     "movdqa    %%xmm0,%%xmm1                   \n"
   2221     "pand      %%xmm5,%%xmm0                   \n"
   2222     "packuswb  %%xmm0,%%xmm0                   \n"
   2223     "psrlw     $0x8,%%xmm1                     \n"
   2224     "packuswb  %%xmm1,%%xmm1                   \n"
   2225     "movq      %%xmm0,(%1)                     \n"
   2226     "movq      %%xmm1,(%1,%2)                  \n"
   2227     "lea       0x8(%1),%1                      \n"
   2228     "sub       $0x10,%3                        \n"
   2229     "jg        1b                              \n"
   2230   : "+r"(src_yuy2),    // %0
   2231     "+r"(dst_u),       // %1
   2232     "+r"(dst_v),       // %2
   2233     "+r"(pix)          // %3
   2234   : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
   2235   : "memory", "cc"
   2236 #if defined(__SSE2__)
   2237     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   2238 #endif
   2239   );
   2240 }
   2241 
   2242 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
   2243                          uint8* dst_u, uint8* dst_v, int pix) {
   2244   asm volatile (
   2245     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   2246     "psrlw     $0x8,%%xmm5                     \n"
   2247     "sub       %1,%2                           \n"
   2248     ".p2align  4                               \n"
   2249   "1:                                          \n"
   2250     "movdqa    (%0),%%xmm0                     \n"
   2251     "movdqa    0x10(%0),%%xmm1                 \n"
   2252     "lea       0x20(%0),%0                     \n"
   2253     "psrlw     $0x8,%%xmm0                     \n"
   2254     "psrlw     $0x8,%%xmm1                     \n"
   2255     "packuswb  %%xmm1,%%xmm0                   \n"
   2256     "movdqa    %%xmm0,%%xmm1                   \n"
   2257     "pand      %%xmm5,%%xmm0                   \n"
   2258     "packuswb  %%xmm0,%%xmm0                   \n"
   2259     "psrlw     $0x8,%%xmm1                     \n"
   2260     "packuswb  %%xmm1,%%xmm1                   \n"
   2261     "movq      %%xmm0,(%1)                     \n"
   2262     "movq      %%xmm1,(%1,%2)                  \n"
   2263     "lea       0x8(%1),%1                      \n"
   2264     "sub       $0x10,%3                        \n"
   2265     "jg        1b                              \n"
   2266   : "+r"(src_yuy2),    // %0
   2267     "+r"(dst_u),       // %1
   2268     "+r"(dst_v),       // %2
   2269     "+r"(pix)          // %3
   2270   :
   2271   : "memory", "cc"
   2272 #if defined(__SSE2__)
   2273     , "xmm0", "xmm1", "xmm5"
   2274 #endif
   2275   );
   2276 }
   2277 
   2278 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
   2279                                uint8* dst_y, int pix) {
   2280   asm volatile (
   2281     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   2282     "psrlw     $0x8,%%xmm5                     \n"
   2283     ".p2align  4                               \n"
   2284   "1:                                          \n"
   2285     "movdqu    (%0),%%xmm0                     \n"
   2286     "movdqu    0x10(%0),%%xmm1                 \n"
   2287     "lea       0x20(%0),%0                     \n"
   2288     "pand      %%xmm5,%%xmm0                   \n"
   2289     "pand      %%xmm5,%%xmm1                   \n"
   2290     "packuswb  %%xmm1,%%xmm0                   \n"
   2291     "sub       $0x10,%2                        \n"
   2292     "movdqu    %%xmm0,(%1)                     \n"
   2293     "lea       0x10(%1),%1                     \n"
   2294     "jg        1b                              \n"
   2295   : "+r"(src_yuy2),  // %0
   2296     "+r"(dst_y),     // %1
   2297     "+r"(pix)        // %2
   2298   :
   2299   : "memory", "cc"
   2300 #if defined(__SSE2__)
   2301     , "xmm0", "xmm1", "xmm5"
   2302 #endif
   2303   );
   2304 }
   2305 
   2306 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
   2307                                 int stride_yuy2,
   2308                                 uint8* dst_u, uint8* dst_v, int pix) {
   2309   asm volatile (
   2310     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   2311     "psrlw     $0x8,%%xmm5                     \n"
   2312     "sub       %1,%2                           \n"
   2313     ".p2align  4                               \n"
   2314   "1:                                          \n"
   2315     "movdqu    (%0),%%xmm0                     \n"
   2316     "movdqu    0x10(%0),%%xmm1                 \n"
   2317     "movdqu    (%0,%4,1),%%xmm2                \n"
   2318     "movdqu    0x10(%0,%4,1),%%xmm3            \n"
   2319     "lea       0x20(%0),%0                     \n"
   2320     "pavgb     %%xmm2,%%xmm0                   \n"
   2321     "pavgb     %%xmm3,%%xmm1                   \n"
   2322     "psrlw     $0x8,%%xmm0                     \n"
   2323     "psrlw     $0x8,%%xmm1                     \n"
   2324     "packuswb  %%xmm1,%%xmm0                   \n"
   2325     "movdqa    %%xmm0,%%xmm1                   \n"
   2326     "pand      %%xmm5,%%xmm0                   \n"
   2327     "packuswb  %%xmm0,%%xmm0                   \n"
   2328     "psrlw     $0x8,%%xmm1                     \n"
   2329     "packuswb  %%xmm1,%%xmm1                   \n"
   2330     "movq      %%xmm0,(%1)                     \n"
   2331     "movq      %%xmm1,(%1,%2)                  \n"
   2332     "lea       0x8(%1),%1                      \n"
   2333     "sub       $0x10,%3                        \n"
   2334     "jg        1b                              \n"
   2335   : "+r"(src_yuy2),    // %0
   2336     "+r"(dst_u),       // %1
   2337     "+r"(dst_v),       // %2
   2338     "+r"(pix)          // %3
   2339   : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
   2340   : "memory", "cc"
   2341 #if defined(__SSE2__)
   2342     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   2343 #endif
   2344   );
   2345 }
   2346 
   2347 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
   2348                                    uint8* dst_u, uint8* dst_v, int pix) {
   2349   asm volatile (
   2350     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   2351     "psrlw     $0x8,%%xmm5                     \n"
   2352     "sub       %1,%2                           \n"
   2353     ".p2align  4                               \n"
   2354   "1:                                          \n"
   2355     "movdqu    (%0),%%xmm0                     \n"
   2356     "movdqu    0x10(%0),%%xmm1                 \n"
   2357     "lea       0x20(%0),%0                     \n"
   2358     "psrlw     $0x8,%%xmm0                     \n"
   2359     "psrlw     $0x8,%%xmm1                     \n"
   2360     "packuswb  %%xmm1,%%xmm0                   \n"
   2361     "movdqa    %%xmm0,%%xmm1                   \n"
   2362     "pand      %%xmm5,%%xmm0                   \n"
   2363     "packuswb  %%xmm0,%%xmm0                   \n"
   2364     "psrlw     $0x8,%%xmm1                     \n"
   2365     "packuswb  %%xmm1,%%xmm1                   \n"
   2366     "movq      %%xmm0,(%1)                     \n"
   2367     "movq      %%xmm1,(%1,%2)                  \n"
   2368     "lea       0x8(%1),%1                      \n"
   2369     "sub       $0x10,%3                        \n"
   2370     "jg        1b                              \n"
   2371   : "+r"(src_yuy2),    // %0
   2372     "+r"(dst_u),       // %1
   2373     "+r"(dst_v),       // %2
   2374     "+r"(pix)          // %3
   2375   :
   2376   : "memory", "cc"
   2377 #if defined(__SSE2__)
   2378     , "xmm0", "xmm1", "xmm5"
   2379 #endif
   2380   );
   2381 }
   2382 
   2383 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
   2384   asm volatile (
   2385     ".p2align  4                               \n"
   2386   "1:                                          \n"
   2387     "movdqa    (%0),%%xmm0                     \n"
   2388     "movdqa    0x10(%0),%%xmm1                 \n"
   2389     "lea       0x20(%0),%0                     \n"
   2390     "psrlw     $0x8,%%xmm0                     \n"
   2391     "psrlw     $0x8,%%xmm1                     \n"
   2392     "packuswb  %%xmm1,%%xmm0                   \n"
   2393     "sub       $0x10,%2                        \n"
   2394     "movdqa    %%xmm0,(%1)                     \n"
   2395     "lea       0x10(%1),%1                     \n"
   2396     "jg        1b                              \n"
   2397   : "+r"(src_uyvy),  // %0
   2398     "+r"(dst_y),     // %1
   2399     "+r"(pix)        // %2
   2400   :
   2401   : "memory", "cc"
   2402 #if defined(__SSE2__)
   2403     , "xmm0", "xmm1"
   2404 #endif
   2405   );
   2406 }
   2407 
   2408 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
   2409                       uint8* dst_u, uint8* dst_v, int pix) {
   2410   asm volatile (
   2411     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   2412     "psrlw     $0x8,%%xmm5                     \n"
   2413     "sub       %1,%2                           \n"
   2414     ".p2align  4                               \n"
   2415   "1:                                          \n"
   2416     "movdqa    (%0),%%xmm0                     \n"
   2417     "movdqa    0x10(%0),%%xmm1                 \n"
   2418     "movdqa    (%0,%4,1),%%xmm2                \n"
   2419     "movdqa    0x10(%0,%4,1),%%xmm3            \n"
   2420     "lea       0x20(%0),%0                     \n"
   2421     "pavgb     %%xmm2,%%xmm0                   \n"
   2422     "pavgb     %%xmm3,%%xmm1                   \n"
   2423     "pand      %%xmm5,%%xmm0                   \n"
   2424     "pand      %%xmm5,%%xmm1                   \n"
   2425     "packuswb  %%xmm1,%%xmm0                   \n"
   2426     "movdqa    %%xmm0,%%xmm1                   \n"
   2427     "pand      %%xmm5,%%xmm0                   \n"
   2428     "packuswb  %%xmm0,%%xmm0                   \n"
   2429     "psrlw     $0x8,%%xmm1                     \n"
   2430     "packuswb  %%xmm1,%%xmm1                   \n"
   2431     "movq      %%xmm0,(%1)                     \n"
   2432     "movq      %%xmm1,(%1,%2)                  \n"
   2433     "lea       0x8(%1),%1                      \n"
   2434     "sub       $0x10,%3                        \n"
   2435     "jg        1b                              \n"
   2436   : "+r"(src_uyvy),    // %0
   2437     "+r"(dst_u),       // %1
   2438     "+r"(dst_v),       // %2
   2439     "+r"(pix)          // %3
   2440   : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
   2441   : "memory", "cc"
   2442 #if defined(__SSE2__)
   2443     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   2444 #endif
   2445   );
   2446 }
   2447 
   2448 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
   2449                          uint8* dst_u, uint8* dst_v, int pix) {
   2450   asm volatile (
   2451     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   2452     "psrlw     $0x8,%%xmm5                     \n"
   2453     "sub       %1,%2                           \n"
   2454     ".p2align  4                               \n"
   2455   "1:                                          \n"
   2456     "movdqa    (%0),%%xmm0                     \n"
   2457     "movdqa    0x10(%0),%%xmm1                 \n"
   2458     "lea       0x20(%0),%0                     \n"
   2459     "pand      %%xmm5,%%xmm0                   \n"
   2460     "pand      %%xmm5,%%xmm1                   \n"
   2461     "packuswb  %%xmm1,%%xmm0                   \n"
   2462     "movdqa    %%xmm0,%%xmm1                   \n"
   2463     "pand      %%xmm5,%%xmm0                   \n"
   2464     "packuswb  %%xmm0,%%xmm0                   \n"
   2465     "psrlw     $0x8,%%xmm1                     \n"
   2466     "packuswb  %%xmm1,%%xmm1                   \n"
   2467     "movq      %%xmm0,(%1)                     \n"
   2468     "movq      %%xmm1,(%1,%2)                  \n"
   2469     "lea       0x8(%1),%1                      \n"
   2470     "sub       $0x10,%3                        \n"
   2471     "jg        1b                              \n"
   2472   : "+r"(src_uyvy),    // %0
   2473     "+r"(dst_u),       // %1
   2474     "+r"(dst_v),       // %2
   2475     "+r"(pix)          // %3
   2476   :
   2477   : "memory", "cc"
   2478 #if defined(__SSE2__)
   2479     , "xmm0", "xmm1", "xmm5"
   2480 #endif
   2481   );
   2482 }
   2483 
   2484 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
   2485                                uint8* dst_y, int pix) {
   2486   asm volatile (
   2487     ".p2align  4                               \n"
   2488   "1:                                          \n"
   2489     "movdqu    (%0),%%xmm0                     \n"
   2490     "movdqu    0x10(%0),%%xmm1                 \n"
   2491     "lea       0x20(%0),%0                     \n"
   2492     "psrlw     $0x8,%%xmm0                     \n"
   2493     "psrlw     $0x8,%%xmm1                     \n"
   2494     "packuswb  %%xmm1,%%xmm0                   \n"
   2495     "sub       $0x10,%2                        \n"
   2496     "movdqu    %%xmm0,(%1)                     \n"
   2497     "lea       0x10(%1),%1                     \n"
   2498     "jg        1b                              \n"
   2499   : "+r"(src_uyvy),  // %0
   2500     "+r"(dst_y),     // %1
   2501     "+r"(pix)        // %2
   2502   :
   2503   : "memory", "cc"
   2504 #if defined(__SSE2__)
   2505     , "xmm0", "xmm1"
   2506 #endif
   2507   );
   2508 }
   2509 
   2510 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
   2511                                 uint8* dst_u, uint8* dst_v, int pix) {
   2512   asm volatile (
   2513     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   2514     "psrlw     $0x8,%%xmm5                     \n"
   2515     "sub       %1,%2                           \n"
   2516     ".p2align  4                               \n"
   2517   "1:                                          \n"
   2518     "movdqu    (%0),%%xmm0                     \n"
   2519     "movdqu    0x10(%0),%%xmm1                 \n"
   2520     "movdqu    (%0,%4,1),%%xmm2                \n"
   2521     "movdqu    0x10(%0,%4,1),%%xmm3            \n"
   2522     "lea       0x20(%0),%0                     \n"
   2523     "pavgb     %%xmm2,%%xmm0                   \n"
   2524     "pavgb     %%xmm3,%%xmm1                   \n"
   2525     "pand      %%xmm5,%%xmm0                   \n"
   2526     "pand      %%xmm5,%%xmm1                   \n"
   2527     "packuswb  %%xmm1,%%xmm0                   \n"
   2528     "movdqa    %%xmm0,%%xmm1                   \n"
   2529     "pand      %%xmm5,%%xmm0                   \n"
   2530     "packuswb  %%xmm0,%%xmm0                   \n"
   2531     "psrlw     $0x8,%%xmm1                     \n"
   2532     "packuswb  %%xmm1,%%xmm1                   \n"
   2533     "movq      %%xmm0,(%1)                     \n"
   2534     "movq      %%xmm1,(%1,%2)                  \n"
   2535     "lea       0x8(%1),%1                      \n"
   2536     "sub       $0x10,%3                        \n"
   2537     "jg        1b                              \n"
   2538   : "+r"(src_uyvy),    // %0
   2539     "+r"(dst_u),       // %1
   2540     "+r"(dst_v),       // %2
   2541     "+r"(pix)          // %3
   2542   : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
   2543   : "memory", "cc"
   2544 #if defined(__SSE2__)
   2545     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   2546 #endif
   2547   );
   2548 }
   2549 
   2550 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
   2551                                    uint8* dst_u, uint8* dst_v, int pix) {
   2552   asm volatile (
   2553     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   2554     "psrlw     $0x8,%%xmm5                     \n"
   2555     "sub       %1,%2                           \n"
   2556     ".p2align  4                               \n"
   2557   "1:                                          \n"
   2558     "movdqu    (%0),%%xmm0                     \n"
   2559     "movdqu    0x10(%0),%%xmm1                 \n"
   2560     "lea       0x20(%0),%0                     \n"
   2561     "pand      %%xmm5,%%xmm0                   \n"
   2562     "pand      %%xmm5,%%xmm1                   \n"
   2563     "packuswb  %%xmm1,%%xmm0                   \n"
   2564     "movdqa    %%xmm0,%%xmm1                   \n"
   2565     "pand      %%xmm5,%%xmm0                   \n"
   2566     "packuswb  %%xmm0,%%xmm0                   \n"
   2567     "psrlw     $0x8,%%xmm1                     \n"
   2568     "packuswb  %%xmm1,%%xmm1                   \n"
   2569     "movq      %%xmm0,(%1)                     \n"
   2570     "movq      %%xmm1,(%1,%2)                  \n"
   2571     "lea       0x8(%1),%1                      \n"
   2572     "sub       $0x10,%3                        \n"
   2573     "jg        1b                              \n"
   2574   : "+r"(src_uyvy),    // %0
   2575     "+r"(dst_u),       // %1
   2576     "+r"(dst_v),       // %2
   2577     "+r"(pix)          // %3
   2578   :
   2579   : "memory", "cc"
   2580 #if defined(__SSE2__)
   2581     , "xmm0", "xmm1", "xmm5"
   2582 #endif
   2583   );
   2584 }
   2585 #endif  // HAS_YUY2TOYROW_SSE2
   2586 
   2587 #ifdef HAS_ARGBBLENDROW_SSE2
   2588 // Blend 8 pixels at a time.
   2589 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   2590                        uint8* dst_argb, int width) {
   2591   asm volatile (
   2592     "pcmpeqb   %%xmm7,%%xmm7                   \n"
   2593     "psrlw     $0xf,%%xmm7                     \n"
   2594     "pcmpeqb   %%xmm6,%%xmm6                   \n"
   2595     "psrlw     $0x8,%%xmm6                     \n"
   2596     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   2597     "psllw     $0x8,%%xmm5                     \n"
   2598     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   2599     "pslld     $0x18,%%xmm4                    \n"
   2600     "sub       $0x1,%3                         \n"
   2601     "je        91f                             \n"
   2602     "jl        99f                             \n"
   2603 
   2604     // 1 pixel loop until destination pointer is aligned.
   2605   "10:                                         \n"
   2606     "test      $0xf,%2                         \n"
   2607     "je        19f                             \n"
   2608     "movd      (%0),%%xmm3                     \n"
   2609     "lea       0x4(%0),%0                      \n"
   2610     "movdqa    %%xmm3,%%xmm0                   \n"
   2611     "pxor      %%xmm4,%%xmm3                   \n"
   2612     "movd      (%1),%%xmm2                     \n"
   2613     "psrlw     $0x8,%%xmm3                     \n"
   2614     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
   2615     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
   2616     "pand      %%xmm6,%%xmm2                   \n"
   2617     "paddw     %%xmm7,%%xmm3                   \n"
   2618     "pmullw    %%xmm3,%%xmm2                   \n"
   2619     "movd      (%1),%%xmm1                     \n"
   2620     "lea       0x4(%1),%1                      \n"
   2621     "psrlw     $0x8,%%xmm1                     \n"
   2622     "por       %%xmm4,%%xmm0                   \n"
   2623     "pmullw    %%xmm3,%%xmm1                   \n"
   2624     "psrlw     $0x8,%%xmm2                     \n"
   2625     "paddusb   %%xmm2,%%xmm0                   \n"
   2626     "pand      %%xmm5,%%xmm1                   \n"
   2627     "paddusb   %%xmm1,%%xmm0                   \n"
   2628     "sub       $0x1,%3                         \n"
   2629     "movd      %%xmm0,(%2)                     \n"
   2630     "lea       0x4(%2),%2                      \n"
   2631     "jge       10b                             \n"
   2632 
   2633   "19:                                         \n"
   2634     "add       $1-4,%3                         \n"
   2635     "jl        49f                             \n"
   2636 
   2637     // 4 pixel loop.
   2638     ".p2align  2                               \n"
   2639   "41:                                         \n"
   2640     "movdqu    (%0),%%xmm3                     \n"
   2641     "lea       0x10(%0),%0                     \n"
   2642     "movdqa    %%xmm3,%%xmm0                   \n"
   2643     "pxor      %%xmm4,%%xmm3                   \n"
   2644     "movdqu    (%1),%%xmm2                     \n"
   2645     "psrlw     $0x8,%%xmm3                     \n"
   2646     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
   2647     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
   2648     "pand      %%xmm6,%%xmm2                   \n"
   2649     "paddw     %%xmm7,%%xmm3                   \n"
   2650     "pmullw    %%xmm3,%%xmm2                   \n"
   2651     "movdqu    (%1),%%xmm1                     \n"
   2652     "lea       0x10(%1),%1                     \n"
   2653     "psrlw     $0x8,%%xmm1                     \n"
   2654     "por       %%xmm4,%%xmm0                   \n"
   2655     "pmullw    %%xmm3,%%xmm1                   \n"
   2656     "psrlw     $0x8,%%xmm2                     \n"
   2657     "paddusb   %%xmm2,%%xmm0                   \n"
   2658     "pand      %%xmm5,%%xmm1                   \n"
   2659     "paddusb   %%xmm1,%%xmm0                   \n"
   2660     "sub       $0x4,%3                         \n"
   2661     "movdqa    %%xmm0,(%2)                     \n"
   2662     "lea       0x10(%2),%2                     \n"
   2663     "jge       41b                             \n"
   2664 
   2665   "49:                                         \n"
   2666     "add       $0x3,%3                         \n"
   2667     "jl        99f                             \n"
   2668 
   2669     // 1 pixel loop.
   2670   "91:                                         \n"
   2671     "movd      (%0),%%xmm3                     \n"
   2672     "lea       0x4(%0),%0                      \n"
   2673     "movdqa    %%xmm3,%%xmm0                   \n"
   2674     "pxor      %%xmm4,%%xmm3                   \n"
   2675     "movd      (%1),%%xmm2                     \n"
   2676     "psrlw     $0x8,%%xmm3                     \n"
   2677     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
   2678     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
   2679     "pand      %%xmm6,%%xmm2                   \n"
   2680     "paddw     %%xmm7,%%xmm3                   \n"
   2681     "pmullw    %%xmm3,%%xmm2                   \n"
   2682     "movd      (%1),%%xmm1                     \n"
   2683     "lea       0x4(%1),%1                      \n"
   2684     "psrlw     $0x8,%%xmm1                     \n"
   2685     "por       %%xmm4,%%xmm0                   \n"
   2686     "pmullw    %%xmm3,%%xmm1                   \n"
   2687     "psrlw     $0x8,%%xmm2                     \n"
   2688     "paddusb   %%xmm2,%%xmm0                   \n"
   2689     "pand      %%xmm5,%%xmm1                   \n"
   2690     "paddusb   %%xmm1,%%xmm0                   \n"
   2691     "sub       $0x1,%3                         \n"
   2692     "movd      %%xmm0,(%2)                     \n"
   2693     "lea       0x4(%2),%2                      \n"
   2694     "jge       91b                             \n"
   2695   "99:                                         \n"
   2696   : "+r"(src_argb0),    // %0
   2697     "+r"(src_argb1),    // %1
   2698     "+r"(dst_argb),     // %2
   2699     "+r"(width)         // %3
   2700   :
   2701   : "memory", "cc"
   2702 #if defined(__SSE2__)
   2703     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   2704 #endif
   2705   );
   2706 }
   2707 #endif  // HAS_ARGBBLENDROW_SSE2
   2708 
   2709 #ifdef HAS_ARGBBLENDROW_SSSE3
   2710 // Shuffle table for isolating alpha.
   2711 CONST uvec8 kShuffleAlpha = {
   2712   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
   2713   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
   2714 };
   2715 
   2716 // Blend 8 pixels at a time
   2717 // Shuffle table for reversing the bytes.
   2718 
   2719 // Same as SSE2, but replaces
   2720 //    psrlw      xmm3, 8          // alpha
   2721 //    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
   2722 //    pshuflw    xmm3, xmm3,0F5h
   2723 // with..
   2724 //    pshufb     xmm3, kShuffleAlpha // alpha
   2725 
   2726 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
   2727                         uint8* dst_argb, int width) {
   2728   asm volatile (
   2729     "pcmpeqb   %%xmm7,%%xmm7                   \n"
   2730     "psrlw     $0xf,%%xmm7                     \n"
   2731     "pcmpeqb   %%xmm6,%%xmm6                   \n"
   2732     "psrlw     $0x8,%%xmm6                     \n"
   2733     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   2734     "psllw     $0x8,%%xmm5                     \n"
   2735     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   2736     "pslld     $0x18,%%xmm4                    \n"
   2737     "sub       $0x1,%3                         \n"
   2738     "je        91f                             \n"
   2739     "jl        99f                             \n"
   2740 
   2741     // 1 pixel loop until destination pointer is aligned.
   2742   "10:                                         \n"
   2743     "test      $0xf,%2                         \n"
   2744     "je        19f                             \n"
   2745     "movd      (%0),%%xmm3                     \n"
   2746     "lea       0x4(%0),%0                      \n"
   2747     "movdqa    %%xmm3,%%xmm0                   \n"
   2748     "pxor      %%xmm4,%%xmm3                   \n"
   2749     "movd      (%1),%%xmm2                     \n"
   2750     "pshufb    %4,%%xmm3                       \n"
   2751     "pand      %%xmm6,%%xmm2                   \n"
   2752     "paddw     %%xmm7,%%xmm3                   \n"
   2753     "pmullw    %%xmm3,%%xmm2                   \n"
   2754     "movd      (%1),%%xmm1                     \n"
   2755     "lea       0x4(%1),%1                      \n"
   2756     "psrlw     $0x8,%%xmm1                     \n"
   2757     "por       %%xmm4,%%xmm0                   \n"
   2758     "pmullw    %%xmm3,%%xmm1                   \n"
   2759     "psrlw     $0x8,%%xmm2                     \n"
   2760     "paddusb   %%xmm2,%%xmm0                   \n"
   2761     "pand      %%xmm5,%%xmm1                   \n"
   2762     "paddusb   %%xmm1,%%xmm0                   \n"
   2763     "sub       $0x1,%3                         \n"
   2764     "movd      %%xmm0,(%2)                     \n"
   2765     "lea       0x4(%2),%2                      \n"
   2766     "jge       10b                             \n"
   2767 
   2768   "19:                                         \n"
   2769     "add       $1-4,%3                         \n"
   2770     "jl        49f                             \n"
   2771     "test      $0xf,%0                         \n"
   2772     "jne       41f                             \n"
   2773     "test      $0xf,%1                         \n"
   2774     "jne       41f                             \n"
   2775 
   2776     // 4 pixel loop.
   2777     ".p2align  2                               \n"
   2778   "40:                                         \n"
   2779     "movdqa    (%0),%%xmm3                     \n"
   2780     "lea       0x10(%0),%0                     \n"
   2781     "movdqa    %%xmm3,%%xmm0                   \n"
   2782     "pxor      %%xmm4,%%xmm3                   \n"
   2783     "movdqa    (%1),%%xmm2                     \n"
   2784     "pshufb    %4,%%xmm3                       \n"
   2785     "pand      %%xmm6,%%xmm2                   \n"
   2786     "paddw     %%xmm7,%%xmm3                   \n"
   2787     "pmullw    %%xmm3,%%xmm2                   \n"
   2788     "movdqa    (%1),%%xmm1                     \n"
   2789     "lea       0x10(%1),%1                     \n"
   2790     "psrlw     $0x8,%%xmm1                     \n"
   2791     "por       %%xmm4,%%xmm0                   \n"
   2792     "pmullw    %%xmm3,%%xmm1                   \n"
   2793     "psrlw     $0x8,%%xmm2                     \n"
   2794     "paddusb   %%xmm2,%%xmm0                   \n"
   2795     "pand      %%xmm5,%%xmm1                   \n"
   2796     "paddusb   %%xmm1,%%xmm0                   \n"
   2797     "sub       $0x4,%3                         \n"
   2798     "movdqa    %%xmm0,(%2)                     \n"
   2799     "lea       0x10(%2),%2                     \n"
   2800     "jge       40b                             \n"
   2801     "jmp       49f                             \n"
   2802 
   2803     // 4 pixel unaligned loop.
   2804     ".p2align  2                               \n"
   2805   "41:                                         \n"
   2806     "movdqu    (%0),%%xmm3                     \n"
   2807     "lea       0x10(%0),%0                     \n"
   2808     "movdqa    %%xmm3,%%xmm0                   \n"
   2809     "pxor      %%xmm4,%%xmm3                   \n"
   2810     "movdqu    (%1),%%xmm2                     \n"
   2811     "pshufb    %4,%%xmm3                       \n"
   2812     "pand      %%xmm6,%%xmm2                   \n"
   2813     "paddw     %%xmm7,%%xmm3                   \n"
   2814     "pmullw    %%xmm3,%%xmm2                   \n"
   2815     "movdqu    (%1),%%xmm1                     \n"
   2816     "lea       0x10(%1),%1                     \n"
   2817     "psrlw     $0x8,%%xmm1                     \n"
   2818     "por       %%xmm4,%%xmm0                   \n"
   2819     "pmullw    %%xmm3,%%xmm1                   \n"
   2820     "psrlw     $0x8,%%xmm2                     \n"
   2821     "paddusb   %%xmm2,%%xmm0                   \n"
   2822     "pand      %%xmm5,%%xmm1                   \n"
   2823     "paddusb   %%xmm1,%%xmm0                   \n"
   2824     "sub       $0x4,%3                         \n"
   2825     "movdqa    %%xmm0,(%2)                     \n"
   2826     "lea       0x10(%2),%2                     \n"
   2827     "jge       41b                             \n"
   2828 
   2829   "49:                                         \n"
   2830     "add       $0x3,%3                         \n"
   2831     "jl        99f                             \n"
   2832 
   2833     // 1 pixel loop.
   2834   "91:                                         \n"
   2835     "movd      (%0),%%xmm3                     \n"
   2836     "lea       0x4(%0),%0                      \n"
   2837     "movdqa    %%xmm3,%%xmm0                   \n"
   2838     "pxor      %%xmm4,%%xmm3                   \n"
   2839     "movd      (%1),%%xmm2                     \n"
   2840     "pshufb    %4,%%xmm3                       \n"
   2841     "pand      %%xmm6,%%xmm2                   \n"
   2842     "paddw     %%xmm7,%%xmm3                   \n"
   2843     "pmullw    %%xmm3,%%xmm2                   \n"
   2844     "movd      (%1),%%xmm1                     \n"
   2845     "lea       0x4(%1),%1                      \n"
   2846     "psrlw     $0x8,%%xmm1                     \n"
   2847     "por       %%xmm4,%%xmm0                   \n"
   2848     "pmullw    %%xmm3,%%xmm1                   \n"
   2849     "psrlw     $0x8,%%xmm2                     \n"
   2850     "paddusb   %%xmm2,%%xmm0                   \n"
   2851     "pand      %%xmm5,%%xmm1                   \n"
   2852     "paddusb   %%xmm1,%%xmm0                   \n"
   2853     "sub       $0x1,%3                         \n"
   2854     "movd      %%xmm0,(%2)                     \n"
   2855     "lea       0x4(%2),%2                      \n"
   2856     "jge       91b                             \n"
   2857   "99:                                         \n"
   2858   : "+r"(src_argb0),    // %0
   2859     "+r"(src_argb1),    // %1
   2860     "+r"(dst_argb),     // %2
   2861     "+r"(width)         // %3
   2862   : "m"(kShuffleAlpha)  // %4
   2863   : "memory", "cc"
   2864 #if defined(__SSE2__)
   2865     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   2866 #endif
   2867   );
   2868 }
   2869 #endif  // HAS_ARGBBLENDROW_SSSE3
   2870 
   2871 #ifdef HAS_ARGBATTENUATE_SSE2
   2872 // Attenuate 4 pixels at a time.
   2873 // aligned to 16 bytes
   2874 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
   2875   asm volatile (
   2876     "sub       %0,%1                           \n"
   2877     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   2878     "pslld     $0x18,%%xmm4                    \n"
   2879     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   2880     "psrld     $0x8,%%xmm5                     \n"
   2881 
   2882     // 4 pixel loop.
   2883     ".p2align  4                               \n"
   2884   "1:                                          \n"
   2885     "movdqa    (%0),%%xmm0                     \n"
   2886     "punpcklbw %%xmm0,%%xmm0                   \n"
   2887     "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
   2888     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
   2889     "pmulhuw   %%xmm2,%%xmm0                   \n"
   2890     "movdqa    (%0),%%xmm1                     \n"
   2891     "punpckhbw %%xmm1,%%xmm1                   \n"
   2892     "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
   2893     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
   2894     "pmulhuw   %%xmm2,%%xmm1                   \n"
   2895     "movdqa    (%0),%%xmm2                     \n"
   2896     "psrlw     $0x8,%%xmm0                     \n"
   2897     "pand      %%xmm4,%%xmm2                   \n"
   2898     "psrlw     $0x8,%%xmm1                     \n"
   2899     "packuswb  %%xmm1,%%xmm0                   \n"
   2900     "pand      %%xmm5,%%xmm0                   \n"
   2901     "por       %%xmm2,%%xmm0                   \n"
   2902     "sub       $0x4,%2                         \n"
   2903     "movdqa    %%xmm0,(%0,%1,1)                \n"
   2904     "lea       0x10(%0),%0                     \n"
   2905     "jg        1b                              \n"
   2906   : "+r"(src_argb),    // %0
   2907     "+r"(dst_argb),    // %1
   2908     "+r"(width)        // %2
   2909   :
   2910   : "memory", "cc"
   2911 #if defined(__SSE2__)
   2912     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2913 #endif
   2914   );
   2915 }
   2916 #endif  // HAS_ARGBATTENUATE_SSE2
   2917 
   2918 #ifdef HAS_ARGBATTENUATEROW_SSSE3
   2919 // Shuffle table duplicating alpha
   2920 CONST uvec8 kShuffleAlpha0 = {
   2921   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
   2922 };
   2923 CONST uvec8 kShuffleAlpha1 = {
   2924   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
   2925   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
   2926 };
   2927 // Attenuate 4 pixels at a time.
   2928 // aligned to 16 bytes
   2929 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   2930   asm volatile (
   2931     "sub       %0,%1                           \n"
   2932     "pcmpeqb   %%xmm3,%%xmm3                   \n"
   2933     "pslld     $0x18,%%xmm3                    \n"
   2934     "movdqa    %3,%%xmm4                       \n"
   2935     "movdqa    %4,%%xmm5                       \n"
   2936 
   2937     // 4 pixel loop.
   2938     ".p2align  4                               \n"
   2939   "1:                                          \n"
   2940     "movdqa    (%0),%%xmm0                     \n"
   2941     "pshufb    %%xmm4,%%xmm0                   \n"
   2942     "movdqa    (%0),%%xmm1                     \n"
   2943     "punpcklbw %%xmm1,%%xmm1                   \n"
   2944     "pmulhuw   %%xmm1,%%xmm0                   \n"
   2945     "movdqa    (%0),%%xmm1                     \n"
   2946     "pshufb    %%xmm5,%%xmm1                   \n"
   2947     "movdqa    (%0),%%xmm2                     \n"
   2948     "punpckhbw %%xmm2,%%xmm2                   \n"
   2949     "pmulhuw   %%xmm2,%%xmm1                   \n"
   2950     "movdqa    (%0),%%xmm2                     \n"
   2951     "pand      %%xmm3,%%xmm2                   \n"
   2952     "psrlw     $0x8,%%xmm0                     \n"
   2953     "psrlw     $0x8,%%xmm1                     \n"
   2954     "packuswb  %%xmm1,%%xmm0                   \n"
   2955     "por       %%xmm2,%%xmm0                   \n"
   2956     "sub       $0x4,%2                         \n"
   2957     "movdqa    %%xmm0,(%0,%1,1)                \n"
   2958     "lea       0x10(%0),%0                     \n"
   2959     "jg        1b                              \n"
   2960   : "+r"(src_argb),    // %0
   2961     "+r"(dst_argb),    // %1
   2962     "+r"(width)        // %2
   2963   : "m"(kShuffleAlpha0),  // %3
   2964     "m"(kShuffleAlpha1)  // %4
   2965   : "memory", "cc"
   2966 #if defined(__SSE2__)
   2967     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2968 #endif
   2969   );
   2970 }
   2971 #endif  // HAS_ARGBATTENUATEROW_SSSE3
   2972 
   2973 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
   2974 // Unattenuate 4 pixels at a time.
   2975 // aligned to 16 bytes
   2976 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
   2977                              int width) {
   2978   uintptr_t alpha = 0;
   2979   asm volatile (
   2980     "sub       %0,%1                           \n"
   2981     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   2982     "pslld     $0x18,%%xmm4                    \n"
   2983 
   2984     // 4 pixel loop.
   2985     ".p2align  4                               \n"
   2986   "1:                                          \n"
   2987     "movdqa    (%0),%%xmm0                     \n"
   2988     "movzb     0x3(%0),%3                      \n"
   2989     "punpcklbw %%xmm0,%%xmm0                   \n"
   2990     "movd      0x0(%4,%3,4),%%xmm2             \n"
   2991     "movzb     0x7(%0),%3                      \n"
   2992     "movd      0x0(%4,%3,4),%%xmm3             \n"
   2993     "pshuflw   $0xc0,%%xmm2,%%xmm2             \n"
   2994     "pshuflw   $0xc0,%%xmm3,%%xmm3             \n"
   2995     "movlhps   %%xmm3,%%xmm2                   \n"
   2996     "pmulhuw   %%xmm2,%%xmm0                   \n"
   2997     "movdqa    (%0),%%xmm1                     \n"
   2998     "movzb     0xb(%0),%3                      \n"
   2999     "punpckhbw %%xmm1,%%xmm1                   \n"
   3000     "movd      0x0(%4,%3,4),%%xmm2             \n"
   3001     "movzb     0xf(%0),%3                      \n"
   3002     "movd      0x0(%4,%3,4),%%xmm3             \n"
   3003     "pshuflw   $0xc0,%%xmm2,%%xmm2             \n"
   3004     "pshuflw   $0xc0,%%xmm3,%%xmm3             \n"
   3005     "movlhps   %%xmm3,%%xmm2                   \n"
   3006     "pmulhuw   %%xmm2,%%xmm1                   \n"
   3007     "movdqa    (%0),%%xmm2                     \n"
   3008     "pand      %%xmm4,%%xmm2                   \n"
   3009     "packuswb  %%xmm1,%%xmm0                   \n"
   3010     "por       %%xmm2,%%xmm0                   \n"
   3011     "sub       $0x4,%2                         \n"
   3012     "movdqa    %%xmm0,(%0,%1,1)                \n"
   3013     "lea       0x10(%0),%0                     \n"
   3014     "jg        1b                              \n"
   3015   : "+r"(src_argb),    // %0
   3016     "+r"(dst_argb),    // %1
   3017     "+r"(width),       // %2
   3018     "+r"(alpha)        // %3
   3019   : "r"(fixed_invtbl8)  // %4
   3020   : "memory", "cc"
   3021 #if defined(__SSE2__)
   3022     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   3023 #endif
   3024   );
   3025 }
   3026 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
   3027 
   3028 #ifdef HAS_ARGBGRAYROW_SSSE3
   3029 // Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
   3030 CONST vec8 kARGBToGray = {
   3031   14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
   3032 };
   3033 
   3034 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
   3035 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   3036   asm volatile (
   3037     "movdqa    %3,%%xmm4                       \n"
   3038     "sub       %0,%1                           \n"
   3039 
   3040     // 8 pixel loop.
   3041     ".p2align  4                               \n"
   3042   "1:                                          \n"
   3043     "movdqa    (%0),%%xmm0                     \n"
   3044     "movdqa    0x10(%0),%%xmm1                 \n"
   3045     "pmaddubsw %%xmm4,%%xmm0                   \n"
   3046     "pmaddubsw %%xmm4,%%xmm1                   \n"
   3047     "phaddw    %%xmm1,%%xmm0                   \n"
   3048     "psrlw     $0x7,%%xmm0                     \n"
   3049     "packuswb  %%xmm0,%%xmm0                   \n"
   3050     "movdqa    (%0),%%xmm2                     \n"
   3051     "movdqa    0x10(%0),%%xmm3                 \n"
   3052     "psrld     $0x18,%%xmm2                    \n"
   3053     "psrld     $0x18,%%xmm3                    \n"
   3054     "packuswb  %%xmm3,%%xmm2                   \n"
   3055     "packuswb  %%xmm2,%%xmm2                   \n"
   3056     "movdqa    %%xmm0,%%xmm3                   \n"
   3057     "punpcklbw %%xmm0,%%xmm0                   \n"
   3058     "punpcklbw %%xmm2,%%xmm3                   \n"
   3059     "movdqa    %%xmm0,%%xmm1                   \n"
   3060     "punpcklwd %%xmm3,%%xmm0                   \n"
   3061     "punpckhwd %%xmm3,%%xmm1                   \n"
   3062     "sub       $0x8,%2                         \n"
   3063     "movdqa    %%xmm0,(%0,%1,1)                \n"
   3064     "movdqa    %%xmm1,0x10(%0,%1,1)            \n"
   3065     "lea       0x20(%0),%0                     \n"
   3066     "jg        1b                              \n"
   3067   : "+r"(src_argb),   // %0
   3068     "+r"(dst_argb),   // %1
   3069     "+r"(width)       // %2
   3070   : "m"(kARGBToGray)  // %3
   3071   : "memory", "cc"
   3072 #if defined(__SSE2__)
   3073     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   3074 #endif
   3075   );
   3076 }
   3077 #endif  // HAS_ARGBGRAYROW_SSSE3
   3078 
   3079 #ifdef HAS_ARGBSEPIAROW_SSSE3
   3080 //    b = (r * 35 + g * 68 + b * 17) >> 7
   3081 //    g = (r * 45 + g * 88 + b * 22) >> 7
   3082 //    r = (r * 50 + g * 98 + b * 24) >> 7
   3083 // Constant for ARGB color to sepia tone
   3084 CONST vec8 kARGBToSepiaB = {
   3085   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
   3086 };
   3087 
   3088 CONST vec8 kARGBToSepiaG = {
   3089   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
   3090 };
   3091 
   3092 CONST vec8 kARGBToSepiaR = {
   3093   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
   3094 };
   3095 
   3096 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
   3097 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
   3098   asm volatile (
   3099     "movdqa    %2,%%xmm2                       \n"
   3100     "movdqa    %3,%%xmm3                       \n"
   3101     "movdqa    %4,%%xmm4                       \n"
   3102 
   3103     // 8 pixel loop.
   3104     ".p2align  4                               \n"
   3105   "1:                                          \n"
   3106     "movdqa    (%0),%%xmm0                     \n"
   3107     "movdqa    0x10(%0),%%xmm6                 \n"
   3108     "pmaddubsw %%xmm2,%%xmm0                   \n"
   3109     "pmaddubsw %%xmm2,%%xmm6                   \n"
   3110     "phaddw    %%xmm6,%%xmm0                   \n"
   3111     "psrlw     $0x7,%%xmm0                     \n"
   3112     "packuswb  %%xmm0,%%xmm0                   \n"
   3113     "movdqa    (%0),%%xmm5                     \n"
   3114     "movdqa    0x10(%0),%%xmm1                 \n"
   3115     "pmaddubsw %%xmm3,%%xmm5                   \n"
   3116     "pmaddubsw %%xmm3,%%xmm1                   \n"
   3117     "phaddw    %%xmm1,%%xmm5                   \n"
   3118     "psrlw     $0x7,%%xmm5                     \n"
   3119     "packuswb  %%xmm5,%%xmm5                   \n"
   3120     "punpcklbw %%xmm5,%%xmm0                   \n"
   3121     "movdqa    (%0),%%xmm5                     \n"
   3122     "movdqa    0x10(%0),%%xmm1                 \n"
   3123     "pmaddubsw %%xmm4,%%xmm5                   \n"
   3124     "pmaddubsw %%xmm4,%%xmm1                   \n"
   3125     "phaddw    %%xmm1,%%xmm5                   \n"
   3126     "psrlw     $0x7,%%xmm5                     \n"
   3127     "packuswb  %%xmm5,%%xmm5                   \n"
   3128     "movdqa    (%0),%%xmm6                     \n"
   3129     "movdqa    0x10(%0),%%xmm1                 \n"
   3130     "psrld     $0x18,%%xmm6                    \n"
   3131     "psrld     $0x18,%%xmm1                    \n"
   3132     "packuswb  %%xmm1,%%xmm6                   \n"
   3133     "packuswb  %%xmm6,%%xmm6                   \n"
   3134     "punpcklbw %%xmm6,%%xmm5                   \n"
   3135     "movdqa    %%xmm0,%%xmm1                   \n"
   3136     "punpcklwd %%xmm5,%%xmm0                   \n"
   3137     "punpckhwd %%xmm5,%%xmm1                   \n"
   3138     "sub       $0x8,%1                         \n"
   3139     "movdqa    %%xmm0,(%0)                     \n"
   3140     "movdqa    %%xmm1,0x10(%0)                 \n"
   3141     "lea       0x20(%0),%0                     \n"
   3142     "jg        1b                              \n"
   3143   : "+r"(dst_argb),      // %0
   3144     "+r"(width)          // %1
   3145   : "m"(kARGBToSepiaB),  // %2
   3146     "m"(kARGBToSepiaG),  // %3
   3147     "m"(kARGBToSepiaR)   // %4
   3148   : "memory", "cc"
   3149 #if defined(__SSE2__)
   3150     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   3151 #endif
   3152   );
   3153 }
   3154 #endif  // HAS_ARGBSEPIAROW_SSSE3
   3155 
   3156 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
   3157 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
   3158 // Same as Sepia except matrix is provided.
   3159 void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
   3160                               int width) {
   3161   asm volatile (
   3162     "movd      (%2),%%xmm2                     \n"
   3163     "movd      0x4(%2),%%xmm3                  \n"
   3164     "movd      0x8(%2),%%xmm4                  \n"
   3165     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
   3166     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
   3167     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
   3168 
   3169     // 8 pixel loop.
   3170     ".p2align  4                               \n"
   3171   "1:                                          \n"
   3172     "movdqa    (%0),%%xmm0                     \n"
   3173     "movdqa    0x10(%0),%%xmm6                 \n"
   3174     "pmaddubsw %%xmm2,%%xmm0                   \n"
   3175     "pmaddubsw %%xmm2,%%xmm6                   \n"
   3176     "movdqa    (%0),%%xmm5                     \n"
   3177     "movdqa    0x10(%0),%%xmm1                 \n"
   3178     "pmaddubsw %%xmm3,%%xmm5                   \n"
   3179     "pmaddubsw %%xmm3,%%xmm1                   \n"
   3180     "phaddsw   %%xmm6,%%xmm0                   \n"
   3181     "phaddsw   %%xmm1,%%xmm5                   \n"
   3182     "psraw     $0x7,%%xmm0                     \n"
   3183     "psraw     $0x7,%%xmm5                     \n"
   3184     "packuswb  %%xmm0,%%xmm0                   \n"
   3185     "packuswb  %%xmm5,%%xmm5                   \n"
   3186     "punpcklbw %%xmm5,%%xmm0                   \n"
   3187     "movdqa    (%0),%%xmm5                     \n"
   3188     "movdqa    0x10(%0),%%xmm1                 \n"
   3189     "pmaddubsw %%xmm4,%%xmm5                   \n"
   3190     "pmaddubsw %%xmm4,%%xmm1                   \n"
   3191     "phaddsw   %%xmm1,%%xmm5                   \n"
   3192     "psraw     $0x7,%%xmm5                     \n"
   3193     "packuswb  %%xmm5,%%xmm5                   \n"
   3194     "movdqa    (%0),%%xmm6                     \n"
   3195     "movdqa    0x10(%0),%%xmm1                 \n"
   3196     "psrld     $0x18,%%xmm6                    \n"
   3197     "psrld     $0x18,%%xmm1                    \n"
   3198     "packuswb  %%xmm1,%%xmm6                   \n"
   3199     "packuswb  %%xmm6,%%xmm6                   \n"
   3200     "movdqa    %%xmm0,%%xmm1                   \n"
   3201     "punpcklbw %%xmm6,%%xmm5                   \n"
   3202     "punpcklwd %%xmm5,%%xmm0                   \n"
   3203     "punpckhwd %%xmm5,%%xmm1                   \n"
   3204     "sub       $0x8,%1                         \n"
   3205     "movdqa    %%xmm0,(%0)                     \n"
   3206     "movdqa    %%xmm1,0x10(%0)                 \n"
   3207     "lea       0x20(%0),%0                     \n"
   3208     "jg        1b                              \n"
   3209   : "+r"(dst_argb),      // %0
   3210     "+r"(width)          // %1
   3211   : "r"(matrix_argb)     // %2
   3212   : "memory", "cc"
   3213 #if defined(__SSE2__)
   3214     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   3215 #endif
   3216   );
   3217 }
   3218 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
   3219 
   3220 #ifdef HAS_ARGBQUANTIZEROW_SSE2
   3221 // Quantize 4 ARGB pixels (16 bytes).
   3222 // aligned to 16 bytes
   3223 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
   3224                           int interval_offset, int width) {
   3225   asm volatile (
   3226     "movd      %2,%%xmm2                       \n"
   3227     "movd      %3,%%xmm3                       \n"
   3228     "movd      %4,%%xmm4                       \n"
   3229     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
   3230     "pshufd    $0x44,%%xmm2,%%xmm2             \n"
   3231     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
   3232     "pshufd    $0x44,%%xmm3,%%xmm3             \n"
   3233     "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
   3234     "pshufd    $0x44,%%xmm4,%%xmm4             \n"
   3235     "pxor      %%xmm5,%%xmm5                   \n"
   3236     "pcmpeqb   %%xmm6,%%xmm6                   \n"
   3237     "pslld     $0x18,%%xmm6                    \n"
   3238 
   3239     // 4 pixel loop.
   3240     ".p2align  2                               \n"
   3241   "1:                                          \n"
   3242     "movdqa    (%0),%%xmm0                     \n"
   3243     "punpcklbw %%xmm5,%%xmm0                   \n"
   3244     "pmulhuw   %%xmm2,%%xmm0                   \n"
   3245     "movdqa    (%0),%%xmm1                     \n"
   3246     "punpckhbw %%xmm5,%%xmm1                   \n"
   3247     "pmulhuw   %%xmm2,%%xmm1                   \n"
   3248     "pmullw    %%xmm3,%%xmm0                   \n"
   3249     "movdqa    (%0),%%xmm7                     \n"
   3250     "pmullw    %%xmm3,%%xmm1                   \n"
   3251     "pand      %%xmm6,%%xmm7                   \n"
   3252     "paddw     %%xmm4,%%xmm0                   \n"
   3253     "paddw     %%xmm4,%%xmm1                   \n"
   3254     "packuswb  %%xmm1,%%xmm0                   \n"
   3255     "por       %%xmm7,%%xmm0                   \n"
   3256     "sub       $0x4,%1                         \n"
   3257     "movdqa    %%xmm0,(%0)                     \n"
   3258     "lea       0x10(%0),%0                     \n"
   3259     "jg        1b                              \n"
   3260   : "+r"(dst_argb),       // %0
   3261     "+r"(width)           // %1
   3262   : "r"(scale),           // %2
   3263     "r"(interval_size),   // %3
   3264     "r"(interval_offset)  // %4
   3265   : "memory", "cc"
   3266 #if defined(__SSE2__)
   3267     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   3268 #endif
   3269   );
   3270 }
   3271 #endif  // HAS_ARGBQUANTIZEROW_SSE2
   3272 
   3273 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
   3274 // Creates a table of cumulative sums where each value is a sum of all values
   3275 // above and to the left of the value, inclusive of the value.
   3276 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
   3277                                   const int32* previous_cumsum, int width) {
   3278   asm volatile (
   3279     "sub       %1,%2                           \n"
   3280     "pxor      %%xmm0,%%xmm0                   \n"
   3281     "pxor      %%xmm1,%%xmm1                   \n"
   3282     "sub       $0x4,%3                         \n"
   3283     "jl        49f                             \n"
   3284     "test      $0xf,%1                         \n"
   3285     "jne       49f                             \n"
   3286 
   3287   // 4 pixel loop                              \n"
   3288     ".p2align  2                               \n"
   3289   "40:                                         \n"
   3290     "movdqu    (%0),%%xmm2                     \n"
   3291     "lea       0x10(%0),%0                     \n"
   3292     "movdqa    %%xmm2,%%xmm4                   \n"
   3293     "punpcklbw %%xmm1,%%xmm2                   \n"
   3294     "movdqa    %%xmm2,%%xmm3                   \n"
   3295     "punpcklwd %%xmm1,%%xmm2                   \n"
   3296     "punpckhwd %%xmm1,%%xmm3                   \n"
   3297     "punpckhbw %%xmm1,%%xmm4                   \n"
   3298     "movdqa    %%xmm4,%%xmm5                   \n"
   3299     "punpcklwd %%xmm1,%%xmm4                   \n"
   3300     "punpckhwd %%xmm1,%%xmm5                   \n"
   3301     "paddd     %%xmm2,%%xmm0                   \n"
   3302     "movdqa    (%1,%2,1),%%xmm2                \n"
   3303     "paddd     %%xmm0,%%xmm2                   \n"
   3304     "paddd     %%xmm3,%%xmm0                   \n"
   3305     "movdqa    0x10(%1,%2,1),%%xmm3            \n"
   3306     "paddd     %%xmm0,%%xmm3                   \n"
   3307     "paddd     %%xmm4,%%xmm0                   \n"
   3308     "movdqa    0x20(%1,%2,1),%%xmm4            \n"
   3309     "paddd     %%xmm0,%%xmm4                   \n"
   3310     "paddd     %%xmm5,%%xmm0                   \n"
   3311     "movdqa    0x30(%1,%2,1),%%xmm5            \n"
   3312     "paddd     %%xmm0,%%xmm5                   \n"
   3313     "movdqa    %%xmm2,(%1)                     \n"
   3314     "movdqa    %%xmm3,0x10(%1)                 \n"
   3315     "movdqa    %%xmm4,0x20(%1)                 \n"
   3316     "movdqa    %%xmm5,0x30(%1)                 \n"
   3317     "lea       0x40(%1),%1                     \n"
   3318     "sub       $0x4,%3                         \n"
   3319     "jge       40b                             \n"
   3320 
   3321   "49:                                         \n"
   3322     "add       $0x3,%3                         \n"
   3323     "jl        19f                             \n"
   3324 
   3325   // 1 pixel loop                              \n"
   3326     ".p2align  2                               \n"
   3327   "10:                                         \n"
   3328     "movd      (%0),%%xmm2                     \n"
   3329     "lea       0x4(%0),%0                      \n"
   3330     "punpcklbw %%xmm1,%%xmm2                   \n"
   3331     "punpcklwd %%xmm1,%%xmm2                   \n"
   3332     "paddd     %%xmm2,%%xmm0                   \n"
   3333     "movdqu    (%1,%2,1),%%xmm2                \n"
   3334     "paddd     %%xmm0,%%xmm2                   \n"
   3335     "movdqu    %%xmm2,(%1)                     \n"
   3336     "lea       0x10(%1),%1                     \n"
   3337     "sub       $0x1,%3                         \n"
   3338     "jge       10b                             \n"
   3339 
   3340   "19:                                         \n"
   3341   : "+r"(row),  // %0
   3342     "+r"(cumsum),  // %1
   3343     "+r"(previous_cumsum),  // %2
   3344     "+r"(width)  // %3
   3345   :
   3346   : "memory", "cc"
   3347 #if defined(__SSE2__)
   3348     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   3349 #endif
   3350   );
   3351 }
   3352 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
   3353 
   3354 #ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
   3355 void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
   3356                                  int width, int area, uint8* dst, int count) {
   3357   asm volatile (
   3358     "movd      %5,%%xmm4                       \n"
   3359     "cvtdq2ps  %%xmm4,%%xmm4                   \n"
   3360     "rcpss     %%xmm4,%%xmm4                   \n"
   3361     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
   3362     "sub       $0x4,%3                         \n"
   3363     "jl        49f                             \n"
   3364 
   3365   // 4 pixel loop                              \n"
   3366     ".p2align  2                               \n"
   3367   "40:                                         \n"
   3368     "movdqa    (%0),%%xmm0                     \n"
   3369     "movdqa    0x10(%0),%%xmm1                 \n"
   3370     "movdqa    0x20(%0),%%xmm2                 \n"
   3371     "movdqa    0x30(%0),%%xmm3                 \n"
   3372     "psubd     (%0,%4,4),%%xmm0                \n"
   3373     "psubd     0x10(%0,%4,4),%%xmm1            \n"
   3374     "psubd     0x20(%0,%4,4),%%xmm2            \n"
   3375     "psubd     0x30(%0,%4,4),%%xmm3            \n"
   3376     "lea       0x40(%0),%0                     \n"
   3377     "psubd     (%1),%%xmm0                     \n"
   3378     "psubd     0x10(%1),%%xmm1                 \n"
   3379     "psubd     0x20(%1),%%xmm2                 \n"
   3380     "psubd     0x30(%1),%%xmm3                 \n"
   3381     "paddd     (%1,%4,4),%%xmm0                \n"
   3382     "paddd     0x10(%1,%4,4),%%xmm1            \n"
   3383     "paddd     0x20(%1,%4,4),%%xmm2            \n"
   3384     "paddd     0x30(%1,%4,4),%%xmm3            \n"
   3385     "lea       0x40(%1),%1                     \n"
   3386     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
   3387     "cvtdq2ps  %%xmm1,%%xmm1                   \n"
   3388     "mulps     %%xmm4,%%xmm0                   \n"
   3389     "mulps     %%xmm4,%%xmm1                   \n"
   3390     "cvtdq2ps  %%xmm2,%%xmm2                   \n"
   3391     "cvtdq2ps  %%xmm3,%%xmm3                   \n"
   3392     "mulps     %%xmm4,%%xmm2                   \n"
   3393     "mulps     %%xmm4,%%xmm3                   \n"
   3394     "cvtps2dq  %%xmm0,%%xmm0                   \n"
   3395     "cvtps2dq  %%xmm1,%%xmm1                   \n"
   3396     "cvtps2dq  %%xmm2,%%xmm2                   \n"
   3397     "cvtps2dq  %%xmm3,%%xmm3                   \n"
   3398     "packssdw  %%xmm1,%%xmm0                   \n"
   3399     "packssdw  %%xmm3,%%xmm2                   \n"
   3400     "packuswb  %%xmm2,%%xmm0                   \n"
   3401     "movdqu    %%xmm0,(%2)                     \n"
   3402     "lea       0x10(%2),%2                     \n"
   3403     "sub       $0x4,%3                         \n"
   3404     "jge       40b                             \n"
   3405 
   3406   "49:                                         \n"
   3407     "add       $0x3,%3                         \n"
   3408     "jl        19f                             \n"
   3409 
   3410   // 1 pixel loop                              \n"
   3411     ".p2align  2                               \n"
   3412   "10:                                         \n"
   3413     "movdqa    (%0),%%xmm0                     \n"
   3414     "psubd     (%0,%4,4),%%xmm0                \n"
   3415     "lea       0x10(%0),%0                     \n"
   3416     "psubd     (%1),%%xmm0                     \n"
   3417     "paddd     (%1,%4,4),%%xmm0                \n"
   3418     "lea       0x10(%1),%1                     \n"
   3419     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
   3420     "mulps     %%xmm4,%%xmm0                   \n"
   3421     "cvtps2dq  %%xmm0,%%xmm0                   \n"
   3422     "packssdw  %%xmm0,%%xmm0                   \n"
   3423     "packuswb  %%xmm0,%%xmm0                   \n"
   3424     "movd      %%xmm0,(%2)                     \n"
   3425     "lea       0x4(%2),%2                      \n"
   3426     "sub       $0x1,%3                         \n"
   3427     "jge       10b                             \n"
   3428   "19:                                         \n"
   3429   : "+r"(topleft),  // %0
   3430     "+r"(botleft),  // %1
   3431     "+r"(dst),      // %2
   3432     "+rm"(count)    // %3
   3433   : "r"(static_cast<intptr_t>(width)),  // %4
   3434     "rm"(area)     // %5
   3435   : "memory", "cc"
   3436 #if defined(__SSE2__)
   3437     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   3438 #endif
   3439   );
   3440 }
   3441 #endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
   3442 #ifdef HAS_ARGBSHADE_SSE2
   3443 // Shade 4 pixels at a time by specified value.
   3444 // Aligned to 16 bytes.
   3445 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
   3446                        uint32 value) {
   3447   asm volatile (
   3448     "movd      %3,%%xmm2                       \n"
   3449     "sub       %0,%1                           \n"
   3450     "punpcklbw %%xmm2,%%xmm2                   \n"
   3451     "punpcklqdq %%xmm2,%%xmm2                  \n"
   3452 
   3453     // 4 pixel loop.
   3454     ".p2align  2                               \n"
   3455   "1:                                          \n"
   3456     "movdqa    (%0),%%xmm0                     \n"
   3457     "movdqa    %%xmm0,%%xmm1                   \n"
   3458     "punpcklbw %%xmm0,%%xmm0                   \n"
   3459     "punpckhbw %%xmm1,%%xmm1                   \n"
   3460     "pmulhuw   %%xmm2,%%xmm0                   \n"
   3461     "pmulhuw   %%xmm2,%%xmm1                   \n"
   3462     "psrlw     $0x8,%%xmm0                     \n"
   3463     "psrlw     $0x8,%%xmm1                     \n"
   3464     "packuswb  %%xmm1,%%xmm0                   \n"
   3465     "sub       $0x4,%2                         \n"
   3466     "movdqa    %%xmm0,(%0,%1,1)                \n"
   3467     "lea       0x10(%0),%0                     \n"
   3468     "jg        1b                              \n"
   3469   : "+r"(src_argb),       // %0
   3470     "+r"(dst_argb),       // %1
   3471     "+r"(width)           // %2
   3472   : "r"(value)            // %3
   3473   : "memory", "cc"
   3474 #if defined(__SSE2__)
   3475     , "xmm0", "xmm1", "xmm2"
   3476 #endif
   3477   );
   3478 }
   3479 #endif  // HAS_ARGBSHADE_SSE2
   3480 
   3481 #ifdef HAS_ARGBAFFINEROW_SSE2
   3482 // TODO(fbarchard): Find 64 bit way to avoid masking.
   3483 // TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
   3484 // Copy ARGB pixels from source image with slope to a row of destination.
   3485 // Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
   3486 // an error if movq is used. movd  %%xmm0,%1
   3487 
   3488 LIBYUV_API
   3489 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
   3490                         uint8* dst_argb, const float* uv_dudv, int width) {
   3491   intptr_t src_argb_stride_temp = src_argb_stride;
   3492   intptr_t temp = 0;
   3493   asm volatile (
   3494     "movq      (%3),%%xmm2                     \n"
   3495     "movq      0x8(%3),%%xmm7                  \n"
   3496     "shl       $0x10,%1                        \n"
   3497     "add       $0x4,%1                         \n"
   3498     "movd      %1,%%xmm5                       \n"
   3499     "sub       $0x4,%4                         \n"
   3500     "jl        49f                             \n"
   3501 
   3502     "pshufd    $0x44,%%xmm7,%%xmm7             \n"
   3503     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
   3504     "movdqa    %%xmm2,%%xmm0                   \n"
   3505     "addps     %%xmm7,%%xmm0                   \n"
   3506     "movlhps   %%xmm0,%%xmm2                   \n"
   3507     "movdqa    %%xmm7,%%xmm4                   \n"
   3508     "addps     %%xmm4,%%xmm4                   \n"
   3509     "movdqa    %%xmm2,%%xmm3                   \n"
   3510     "addps     %%xmm4,%%xmm3                   \n"
   3511     "addps     %%xmm4,%%xmm4                   \n"
   3512 
   3513   // 4 pixel loop                              \n"
   3514     ".p2align  4                               \n"
   3515   "40:                                         \n"
   3516     "cvttps2dq %%xmm2,%%xmm0                   \n"
   3517     "cvttps2dq %%xmm3,%%xmm1                   \n"
   3518     "packssdw  %%xmm1,%%xmm0                   \n"
   3519     "pmaddwd   %%xmm5,%%xmm0                   \n"
   3520 #if defined(__x86_64__)
   3521     "movd      %%xmm0,%1                       \n"
   3522     "mov       %1,%5                           \n"
   3523     "and       $0x0fffffff,%1                  \n"
   3524     "shr       $32,%5                          \n"
   3525     "pshufd    $0xEE,%%xmm0,%%xmm0             \n"
   3526 #else
   3527     "movd      %%xmm0,%1                       \n"
   3528     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   3529     "movd      %%xmm0,%5                       \n"
   3530     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   3531 #endif
   3532     "movd      (%0,%1,1),%%xmm1                \n"
   3533     "movd      (%0,%5,1),%%xmm6                \n"
   3534     "punpckldq %%xmm6,%%xmm1                   \n"
   3535     "addps     %%xmm4,%%xmm2                   \n"
   3536     "movq      %%xmm1,(%2)                     \n"
   3537 #if defined(__x86_64__)
   3538     "movd      %%xmm0,%1                       \n"
   3539     "mov       %1,%5                           \n"
   3540     "and       $0x0fffffff,%1                  \n"
   3541     "shr       $32,%5                          \n"
   3542 #else
   3543     "movd      %%xmm0,%1                       \n"
   3544     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   3545     "movd      %%xmm0,%5                       \n"
   3546 #endif
   3547     "movd      (%0,%1,1),%%xmm0                \n"
   3548     "movd      (%0,%5,1),%%xmm6                \n"
   3549     "punpckldq %%xmm6,%%xmm0                   \n"
   3550     "addps     %%xmm4,%%xmm3                   \n"
   3551     "sub       $0x4,%4                         \n"
   3552     "movq      %%xmm0,0x08(%2)                 \n"
   3553     "lea       0x10(%2),%2                     \n"
   3554     "jge       40b                             \n"
   3555 
   3556   "49:                                         \n"
   3557     "add       $0x3,%4                         \n"
   3558     "jl        19f                             \n"
   3559 
   3560   // 1 pixel loop                              \n"
   3561     ".p2align  4                               \n"
   3562   "10:                                         \n"
   3563     "cvttps2dq %%xmm2,%%xmm0                   \n"
   3564     "packssdw  %%xmm0,%%xmm0                   \n"
   3565     "pmaddwd   %%xmm5,%%xmm0                   \n"
   3566     "addps     %%xmm7,%%xmm2                   \n"
   3567     "movd      %%xmm0,%1                       \n"
   3568 #if defined(__x86_64__)
   3569     "and       $0x0fffffff,%1                  \n"
   3570 #endif
   3571     "movd      (%0,%1,1),%%xmm0                \n"
   3572     "sub       $0x1,%4                         \n"
   3573     "movd      %%xmm0,(%2)                     \n"
   3574     "lea       0x4(%2),%2                      \n"
   3575     "jge       10b                             \n"
   3576   "19:                                         \n"
   3577   : "+r"(src_argb),  // %0
   3578     "+r"(src_argb_stride_temp),  // %1
   3579     "+r"(dst_argb),  // %2
   3580     "+r"(uv_dudv),   // %3
   3581     "+rm"(width),    // %4
   3582     "+r"(temp)   // %5
   3583   :
   3584   : "memory", "cc"
   3585 #if defined(__SSE2__)
   3586     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   3587 #endif
   3588   );
   3589 }
   3590 #endif  // HAS_ARGBAFFINEROW_SSE2
   3591 
   3592 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
   3593 void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   3594                               ptrdiff_t src_stride, int dst_width,
   3595                               int source_y_fraction) {
   3596   asm volatile (
   3597     "sub       %1,%0                           \n"
   3598     "shr       %3                              \n"
   3599     "cmp       $0x0,%3                         \n"
   3600     "je        2f                              \n"
   3601     "cmp       $0x40,%3                        \n"
   3602     "je        3f                              \n"
   3603     "movd      %3,%%xmm0                       \n"
   3604     "neg       %3                              \n"
   3605     "add       $0x80,%3                        \n"
   3606     "movd      %3,%%xmm5                       \n"
   3607     "punpcklbw %%xmm0,%%xmm5                   \n"
   3608     "punpcklwd %%xmm5,%%xmm5                   \n"
   3609     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
   3610     ".p2align  4                               \n"
   3611   "1:                                          \n"
   3612     "movdqa    (%1),%%xmm0                     \n"
   3613     "movdqa    (%1,%4,1),%%xmm2                \n"
   3614     "movdqa    %%xmm0,%%xmm1                   \n"
   3615     "punpcklbw %%xmm2,%%xmm0                   \n"
   3616     "punpckhbw %%xmm2,%%xmm1                   \n"
   3617     "pmaddubsw %%xmm5,%%xmm0                   \n"
   3618     "pmaddubsw %%xmm5,%%xmm1                   \n"
   3619     "psrlw     $0x7,%%xmm0                     \n"
   3620     "psrlw     $0x7,%%xmm1                     \n"
   3621     "packuswb  %%xmm1,%%xmm0                   \n"
   3622     "sub       $0x4,%2                         \n"
   3623     "movdqa    %%xmm0,(%1,%0,1)                \n"
   3624     "lea       0x10(%1),%1                     \n"
   3625     "jg        1b                              \n"
   3626     "jmp       4f                              \n"
   3627     ".p2align  4                               \n"
   3628   "2:                                          \n"
   3629     "movdqa    (%1),%%xmm0                     \n"
   3630     "sub       $0x4,%2                         \n"
   3631     "movdqa    %%xmm0,(%1,%0,1)                \n"
   3632     "lea       0x10(%1),%1                     \n"
   3633     "jg        2b                              \n"
   3634     "jmp       4f                              \n"
   3635     ".p2align  4                               \n"
   3636   "3:                                          \n"
   3637     "movdqa    (%1),%%xmm0                     \n"
   3638     "pavgb     (%1,%4,1),%%xmm0                \n"
   3639     "sub       $0x4,%2                         \n"
   3640     "movdqa    %%xmm0,(%1,%0,1)                \n"
   3641     "lea       0x10(%1),%1                     \n"
   3642     "jg        3b                              \n"
   3643   "4:                                          \n"
   3644     ".p2align  4                               \n"
   3645   : "+r"(dst_ptr),     // %0
   3646     "+r"(src_ptr),     // %1
   3647     "+r"(dst_width),   // %2
   3648     "+r"(source_y_fraction)  // %3
   3649   : "r"(static_cast<intptr_t>(src_stride))  // %4
   3650   : "memory", "cc"
   3651 #if defined(__SSE2__)
   3652     , "xmm0", "xmm1", "xmm2", "xmm5"
   3653 #endif
   3654   );
   3655 }
   3656 
   3657 #endif  // defined(__x86_64__) || defined(__i386__)
   3658 
   3659 #ifdef __cplusplus
   3660 }  // extern "C"
   3661 }  // namespace libyuv
   3662 #endif
   3663