Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 
     13 #ifdef __cplusplus
     14 namespace libyuv {
     15 extern "C" {
     16 #endif
     17 
     18 // This module is for GCC x86 and x64.
     19 #if !defined(LIBYUV_DISABLE_X86) && \
     20     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
     21 
     22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
     23 
     24 // Constants for ARGB
     25 static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
     26                         13, 65, 33, 0, 13, 65, 33, 0};
     27 
     28 // JPeg full range.
     29 static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
     30                          15, 75, 38, 0, 15, 75, 38, 0};
     31 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
     32 
     33 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
     34 
     35 static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
     36                         112, -74, -38, 0, 112, -74, -38, 0};
     37 
     38 static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
     39                          127, -84, -43, 0, 127, -84, -43, 0};
     40 
     41 static vec8 kARGBToV = {
     42     -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
     43 };
     44 
     45 static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
     46                          -20, -107, 127, 0, -20, -107, 127, 0};
     47 
     48 // Constants for BGRA
     49 static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
     50                         0, 33, 65, 13, 0, 33, 65, 13};
     51 
     52 static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
     53                         0, -38, -74, 112, 0, -38, -74, 112};
     54 
     55 static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
     56                         0, 112, -94, -18, 0, 112, -94, -18};
     57 
     58 // Constants for ABGR
     59 static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
     60                         33, 65, 13, 0, 33, 65, 13, 0};
     61 
     62 static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
     63                         -38, -74, 112, 0, -38, -74, 112, 0};
     64 
     65 static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
     66                         112, -94, -18, 0, 112, -94, -18, 0};
     67 
     68 // Constants for RGBA.
     69 static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
     70                         0, 13, 65, 33, 0, 13, 65, 33};
     71 
     72 static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
     73                         0, 112, -74, -38, 0, 112, -74, -38};
     74 
     75 static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
     76                         0, -18, -94, 112, 0, -18, -94, 112};
     77 
     78 static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
     79                         16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
     80 
     81 // 7 bit fixed point 0.5.
     82 static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
     83 
     84 static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
     85                           128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
     86 
     87 static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
     88                             0x8080u, 0x8080u, 0x8080u, 0x8080u};
     89 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
     90 
     91 #ifdef HAS_RGB24TOARGBROW_SSSE3
     92 
     93 // Shuffle table for converting RGB24 to ARGB.
     94 static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u,  5u,  13u,
     95                                         6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
     96 
     97 // Shuffle table for converting RAW to ARGB.
     98 static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
     99                                       8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
    100 
    101 // Shuffle table for converting RAW to RGB24.  First 8.
    102 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
    103     2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
    104     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
    105 
    106 // Shuffle table for converting RAW to RGB24.  Middle 8.
    107 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
    108     2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
    109     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
    110 
    111 // Shuffle table for converting RAW to RGB24.  Last 8.
    112 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
    113     8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
    114     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
    115 
    116 // Shuffle table for converting ARGB to RGB24.
    117 static uvec8 kShuffleMaskARGBToRGB24 = {
    118     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
    119 
    120 // Shuffle table for converting ARGB to RAW.
    121 static uvec8 kShuffleMaskARGBToRAW = {
    122     2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
    123 
    124 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
    125 static uvec8 kShuffleMaskARGBToRGB24_0 = {
    126     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
    127 
    128 // YUY2 shuf 16 Y to 32 Y.
    129 static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
    130                                     10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
    131                                     6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
    132 
    133 // YUY2 shuf 8 UV to 16 UV.
    134 static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
    135                                      11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
    136                                      5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
    137 
    138 // UYVY shuf 16 Y to 32 Y.
    139 static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
    140                                     11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
    141                                     7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
    142 
    143 // UYVY shuf 8 UV to 16 UV.
    144 static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
    145                                      10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
    146                                      4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
    147 
    148 // NV21 shuf 8 VU to 16 UV.
    149 static const lvec8 kShuffleNV21 = {
    150     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
    151     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
    152 };
    153 #endif  // HAS_RGB24TOARGBROW_SSSE3
    154 
    155 #ifdef HAS_J400TOARGBROW_SSE2
    156 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
    157   asm volatile (
    158     "pcmpeqb   %%xmm5,%%xmm5                   \n"
    159     "pslld     $0x18,%%xmm5                    \n"
    160     LABELALIGN
    161     "1:                                        \n"
    162     "movq      " MEMACCESS(0) ",%%xmm0         \n"
    163     "lea       " MEMLEA(0x8,0) ",%0            \n"
    164     "punpcklbw %%xmm0,%%xmm0                   \n"
    165     "movdqa    %%xmm0,%%xmm1                   \n"
    166     "punpcklwd %%xmm0,%%xmm0                   \n"
    167     "punpckhwd %%xmm1,%%xmm1                   \n"
    168     "por       %%xmm5,%%xmm0                   \n"
    169     "por       %%xmm5,%%xmm1                   \n"
    170     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    171     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
    172     "lea       " MEMLEA(0x20,1) ",%1           \n"
    173     "sub       $0x8,%2                         \n"
    174     "jg        1b                              \n"
    175   : "+r"(src_y),     // %0
    176     "+r"(dst_argb),  // %1
    177     "+r"(width)        // %2
    178   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
    179   );
    180 }
    181 #endif  // HAS_J400TOARGBROW_SSE2
    182 
    183 #ifdef HAS_RGB24TOARGBROW_SSSE3
    184 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
    185   asm volatile (
    186     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
    187     "pslld     $0x18,%%xmm5                    \n"
    188     "movdqa    %3,%%xmm4                       \n"
    189     LABELALIGN
    190     "1:                                        \n"
    191     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    192     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    193     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
    194     "lea       " MEMLEA(0x30,0) ",%0           \n"
    195     "movdqa    %%xmm3,%%xmm2                   \n"
    196     "palignr   $0x8,%%xmm1,%%xmm2              \n"
    197     "pshufb    %%xmm4,%%xmm2                   \n"
    198     "por       %%xmm5,%%xmm2                   \n"
    199     "palignr   $0xc,%%xmm0,%%xmm1              \n"
    200     "pshufb    %%xmm4,%%xmm0                   \n"
    201     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
    202     "por       %%xmm5,%%xmm0                   \n"
    203     "pshufb    %%xmm4,%%xmm1                   \n"
    204     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    205     "por       %%xmm5,%%xmm1                   \n"
    206     "palignr   $0x4,%%xmm3,%%xmm3              \n"
    207     "pshufb    %%xmm4,%%xmm3                   \n"
    208     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
    209     "por       %%xmm5,%%xmm3                   \n"
    210     "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
    211     "lea       " MEMLEA(0x40,1) ",%1           \n"
    212     "sub       $0x10,%2                        \n"
    213     "jg        1b                              \n"
    214   : "+r"(src_rgb24),  // %0
    215     "+r"(dst_argb),  // %1
    216     "+r"(width)        // %2
    217   : "m"(kShuffleMaskRGB24ToARGB)  // %3
    218   : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    219   );
    220 }
    221 
    222 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
    223   asm volatile (
    224     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
    225     "pslld     $0x18,%%xmm5                    \n"
    226     "movdqa    %3,%%xmm4                       \n"
    227     LABELALIGN
    228     "1:                                        \n"
    229     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    230     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    231     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
    232     "lea       " MEMLEA(0x30,0) ",%0           \n"
    233     "movdqa    %%xmm3,%%xmm2                   \n"
    234     "palignr   $0x8,%%xmm1,%%xmm2              \n"
    235     "pshufb    %%xmm4,%%xmm2                   \n"
    236     "por       %%xmm5,%%xmm2                   \n"
    237     "palignr   $0xc,%%xmm0,%%xmm1              \n"
    238     "pshufb    %%xmm4,%%xmm0                   \n"
    239     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
    240     "por       %%xmm5,%%xmm0                   \n"
    241     "pshufb    %%xmm4,%%xmm1                   \n"
    242     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    243     "por       %%xmm5,%%xmm1                   \n"
    244     "palignr   $0x4,%%xmm3,%%xmm3              \n"
    245     "pshufb    %%xmm4,%%xmm3                   \n"
    246     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
    247     "por       %%xmm5,%%xmm3                   \n"
    248     "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
    249     "lea       " MEMLEA(0x40,1) ",%1           \n"
    250     "sub       $0x10,%2                        \n"
    251     "jg        1b                              \n"
    252   : "+r"(src_raw),   // %0
    253     "+r"(dst_argb),  // %1
    254     "+r"(width)        // %2
    255   : "m"(kShuffleMaskRAWToARGB)  // %3
    256   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    257   );
    258 }
    259 
    260 void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
    261   asm volatile (
    262    "movdqa     %3,%%xmm3                       \n"
    263    "movdqa     %4,%%xmm4                       \n"
    264    "movdqa     %5,%%xmm5                       \n"
    265     LABELALIGN
    266     "1:                                        \n"
    267     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    268     "movdqu    " MEMACCESS2(0x4,0) ",%%xmm1    \n"
    269     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm2    \n"
    270     "lea       " MEMLEA(0x18,0) ",%0           \n"
    271     "pshufb    %%xmm3,%%xmm0                   \n"
    272     "pshufb    %%xmm4,%%xmm1                   \n"
    273     "pshufb    %%xmm5,%%xmm2                   \n"
    274     "movq      %%xmm0," MEMACCESS(1) "         \n"
    275     "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
    276     "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
    277     "lea       " MEMLEA(0x18,1) ",%1           \n"
    278     "sub       $0x8,%2                         \n"
    279     "jg        1b                              \n"
    280   : "+r"(src_raw),    // %0
    281     "+r"(dst_rgb24),  // %1
    282     "+r"(width)       // %2
    283   : "m"(kShuffleMaskRAWToRGB24_0),  // %3
    284     "m"(kShuffleMaskRAWToRGB24_1),  // %4
    285     "m"(kShuffleMaskRAWToRGB24_2)   // %5
    286   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    287   );
    288 }
    289 
    290 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
    291   asm volatile (
    292     "mov       $0x1080108,%%eax                \n"
    293     "movd      %%eax,%%xmm5                    \n"
    294     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
    295     "mov       $0x20802080,%%eax               \n"
    296     "movd      %%eax,%%xmm6                    \n"
    297     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
    298     "pcmpeqb   %%xmm3,%%xmm3                   \n"
    299     "psllw     $0xb,%%xmm3                     \n"
    300     "pcmpeqb   %%xmm4,%%xmm4                   \n"
    301     "psllw     $0xa,%%xmm4                     \n"
    302     "psrlw     $0x5,%%xmm4                     \n"
    303     "pcmpeqb   %%xmm7,%%xmm7                   \n"
    304     "psllw     $0x8,%%xmm7                     \n"
    305     "sub       %0,%1                           \n"
    306     "sub       %0,%1                           \n"
    307     LABELALIGN
    308     "1:                                        \n"
    309     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    310     "movdqa    %%xmm0,%%xmm1                   \n"
    311     "movdqa    %%xmm0,%%xmm2                   \n"
    312     "pand      %%xmm3,%%xmm1                   \n"
    313     "psllw     $0xb,%%xmm2                     \n"
    314     "pmulhuw   %%xmm5,%%xmm1                   \n"
    315     "pmulhuw   %%xmm5,%%xmm2                   \n"
    316     "psllw     $0x8,%%xmm1                     \n"
    317     "por       %%xmm2,%%xmm1                   \n"
    318     "pand      %%xmm4,%%xmm0                   \n"
    319     "pmulhuw   %%xmm6,%%xmm0                   \n"
    320     "por       %%xmm7,%%xmm0                   \n"
    321     "movdqa    %%xmm1,%%xmm2                   \n"
    322     "punpcklbw %%xmm0,%%xmm1                   \n"
    323     "punpckhbw %%xmm0,%%xmm2                   \n"
    324     MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
    325     MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
    326     "lea       " MEMLEA(0x10,0) ",%0           \n"
    327     "sub       $0x8,%2                         \n"
    328     "jg        1b                              \n"
    329   : "+r"(src),  // %0
    330     "+r"(dst),  // %1
    331     "+r"(width)   // %2
    332   :
    333   : "memory", "cc", "eax", NACL_R14
    334     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    335   );
    336 }
    337 
    338 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
    339   asm volatile (
    340     "mov       $0x1080108,%%eax                \n"
    341     "movd      %%eax,%%xmm5                    \n"
    342     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
    343     "mov       $0x42004200,%%eax               \n"
    344     "movd      %%eax,%%xmm6                    \n"
    345     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
    346     "pcmpeqb   %%xmm3,%%xmm3                   \n"
    347     "psllw     $0xb,%%xmm3                     \n"
    348     "movdqa    %%xmm3,%%xmm4                   \n"
    349     "psrlw     $0x6,%%xmm4                     \n"
    350     "pcmpeqb   %%xmm7,%%xmm7                   \n"
    351     "psllw     $0x8,%%xmm7                     \n"
    352     "sub       %0,%1                           \n"
    353     "sub       %0,%1                           \n"
    354     LABELALIGN
    355     "1:                                        \n"
    356     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    357     "movdqa    %%xmm0,%%xmm1                   \n"
    358     "movdqa    %%xmm0,%%xmm2                   \n"
    359     "psllw     $0x1,%%xmm1                     \n"
    360     "psllw     $0xb,%%xmm2                     \n"
    361     "pand      %%xmm3,%%xmm1                   \n"
    362     "pmulhuw   %%xmm5,%%xmm2                   \n"
    363     "pmulhuw   %%xmm5,%%xmm1                   \n"
    364     "psllw     $0x8,%%xmm1                     \n"
    365     "por       %%xmm2,%%xmm1                   \n"
    366     "movdqa    %%xmm0,%%xmm2                   \n"
    367     "pand      %%xmm4,%%xmm0                   \n"
    368     "psraw     $0x8,%%xmm2                     \n"
    369     "pmulhuw   %%xmm6,%%xmm0                   \n"
    370     "pand      %%xmm7,%%xmm2                   \n"
    371     "por       %%xmm2,%%xmm0                   \n"
    372     "movdqa    %%xmm1,%%xmm2                   \n"
    373     "punpcklbw %%xmm0,%%xmm1                   \n"
    374     "punpckhbw %%xmm0,%%xmm2                   \n"
    375     MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
    376     MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
    377     "lea       " MEMLEA(0x10,0) ",%0           \n"
    378     "sub       $0x8,%2                         \n"
    379     "jg        1b                              \n"
    380   : "+r"(src),  // %0
    381     "+r"(dst),  // %1
    382     "+r"(width)   // %2
    383   :
    384   : "memory", "cc", "eax", NACL_R14
    385     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    386   );
    387 }
    388 
    389 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
    390   asm volatile (
    391     "mov       $0xf0f0f0f,%%eax                \n"
    392     "movd      %%eax,%%xmm4                    \n"
    393     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
    394     "movdqa    %%xmm4,%%xmm5                   \n"
    395     "pslld     $0x4,%%xmm5                     \n"
    396     "sub       %0,%1                           \n"
    397     "sub       %0,%1                           \n"
    398     LABELALIGN
    399     "1:                                        \n"
    400     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    401     "movdqa    %%xmm0,%%xmm2                   \n"
    402     "pand      %%xmm4,%%xmm0                   \n"
    403     "pand      %%xmm5,%%xmm2                   \n"
    404     "movdqa    %%xmm0,%%xmm1                   \n"
    405     "movdqa    %%xmm2,%%xmm3                   \n"
    406     "psllw     $0x4,%%xmm1                     \n"
    407     "psrlw     $0x4,%%xmm3                     \n"
    408     "por       %%xmm1,%%xmm0                   \n"
    409     "por       %%xmm3,%%xmm2                   \n"
    410     "movdqa    %%xmm0,%%xmm1                   \n"
    411     "punpcklbw %%xmm2,%%xmm0                   \n"
    412     "punpckhbw %%xmm2,%%xmm1                   \n"
    413     MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
    414     MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
    415     "lea       " MEMLEA(0x10,0) ",%0           \n"
    416     "sub       $0x8,%2                         \n"
    417     "jg        1b                              \n"
    418   : "+r"(src),  // %0
    419     "+r"(dst),  // %1
    420     "+r"(width)   // %2
    421   :
    422   : "memory", "cc", "eax", NACL_R14
    423     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    424   );
    425 }
    426 
    427 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
    428   asm volatile (
    429     "movdqa    %3,%%xmm6                       \n"
    430     LABELALIGN
    431     "1:                                        \n"
    432     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    433     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    434     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
    435     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
    436     "lea       " MEMLEA(0x40,0) ",%0           \n"
    437     "pshufb    %%xmm6,%%xmm0                   \n"
    438     "pshufb    %%xmm6,%%xmm1                   \n"
    439     "pshufb    %%xmm6,%%xmm2                   \n"
    440     "pshufb    %%xmm6,%%xmm3                   \n"
    441     "movdqa    %%xmm1,%%xmm4                   \n"
    442     "psrldq    $0x4,%%xmm1                     \n"
    443     "pslldq    $0xc,%%xmm4                     \n"
    444     "movdqa    %%xmm2,%%xmm5                   \n"
    445     "por       %%xmm4,%%xmm0                   \n"
    446     "pslldq    $0x8,%%xmm5                     \n"
    447     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    448     "por       %%xmm5,%%xmm1                   \n"
    449     "psrldq    $0x8,%%xmm2                     \n"
    450     "pslldq    $0x4,%%xmm3                     \n"
    451     "por       %%xmm3,%%xmm2                   \n"
    452     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
    453     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
    454     "lea       " MEMLEA(0x30,1) ",%1           \n"
    455     "sub       $0x10,%2                        \n"
    456     "jg        1b                              \n"
    457   : "+r"(src),  // %0
    458     "+r"(dst),  // %1
    459     "+r"(width)   // %2
    460   : "m"(kShuffleMaskARGBToRGB24)  // %3
    461   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
    462   );
    463 }
    464 
    465 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
    466   asm volatile (
    467     "movdqa    %3,%%xmm6                       \n"
    468     LABELALIGN
    469     "1:                                        \n"
    470     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    471     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    472     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
    473     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
    474     "lea       " MEMLEA(0x40,0) ",%0           \n"
    475     "pshufb    %%xmm6,%%xmm0                   \n"
    476     "pshufb    %%xmm6,%%xmm1                   \n"
    477     "pshufb    %%xmm6,%%xmm2                   \n"
    478     "pshufb    %%xmm6,%%xmm3                   \n"
    479     "movdqa    %%xmm1,%%xmm4                   \n"
    480     "psrldq    $0x4,%%xmm1                     \n"
    481     "pslldq    $0xc,%%xmm4                     \n"
    482     "movdqa    %%xmm2,%%xmm5                   \n"
    483     "por       %%xmm4,%%xmm0                   \n"
    484     "pslldq    $0x8,%%xmm5                     \n"
    485     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    486     "por       %%xmm5,%%xmm1                   \n"
    487     "psrldq    $0x8,%%xmm2                     \n"
    488     "pslldq    $0x4,%%xmm3                     \n"
    489     "por       %%xmm3,%%xmm2                   \n"
    490     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
    491     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
    492     "lea       " MEMLEA(0x30,1) ",%1           \n"
    493     "sub       $0x10,%2                        \n"
    494     "jg        1b                              \n"
    495   : "+r"(src),  // %0
    496     "+r"(dst),  // %1
    497     "+r"(width)   // %2
    498   : "m"(kShuffleMaskARGBToRAW)  // %3
    499   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
    500   );
    501 }
    502 
    503 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
    504   asm volatile (
    505     "pcmpeqb   %%xmm3,%%xmm3                   \n"
    506     "psrld     $0x1b,%%xmm3                    \n"
    507     "pcmpeqb   %%xmm4,%%xmm4                   \n"
    508     "psrld     $0x1a,%%xmm4                    \n"
    509     "pslld     $0x5,%%xmm4                     \n"
    510     "pcmpeqb   %%xmm5,%%xmm5                   \n"
    511     "pslld     $0xb,%%xmm5                     \n"
    512     LABELALIGN
    513     "1:                                        \n"
    514     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    515     "movdqa    %%xmm0,%%xmm1                   \n"
    516     "movdqa    %%xmm0,%%xmm2                   \n"
    517     "pslld     $0x8,%%xmm0                     \n"
    518     "psrld     $0x3,%%xmm1                     \n"
    519     "psrld     $0x5,%%xmm2                     \n"
    520     "psrad     $0x10,%%xmm0                    \n"
    521     "pand      %%xmm3,%%xmm1                   \n"
    522     "pand      %%xmm4,%%xmm2                   \n"
    523     "pand      %%xmm5,%%xmm0                   \n"
    524     "por       %%xmm2,%%xmm1                   \n"
    525     "por       %%xmm1,%%xmm0                   \n"
    526     "packssdw  %%xmm0,%%xmm0                   \n"
    527     "lea       " MEMLEA(0x10,0) ",%0           \n"
    528     "movq      %%xmm0," MEMACCESS(1) "         \n"
    529     "lea       " MEMLEA(0x8,1) ",%1            \n"
    530     "sub       $0x4,%2                         \n"
    531     "jg        1b                              \n"
    532   : "+r"(src),  // %0
    533     "+r"(dst),  // %1
    534     "+r"(width)   // %2
    535   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    536   );
    537 }
    538 
    539 void ARGBToRGB565DitherRow_SSE2(const uint8* src,
    540                                 uint8* dst,
    541                                 const uint32 dither4,
    542                                 int width) {
    543   asm volatile(
    544       "movd       %3,%%xmm6                      \n"
    545       "punpcklbw  %%xmm6,%%xmm6                  \n"
    546       "movdqa     %%xmm6,%%xmm7                  \n"
    547       "punpcklwd  %%xmm6,%%xmm6                  \n"
    548       "punpckhwd  %%xmm7,%%xmm7                  \n"
    549       "pcmpeqb    %%xmm3,%%xmm3                  \n"
    550       "psrld      $0x1b,%%xmm3                   \n"
    551       "pcmpeqb    %%xmm4,%%xmm4                  \n"
    552       "psrld      $0x1a,%%xmm4                   \n"
    553       "pslld      $0x5,%%xmm4                    \n"
    554       "pcmpeqb    %%xmm5,%%xmm5                  \n"
    555       "pslld      $0xb,%%xmm5                    \n"
    556 
    557       LABELALIGN
    558       "1:                                        \n"
    559       "movdqu     (%0),%%xmm0                    \n"
    560       "paddusb    %%xmm6,%%xmm0                  \n"
    561       "movdqa     %%xmm0,%%xmm1                  \n"
    562       "movdqa     %%xmm0,%%xmm2                  \n"
    563       "pslld      $0x8,%%xmm0                    \n"
    564       "psrld      $0x3,%%xmm1                    \n"
    565       "psrld      $0x5,%%xmm2                    \n"
    566       "psrad      $0x10,%%xmm0                   \n"
    567       "pand       %%xmm3,%%xmm1                  \n"
    568       "pand       %%xmm4,%%xmm2                  \n"
    569       "pand       %%xmm5,%%xmm0                  \n"
    570       "por        %%xmm2,%%xmm1                  \n"
    571       "por        %%xmm1,%%xmm0                  \n"
    572       "packssdw   %%xmm0,%%xmm0                  \n"
    573       "lea        0x10(%0),%0                    \n"
    574       "movq       %%xmm0,(%1)                    \n"
    575       "lea        0x8(%1),%1                     \n"
    576       "sub        $0x4,%2                        \n"
    577       "jg        1b                              \n"
    578       : "+r"(src),    // %0
    579         "+r"(dst),    // %1
    580         "+r"(width)   // %2
    581       : "m"(dither4)  // %3
    582       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
    583         "xmm7");
    584 }
    585 
    586 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
    587 void ARGBToRGB565DitherRow_AVX2(const uint8* src,
    588                                 uint8* dst,
    589                                 const uint32 dither4,
    590                                 int width) {
    591   asm volatile(
    592       "vbroadcastss %3,%%xmm6                    \n"
    593       "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
    594       "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
    595       "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
    596       "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
    597       "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
    598       "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
    599       "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
    600       "vpslld     $0x5,%%ymm4,%%ymm4             \n"
    601       "vpslld     $0xb,%%ymm3,%%ymm5             \n"
    602 
    603       LABELALIGN
    604       "1:                                        \n"
    605       "vmovdqu    (%0),%%ymm0                    \n"
    606       "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
    607       "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
    608       "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
    609       "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
    610       "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
    611       "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
    612       "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
    613       "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
    614       "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
    615       "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
    616       "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
    617       "lea        0x20(%0),%0                    \n"
    618       "vmovdqu    %%xmm0,(%1)                    \n"
    619       "lea        0x10(%1),%1                    \n"
    620       "sub        $0x8,%2                        \n"
    621       "jg         1b                             \n"
    622       "vzeroupper                                \n"
    623       : "+r"(src),    // %0
    624         "+r"(dst),    // %1
    625         "+r"(width)   // %2
    626       : "m"(dither4)  // %3
    627       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
    628         "xmm7");
    629 }
    630 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
    631 
    632 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
    633   asm volatile (
    634     "pcmpeqb   %%xmm4,%%xmm4                   \n"
    635     "psrld     $0x1b,%%xmm4                    \n"
    636     "movdqa    %%xmm4,%%xmm5                   \n"
    637     "pslld     $0x5,%%xmm5                     \n"
    638     "movdqa    %%xmm4,%%xmm6                   \n"
    639     "pslld     $0xa,%%xmm6                     \n"
    640     "pcmpeqb   %%xmm7,%%xmm7                   \n"
    641     "pslld     $0xf,%%xmm7                     \n"
    642 
    643     LABELALIGN
    644     "1:                                        \n"
    645     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    646     "movdqa    %%xmm0,%%xmm1                   \n"
    647     "movdqa    %%xmm0,%%xmm2                   \n"
    648     "movdqa    %%xmm0,%%xmm3                   \n"
    649     "psrad     $0x10,%%xmm0                    \n"
    650     "psrld     $0x3,%%xmm1                     \n"
    651     "psrld     $0x6,%%xmm2                     \n"
    652     "psrld     $0x9,%%xmm3                     \n"
    653     "pand      %%xmm7,%%xmm0                   \n"
    654     "pand      %%xmm4,%%xmm1                   \n"
    655     "pand      %%xmm5,%%xmm2                   \n"
    656     "pand      %%xmm6,%%xmm3                   \n"
    657     "por       %%xmm1,%%xmm0                   \n"
    658     "por       %%xmm3,%%xmm2                   \n"
    659     "por       %%xmm2,%%xmm0                   \n"
    660     "packssdw  %%xmm0,%%xmm0                   \n"
    661     "lea       " MEMLEA(0x10,0) ",%0           \n"
    662     "movq      %%xmm0," MEMACCESS(1) "         \n"
    663     "lea       " MEMLEA(0x8,1) ",%1            \n"
    664     "sub       $0x4,%2                         \n"
    665     "jg        1b                              \n"
    666   : "+r"(src),  // %0
    667     "+r"(dst),  // %1
    668     "+r"(width)   // %2
    669   :: "memory", "cc",
    670     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    671   );
    672 }
    673 
    674 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
    675   asm volatile (
    676     "pcmpeqb   %%xmm4,%%xmm4                   \n"
    677     "psllw     $0xc,%%xmm4                     \n"
    678     "movdqa    %%xmm4,%%xmm3                   \n"
    679     "psrlw     $0x8,%%xmm3                     \n"
    680 
    681     LABELALIGN
    682     "1:                                        \n"
    683     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    684     "movdqa    %%xmm0,%%xmm1                   \n"
    685     "pand      %%xmm3,%%xmm0                   \n"
    686     "pand      %%xmm4,%%xmm1                   \n"
    687     "psrlq     $0x4,%%xmm0                     \n"
    688     "psrlq     $0x8,%%xmm1                     \n"
    689     "por       %%xmm1,%%xmm0                   \n"
    690     "packuswb  %%xmm0,%%xmm0                   \n"
    691     "lea       " MEMLEA(0x10,0) ",%0           \n"
    692     "movq      %%xmm0," MEMACCESS(1) "         \n"
    693     "lea       " MEMLEA(0x8,1) ",%1            \n"
    694     "sub       $0x4,%2                         \n"
    695     "jg        1b                              \n"
    696   : "+r"(src),  // %0
    697     "+r"(dst),  // %1
    698     "+r"(width)   // %2
    699   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
    700   );
    701 }
    702 #endif  // HAS_RGB24TOARGBROW_SSSE3
    703 
    704 #ifdef HAS_ARGBTOYROW_SSSE3
    705 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
    706 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
    707   asm volatile (
    708     "movdqa    %3,%%xmm4                       \n"
    709     "movdqa    %4,%%xmm5                       \n"
    710 
    711     LABELALIGN
    712     "1:                                        \n"
    713     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    714     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    715     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
    716     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
    717     "pmaddubsw %%xmm4,%%xmm0                   \n"
    718     "pmaddubsw %%xmm4,%%xmm1                   \n"
    719     "pmaddubsw %%xmm4,%%xmm2                   \n"
    720     "pmaddubsw %%xmm4,%%xmm3                   \n"
    721     "lea       " MEMLEA(0x40,0) ",%0           \n"
    722     "phaddw    %%xmm1,%%xmm0                   \n"
    723     "phaddw    %%xmm3,%%xmm2                   \n"
    724     "psrlw     $0x7,%%xmm0                     \n"
    725     "psrlw     $0x7,%%xmm2                     \n"
    726     "packuswb  %%xmm2,%%xmm0                   \n"
    727     "paddb     %%xmm5,%%xmm0                   \n"
    728     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    729     "lea       " MEMLEA(0x10,1) ",%1           \n"
    730     "sub       $0x10,%2                        \n"
    731     "jg        1b                              \n"
    732   : "+r"(src_argb),  // %0
    733     "+r"(dst_y),     // %1
    734     "+r"(width)        // %2
    735   : "m"(kARGBToY),   // %3
    736     "m"(kAddY16)     // %4
    737   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    738   );
    739 }
    740 #endif  // HAS_ARGBTOYROW_SSSE3
    741 
    742 #ifdef HAS_ARGBTOYJROW_SSSE3
    743 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
    744 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
    745 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
    746   asm volatile (
    747     "movdqa    %3,%%xmm4                       \n"
    748     "movdqa    %4,%%xmm5                       \n"
    749 
    750     LABELALIGN
    751     "1:                                        \n"
    752     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    753     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    754     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
    755     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
    756     "pmaddubsw %%xmm4,%%xmm0                   \n"
    757     "pmaddubsw %%xmm4,%%xmm1                   \n"
    758     "pmaddubsw %%xmm4,%%xmm2                   \n"
    759     "pmaddubsw %%xmm4,%%xmm3                   \n"
    760     "lea       " MEMLEA(0x40,0) ",%0           \n"
    761     "phaddw    %%xmm1,%%xmm0                   \n"
    762     "phaddw    %%xmm3,%%xmm2                   \n"
    763     "paddw     %%xmm5,%%xmm0                   \n"
    764     "paddw     %%xmm5,%%xmm2                   \n"
    765     "psrlw     $0x7,%%xmm0                     \n"
    766     "psrlw     $0x7,%%xmm2                     \n"
    767     "packuswb  %%xmm2,%%xmm0                   \n"
    768     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    769     "lea       " MEMLEA(0x10,1) ",%1           \n"
    770     "sub       $0x10,%2                        \n"
    771     "jg        1b                              \n"
    772   : "+r"(src_argb),  // %0
    773     "+r"(dst_y),     // %1
    774     "+r"(width)        // %2
    775   : "m"(kARGBToYJ),  // %3
    776     "m"(kAddYJ64)    // %4
    777   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    778   );
    779 }
    780 #endif  // HAS_ARGBTOYJROW_SSSE3
    781 
    782 #ifdef HAS_ARGBTOYROW_AVX2
    783 // vpermd for vphaddw + vpackuswb vpermd.
    784 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
    785 
    786 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
    787 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
    788   asm volatile (
    789     "vbroadcastf128 %3,%%ymm4                  \n"
    790     "vbroadcastf128 %4,%%ymm5                  \n"
    791     "vmovdqu    %5,%%ymm6                      \n"
    792 
    793     LABELALIGN
    794     "1:                                        \n"
    795     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
    796     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
    797     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
    798     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
    799     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
    800     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
    801     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
    802     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
    803     "lea       " MEMLEA(0x80,0) ",%0           \n"
    804     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
    805     "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
    806     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
    807     "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
    808     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
    809     "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
    810     "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
    811     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
    812     "lea       " MEMLEA(0x20,1) ",%1           \n"
    813     "sub       $0x20,%2                        \n"
    814     "jg        1b                              \n"
    815     "vzeroupper                                \n"
    816   : "+r"(src_argb),  // %0
    817     "+r"(dst_y),     // %1
    818     "+r"(width)        // %2
    819   : "m"(kARGBToY),   // %3
    820     "m"(kAddY16),    // %4
    821     "m"(kPermdARGBToY_AVX)  // %5
    822   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
    823   );
    824 }
    825 #endif  // HAS_ARGBTOYROW_AVX2
    826 
    827 #ifdef HAS_ARGBTOYJROW_AVX2
    828 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
    829 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
    830   asm volatile (
    831     "vbroadcastf128 %3,%%ymm4                  \n"
    832     "vbroadcastf128 %4,%%ymm5                  \n"
    833     "vmovdqu    %5,%%ymm6                      \n"
    834 
    835     LABELALIGN
    836     "1:                                        \n"
    837     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
    838     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
    839     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
    840     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
    841     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
    842     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
    843     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
    844     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
    845     "lea       " MEMLEA(0x80,0) ",%0           \n"
    846     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
    847     "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
    848     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
    849     "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
    850     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
    851     "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
    852     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
    853     "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
    854     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
    855     "lea       " MEMLEA(0x20,1) ",%1           \n"
    856     "sub       $0x20,%2                        \n"
    857     "jg        1b                              \n"
    858     "vzeroupper                                \n"
    859   : "+r"(src_argb),  // %0
    860     "+r"(dst_y),     // %1
    861     "+r"(width)        // %2
    862   : "m"(kARGBToYJ),   // %3
    863     "m"(kAddYJ64),    // %4
    864     "m"(kPermdARGBToY_AVX)  // %5
    865   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
    866   );
    867 }
    868 #endif  // HAS_ARGBTOYJROW_AVX2
    869 
    870 #ifdef HAS_ARGBTOUVROW_SSSE3
    871 void ARGBToUVRow_SSSE3(const uint8* src_argb0,
    872                        int src_stride_argb,
    873                        uint8* dst_u,
    874                        uint8* dst_v,
    875                        int width) {
    876   asm volatile (
    877     "movdqa    %5,%%xmm3                       \n"
    878     "movdqa    %6,%%xmm4                       \n"
    879     "movdqa    %7,%%xmm5                       \n"
    880     "sub       %1,%2                           \n"
    881 
    882     LABELALIGN
    883     "1:                                        \n"
    884     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    885     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
    886     "pavgb     %%xmm7,%%xmm0                   \n"
    887     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    888     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
    889     "pavgb     %%xmm7,%%xmm1                   \n"
    890     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
    891     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
    892     "pavgb     %%xmm7,%%xmm2                   \n"
    893     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
    894     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
    895     "pavgb     %%xmm7,%%xmm6                   \n"
    896 
    897     "lea       " MEMLEA(0x40,0) ",%0           \n"
    898     "movdqa    %%xmm0,%%xmm7                   \n"
    899     "shufps    $0x88,%%xmm1,%%xmm0             \n"
    900     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
    901     "pavgb     %%xmm7,%%xmm0                   \n"
    902     "movdqa    %%xmm2,%%xmm7                   \n"
    903     "shufps    $0x88,%%xmm6,%%xmm2             \n"
    904     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
    905     "pavgb     %%xmm7,%%xmm2                   \n"
    906     "movdqa    %%xmm0,%%xmm1                   \n"
    907     "movdqa    %%xmm2,%%xmm6                   \n"
    908     "pmaddubsw %%xmm4,%%xmm0                   \n"
    909     "pmaddubsw %%xmm4,%%xmm2                   \n"
    910     "pmaddubsw %%xmm3,%%xmm1                   \n"
    911     "pmaddubsw %%xmm3,%%xmm6                   \n"
    912     "phaddw    %%xmm2,%%xmm0                   \n"
    913     "phaddw    %%xmm6,%%xmm1                   \n"
    914     "psraw     $0x8,%%xmm0                     \n"
    915     "psraw     $0x8,%%xmm1                     \n"
    916     "packsswb  %%xmm1,%%xmm0                   \n"
    917     "paddb     %%xmm5,%%xmm0                   \n"
    918     "movlps    %%xmm0," MEMACCESS(1) "         \n"
    919     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
    920     "lea       " MEMLEA(0x8,1) ",%1            \n"
    921     "sub       $0x10,%3                        \n"
    922     "jg        1b                              \n"
    923   : "+r"(src_argb0),       // %0
    924     "+r"(dst_u),           // %1
    925     "+r"(dst_v),           // %2
    926     "+rm"(width)           // %3
    927   : "r"((intptr_t)(src_stride_argb)), // %4
    928     "m"(kARGBToV),  // %5
    929     "m"(kARGBToU),  // %6
    930     "m"(kAddUV128)  // %7
    931   : "memory", "cc", NACL_R14
    932     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
    933   );
    934 }
    935 #endif  // HAS_ARGBTOUVROW_SSSE3
    936 
    937 #ifdef HAS_ARGBTOUVROW_AVX2
    938 // vpshufb for vphaddw + vpackuswb packed to shorts.
    939 static const lvec8 kShufARGBToUV_AVX = {
    940     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
    941     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
    942 void ARGBToUVRow_AVX2(const uint8* src_argb0,
    943                       int src_stride_argb,
    944                       uint8* dst_u,
    945                       uint8* dst_v,
    946                       int width) {
    947   asm volatile (
    948     "vbroadcastf128 %5,%%ymm5                  \n"
    949     "vbroadcastf128 %6,%%ymm6                  \n"
    950     "vbroadcastf128 %7,%%ymm7                  \n"
    951     "sub        %1,%2                          \n"
    952 
    953     LABELALIGN
    954     "1:                                        \n"
    955     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
    956     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
    957     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
    958     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
    959     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
    960     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
    961     VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
    962     VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
    963     "lea        " MEMLEA(0x80,0) ",%0          \n"
    964     "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
    965     "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
    966     "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
    967     "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
    968     "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
    969     "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
    970 
    971     "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
    972     "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
    973     "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
    974     "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
    975     "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
    976     "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
    977     "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
    978     "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
    979     "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
    980     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
    981     "vpshufb    %8,%%ymm0,%%ymm0               \n"
    982     "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
    983 
    984     "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
    985     VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
    986     "lea        " MEMLEA(0x10,1) ",%1          \n"
    987     "sub        $0x20,%3                       \n"
    988     "jg         1b                             \n"
    989     "vzeroupper                                \n"
    990   : "+r"(src_argb0),       // %0
    991     "+r"(dst_u),           // %1
    992     "+r"(dst_v),           // %2
    993     "+rm"(width)           // %3
    994   : "r"((intptr_t)(src_stride_argb)), // %4
    995     "m"(kAddUV128),  // %5
    996     "m"(kARGBToV),   // %6
    997     "m"(kARGBToU),   // %7
    998     "m"(kShufARGBToUV_AVX)  // %8
    999   : "memory", "cc", NACL_R14
   1000     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   1001   );
   1002 }
   1003 #endif  // HAS_ARGBTOUVROW_AVX2
   1004 
   1005 #ifdef HAS_ARGBTOUVJROW_AVX2
   1006 void ARGBToUVJRow_AVX2(const uint8* src_argb0,
   1007                        int src_stride_argb,
   1008                        uint8* dst_u,
   1009                        uint8* dst_v,
   1010                        int width) {
   1011   asm volatile (
   1012     "vbroadcastf128 %5,%%ymm5                  \n"
   1013     "vbroadcastf128 %6,%%ymm6                  \n"
   1014     "vbroadcastf128 %7,%%ymm7                  \n"
   1015     "sub        %1,%2                          \n"
   1016 
   1017     LABELALIGN
   1018     "1:                                        \n"
   1019     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
   1020     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
   1021     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
   1022     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
   1023     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
   1024     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
   1025     VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
   1026     VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
   1027     "lea       " MEMLEA(0x80,0) ",%0           \n"
   1028     "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
   1029     "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
   1030     "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
   1031     "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
   1032     "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
   1033     "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
   1034 
   1035     "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
   1036     "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
   1037     "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
   1038     "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
   1039     "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
   1040     "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
   1041     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
   1042     "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
   1043     "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
   1044     "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
   1045     "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
   1046     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
   1047     "vpshufb    %8,%%ymm0,%%ymm0               \n"
   1048 
   1049     "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
   1050     VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
   1051     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1052     "sub       $0x20,%3                        \n"
   1053     "jg        1b                              \n"
   1054     "vzeroupper                                \n"
   1055   : "+r"(src_argb0),       // %0
   1056     "+r"(dst_u),           // %1
   1057     "+r"(dst_v),           // %2
   1058     "+rm"(width)           // %3
   1059   : "r"((intptr_t)(src_stride_argb)), // %4
   1060     "m"(kAddUVJ128),  // %5
   1061     "m"(kARGBToVJ),  // %6
   1062     "m"(kARGBToUJ),  // %7
   1063     "m"(kShufARGBToUV_AVX)  // %8
   1064   : "memory", "cc", NACL_R14
   1065     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   1066   );
   1067 }
   1068 #endif  // HAS_ARGBTOUVJROW_AVX2
   1069 
   1070 #ifdef HAS_ARGBTOUVJROW_SSSE3
   1071 void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
   1072                         int src_stride_argb,
   1073                         uint8* dst_u,
   1074                         uint8* dst_v,
   1075                         int width) {
   1076   asm volatile (
   1077     "movdqa    %5,%%xmm3                       \n"
   1078     "movdqa    %6,%%xmm4                       \n"
   1079     "movdqa    %7,%%xmm5                       \n"
   1080     "sub       %1,%2                           \n"
   1081 
   1082     LABELALIGN
   1083     "1:                                        \n"
   1084     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1085     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
   1086     "pavgb     %%xmm7,%%xmm0                   \n"
   1087     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1088     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
   1089     "pavgb     %%xmm7,%%xmm1                   \n"
   1090     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1091     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
   1092     "pavgb     %%xmm7,%%xmm2                   \n"
   1093     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   1094     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
   1095     "pavgb     %%xmm7,%%xmm6                   \n"
   1096 
   1097     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1098     "movdqa    %%xmm0,%%xmm7                   \n"
   1099     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1100     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   1101     "pavgb     %%xmm7,%%xmm0                   \n"
   1102     "movdqa    %%xmm2,%%xmm7                   \n"
   1103     "shufps    $0x88,%%xmm6,%%xmm2             \n"
   1104     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
   1105     "pavgb     %%xmm7,%%xmm2                   \n"
   1106     "movdqa    %%xmm0,%%xmm1                   \n"
   1107     "movdqa    %%xmm2,%%xmm6                   \n"
   1108     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1109     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1110     "pmaddubsw %%xmm3,%%xmm1                   \n"
   1111     "pmaddubsw %%xmm3,%%xmm6                   \n"
   1112     "phaddw    %%xmm2,%%xmm0                   \n"
   1113     "phaddw    %%xmm6,%%xmm1                   \n"
   1114     "paddw     %%xmm5,%%xmm0                   \n"
   1115     "paddw     %%xmm5,%%xmm1                   \n"
   1116     "psraw     $0x8,%%xmm0                     \n"
   1117     "psraw     $0x8,%%xmm1                     \n"
   1118     "packsswb  %%xmm1,%%xmm0                   \n"
   1119     "movlps    %%xmm0," MEMACCESS(1) "         \n"
   1120     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
   1121     "lea       " MEMLEA(0x8,1) ",%1            \n"
   1122     "sub       $0x10,%3                        \n"
   1123     "jg        1b                              \n"
   1124   : "+r"(src_argb0),       // %0
   1125     "+r"(dst_u),           // %1
   1126     "+r"(dst_v),           // %2
   1127     "+rm"(width)           // %3
   1128   : "r"((intptr_t)(src_stride_argb)), // %4
   1129     "m"(kARGBToVJ),  // %5
   1130     "m"(kARGBToUJ),  // %6
   1131     "m"(kAddUVJ128)  // %7
   1132   : "memory", "cc", NACL_R14
   1133     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   1134   );
   1135 }
   1136 #endif  // HAS_ARGBTOUVJROW_SSSE3
   1137 
   1138 #ifdef HAS_ARGBTOUV444ROW_SSSE3
   1139 void ARGBToUV444Row_SSSE3(const uint8* src_argb,
   1140                           uint8* dst_u,
   1141                           uint8* dst_v,
   1142                           int width) {
   1143   asm volatile (
   1144     "movdqa    %4,%%xmm3                       \n"
   1145     "movdqa    %5,%%xmm4                       \n"
   1146     "movdqa    %6,%%xmm5                       \n"
   1147     "sub       %1,%2                           \n"
   1148 
   1149     LABELALIGN
   1150     "1:                                        \n"
   1151     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1152     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1153     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1154     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   1155     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1156     "pmaddubsw %%xmm4,%%xmm1                   \n"
   1157     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1158     "pmaddubsw %%xmm4,%%xmm6                   \n"
   1159     "phaddw    %%xmm1,%%xmm0                   \n"
   1160     "phaddw    %%xmm6,%%xmm2                   \n"
   1161     "psraw     $0x8,%%xmm0                     \n"
   1162     "psraw     $0x8,%%xmm2                     \n"
   1163     "packsswb  %%xmm2,%%xmm0                   \n"
   1164     "paddb     %%xmm5,%%xmm0                   \n"
   1165     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1166     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1167     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1168     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1169     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   1170     "pmaddubsw %%xmm3,%%xmm0                   \n"
   1171     "pmaddubsw %%xmm3,%%xmm1                   \n"
   1172     "pmaddubsw %%xmm3,%%xmm2                   \n"
   1173     "pmaddubsw %%xmm3,%%xmm6                   \n"
   1174     "phaddw    %%xmm1,%%xmm0                   \n"
   1175     "phaddw    %%xmm6,%%xmm2                   \n"
   1176     "psraw     $0x8,%%xmm0                     \n"
   1177     "psraw     $0x8,%%xmm2                     \n"
   1178     "packsswb  %%xmm2,%%xmm0                   \n"
   1179     "paddb     %%xmm5,%%xmm0                   \n"
   1180     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1181     MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
   1182     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1183     "sub       $0x10,%3                        \n"
   1184     "jg        1b                              \n"
   1185   : "+r"(src_argb),        // %0
   1186     "+r"(dst_u),           // %1
   1187     "+r"(dst_v),           // %2
   1188     "+rm"(width)           // %3
   1189   : "m"(kARGBToV),  // %4
   1190     "m"(kARGBToU),  // %5
   1191     "m"(kAddUV128)  // %6
   1192   : "memory", "cc", NACL_R14
   1193     "xmm0", "xmm1", "xmm2", "xmm6"
   1194   );
   1195 }
   1196 #endif  // HAS_ARGBTOUV444ROW_SSSE3
   1197 
   1198 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
   1199   asm volatile (
   1200     "movdqa    %4,%%xmm5                       \n"
   1201     "movdqa    %3,%%xmm4                       \n"
   1202 
   1203     LABELALIGN
   1204     "1:                                        \n"
   1205     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1206     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1207     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1208     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   1209     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1210     "pmaddubsw %%xmm4,%%xmm1                   \n"
   1211     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1212     "pmaddubsw %%xmm4,%%xmm3                   \n"
   1213     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1214     "phaddw    %%xmm1,%%xmm0                   \n"
   1215     "phaddw    %%xmm3,%%xmm2                   \n"
   1216     "psrlw     $0x7,%%xmm0                     \n"
   1217     "psrlw     $0x7,%%xmm2                     \n"
   1218     "packuswb  %%xmm2,%%xmm0                   \n"
   1219     "paddb     %%xmm5,%%xmm0                   \n"
   1220     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1221     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1222     "sub       $0x10,%2                        \n"
   1223     "jg        1b                              \n"
   1224   : "+r"(src_bgra),  // %0
   1225     "+r"(dst_y),     // %1
   1226     "+r"(width)        // %2
   1227   : "m"(kBGRAToY),   // %3
   1228     "m"(kAddY16)     // %4
   1229   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1230   );
   1231 }
   1232 
   1233 void BGRAToUVRow_SSSE3(const uint8* src_bgra0,
   1234                        int src_stride_bgra,
   1235                        uint8* dst_u,
   1236                        uint8* dst_v,
   1237                        int width) {
   1238   asm volatile (
   1239     "movdqa    %5,%%xmm3                       \n"
   1240     "movdqa    %6,%%xmm4                       \n"
   1241     "movdqa    %7,%%xmm5                       \n"
   1242     "sub       %1,%2                           \n"
   1243 
   1244     LABELALIGN
   1245     "1:                                        \n"
   1246     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1247     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
   1248     "pavgb     %%xmm7,%%xmm0                   \n"
   1249     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1250     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
   1251     "pavgb     %%xmm7,%%xmm1                   \n"
   1252     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1253     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
   1254     "pavgb     %%xmm7,%%xmm2                   \n"
   1255     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   1256     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
   1257     "pavgb     %%xmm7,%%xmm6                   \n"
   1258 
   1259     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1260     "movdqa    %%xmm0,%%xmm7                   \n"
   1261     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1262     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   1263     "pavgb     %%xmm7,%%xmm0                   \n"
   1264     "movdqa    %%xmm2,%%xmm7                   \n"
   1265     "shufps    $0x88,%%xmm6,%%xmm2             \n"
   1266     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
   1267     "pavgb     %%xmm7,%%xmm2                   \n"
   1268     "movdqa    %%xmm0,%%xmm1                   \n"
   1269     "movdqa    %%xmm2,%%xmm6                   \n"
   1270     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1271     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1272     "pmaddubsw %%xmm3,%%xmm1                   \n"
   1273     "pmaddubsw %%xmm3,%%xmm6                   \n"
   1274     "phaddw    %%xmm2,%%xmm0                   \n"
   1275     "phaddw    %%xmm6,%%xmm1                   \n"
   1276     "psraw     $0x8,%%xmm0                     \n"
   1277     "psraw     $0x8,%%xmm1                     \n"
   1278     "packsswb  %%xmm1,%%xmm0                   \n"
   1279     "paddb     %%xmm5,%%xmm0                   \n"
   1280     "movlps    %%xmm0," MEMACCESS(1) "         \n"
   1281     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
   1282     "lea       " MEMLEA(0x8,1) ",%1            \n"
   1283     "sub       $0x10,%3                        \n"
   1284     "jg        1b                              \n"
   1285   : "+r"(src_bgra0),       // %0
   1286     "+r"(dst_u),           // %1
   1287     "+r"(dst_v),           // %2
   1288     "+rm"(width)           // %3
   1289   : "r"((intptr_t)(src_stride_bgra)), // %4
   1290     "m"(kBGRAToV),  // %5
   1291     "m"(kBGRAToU),  // %6
   1292     "m"(kAddUV128)  // %7
   1293   : "memory", "cc", NACL_R14
   1294     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   1295   );
   1296 }
   1297 
   1298 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
   1299   asm volatile (
   1300     "movdqa    %4,%%xmm5                       \n"
   1301     "movdqa    %3,%%xmm4                       \n"
   1302 
   1303     LABELALIGN
   1304     "1:                                        \n"
   1305     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1306     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1307     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1308     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   1309     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1310     "pmaddubsw %%xmm4,%%xmm1                   \n"
   1311     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1312     "pmaddubsw %%xmm4,%%xmm3                   \n"
   1313     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1314     "phaddw    %%xmm1,%%xmm0                   \n"
   1315     "phaddw    %%xmm3,%%xmm2                   \n"
   1316     "psrlw     $0x7,%%xmm0                     \n"
   1317     "psrlw     $0x7,%%xmm2                     \n"
   1318     "packuswb  %%xmm2,%%xmm0                   \n"
   1319     "paddb     %%xmm5,%%xmm0                   \n"
   1320     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1321     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1322     "sub       $0x10,%2                        \n"
   1323     "jg        1b                              \n"
   1324   : "+r"(src_abgr),  // %0
   1325     "+r"(dst_y),     // %1
   1326     "+r"(width)        // %2
   1327   : "m"(kABGRToY),   // %3
   1328     "m"(kAddY16)     // %4
   1329   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1330   );
   1331 }
   1332 
   1333 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
   1334   asm volatile (
   1335     "movdqa    %4,%%xmm5                       \n"
   1336     "movdqa    %3,%%xmm4                       \n"
   1337 
   1338     LABELALIGN
   1339     "1:                                        \n"
   1340     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1341     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1342     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1343     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   1344     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1345     "pmaddubsw %%xmm4,%%xmm1                   \n"
   1346     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1347     "pmaddubsw %%xmm4,%%xmm3                   \n"
   1348     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1349     "phaddw    %%xmm1,%%xmm0                   \n"
   1350     "phaddw    %%xmm3,%%xmm2                   \n"
   1351     "psrlw     $0x7,%%xmm0                     \n"
   1352     "psrlw     $0x7,%%xmm2                     \n"
   1353     "packuswb  %%xmm2,%%xmm0                   \n"
   1354     "paddb     %%xmm5,%%xmm0                   \n"
   1355     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1356     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1357     "sub       $0x10,%2                        \n"
   1358     "jg        1b                              \n"
   1359   : "+r"(src_rgba),  // %0
   1360     "+r"(dst_y),     // %1
   1361     "+r"(width)        // %2
   1362   : "m"(kRGBAToY),   // %3
   1363     "m"(kAddY16)     // %4
   1364   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1365   );
   1366 }
   1367 
   1368 void ABGRToUVRow_SSSE3(const uint8* src_abgr0,
   1369                        int src_stride_abgr,
   1370                        uint8* dst_u,
   1371                        uint8* dst_v,
   1372                        int width) {
   1373   asm volatile (
   1374     "movdqa    %5,%%xmm3                       \n"
   1375     "movdqa    %6,%%xmm4                       \n"
   1376     "movdqa    %7,%%xmm5                       \n"
   1377     "sub       %1,%2                           \n"
   1378 
   1379     LABELALIGN
   1380     "1:                                        \n"
   1381     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1382     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
   1383     "pavgb     %%xmm7,%%xmm0                   \n"
   1384     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1385     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
   1386     "pavgb     %%xmm7,%%xmm1                   \n"
   1387     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1388     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
   1389     "pavgb     %%xmm7,%%xmm2                   \n"
   1390     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   1391     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
   1392     "pavgb     %%xmm7,%%xmm6                   \n"
   1393 
   1394     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1395     "movdqa    %%xmm0,%%xmm7                   \n"
   1396     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1397     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   1398     "pavgb     %%xmm7,%%xmm0                   \n"
   1399     "movdqa    %%xmm2,%%xmm7                   \n"
   1400     "shufps    $0x88,%%xmm6,%%xmm2             \n"
   1401     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
   1402     "pavgb     %%xmm7,%%xmm2                   \n"
   1403     "movdqa    %%xmm0,%%xmm1                   \n"
   1404     "movdqa    %%xmm2,%%xmm6                   \n"
   1405     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1406     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1407     "pmaddubsw %%xmm3,%%xmm1                   \n"
   1408     "pmaddubsw %%xmm3,%%xmm6                   \n"
   1409     "phaddw    %%xmm2,%%xmm0                   \n"
   1410     "phaddw    %%xmm6,%%xmm1                   \n"
   1411     "psraw     $0x8,%%xmm0                     \n"
   1412     "psraw     $0x8,%%xmm1                     \n"
   1413     "packsswb  %%xmm1,%%xmm0                   \n"
   1414     "paddb     %%xmm5,%%xmm0                   \n"
   1415     "movlps    %%xmm0," MEMACCESS(1) "         \n"
   1416     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
   1417     "lea       " MEMLEA(0x8,1) ",%1            \n"
   1418     "sub       $0x10,%3                        \n"
   1419     "jg        1b                              \n"
   1420   : "+r"(src_abgr0),       // %0
   1421     "+r"(dst_u),           // %1
   1422     "+r"(dst_v),           // %2
   1423     "+rm"(width)           // %3
   1424   : "r"((intptr_t)(src_stride_abgr)), // %4
   1425     "m"(kABGRToV),  // %5
   1426     "m"(kABGRToU),  // %6
   1427     "m"(kAddUV128)  // %7
   1428   : "memory", "cc", NACL_R14
   1429     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   1430   );
   1431 }
   1432 
   1433 void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
   1434                        int src_stride_rgba,
   1435                        uint8* dst_u,
   1436                        uint8* dst_v,
   1437                        int width) {
   1438   asm volatile (
   1439     "movdqa    %5,%%xmm3                       \n"
   1440     "movdqa    %6,%%xmm4                       \n"
   1441     "movdqa    %7,%%xmm5                       \n"
   1442     "sub       %1,%2                           \n"
   1443 
   1444     LABELALIGN
   1445     "1:                                        \n"
   1446     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1447     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
   1448     "pavgb     %%xmm7,%%xmm0                   \n"
   1449     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1450     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
   1451     "pavgb     %%xmm7,%%xmm1                   \n"
   1452     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1453     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
   1454     "pavgb     %%xmm7,%%xmm2                   \n"
   1455     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   1456     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
   1457     "pavgb     %%xmm7,%%xmm6                   \n"
   1458 
   1459     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1460     "movdqa    %%xmm0,%%xmm7                   \n"
   1461     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1462     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   1463     "pavgb     %%xmm7,%%xmm0                   \n"
   1464     "movdqa    %%xmm2,%%xmm7                   \n"
   1465     "shufps    $0x88,%%xmm6,%%xmm2             \n"
   1466     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
   1467     "pavgb     %%xmm7,%%xmm2                   \n"
   1468     "movdqa    %%xmm0,%%xmm1                   \n"
   1469     "movdqa    %%xmm2,%%xmm6                   \n"
   1470     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1471     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1472     "pmaddubsw %%xmm3,%%xmm1                   \n"
   1473     "pmaddubsw %%xmm3,%%xmm6                   \n"
   1474     "phaddw    %%xmm2,%%xmm0                   \n"
   1475     "phaddw    %%xmm6,%%xmm1                   \n"
   1476     "psraw     $0x8,%%xmm0                     \n"
   1477     "psraw     $0x8,%%xmm1                     \n"
   1478     "packsswb  %%xmm1,%%xmm0                   \n"
   1479     "paddb     %%xmm5,%%xmm0                   \n"
   1480     "movlps    %%xmm0," MEMACCESS(1) "         \n"
   1481     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
   1482     "lea       " MEMLEA(0x8,1) ",%1            \n"
   1483     "sub       $0x10,%3                        \n"
   1484     "jg        1b                              \n"
   1485   : "+r"(src_rgba0),       // %0
   1486     "+r"(dst_u),           // %1
   1487     "+r"(dst_v),           // %2
   1488     "+rm"(width)           // %3
   1489   : "r"((intptr_t)(src_stride_rgba)), // %4
   1490     "m"(kRGBAToV),  // %5
   1491     "m"(kRGBAToU),  // %6
   1492     "m"(kAddUV128)  // %7
   1493   : "memory", "cc", NACL_R14
   1494     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   1495   );
   1496 }
   1497 
   1498 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
   1499 
   1500 // Read 8 UV from 444
   1501 #define READYUV444 \
   1502   "movq       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
   1503     MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
   1504     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
   1505     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
   1506     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
   1507     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
   1508     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
   1509 
   1510 // Read 4 UV from 422, upsample to 8 UV
   1511 #define READYUV422 \
   1512   "movd       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
   1513     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
   1514     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
   1515     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
   1516     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
   1517     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
   1518     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
   1519     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
   1520 
   1521 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
   1522 #define READYUVA422 \
   1523   "movd       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
   1524     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
   1525     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
   1526     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
   1527     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
   1528     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
   1529     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
   1530     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
   1531     "movq       " MEMACCESS([a_buf]) ",%%xmm5                   \n"            \
   1532     "lea        " MEMLEA(0x8, [a_buf]) ",%[a_buf]               \n"
   1533 
   1534 // Read 4 UV from NV12, upsample to 8 UV
   1535 #define READNV12 \
   1536   "movq       " MEMACCESS([uv_buf]) ",%%xmm0                    \n"            \
   1537     "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
   1538     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
   1539     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
   1540     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
   1541     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
   1542 
   1543 // Read 4 VU from NV21, upsample to 8 UV
   1544 #define READNV21 \
   1545   "movq       " MEMACCESS([vu_buf]) ",%%xmm0                    \n"            \
   1546     "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \
   1547     "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \
   1548     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
   1549     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
   1550     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
   1551 
   1552 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
   1553 #define READYUY2 \
   1554   "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                  \n"            \
   1555     "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \
   1556     "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm0                \n"            \
   1557     "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \
   1558     "lea        " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf]        \n"
   1559 
   1560 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
   1561 #define READUYVY \
   1562   "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                  \n"            \
   1563     "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \
   1564     "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm0                \n"            \
   1565     "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \
   1566     "lea        " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf]        \n"
   1567 
   1568 #if defined(__x86_64__)
   1569 #define YUVTORGB_SETUP(yuvconstants) \
   1570   "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8              \n"            \
   1571     "movdqa     " MEMACCESS2(32, [yuvconstants]) ",%%xmm9       \n"            \
   1572     "movdqa     " MEMACCESS2(64, [yuvconstants]) ",%%xmm10      \n"            \
   1573     "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm11      \n"            \
   1574     "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm12     \n"            \
   1575     "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm13     \n"            \
   1576     "movdqa     " MEMACCESS2(192, [yuvconstants]) ",%%xmm14     \n"
   1577 // Convert 8 pixels: 8 UV and 8 Y
   1578 #define YUVTORGB(yuvconstants)                                    \
   1579   "movdqa     %%xmm0,%%xmm1                                   \n" \
   1580   "movdqa     %%xmm0,%%xmm2                                   \n" \
   1581   "movdqa     %%xmm0,%%xmm3                                   \n" \
   1582   "movdqa     %%xmm11,%%xmm0                                  \n" \
   1583   "pmaddubsw  %%xmm8,%%xmm1                                   \n" \
   1584   "psubw      %%xmm1,%%xmm0                                   \n" \
   1585   "movdqa     %%xmm12,%%xmm1                                  \n" \
   1586   "pmaddubsw  %%xmm9,%%xmm2                                   \n" \
   1587   "psubw      %%xmm2,%%xmm1                                   \n" \
   1588   "movdqa     %%xmm13,%%xmm2                                  \n" \
   1589   "pmaddubsw  %%xmm10,%%xmm3                                  \n" \
   1590   "psubw      %%xmm3,%%xmm2                                   \n" \
   1591   "pmulhuw    %%xmm14,%%xmm4                                  \n" \
   1592   "paddsw     %%xmm4,%%xmm0                                   \n" \
   1593   "paddsw     %%xmm4,%%xmm1                                   \n" \
   1594   "paddsw     %%xmm4,%%xmm2                                   \n" \
   1595   "psraw      $0x6,%%xmm0                                     \n" \
   1596   "psraw      $0x6,%%xmm1                                     \n" \
   1597   "psraw      $0x6,%%xmm2                                     \n" \
   1598   "packuswb   %%xmm0,%%xmm0                                   \n" \
   1599   "packuswb   %%xmm1,%%xmm1                                   \n" \
   1600   "packuswb   %%xmm2,%%xmm2                                   \n"
   1601 #define YUVTORGB_REGS \
   1602   "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
   1603 
   1604 #else
   1605 #define YUVTORGB_SETUP(yuvconstants)
   1606 // Convert 8 pixels: 8 UV and 8 Y
   1607 #define YUVTORGB(yuvconstants) \
   1608   "movdqa     %%xmm0,%%xmm1                                     \n"            \
   1609     "movdqa     %%xmm0,%%xmm2                                   \n"            \
   1610     "movdqa     %%xmm0,%%xmm3                                   \n"            \
   1611     "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm0       \n"            \
   1612     "pmaddubsw  " MEMACCESS([yuvconstants]) ",%%xmm1            \n"            \
   1613     "psubw      %%xmm1,%%xmm0                                   \n"            \
   1614     "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm1      \n"            \
   1615     "pmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%xmm2       \n"            \
   1616     "psubw      %%xmm2,%%xmm1                                   \n"            \
   1617     "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm2      \n"            \
   1618     "pmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%xmm3       \n"            \
   1619     "psubw      %%xmm3,%%xmm2                                   \n"            \
   1620     "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm4      \n"            \
   1621     "paddsw     %%xmm4,%%xmm0                                   \n"            \
   1622     "paddsw     %%xmm4,%%xmm1                                   \n"            \
   1623     "paddsw     %%xmm4,%%xmm2                                   \n"            \
   1624     "psraw      $0x6,%%xmm0                                     \n"            \
   1625     "psraw      $0x6,%%xmm1                                     \n"            \
   1626     "psraw      $0x6,%%xmm2                                     \n"            \
   1627     "packuswb   %%xmm0,%%xmm0                                   \n"            \
   1628     "packuswb   %%xmm1,%%xmm1                                   \n"            \
   1629     "packuswb   %%xmm2,%%xmm2                                   \n"
   1630 #define YUVTORGB_REGS
   1631 #endif
   1632 
   1633 // Store 8 ARGB values.
   1634 #define STOREARGB \
   1635   "punpcklbw  %%xmm1,%%xmm0                                      \n"           \
   1636     "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
   1637     "movdqa     %%xmm0,%%xmm1                                    \n"           \
   1638     "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
   1639     "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
   1640     "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
   1641     "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
   1642     "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
   1643 
   1644 // Store 8 RGBA values.
   1645 #define STORERGBA \
   1646   "pcmpeqb   %%xmm5,%%xmm5                                       \n"           \
   1647     "punpcklbw %%xmm2,%%xmm1                                     \n"           \
   1648     "punpcklbw %%xmm0,%%xmm5                                     \n"           \
   1649     "movdqa    %%xmm5,%%xmm0                                     \n"           \
   1650     "punpcklwd %%xmm1,%%xmm5                                     \n"           \
   1651     "punpckhwd %%xmm1,%%xmm0                                     \n"           \
   1652     "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
   1653     "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
   1654     "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
   1655 
   1656 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
   1657                                 const uint8* u_buf,
   1658                                 const uint8* v_buf,
   1659                                 uint8* dst_argb,
   1660                                 const struct YuvConstants* yuvconstants,
   1661                                 int width) {
   1662   asm volatile (
   1663     YUVTORGB_SETUP(yuvconstants)
   1664     "sub       %[u_buf],%[v_buf]               \n"
   1665     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1666 
   1667     LABELALIGN
   1668     "1:                                        \n"
   1669     READYUV444
   1670     YUVTORGB(yuvconstants)
   1671     STOREARGB
   1672     "sub       $0x8,%[width]                   \n"
   1673     "jg        1b                              \n"
   1674   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1675     [u_buf]"+r"(u_buf),    // %[u_buf]
   1676     [v_buf]"+r"(v_buf),    // %[v_buf]
   1677     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1678     [width]"+rm"(width)    // %[width]
   1679   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   1680   : "memory", "cc", NACL_R14 YUVTORGB_REGS
   1681     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1682   );
   1683 }
   1684 
   1685 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
   1686                                  const uint8* u_buf,
   1687                                  const uint8* v_buf,
   1688                                  uint8* dst_rgb24,
   1689                                  const struct YuvConstants* yuvconstants,
   1690                                  int width) {
   1691   asm volatile (
   1692     YUVTORGB_SETUP(yuvconstants)
   1693     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
   1694     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
   1695     "sub       %[u_buf],%[v_buf]               \n"
   1696 
   1697     LABELALIGN
   1698     "1:                                        \n"
   1699     READYUV422
   1700     YUVTORGB(yuvconstants)
   1701     "punpcklbw %%xmm1,%%xmm0                   \n"
   1702     "punpcklbw %%xmm2,%%xmm2                   \n"
   1703     "movdqa    %%xmm0,%%xmm1                   \n"
   1704     "punpcklwd %%xmm2,%%xmm0                   \n"
   1705     "punpckhwd %%xmm2,%%xmm1                   \n"
   1706     "pshufb    %%xmm5,%%xmm0                   \n"
   1707     "pshufb    %%xmm6,%%xmm1                   \n"
   1708     "palignr   $0xc,%%xmm0,%%xmm1              \n"
   1709     "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
   1710     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
   1711     "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
   1712     "subl      $0x8,%[width]                   \n"
   1713     "jg        1b                              \n"
   1714   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1715     [u_buf]"+r"(u_buf),    // %[u_buf]
   1716     [v_buf]"+r"(v_buf),    // %[v_buf]
   1717     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
   1718 #if defined(__i386__)
   1719     [width]"+m"(width)     // %[width]
   1720 #else
   1721     [width]"+rm"(width)    // %[width]
   1722 #endif
   1723   : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
   1724     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
   1725     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
   1726   : "memory", "cc", NACL_R14 YUVTORGB_REGS
   1727     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   1728   );
   1729 }
   1730 
   1731 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
   1732                                 const uint8* u_buf,
   1733                                 const uint8* v_buf,
   1734                                 uint8* dst_argb,
   1735                                 const struct YuvConstants* yuvconstants,
   1736                                 int width) {
   1737   asm volatile (
   1738     YUVTORGB_SETUP(yuvconstants)
   1739     "sub       %[u_buf],%[v_buf]               \n"
   1740     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1741 
   1742     LABELALIGN
   1743     "1:                                        \n"
   1744     READYUV422
   1745     YUVTORGB(yuvconstants)
   1746     STOREARGB
   1747     "sub       $0x8,%[width]                   \n"
   1748     "jg        1b                              \n"
   1749   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1750     [u_buf]"+r"(u_buf),    // %[u_buf]
   1751     [v_buf]"+r"(v_buf),    // %[v_buf]
   1752     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1753     [width]"+rm"(width)    // %[width]
   1754   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   1755   : "memory", "cc", NACL_R14 YUVTORGB_REGS
   1756     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1757   );
   1758 }
   1759 
   1760 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
   1761 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
   1762                                      const uint8* u_buf,
   1763                                      const uint8* v_buf,
   1764                                      const uint8* a_buf,
   1765                                      uint8* dst_argb,
   1766                                      const struct YuvConstants* yuvconstants,
   1767                                      int width) {
   1768   // clang-format off
   1769   asm volatile (
   1770     YUVTORGB_SETUP(yuvconstants)
   1771     "sub       %[u_buf],%[v_buf]               \n"
   1772 
   1773     LABELALIGN
   1774     "1:                                        \n"
   1775     READYUVA422
   1776     YUVTORGB(yuvconstants)
   1777     STOREARGB
   1778     "subl      $0x8,%[width]                   \n"
   1779     "jg        1b                              \n"
   1780   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1781     [u_buf]"+r"(u_buf),    // %[u_buf]
   1782     [v_buf]"+r"(v_buf),    // %[v_buf]
   1783     [a_buf]"+r"(a_buf),    // %[a_buf]
   1784     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1785 #if defined(__i386__)
   1786     [width]"+m"(width)     // %[width]
   1787 #else
   1788     [width]"+rm"(width)    // %[width]
   1789 #endif
   1790   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   1791   : "memory", "cc", NACL_R14 YUVTORGB_REGS
   1792     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1793   );
   1794   // clang-format on
   1795 }
   1796 #endif  // HAS_I422ALPHATOARGBROW_SSSE3
   1797 
   1798 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
   1799                                 const uint8* uv_buf,
   1800                                 uint8* dst_argb,
   1801                                 const struct YuvConstants* yuvconstants,
   1802                                 int width) {
   1803   // clang-format off
   1804   asm volatile (
   1805     YUVTORGB_SETUP(yuvconstants)
   1806     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1807 
   1808     LABELALIGN
   1809     "1:                                        \n"
   1810     READNV12
   1811     YUVTORGB(yuvconstants)
   1812     STOREARGB
   1813     "sub       $0x8,%[width]                   \n"
   1814     "jg        1b                              \n"
   1815   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1816     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
   1817     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1818     [width]"+rm"(width)    // %[width]
   1819   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   1820     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
   1821       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1822   );
   1823   // clang-format on
   1824 }
   1825 
   1826 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
   1827                                 const uint8* vu_buf,
   1828                                 uint8* dst_argb,
   1829                                 const struct YuvConstants* yuvconstants,
   1830                                 int width) {
   1831   // clang-format off
   1832   asm volatile (
   1833     YUVTORGB_SETUP(yuvconstants)
   1834     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1835 
   1836     LABELALIGN
   1837     "1:                                        \n"
   1838     READNV21
   1839     YUVTORGB(yuvconstants)
   1840     STOREARGB
   1841     "sub       $0x8,%[width]                   \n"
   1842     "jg        1b                              \n"
   1843   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1844     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
   1845     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1846     [width]"+rm"(width)    // %[width]
   1847   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
   1848     [kShuffleNV21]"m"(kShuffleNV21)
   1849     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
   1850       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1851   );
   1852   // clang-format on
   1853 }
   1854 
   1855 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
   1856                                 uint8* dst_argb,
   1857                                 const struct YuvConstants* yuvconstants,
   1858                                 int width) {
   1859   // clang-format off
   1860   asm volatile (
   1861     YUVTORGB_SETUP(yuvconstants)
   1862     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1863 
   1864     LABELALIGN
   1865     "1:                                        \n"
   1866     READYUY2
   1867     YUVTORGB(yuvconstants)
   1868     STOREARGB
   1869     "sub       $0x8,%[width]                   \n"
   1870     "jg        1b                              \n"
   1871   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
   1872     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1873     [width]"+rm"(width)    // %[width]
   1874   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
   1875     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
   1876     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
   1877     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
   1878       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1879   );
   1880   // clang-format on
   1881 }
   1882 
   1883 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
   1884                                 uint8* dst_argb,
   1885                                 const struct YuvConstants* yuvconstants,
   1886                                 int width) {
   1887   // clang-format off
   1888   asm volatile (
   1889     YUVTORGB_SETUP(yuvconstants)
   1890     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1891 
   1892     LABELALIGN
   1893     "1:                                        \n"
   1894     READUYVY
   1895     YUVTORGB(yuvconstants)
   1896     STOREARGB
   1897     "sub       $0x8,%[width]                   \n"
   1898     "jg        1b                              \n"
   1899   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
   1900     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1901     [width]"+rm"(width)    // %[width]
   1902   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
   1903     [kShuffleUYVYY]"m"(kShuffleUYVYY),
   1904     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
   1905     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
   1906       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1907   );
   1908   // clang-format on
   1909 }
   1910 
   1911 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
   1912                                 const uint8* u_buf,
   1913                                 const uint8* v_buf,
   1914                                 uint8* dst_rgba,
   1915                                 const struct YuvConstants* yuvconstants,
   1916                                 int width) {
   1917   asm volatile (
   1918     YUVTORGB_SETUP(yuvconstants)
   1919     "sub       %[u_buf],%[v_buf]               \n"
   1920     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1921 
   1922     LABELALIGN
   1923     "1:                                        \n"
   1924     READYUV422
   1925     YUVTORGB(yuvconstants)
   1926     STORERGBA
   1927     "sub       $0x8,%[width]                   \n"
   1928     "jg        1b                              \n"
   1929   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1930     [u_buf]"+r"(u_buf),    // %[u_buf]
   1931     [v_buf]"+r"(v_buf),    // %[v_buf]
   1932     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
   1933     [width]"+rm"(width)    // %[width]
   1934   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   1935   : "memory", "cc", NACL_R14 YUVTORGB_REGS
   1936     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1937   );
   1938 }
   1939 
   1940 #endif  // HAS_I422TOARGBROW_SSSE3
   1941 
   1942 // Read 16 UV from 444
   1943 #define READYUV444_AVX2 \
   1944   "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
   1945     MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)                         \
   1946     "lea        " MEMLEA(0x10, [u_buf]) ",%[u_buf]                  \n"        \
   1947     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
   1948     "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n"        \
   1949     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
   1950     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
   1951     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
   1952     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
   1953     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
   1954 
   1955 // Read 8 UV from 422, upsample to 16 UV.
   1956 #define READYUV422_AVX2 \
   1957   "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
   1958     MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
   1959     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
   1960     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
   1961     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
   1962     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
   1963     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
   1964     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
   1965     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
   1966     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
   1967 
   1968 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
   1969 #define READYUVA422_AVX2 \
   1970   "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
   1971     MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
   1972     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
   1973     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
   1974     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
   1975     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
   1976     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
   1977     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
   1978     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
   1979     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"        \
   1980     "vmovdqu    " MEMACCESS([a_buf]) ",%%xmm5                       \n"        \
   1981     "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \
   1982     "lea        " MEMLEA(0x10, [a_buf]) ",%[a_buf]                  \n"
   1983 
   1984 // Read 8 UV from NV12, upsample to 16 UV.
   1985 #define READNV12_AVX2 \
   1986   "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                        \n"        \
   1987     "lea        " MEMLEA(0x10, [uv_buf]) ",%[uv_buf]                \n"        \
   1988     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
   1989     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
   1990     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
   1991     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
   1992     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
   1993     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
   1994 
   1995 // Read 8 VU from NV21, upsample to 16 UV.
   1996 #define READNV21_AVX2 \
   1997   "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                        \n"        \
   1998     "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \
   1999     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
   2000     "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \
   2001     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
   2002     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
   2003     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
   2004     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
   2005 
   2006 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
   2007 #define READYUY2_AVX2 \
   2008   "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                      \n"        \
   2009     "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \
   2010     "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm0                    \n"        \
   2011     "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \
   2012     "lea        " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf]            \n"
   2013 
   2014 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
   2015 #define READUYVY_AVX2 \
   2016   "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                     \n"        \
   2017     "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \
   2018     "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm0                   \n"        \
   2019     "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \
   2020     "lea        " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf]            \n"
   2021 
   2022 #if defined(__x86_64__)
   2023 #define YUVTORGB_SETUP_AVX2(yuvconstants) \
   2024   "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8              \n"           \
   2025     "vmovdqa     " MEMACCESS2(32, [yuvconstants]) ",%%ymm9       \n"           \
   2026     "vmovdqa     " MEMACCESS2(64, [yuvconstants]) ",%%ymm10      \n"           \
   2027     "vmovdqa     " MEMACCESS2(96, [yuvconstants]) ",%%ymm11      \n"           \
   2028     "vmovdqa     " MEMACCESS2(128, [yuvconstants]) ",%%ymm12     \n"           \
   2029     "vmovdqa     " MEMACCESS2(160, [yuvconstants]) ",%%ymm13     \n"           \
   2030     "vmovdqa     " MEMACCESS2(192, [yuvconstants]) ",%%ymm14     \n"
   2031 
   2032 #define YUVTORGB_AVX2(yuvconstants)                                   \
   2033   "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
   2034   "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \
   2035   "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \
   2036   "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \
   2037   "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \
   2038   "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \
   2039   "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
   2040   "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
   2041   "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
   2042   "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n" \
   2043   "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
   2044   "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
   2045   "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
   2046   "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
   2047   "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
   2048   "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
   2049 
   2050 #define YUVTORGB_REGS_AVX2 \
   2051   "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
   2052 
   2053 #else  // Convert 16 pixels: 16 UV and 16 Y.
   2054 
   2055 #define YUVTORGB_SETUP_AVX2(yuvconstants)
   2056 #define YUVTORGB_AVX2(yuvconstants) \
   2057   "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2     \n"        \
   2058     "vpmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1   \n"        \
   2059     "vpmaddubsw  " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0        \n"        \
   2060     "vmovdqu     " MEMACCESS2(160, [yuvconstants]) ",%%ymm3         \n"        \
   2061     "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
   2062     "vmovdqu     " MEMACCESS2(128, [yuvconstants]) ",%%ymm3         \n"        \
   2063     "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
   2064     "vmovdqu     " MEMACCESS2(96, [yuvconstants]) ",%%ymm3          \n"        \
   2065     "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
   2066     "vpmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4  \n"        \
   2067     "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
   2068     "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
   2069     "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
   2070     "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
   2071     "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
   2072     "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
   2073     "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
   2074     "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
   2075     "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
   2076 #define YUVTORGB_REGS_AVX2
   2077 #endif
   2078 
   2079 // Store 16 ARGB values.
   2080 #define STOREARGB_AVX2 \
   2081   "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                  \n"        \
   2082     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
   2083     "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \
   2084     "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \
   2085     "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n"        \
   2086     "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n"        \
   2087     "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "                    \n"        \
   2088     "vmovdqu    %%ymm0," MEMACCESS2(0x20, [dst_argb]) "             \n"        \
   2089     "lea       " MEMLEA(0x40, [dst_argb]) ", %[dst_argb]            \n"
   2090 
   2091 #ifdef HAS_I444TOARGBROW_AVX2
   2092 // 16 pixels
   2093 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
   2094 void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
   2095                                const uint8* u_buf,
   2096                                const uint8* v_buf,
   2097                                uint8* dst_argb,
   2098                                const struct YuvConstants* yuvconstants,
   2099                                int width) {
   2100   asm volatile (
   2101     YUVTORGB_SETUP_AVX2(yuvconstants)
   2102     "sub       %[u_buf],%[v_buf]               \n"
   2103     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
   2104 
   2105     LABELALIGN
   2106     "1:                                        \n"
   2107     READYUV444_AVX2
   2108     YUVTORGB_AVX2(yuvconstants)
   2109     STOREARGB_AVX2
   2110     "sub       $0x10,%[width]                  \n"
   2111     "jg        1b                              \n"
   2112     "vzeroupper                                \n"
   2113   : [y_buf]"+r"(y_buf),    // %[y_buf]
   2114     [u_buf]"+r"(u_buf),    // %[u_buf]
   2115     [v_buf]"+r"(v_buf),    // %[v_buf]
   2116     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2117     [width]"+rm"(width)    // %[width]
   2118   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   2119   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
   2120     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2121   );
   2122 }
   2123 #endif  // HAS_I444TOARGBROW_AVX2
   2124 
   2125 #if defined(HAS_I422TOARGBROW_AVX2)
   2126 // 16 pixels
   2127 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2128 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
   2129                                const uint8* u_buf,
   2130                                const uint8* v_buf,
   2131                                uint8* dst_argb,
   2132                                const struct YuvConstants* yuvconstants,
   2133                                int width) {
   2134   asm volatile (
   2135     YUVTORGB_SETUP_AVX2(yuvconstants)
   2136     "sub       %[u_buf],%[v_buf]               \n"
   2137     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
   2138 
   2139     LABELALIGN
   2140     "1:                                        \n"
   2141     READYUV422_AVX2
   2142     YUVTORGB_AVX2(yuvconstants)
   2143     STOREARGB_AVX2
   2144     "sub       $0x10,%[width]                  \n"
   2145     "jg        1b                              \n"
   2146 
   2147     "vzeroupper                                \n"
   2148   : [y_buf]"+r"(y_buf),    // %[y_buf]
   2149     [u_buf]"+r"(u_buf),    // %[u_buf]
   2150     [v_buf]"+r"(v_buf),    // %[v_buf]
   2151     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2152     [width]"+rm"(width)    // %[width]
   2153   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   2154   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
   2155     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2156   );
   2157 }
   2158 #endif  // HAS_I422TOARGBROW_AVX2
   2159 
   2160 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
   2161 // 16 pixels
   2162 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
   2163 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
   2164                                     const uint8* u_buf,
   2165                                     const uint8* v_buf,
   2166                                     const uint8* a_buf,
   2167                                     uint8* dst_argb,
   2168                                     const struct YuvConstants* yuvconstants,
   2169                                     int width) {
   2170   // clang-format off
   2171   asm volatile (
   2172     YUVTORGB_SETUP_AVX2(yuvconstants)
   2173     "sub       %[u_buf],%[v_buf]               \n"
   2174 
   2175     LABELALIGN
   2176     "1:                                        \n"
   2177     READYUVA422_AVX2
   2178     YUVTORGB_AVX2(yuvconstants)
   2179     STOREARGB_AVX2
   2180     "subl      $0x10,%[width]                  \n"
   2181     "jg        1b                              \n"
   2182     "vzeroupper                                \n"
   2183   : [y_buf]"+r"(y_buf),    // %[y_buf]
   2184     [u_buf]"+r"(u_buf),    // %[u_buf]
   2185     [v_buf]"+r"(v_buf),    // %[v_buf]
   2186     [a_buf]"+r"(a_buf),    // %[a_buf]
   2187     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2188 #if defined(__i386__)
   2189     [width]"+m"(width)     // %[width]
   2190 #else
   2191     [width]"+rm"(width)    // %[width]
   2192 #endif
   2193   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   2194   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
   2195     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2196   );
   2197   // clang-format on
   2198 }
   2199 #endif  // HAS_I422ALPHATOARGBROW_AVX2
   2200 
   2201 #if defined(HAS_I422TORGBAROW_AVX2)
   2202 // 16 pixels
   2203 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
   2204 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
   2205                                const uint8* u_buf,
   2206                                const uint8* v_buf,
   2207                                uint8* dst_argb,
   2208                                const struct YuvConstants* yuvconstants,
   2209                                int width) {
   2210   asm volatile (
   2211     YUVTORGB_SETUP_AVX2(yuvconstants)
   2212     "sub       %[u_buf],%[v_buf]               \n"
   2213     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   2214 
   2215     LABELALIGN
   2216     "1:                                        \n"
   2217     READYUV422_AVX2
   2218     YUVTORGB_AVX2(yuvconstants)
   2219 
   2220     // Step 3: Weave into RGBA
   2221     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
   2222     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
   2223     "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
   2224     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
   2225     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
   2226     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
   2227     "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
   2228     "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
   2229     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
   2230     "sub       $0x10,%[width]                  \n"
   2231     "jg        1b                              \n"
   2232     "vzeroupper                                \n"
   2233   : [y_buf]"+r"(y_buf),    // %[y_buf]
   2234     [u_buf]"+r"(u_buf),    // %[u_buf]
   2235     [v_buf]"+r"(v_buf),    // %[v_buf]
   2236     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2237     [width]"+rm"(width)    // %[width]
   2238   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   2239   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
   2240     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2241   );
   2242 }
   2243 #endif  // HAS_I422TORGBAROW_AVX2
   2244 
   2245 #if defined(HAS_NV12TOARGBROW_AVX2)
   2246 // 16 pixels.
   2247 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2248 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
   2249                                const uint8* uv_buf,
   2250                                uint8* dst_argb,
   2251                                const struct YuvConstants* yuvconstants,
   2252                                int width) {
   2253   // clang-format off
   2254   asm volatile (
   2255     YUVTORGB_SETUP_AVX2(yuvconstants)
   2256     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   2257 
   2258     LABELALIGN
   2259     "1:                                        \n"
   2260     READNV12_AVX2
   2261     YUVTORGB_AVX2(yuvconstants)
   2262     STOREARGB_AVX2
   2263     "sub       $0x10,%[width]                  \n"
   2264     "jg        1b                              \n"
   2265     "vzeroupper                                \n"
   2266   : [y_buf]"+r"(y_buf),    // %[y_buf]
   2267     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
   2268     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2269     [width]"+rm"(width)    // %[width]
   2270   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   2271     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
   2272     "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2273   );
   2274   // clang-format on
   2275 }
   2276 #endif  // HAS_NV12TOARGBROW_AVX2
   2277 
   2278 #if defined(HAS_NV21TOARGBROW_AVX2)
   2279 // 16 pixels.
   2280 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2281 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
   2282                                const uint8* vu_buf,
   2283                                uint8* dst_argb,
   2284                                const struct YuvConstants* yuvconstants,
   2285                                int width) {
   2286   // clang-format off
   2287   asm volatile (
   2288     YUVTORGB_SETUP_AVX2(yuvconstants)
   2289     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   2290 
   2291     LABELALIGN
   2292     "1:                                        \n"
   2293     READNV21_AVX2
   2294     YUVTORGB_AVX2(yuvconstants)
   2295     STOREARGB_AVX2
   2296     "sub       $0x10,%[width]                  \n"
   2297     "jg        1b                              \n"
   2298     "vzeroupper                                \n"
   2299   : [y_buf]"+r"(y_buf),    // %[y_buf]
   2300     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
   2301     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2302     [width]"+rm"(width)    // %[width]
   2303   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
   2304     [kShuffleNV21]"m"(kShuffleNV21)
   2305     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
   2306       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2307   );
   2308   // clang-format on
   2309 }
   2310 #endif  // HAS_NV21TOARGBROW_AVX2
   2311 
   2312 #if defined(HAS_YUY2TOARGBROW_AVX2)
   2313 // 16 pixels.
   2314 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
   2315 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
   2316                                uint8* dst_argb,
   2317                                const struct YuvConstants* yuvconstants,
   2318                                int width) {
   2319   // clang-format off
   2320   asm volatile (
   2321     YUVTORGB_SETUP_AVX2(yuvconstants)
   2322     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   2323 
   2324     LABELALIGN
   2325     "1:                                        \n"
   2326     READYUY2_AVX2
   2327     YUVTORGB_AVX2(yuvconstants)
   2328     STOREARGB_AVX2
   2329     "sub       $0x10,%[width]                  \n"
   2330     "jg        1b                              \n"
   2331     "vzeroupper                                \n"
   2332   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
   2333     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2334     [width]"+rm"(width)    // %[width]
   2335   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
   2336     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
   2337     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
   2338     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
   2339       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2340   );
   2341   // clang-format on
   2342 }
   2343 #endif  // HAS_YUY2TOARGBROW_AVX2
   2344 
   2345 #if defined(HAS_UYVYTOARGBROW_AVX2)
   2346 // 16 pixels.
   2347 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
   2348 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
   2349                                uint8* dst_argb,
   2350                                const struct YuvConstants* yuvconstants,
   2351                                int width) {
   2352   // clang-format off
   2353   asm volatile (
   2354     YUVTORGB_SETUP_AVX2(yuvconstants)
   2355     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   2356 
   2357     LABELALIGN
   2358     "1:                                        \n"
   2359     READUYVY_AVX2
   2360     YUVTORGB_AVX2(yuvconstants)
   2361     STOREARGB_AVX2
   2362     "sub       $0x10,%[width]                  \n"
   2363     "jg        1b                              \n"
   2364     "vzeroupper                                \n"
   2365   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
   2366     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2367     [width]"+rm"(width)    // %[width]
   2368   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
   2369     [kShuffleUYVYY]"m"(kShuffleUYVYY),
   2370     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
   2371     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
   2372       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2373   );
   2374   // clang-format on
   2375 }
   2376 #endif  // HAS_UYVYTOARGBROW_AVX2
   2377 
   2378 #ifdef HAS_I400TOARGBROW_SSE2
   2379 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
   2380   asm volatile (
   2381     "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
   2382     "movd      %%eax,%%xmm2                    \n"
   2383     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
   2384     "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
   2385     "movd      %%eax,%%xmm3                    \n"
   2386     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
   2387     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   2388     "pslld     $0x18,%%xmm4                    \n"
   2389 
   2390     LABELALIGN
   2391     "1:                                        \n"
   2392     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
   2393     "movq      " MEMACCESS(0) ",%%xmm0         \n"
   2394     "lea       " MEMLEA(0x8,0) ",%0            \n"
   2395     "punpcklbw %%xmm0,%%xmm0                   \n"
   2396     "pmulhuw   %%xmm2,%%xmm0                   \n"
   2397     "psubusw   %%xmm3,%%xmm0                   \n"
   2398     "psrlw     $6, %%xmm0                      \n"
   2399     "packuswb  %%xmm0,%%xmm0                   \n"
   2400 
   2401     // Step 2: Weave into ARGB
   2402     "punpcklbw %%xmm0,%%xmm0                   \n"
   2403     "movdqa    %%xmm0,%%xmm1                   \n"
   2404     "punpcklwd %%xmm0,%%xmm0                   \n"
   2405     "punpckhwd %%xmm1,%%xmm1                   \n"
   2406     "por       %%xmm4,%%xmm0                   \n"
   2407     "por       %%xmm4,%%xmm1                   \n"
   2408     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   2409     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   2410     "lea       " MEMLEA(0x20,1) ",%1           \n"
   2411 
   2412     "sub       $0x8,%2                         \n"
   2413     "jg        1b                              \n"
   2414   : "+r"(y_buf),     // %0
   2415     "+r"(dst_argb),  // %1
   2416     "+rm"(width)     // %2
   2417   :
   2418   : "memory", "cc", "eax"
   2419     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   2420   );
   2421 }
   2422 #endif  // HAS_I400TOARGBROW_SSE2
   2423 
   2424 #ifdef HAS_I400TOARGBROW_AVX2
   2425 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
   2426 // note: vpunpcklbw mutates and vpackuswb unmutates.
   2427 void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
   2428   asm volatile (
   2429     "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
   2430     "vmovd      %%eax,%%xmm2                   \n"
   2431     "vbroadcastss %%xmm2,%%ymm2                \n"
   2432     "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
   2433     "vmovd      %%eax,%%xmm3                   \n"
   2434     "vbroadcastss %%xmm3,%%ymm3                \n"
   2435     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
   2436     "vpslld     $0x18,%%ymm4,%%ymm4            \n"
   2437 
   2438     LABELALIGN
   2439     "1:                                        \n"
   2440     // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
   2441     "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
   2442     "lea        " MEMLEA(0x10,0) ",%0          \n"
   2443     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
   2444     "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
   2445     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
   2446     "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
   2447     "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
   2448     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
   2449     "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
   2450     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
   2451     "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
   2452     "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
   2453     "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
   2454     "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
   2455     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
   2456     "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
   2457     "lea       " MEMLEA(0x40,1) ",%1           \n"
   2458     "sub        $0x10,%2                       \n"
   2459     "jg        1b                              \n"
   2460     "vzeroupper                                \n"
   2461   : "+r"(y_buf),     // %0
   2462     "+r"(dst_argb),  // %1
   2463     "+rm"(width)     // %2
   2464   :
   2465   : "memory", "cc", "eax"
   2466     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   2467   );
   2468 }
   2469 #endif  // HAS_I400TOARGBROW_AVX2
   2470 
   2471 #ifdef HAS_MIRRORROW_SSSE3
   2472 // Shuffle table for reversing the bytes.
   2473 static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
   2474                                7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
   2475 
   2476 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   2477   intptr_t temp_width = (intptr_t)(width);
   2478   asm volatile (
   2479     "movdqa    %3,%%xmm5                       \n"
   2480 
   2481     LABELALIGN
   2482     "1:                                        \n"
   2483     MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
   2484     "pshufb    %%xmm5,%%xmm0                   \n"
   2485     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   2486     "lea       " MEMLEA(0x10,1) ",%1           \n"
   2487     "sub       $0x10,%2                        \n"
   2488     "jg        1b                              \n"
   2489   : "+r"(src),  // %0
   2490     "+r"(dst),  // %1
   2491     "+r"(temp_width)  // %2
   2492   : "m"(kShuffleMirror) // %3
   2493   : "memory", "cc", NACL_R14
   2494     "xmm0", "xmm5"
   2495   );
   2496 }
   2497 #endif  // HAS_MIRRORROW_SSSE3
   2498 
   2499 #ifdef HAS_MIRRORROW_AVX2
   2500 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   2501   intptr_t temp_width = (intptr_t)(width);
   2502   asm volatile (
   2503     "vbroadcastf128 %3,%%ymm5                  \n"
   2504 
   2505     LABELALIGN
   2506     "1:                                        \n"
   2507     MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
   2508     "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
   2509     "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
   2510     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
   2511     "lea       " MEMLEA(0x20,1) ",%1           \n"
   2512     "sub       $0x20,%2                        \n"
   2513     "jg        1b                              \n"
   2514     "vzeroupper                                \n"
   2515   : "+r"(src),  // %0
   2516     "+r"(dst),  // %1
   2517     "+r"(temp_width)  // %2
   2518   : "m"(kShuffleMirror) // %3
   2519   : "memory", "cc", NACL_R14
   2520     "xmm0", "xmm5"
   2521   );
   2522 }
   2523 #endif  // HAS_MIRRORROW_AVX2
   2524 
   2525 #ifdef HAS_MIRRORUVROW_SSSE3
   2526 // Shuffle table for reversing the bytes of UV channels.
   2527 static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
   2528                                  15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
   2529 void MirrorUVRow_SSSE3(const uint8* src,
   2530                        uint8* dst_u,
   2531                        uint8* dst_v,
   2532                        int width) {
   2533   intptr_t temp_width = (intptr_t)(width);
   2534   asm volatile (
   2535     "movdqa    %4,%%xmm1                       \n"
   2536     "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
   2537     "sub       %1,%2                           \n"
   2538 
   2539     LABELALIGN
   2540     "1:                                        \n"
   2541     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   2542     "lea       " MEMLEA(-0x10,0) ",%0          \n"
   2543     "pshufb    %%xmm1,%%xmm0                   \n"
   2544     "movlpd    %%xmm0," MEMACCESS(1) "         \n"
   2545     MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
   2546     "lea       " MEMLEA(0x8,1) ",%1            \n"
   2547     "sub       $8,%3                           \n"
   2548     "jg        1b                              \n"
   2549   : "+r"(src),      // %0
   2550     "+r"(dst_u),    // %1
   2551     "+r"(dst_v),    // %2
   2552     "+r"(temp_width)  // %3
   2553   : "m"(kShuffleMirrorUV)  // %4
   2554   : "memory", "cc", NACL_R14
   2555     "xmm0", "xmm1"
   2556   );
   2557 }
   2558 #endif  // HAS_MIRRORUVROW_SSSE3
   2559 
   2560 #ifdef HAS_ARGBMIRRORROW_SSE2
   2561 
   2562 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   2563   intptr_t temp_width = (intptr_t)(width);
   2564   asm volatile (
   2565     "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
   2566 
   2567     LABELALIGN
   2568     "1:                                        \n"
   2569     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   2570     "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
   2571     "lea       " MEMLEA(-0x10,0) ",%0          \n"
   2572     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   2573     "lea       " MEMLEA(0x10,1) ",%1           \n"
   2574     "sub       $0x4,%2                         \n"
   2575     "jg        1b                              \n"
   2576   : "+r"(src),  // %0
   2577     "+r"(dst),  // %1
   2578     "+r"(temp_width)  // %2
   2579   :
   2580   : "memory", "cc"
   2581     , "xmm0"
   2582   );
   2583 }
   2584 #endif  // HAS_ARGBMIRRORROW_SSE2
   2585 
   2586 #ifdef HAS_ARGBMIRRORROW_AVX2
   2587 // Shuffle table for reversing the bytes.
   2588 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
   2589 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   2590   intptr_t temp_width = (intptr_t)(width);
   2591   asm volatile (
   2592     "vmovdqu    %3,%%ymm5                      \n"
   2593 
   2594     LABELALIGN
   2595     "1:                                        \n"
   2596     VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
   2597     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
   2598     "lea        " MEMLEA(0x20,1) ",%1          \n"
   2599     "sub        $0x8,%2                        \n"
   2600     "jg         1b                             \n"
   2601     "vzeroupper                                \n"
   2602   : "+r"(src),  // %0
   2603     "+r"(dst),  // %1
   2604     "+r"(temp_width)  // %2
   2605   : "m"(kARGBShuffleMirror_AVX2) // %3
   2606   : "memory", "cc", NACL_R14
   2607     "xmm0", "xmm5"
   2608   );
   2609 }
   2610 #endif  // HAS_ARGBMIRRORROW_AVX2
   2611 
   2612 #ifdef HAS_SPLITUVROW_AVX2
   2613 void SplitUVRow_AVX2(const uint8* src_uv,
   2614                      uint8* dst_u,
   2615                      uint8* dst_v,
   2616                      int width) {
   2617   asm volatile (
   2618     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   2619     "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
   2620     "sub        %1,%2                          \n"
   2621 
   2622     LABELALIGN
   2623     "1:                                        \n"
   2624     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
   2625     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
   2626     "lea        " MEMLEA(0x40,0) ",%0          \n"
   2627     "vpsrlw     $0x8,%%ymm0,%%ymm2             \n"
   2628     "vpsrlw     $0x8,%%ymm1,%%ymm3             \n"
   2629     "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
   2630     "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
   2631     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
   2632     "vpackuswb  %%ymm3,%%ymm2,%%ymm2           \n"
   2633     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
   2634     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
   2635     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
   2636     MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)           //  vmovdqu %%ymm2,(%1,%2)
   2637     "lea        " MEMLEA(0x20,1) ",%1          \n"
   2638     "sub        $0x20,%3                       \n"
   2639     "jg         1b                             \n"
   2640     "vzeroupper                                \n"
   2641   : "+r"(src_uv),     // %0
   2642     "+r"(dst_u),      // %1
   2643     "+r"(dst_v),      // %2
   2644     "+r"(width)         // %3
   2645   :
   2646   : "memory", "cc", NACL_R14
   2647     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   2648   );
   2649 }
   2650 #endif  // HAS_SPLITUVROW_AVX2
   2651 
   2652 #ifdef HAS_SPLITUVROW_SSE2
   2653 void SplitUVRow_SSE2(const uint8* src_uv,
   2654                      uint8* dst_u,
   2655                      uint8* dst_v,
   2656                      int width) {
   2657   asm volatile (
   2658     "pcmpeqb    %%xmm5,%%xmm5                  \n"
   2659     "psrlw      $0x8,%%xmm5                    \n"
   2660     "sub        %1,%2                          \n"
   2661 
   2662     LABELALIGN
   2663     "1:                                        \n"
   2664     "movdqu     " MEMACCESS(0) ",%%xmm0        \n"
   2665     "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1  \n"
   2666     "lea        " MEMLEA(0x20,0) ",%0          \n"
   2667     "movdqa     %%xmm0,%%xmm2                  \n"
   2668     "movdqa     %%xmm1,%%xmm3                  \n"
   2669     "pand       %%xmm5,%%xmm0                  \n"
   2670     "pand       %%xmm5,%%xmm1                  \n"
   2671     "packuswb   %%xmm1,%%xmm0                  \n"
   2672     "psrlw      $0x8,%%xmm2                    \n"
   2673     "psrlw      $0x8,%%xmm3                    \n"
   2674     "packuswb   %%xmm3,%%xmm2                  \n"
   2675     "movdqu     %%xmm0," MEMACCESS(1) "        \n"
   2676     MEMOPMEM(movdqu,xmm2,0x00,1,2,1)           //  movdqu     %%xmm2,(%1,%2)
   2677     "lea        " MEMLEA(0x10,1) ",%1          \n"
   2678     "sub        $0x10,%3                       \n"
   2679     "jg         1b                             \n"
   2680   : "+r"(src_uv),     // %0
   2681     "+r"(dst_u),      // %1
   2682     "+r"(dst_v),      // %2
   2683     "+r"(width)         // %3
   2684   :
   2685   : "memory", "cc", NACL_R14
   2686     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   2687   );
   2688 }
   2689 #endif  // HAS_SPLITUVROW_SSE2
   2690 
   2691 #ifdef HAS_MERGEUVROW_AVX2
   2692 void MergeUVRow_AVX2(const uint8* src_u,
   2693                      const uint8* src_v,
   2694                      uint8* dst_uv,
   2695                      int width) {
   2696   asm volatile (
   2697     "sub       %0,%1                           \n"
   2698 
   2699     LABELALIGN
   2700     "1:                                        \n"
   2701     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   2702     MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)           //  vmovdqu (%0,%1,1),%%ymm1
   2703     "lea       " MEMLEA(0x20,0) ",%0           \n"
   2704     "vpunpcklbw %%ymm1,%%ymm0,%%ymm2           \n"
   2705     "vpunpckhbw %%ymm1,%%ymm0,%%ymm0           \n"
   2706     "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
   2707     "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
   2708     "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
   2709     "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
   2710     "lea       " MEMLEA(0x40,2) ",%2           \n"
   2711     "sub       $0x20,%3                        \n"
   2712     "jg        1b                              \n"
   2713     "vzeroupper                                \n"
   2714   : "+r"(src_u),     // %0
   2715     "+r"(src_v),     // %1
   2716     "+r"(dst_uv),    // %2
   2717     "+r"(width)      // %3
   2718   :
   2719   : "memory", "cc", NACL_R14
   2720     "xmm0", "xmm1", "xmm2"
   2721   );
   2722 }
   2723 #endif  // HAS_MERGEUVROW_AVX2
   2724 
   2725 #ifdef HAS_MERGEUVROW_SSE2
   2726 void MergeUVRow_SSE2(const uint8* src_u,
   2727                      const uint8* src_v,
   2728                      uint8* dst_uv,
   2729                      int width) {
   2730   asm volatile (
   2731     "sub       %0,%1                           \n"
   2732 
   2733     LABELALIGN
   2734     "1:                                        \n"
   2735     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   2736     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
   2737     "lea       " MEMLEA(0x10,0) ",%0           \n"
   2738     "movdqa    %%xmm0,%%xmm2                   \n"
   2739     "punpcklbw %%xmm1,%%xmm0                   \n"
   2740     "punpckhbw %%xmm1,%%xmm2                   \n"
   2741     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   2742     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
   2743     "lea       " MEMLEA(0x20,2) ",%2           \n"
   2744     "sub       $0x10,%3                        \n"
   2745     "jg        1b                              \n"
   2746   : "+r"(src_u),     // %0
   2747     "+r"(src_v),     // %1
   2748     "+r"(dst_uv),    // %2
   2749     "+r"(width)      // %3
   2750   :
   2751   : "memory", "cc", NACL_R14
   2752     "xmm0", "xmm1", "xmm2"
   2753   );
   2754 }
   2755 #endif  // HAS_MERGEUVROW_SSE2
   2756 
   2757 #ifdef HAS_COPYROW_SSE2
   2758 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   2759   asm volatile (
   2760     "test       $0xf,%0                        \n"
   2761     "jne        2f                             \n"
   2762     "test       $0xf,%1                        \n"
   2763     "jne        2f                             \n"
   2764 
   2765     LABELALIGN
   2766     "1:                                        \n"
   2767     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   2768     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   2769     "lea       " MEMLEA(0x20,0) ",%0           \n"
   2770     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   2771     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   2772     "lea       " MEMLEA(0x20,1) ",%1           \n"
   2773     "sub       $0x20,%2                        \n"
   2774     "jg        1b                              \n"
   2775     "jmp       9f                              \n"
   2776 
   2777     LABELALIGN
   2778   "2:                                          \n"
   2779     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   2780     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   2781     "lea       " MEMLEA(0x20,0) ",%0           \n"
   2782     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   2783     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   2784     "lea       " MEMLEA(0x20,1) ",%1           \n"
   2785     "sub       $0x20,%2                        \n"
   2786     "jg        2b                              \n"
   2787   "9:                                          \n"
   2788   : "+r"(src),   // %0
   2789     "+r"(dst),   // %1
   2790     "+r"(count)  // %2
   2791   :
   2792   : "memory", "cc"
   2793     , "xmm0", "xmm1"
   2794   );
   2795 }
   2796 #endif  // HAS_COPYROW_SSE2
   2797 
   2798 #ifdef HAS_COPYROW_AVX
   2799 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
   2800   asm volatile (
   2801     LABELALIGN
   2802     "1:                                        \n"
   2803     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   2804     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   2805     "lea       " MEMLEA(0x40,0) ",%0           \n"
   2806     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
   2807     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
   2808     "lea       " MEMLEA(0x40,1) ",%1           \n"
   2809     "sub       $0x40,%2                        \n"
   2810     "jg        1b                              \n"
   2811   : "+r"(src),   // %0
   2812     "+r"(dst),   // %1
   2813     "+r"(count)  // %2
   2814   :
   2815   : "memory", "cc"
   2816     , "xmm0", "xmm1"
   2817   );
   2818 }
   2819 #endif  // HAS_COPYROW_AVX
   2820 
   2821 #ifdef HAS_COPYROW_ERMS
   2822 // Multiple of 1.
   2823 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
   2824   size_t width_tmp = (size_t)(width);
   2825   asm volatile("rep movsb " MEMMOVESTRING(0, 1) "          \n"
   2826                : "+S"(src),       // %0
   2827                  "+D"(dst),       // %1
   2828                  "+c"(width_tmp)  // %2
   2829                :
   2830                : "memory", "cc");
   2831 }
   2832 #endif  // HAS_COPYROW_ERMS
   2833 
   2834 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
   2835 // width in pixels
   2836 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   2837   asm volatile (
   2838     "pcmpeqb   %%xmm0,%%xmm0                   \n"
   2839     "pslld     $0x18,%%xmm0                    \n"
   2840     "pcmpeqb   %%xmm1,%%xmm1                   \n"
   2841     "psrld     $0x8,%%xmm1                     \n"
   2842 
   2843     LABELALIGN
   2844     "1:                                        \n"
   2845     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
   2846     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
   2847     "lea       " MEMLEA(0x20,0) ",%0           \n"
   2848     "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
   2849     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
   2850     "pand      %%xmm0,%%xmm2                   \n"
   2851     "pand      %%xmm0,%%xmm3                   \n"
   2852     "pand      %%xmm1,%%xmm4                   \n"
   2853     "pand      %%xmm1,%%xmm5                   \n"
   2854     "por       %%xmm4,%%xmm2                   \n"
   2855     "por       %%xmm5,%%xmm3                   \n"
   2856     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
   2857     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
   2858     "lea       " MEMLEA(0x20,1) ",%1           \n"
   2859     "sub       $0x8,%2                         \n"
   2860     "jg        1b                              \n"
   2861   : "+r"(src),   // %0
   2862     "+r"(dst),   // %1
   2863     "+r"(width)  // %2
   2864   :
   2865   : "memory", "cc"
   2866     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2867   );
   2868 }
   2869 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
   2870 
   2871 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
   2872 // width in pixels
   2873 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   2874   asm volatile (
   2875     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
   2876     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
   2877 
   2878     LABELALIGN
   2879     "1:                                        \n"
   2880     "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
   2881     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
   2882     "lea       " MEMLEA(0x40,0) ",%0           \n"
   2883     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
   2884     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
   2885     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
   2886     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
   2887     "lea       " MEMLEA(0x40,1) ",%1           \n"
   2888     "sub       $0x10,%2                        \n"
   2889     "jg        1b                              \n"
   2890     "vzeroupper                                \n"
   2891   : "+r"(src),   // %0
   2892     "+r"(dst),   // %1
   2893     "+r"(width)  // %2
   2894   :
   2895   : "memory", "cc"
   2896     , "xmm0", "xmm1", "xmm2"
   2897   );
   2898 }
   2899 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
   2900 
   2901 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
   2902 // width in pixels
   2903 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
   2904   asm volatile (
   2905     LABELALIGN
   2906     "1:                                        \n"
   2907     "movdqu    " MEMACCESS(0) ", %%xmm0        \n"
   2908     "movdqu    " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
   2909     "lea       " MEMLEA(0x20, 0) ", %0         \n"
   2910     "psrld     $0x18, %%xmm0                   \n"
   2911     "psrld     $0x18, %%xmm1                   \n"
   2912     "packssdw  %%xmm1, %%xmm0                  \n"
   2913     "packuswb  %%xmm0, %%xmm0                  \n"
   2914     "movq      %%xmm0," MEMACCESS(1) "         \n"
   2915     "lea       " MEMLEA(0x8, 1) ", %1          \n"
   2916     "sub       $0x8, %2                        \n"
   2917     "jg        1b                              \n"
   2918   : "+r"(src_argb),  // %0
   2919     "+r"(dst_a),     // %1
   2920     "+rm"(width)     // %2
   2921   :
   2922   : "memory", "cc"
   2923     , "xmm0", "xmm1"
   2924   );
   2925 }
   2926 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
   2927 
   2928 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
   2929 static const uvec8 kShuffleAlphaShort_AVX2 = {
   2930     3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
   2931     11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
   2932 
   2933 void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) {
   2934   asm volatile (
   2935     "vmovdqa    %3,%%ymm4                      \n"
   2936     "vbroadcastf128 %4,%%ymm5                  \n"
   2937 
   2938     LABELALIGN
   2939     "1:                                        \n"
   2940     "vmovdqu   " MEMACCESS(0) ", %%ymm0        \n"
   2941     "vmovdqu   " MEMACCESS2(0x20, 0) ", %%ymm1 \n"
   2942     "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n" // vpsrld $0x18, %%ymm0
   2943     "vpshufb    %%ymm5,%%ymm1,%%ymm1           \n"
   2944     "vmovdqu   " MEMACCESS2(0x40, 0) ", %%ymm2 \n"
   2945     "vmovdqu   " MEMACCESS2(0x60, 0) ", %%ymm3 \n"
   2946     "lea       " MEMLEA(0x80, 0) ", %0         \n"
   2947     "vpackssdw  %%ymm1, %%ymm0, %%ymm0         \n"  // mutates
   2948     "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
   2949     "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
   2950     "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates
   2951     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
   2952     "vpermd     %%ymm0,%%ymm4,%%ymm0           \n"  // unmutate.
   2953     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
   2954     "lea       " MEMLEA(0x20,1) ",%1           \n"
   2955     "sub        $0x20, %2                      \n"
   2956     "jg         1b                             \n"
   2957     "vzeroupper                                \n"
   2958   : "+r"(src_argb),  // %0
   2959     "+r"(dst_a),     // %1
   2960     "+rm"(width)     // %2
   2961   : "m"(kPermdARGBToY_AVX),  // %3
   2962     "m"(kShuffleAlphaShort_AVX2)  // %4
   2963   : "memory", "cc"
   2964     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2965   );
   2966 }
   2967 #endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
   2968 
   2969 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
   2970 // width in pixels
   2971 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   2972   asm volatile (
   2973     "pcmpeqb   %%xmm0,%%xmm0                   \n"
   2974     "pslld     $0x18,%%xmm0                    \n"
   2975     "pcmpeqb   %%xmm1,%%xmm1                   \n"
   2976     "psrld     $0x8,%%xmm1                     \n"
   2977 
   2978     LABELALIGN
   2979     "1:                                        \n"
   2980     "movq      " MEMACCESS(0) ",%%xmm2         \n"
   2981     "lea       " MEMLEA(0x8,0) ",%0            \n"
   2982     "punpcklbw %%xmm2,%%xmm2                   \n"
   2983     "punpckhwd %%xmm2,%%xmm3                   \n"
   2984     "punpcklwd %%xmm2,%%xmm2                   \n"
   2985     "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
   2986     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
   2987     "pand      %%xmm0,%%xmm2                   \n"
   2988     "pand      %%xmm0,%%xmm3                   \n"
   2989     "pand      %%xmm1,%%xmm4                   \n"
   2990     "pand      %%xmm1,%%xmm5                   \n"
   2991     "por       %%xmm4,%%xmm2                   \n"
   2992     "por       %%xmm5,%%xmm3                   \n"
   2993     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
   2994     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
   2995     "lea       " MEMLEA(0x20,1) ",%1           \n"
   2996     "sub       $0x8,%2                         \n"
   2997     "jg        1b                              \n"
   2998   : "+r"(src),   // %0
   2999     "+r"(dst),   // %1
   3000     "+r"(width)  // %2
   3001   :
   3002   : "memory", "cc"
   3003     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   3004   );
   3005 }
   3006 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
   3007 
   3008 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
   3009 // width in pixels
   3010 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   3011   asm volatile (
   3012     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
   3013     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
   3014 
   3015     LABELALIGN
   3016     "1:                                        \n"
   3017     "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
   3018     "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
   3019     "lea       " MEMLEA(0x10,0) ",%0           \n"
   3020     "vpslld    $0x18,%%ymm1,%%ymm1             \n"
   3021     "vpslld    $0x18,%%ymm2,%%ymm2             \n"
   3022     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
   3023     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
   3024     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
   3025     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
   3026     "lea       " MEMLEA(0x40,1) ",%1           \n"
   3027     "sub       $0x10,%2                        \n"
   3028     "jg        1b                              \n"
   3029     "vzeroupper                                \n"
   3030   : "+r"(src),   // %0
   3031     "+r"(dst),   // %1
   3032     "+r"(width)  // %2
   3033   :
   3034   : "memory", "cc"
   3035     , "xmm0", "xmm1", "xmm2"
   3036   );
   3037 }
   3038 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
   3039 
   3040 #ifdef HAS_SETROW_X86
   3041 void SetRow_X86(uint8* dst, uint8 v8, int width) {
   3042   size_t width_tmp = (size_t)(width >> 2);
   3043   const uint32 v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
   3044   asm volatile("rep stosl " MEMSTORESTRING(eax, 0) "       \n"
   3045                : "+D"(dst),       // %0
   3046                  "+c"(width_tmp)  // %1
   3047                : "a"(v32)         // %2
   3048                : "memory", "cc");
   3049 }
   3050 
   3051 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
   3052   size_t width_tmp = (size_t)(width);
   3053   asm volatile("rep stosb " MEMSTORESTRING(al, 0) "        \n"
   3054                : "+D"(dst),       // %0
   3055                  "+c"(width_tmp)  // %1
   3056                : "a"(v8)          // %2
   3057                : "memory", "cc");
   3058 }
   3059 
   3060 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
   3061   size_t width_tmp = (size_t)(width);
   3062   asm volatile("rep stosl " MEMSTORESTRING(eax, 0) "       \n"
   3063                : "+D"(dst_argb),  // %0
   3064                  "+c"(width_tmp)  // %1
   3065                : "a"(v32)         // %2
   3066                : "memory", "cc");
   3067 }
   3068 #endif  // HAS_SETROW_X86
   3069 
   3070 #ifdef HAS_YUY2TOYROW_SSE2
   3071 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
   3072   asm volatile (
   3073     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   3074     "psrlw     $0x8,%%xmm5                     \n"
   3075 
   3076     LABELALIGN
   3077     "1:                                        \n"
   3078     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3079     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3080     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3081     "pand      %%xmm5,%%xmm0                   \n"
   3082     "pand      %%xmm5,%%xmm1                   \n"
   3083     "packuswb  %%xmm1,%%xmm0                   \n"
   3084     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   3085     "lea       " MEMLEA(0x10,1) ",%1           \n"
   3086     "sub       $0x10,%2                        \n"
   3087     "jg        1b                              \n"
   3088   : "+r"(src_yuy2),  // %0
   3089     "+r"(dst_y),     // %1
   3090     "+r"(width)        // %2
   3091   :
   3092   : "memory", "cc"
   3093     , "xmm0", "xmm1", "xmm5"
   3094   );
   3095 }
   3096 
   3097 void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
   3098                       int stride_yuy2,
   3099                       uint8* dst_u,
   3100                       uint8* dst_v,
   3101                       int width) {
   3102   asm volatile (
   3103     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   3104     "psrlw     $0x8,%%xmm5                     \n"
   3105     "sub       %1,%2                           \n"
   3106 
   3107     LABELALIGN
   3108     "1:                                        \n"
   3109     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3110     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3111     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
   3112     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
   3113     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3114     "pavgb     %%xmm2,%%xmm0                   \n"
   3115     "pavgb     %%xmm3,%%xmm1                   \n"
   3116     "psrlw     $0x8,%%xmm0                     \n"
   3117     "psrlw     $0x8,%%xmm1                     \n"
   3118     "packuswb  %%xmm1,%%xmm0                   \n"
   3119     "movdqa    %%xmm0,%%xmm1                   \n"
   3120     "pand      %%xmm5,%%xmm0                   \n"
   3121     "packuswb  %%xmm0,%%xmm0                   \n"
   3122     "psrlw     $0x8,%%xmm1                     \n"
   3123     "packuswb  %%xmm1,%%xmm1                   \n"
   3124     "movq      %%xmm0," MEMACCESS(1) "         \n"
   3125     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
   3126     "lea       " MEMLEA(0x8,1) ",%1            \n"
   3127     "sub       $0x10,%3                        \n"
   3128     "jg        1b                              \n"
   3129   : "+r"(src_yuy2),    // %0
   3130     "+r"(dst_u),       // %1
   3131     "+r"(dst_v),       // %2
   3132     "+r"(width)          // %3
   3133   : "r"((intptr_t)(stride_yuy2))  // %4
   3134   : "memory", "cc", NACL_R14
   3135     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   3136   );
   3137 }
   3138 
   3139 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
   3140                          uint8* dst_u,
   3141                          uint8* dst_v,
   3142                          int width) {
   3143   asm volatile (
   3144     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   3145     "psrlw     $0x8,%%xmm5                     \n"
   3146     "sub       %1,%2                           \n"
   3147 
   3148     LABELALIGN
   3149     "1:                                        \n"
   3150     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3151     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3152     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3153     "psrlw     $0x8,%%xmm0                     \n"
   3154     "psrlw     $0x8,%%xmm1                     \n"
   3155     "packuswb  %%xmm1,%%xmm0                   \n"
   3156     "movdqa    %%xmm0,%%xmm1                   \n"
   3157     "pand      %%xmm5,%%xmm0                   \n"
   3158     "packuswb  %%xmm0,%%xmm0                   \n"
   3159     "psrlw     $0x8,%%xmm1                     \n"
   3160     "packuswb  %%xmm1,%%xmm1                   \n"
   3161     "movq      %%xmm0," MEMACCESS(1) "         \n"
   3162     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
   3163     "lea       " MEMLEA(0x8,1) ",%1            \n"
   3164     "sub       $0x10,%3                        \n"
   3165     "jg        1b                              \n"
   3166   : "+r"(src_yuy2),    // %0
   3167     "+r"(dst_u),       // %1
   3168     "+r"(dst_v),       // %2
   3169     "+r"(width)          // %3
   3170   :
   3171   : "memory", "cc", NACL_R14
   3172     "xmm0", "xmm1", "xmm5"
   3173   );
   3174 }
   3175 
   3176 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
   3177   asm volatile (
   3178     LABELALIGN
   3179     "1:                                        \n"
   3180     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3181     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3182     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3183     "psrlw     $0x8,%%xmm0                     \n"
   3184     "psrlw     $0x8,%%xmm1                     \n"
   3185     "packuswb  %%xmm1,%%xmm0                   \n"
   3186     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   3187     "lea       " MEMLEA(0x10,1) ",%1           \n"
   3188     "sub       $0x10,%2                        \n"
   3189     "jg        1b                              \n"
   3190   : "+r"(src_uyvy),  // %0
   3191     "+r"(dst_y),     // %1
   3192     "+r"(width)        // %2
   3193   :
   3194   : "memory", "cc"
   3195     , "xmm0", "xmm1"
   3196   );
   3197 }
   3198 
   3199 void UYVYToUVRow_SSE2(const uint8* src_uyvy,
   3200                       int stride_uyvy,
   3201                       uint8* dst_u,
   3202                       uint8* dst_v,
   3203                       int width) {
   3204   asm volatile (
   3205     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   3206     "psrlw     $0x8,%%xmm5                     \n"
   3207     "sub       %1,%2                           \n"
   3208 
   3209     LABELALIGN
   3210     "1:                                        \n"
   3211     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3212     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3213     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
   3214     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
   3215     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3216     "pavgb     %%xmm2,%%xmm0                   \n"
   3217     "pavgb     %%xmm3,%%xmm1                   \n"
   3218     "pand      %%xmm5,%%xmm0                   \n"
   3219     "pand      %%xmm5,%%xmm1                   \n"
   3220     "packuswb  %%xmm1,%%xmm0                   \n"
   3221     "movdqa    %%xmm0,%%xmm1                   \n"
   3222     "pand      %%xmm5,%%xmm0                   \n"
   3223     "packuswb  %%xmm0,%%xmm0                   \n"
   3224     "psrlw     $0x8,%%xmm1                     \n"
   3225     "packuswb  %%xmm1,%%xmm1                   \n"
   3226     "movq      %%xmm0," MEMACCESS(1) "         \n"
   3227     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
   3228     "lea       " MEMLEA(0x8,1) ",%1            \n"
   3229     "sub       $0x10,%3                        \n"
   3230     "jg        1b                              \n"
   3231   : "+r"(src_uyvy),    // %0
   3232     "+r"(dst_u),       // %1
   3233     "+r"(dst_v),       // %2
   3234     "+r"(width)          // %3
   3235   : "r"((intptr_t)(stride_uyvy))  // %4
   3236   : "memory", "cc", NACL_R14
   3237     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   3238   );
   3239 }
   3240 
   3241 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
   3242                          uint8* dst_u,
   3243                          uint8* dst_v,
   3244                          int width) {
   3245   asm volatile (
   3246     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   3247     "psrlw     $0x8,%%xmm5                     \n"
   3248     "sub       %1,%2                           \n"
   3249 
   3250     LABELALIGN
   3251     "1:                                        \n"
   3252     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3253     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3254     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3255     "pand      %%xmm5,%%xmm0                   \n"
   3256     "pand      %%xmm5,%%xmm1                   \n"
   3257     "packuswb  %%xmm1,%%xmm0                   \n"
   3258     "movdqa    %%xmm0,%%xmm1                   \n"
   3259     "pand      %%xmm5,%%xmm0                   \n"
   3260     "packuswb  %%xmm0,%%xmm0                   \n"
   3261     "psrlw     $0x8,%%xmm1                     \n"
   3262     "packuswb  %%xmm1,%%xmm1                   \n"
   3263     "movq      %%xmm0," MEMACCESS(1) "         \n"
   3264     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
   3265     "lea       " MEMLEA(0x8,1) ",%1            \n"
   3266     "sub       $0x10,%3                        \n"
   3267     "jg        1b                              \n"
   3268   : "+r"(src_uyvy),    // %0
   3269     "+r"(dst_u),       // %1
   3270     "+r"(dst_v),       // %2
   3271     "+r"(width)          // %3
   3272   :
   3273   : "memory", "cc", NACL_R14
   3274     "xmm0", "xmm1", "xmm5"
   3275   );
   3276 }
   3277 #endif  // HAS_YUY2TOYROW_SSE2
   3278 
   3279 #ifdef HAS_YUY2TOYROW_AVX2
   3280 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
   3281   asm volatile (
   3282     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
   3283     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
   3284 
   3285     LABELALIGN
   3286     "1:                                        \n"
   3287     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   3288     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   3289     "lea       " MEMLEA(0x40,0) ",%0           \n"
   3290     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
   3291     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
   3292     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
   3293     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3294     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
   3295     "lea      " MEMLEA(0x20,1) ",%1            \n"
   3296     "sub       $0x20,%2                        \n"
   3297     "jg        1b                              \n"
   3298     "vzeroupper                                \n"
   3299   : "+r"(src_yuy2),  // %0
   3300     "+r"(dst_y),     // %1
   3301     "+r"(width)        // %2
   3302   :
   3303   : "memory", "cc"
   3304     , "xmm0", "xmm1", "xmm5"
   3305   );
   3306 }
   3307 
   3308 void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
   3309                       int stride_yuy2,
   3310                       uint8* dst_u,
   3311                       uint8* dst_v,
   3312                       int width) {
   3313   asm volatile (
   3314     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
   3315     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
   3316     "sub       %1,%2                           \n"
   3317 
   3318     LABELALIGN
   3319     "1:                                        \n"
   3320     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   3321     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   3322     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
   3323     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
   3324     "lea       " MEMLEA(0x40,0) ",%0           \n"
   3325     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
   3326     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
   3327     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
   3328     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3329     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
   3330     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
   3331     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
   3332     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
   3333     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
   3334     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3335     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
   3336     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
   3337     "lea      " MEMLEA(0x10,1) ",%1            \n"
   3338     "sub       $0x20,%3                        \n"
   3339     "jg        1b                              \n"
   3340     "vzeroupper                                \n"
   3341   : "+r"(src_yuy2),    // %0
   3342     "+r"(dst_u),       // %1
   3343     "+r"(dst_v),       // %2
   3344     "+r"(width)          // %3
   3345   : "r"((intptr_t)(stride_yuy2))  // %4
   3346   : "memory", "cc", NACL_R14
   3347     "xmm0", "xmm1", "xmm5"
   3348   );
   3349 }
   3350 
   3351 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
   3352                          uint8* dst_u,
   3353                          uint8* dst_v,
   3354                          int width) {
   3355   asm volatile (
   3356     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
   3357     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
   3358     "sub       %1,%2                           \n"
   3359 
   3360     LABELALIGN
   3361     "1:                                        \n"
   3362     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   3363     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   3364     "lea       " MEMLEA(0x40,0) ",%0           \n"
   3365     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
   3366     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
   3367     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
   3368     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3369     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
   3370     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
   3371     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
   3372     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
   3373     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
   3374     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3375     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
   3376     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
   3377     "lea      " MEMLEA(0x10,1) ",%1            \n"
   3378     "sub       $0x20,%3                        \n"
   3379     "jg        1b                              \n"
   3380     "vzeroupper                                \n"
   3381   : "+r"(src_yuy2),    // %0
   3382     "+r"(dst_u),       // %1
   3383     "+r"(dst_v),       // %2
   3384     "+r"(width)          // %3
   3385   :
   3386   : "memory", "cc", NACL_R14
   3387     "xmm0", "xmm1", "xmm5"
   3388   );
   3389 }
   3390 
   3391 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
   3392   asm volatile (
   3393     LABELALIGN
   3394     "1:                                        \n"
   3395     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   3396     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   3397     "lea       " MEMLEA(0x40,0) ",%0           \n"
   3398     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
   3399     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
   3400     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
   3401     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3402     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
   3403     "lea      " MEMLEA(0x20,1) ",%1            \n"
   3404     "sub       $0x20,%2                        \n"
   3405     "jg        1b                              \n"
   3406     "vzeroupper                                \n"
   3407   : "+r"(src_uyvy),  // %0
   3408     "+r"(dst_y),     // %1
   3409     "+r"(width)        // %2
   3410   :
   3411   : "memory", "cc"
   3412     , "xmm0", "xmm1", "xmm5"
   3413   );
   3414 }
   3415 void UYVYToUVRow_AVX2(const uint8* src_uyvy,
   3416                       int stride_uyvy,
   3417                       uint8* dst_u,
   3418                       uint8* dst_v,
   3419                       int width) {
   3420   asm volatile (
   3421     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
   3422     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
   3423     "sub       %1,%2                           \n"
   3424 
   3425     LABELALIGN
   3426     "1:                                        \n"
   3427     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   3428     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   3429     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
   3430     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
   3431     "lea       " MEMLEA(0x40,0) ",%0           \n"
   3432     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
   3433     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
   3434     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
   3435     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3436     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
   3437     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
   3438     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
   3439     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
   3440     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
   3441     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3442     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
   3443     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
   3444     "lea      " MEMLEA(0x10,1) ",%1            \n"
   3445     "sub       $0x20,%3                        \n"
   3446     "jg        1b                              \n"
   3447     "vzeroupper                                \n"
   3448   : "+r"(src_uyvy),    // %0
   3449     "+r"(dst_u),       // %1
   3450     "+r"(dst_v),       // %2
   3451     "+r"(width)          // %3
   3452   : "r"((intptr_t)(stride_uyvy))  // %4
   3453   : "memory", "cc", NACL_R14
   3454     "xmm0", "xmm1", "xmm5"
   3455   );
   3456 }
   3457 
   3458 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
   3459                          uint8* dst_u,
   3460                          uint8* dst_v,
   3461                          int width) {
   3462   asm volatile (
   3463     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   3464     "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
   3465     "sub       %1,%2                           \n"
   3466 
   3467     LABELALIGN
   3468     "1:                                        \n"
   3469     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   3470     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   3471     "lea       " MEMLEA(0x40,0) ",%0           \n"
   3472     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
   3473     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
   3474     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
   3475     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3476     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
   3477     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
   3478     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
   3479     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
   3480     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
   3481     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3482     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
   3483     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
   3484     "lea      " MEMLEA(0x10,1) ",%1            \n"
   3485     "sub       $0x20,%3                        \n"
   3486     "jg        1b                              \n"
   3487     "vzeroupper                                \n"
   3488   : "+r"(src_uyvy),    // %0
   3489     "+r"(dst_u),       // %1
   3490     "+r"(dst_v),       // %2
   3491     "+r"(width)          // %3
   3492   :
   3493   : "memory", "cc", NACL_R14
   3494     "xmm0", "xmm1", "xmm5"
   3495   );
   3496 }
   3497 #endif  // HAS_YUY2TOYROW_AVX2
   3498 
   3499 #ifdef HAS_ARGBBLENDROW_SSSE3
   3500 // Shuffle table for isolating alpha.
   3501 static uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
   3502                               11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
   3503 
   3504 // Blend 8 pixels at a time
   3505 void ARGBBlendRow_SSSE3(const uint8* src_argb0,
   3506                         const uint8* src_argb1,
   3507                         uint8* dst_argb,
   3508                         int width) {
   3509   asm volatile (
   3510     "pcmpeqb   %%xmm7,%%xmm7                   \n"
   3511     "psrlw     $0xf,%%xmm7                     \n"
   3512     "pcmpeqb   %%xmm6,%%xmm6                   \n"
   3513     "psrlw     $0x8,%%xmm6                     \n"
   3514     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   3515     "psllw     $0x8,%%xmm5                     \n"
   3516     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   3517     "pslld     $0x18,%%xmm4                    \n"
   3518     "sub       $0x4,%3                         \n"
   3519     "jl        49f                             \n"
   3520 
   3521     // 4 pixel loop.
   3522     LABELALIGN
   3523   "40:                                         \n"
   3524     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
   3525     "lea       " MEMLEA(0x10,0) ",%0           \n"
   3526     "movdqa    %%xmm3,%%xmm0                   \n"
   3527     "pxor      %%xmm4,%%xmm3                   \n"
   3528     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
   3529     "pshufb    %4,%%xmm3                       \n"
   3530     "pand      %%xmm6,%%xmm2                   \n"
   3531     "paddw     %%xmm7,%%xmm3                   \n"
   3532     "pmullw    %%xmm3,%%xmm2                   \n"
   3533     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
   3534     "lea       " MEMLEA(0x10,1) ",%1           \n"
   3535     "psrlw     $0x8,%%xmm1                     \n"
   3536     "por       %%xmm4,%%xmm0                   \n"
   3537     "pmullw    %%xmm3,%%xmm1                   \n"
   3538     "psrlw     $0x8,%%xmm2                     \n"
   3539     "paddusb   %%xmm2,%%xmm0                   \n"
   3540     "pand      %%xmm5,%%xmm1                   \n"
   3541     "paddusb   %%xmm1,%%xmm0                   \n"
   3542     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   3543     "lea       " MEMLEA(0x10,2) ",%2           \n"
   3544     "sub       $0x4,%3                         \n"
   3545     "jge       40b                             \n"
   3546 
   3547   "49:                                         \n"
   3548     "add       $0x3,%3                         \n"
   3549     "jl        99f                             \n"
   3550 
   3551     // 1 pixel loop.
   3552   "91:                                         \n"
   3553     "movd      " MEMACCESS(0) ",%%xmm3         \n"
   3554     "lea       " MEMLEA(0x4,0) ",%0            \n"
   3555     "movdqa    %%xmm3,%%xmm0                   \n"
   3556     "pxor      %%xmm4,%%xmm3                   \n"
   3557     "movd      " MEMACCESS(1) ",%%xmm2         \n"
   3558     "pshufb    %4,%%xmm3                       \n"
   3559     "pand      %%xmm6,%%xmm2                   \n"
   3560     "paddw     %%xmm7,%%xmm3                   \n"
   3561     "pmullw    %%xmm3,%%xmm2                   \n"
   3562     "movd      " MEMACCESS(1) ",%%xmm1         \n"
   3563     "lea       " MEMLEA(0x4,1) ",%1            \n"
   3564     "psrlw     $0x8,%%xmm1                     \n"
   3565     "por       %%xmm4,%%xmm0                   \n"
   3566     "pmullw    %%xmm3,%%xmm1                   \n"
   3567     "psrlw     $0x8,%%xmm2                     \n"
   3568     "paddusb   %%xmm2,%%xmm0                   \n"
   3569     "pand      %%xmm5,%%xmm1                   \n"
   3570     "paddusb   %%xmm1,%%xmm0                   \n"
   3571     "movd      %%xmm0," MEMACCESS(2) "         \n"
   3572     "lea       " MEMLEA(0x4,2) ",%2            \n"
   3573     "sub       $0x1,%3                         \n"
   3574     "jge       91b                             \n"
   3575   "99:                                         \n"
   3576   : "+r"(src_argb0),    // %0
   3577     "+r"(src_argb1),    // %1
   3578     "+r"(dst_argb),     // %2
   3579     "+r"(width)         // %3
   3580   : "m"(kShuffleAlpha)  // %4
   3581   : "memory", "cc"
   3582     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   3583   );
   3584 }
   3585 #endif  // HAS_ARGBBLENDROW_SSSE3
   3586 
   3587 #ifdef HAS_BLENDPLANEROW_SSSE3
   3588 // Blend 8 pixels at a time.
   3589 // unsigned version of math
   3590 // =((A2*C2)+(B2*(255-C2))+255)/256
   3591 // signed version of math
   3592 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
   3593 void BlendPlaneRow_SSSE3(const uint8* src0,
   3594                          const uint8* src1,
   3595                          const uint8* alpha,
   3596                          uint8* dst,
   3597                          int width) {
   3598   asm volatile(
   3599       "pcmpeqb    %%xmm5,%%xmm5                  \n"
   3600       "psllw      $0x8,%%xmm5                    \n"
   3601       "mov        $0x80808080,%%eax              \n"
   3602       "movd       %%eax,%%xmm6                   \n"
   3603       "pshufd     $0x0,%%xmm6,%%xmm6             \n"
   3604       "mov        $0x807f807f,%%eax              \n"
   3605       "movd       %%eax,%%xmm7                   \n"
   3606       "pshufd     $0x0,%%xmm7,%%xmm7             \n"
   3607       "sub        %2,%0                          \n"
   3608       "sub        %2,%1                          \n"
   3609       "sub        %2,%3                          \n"
   3610 
   3611       // 8 pixel loop.
   3612       LABELALIGN
   3613       "1:                                        \n"
   3614       "movq       (%2),%%xmm0                    \n"
   3615       "punpcklbw  %%xmm0,%%xmm0                  \n"
   3616       "pxor       %%xmm5,%%xmm0                  \n"
   3617       "movq       (%0,%2,1),%%xmm1               \n"
   3618       "movq       (%1,%2,1),%%xmm2               \n"
   3619       "punpcklbw  %%xmm2,%%xmm1                  \n"
   3620       "psubb      %%xmm6,%%xmm1                  \n"
   3621       "pmaddubsw  %%xmm1,%%xmm0                  \n"
   3622       "paddw      %%xmm7,%%xmm0                  \n"
   3623       "psrlw      $0x8,%%xmm0                    \n"
   3624       "packuswb   %%xmm0,%%xmm0                  \n"
   3625       "movq       %%xmm0,(%3,%2,1)               \n"
   3626       "lea        0x8(%2),%2                     \n"
   3627       "sub        $0x8,%4                        \n"
   3628       "jg        1b                              \n"
   3629       : "+r"(src0),   // %0
   3630         "+r"(src1),   // %1
   3631         "+r"(alpha),  // %2
   3632         "+r"(dst),    // %3
   3633         "+rm"(width)  // %4
   3634         ::"memory",
   3635         "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
   3636 }
   3637 #endif  // HAS_BLENDPLANEROW_SSSE3
   3638 
   3639 #ifdef HAS_BLENDPLANEROW_AVX2
   3640 // Blend 32 pixels at a time.
   3641 // unsigned version of math
   3642 // =((A2*C2)+(B2*(255-C2))+255)/256
   3643 // signed version of math
   3644 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
   3645 void BlendPlaneRow_AVX2(const uint8* src0,
   3646                         const uint8* src1,
   3647                         const uint8* alpha,
   3648                         uint8* dst,
   3649                         int width) {
   3650   asm volatile(
   3651       "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   3652       "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
   3653       "mov        $0x80808080,%%eax              \n"
   3654       "vmovd      %%eax,%%xmm6                   \n"
   3655       "vbroadcastss %%xmm6,%%ymm6                \n"
   3656       "mov        $0x807f807f,%%eax              \n"
   3657       "vmovd      %%eax,%%xmm7                   \n"
   3658       "vbroadcastss %%xmm7,%%ymm7                \n"
   3659       "sub        %2,%0                          \n"
   3660       "sub        %2,%1                          \n"
   3661       "sub        %2,%3                          \n"
   3662 
   3663       // 32 pixel loop.
   3664       LABELALIGN
   3665       "1:                                        \n"
   3666       "vmovdqu    (%2),%%ymm0                    \n"
   3667       "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
   3668       "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
   3669       "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
   3670       "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
   3671       "vmovdqu    (%0,%2,1),%%ymm1               \n"
   3672       "vmovdqu    (%1,%2,1),%%ymm2               \n"
   3673       "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
   3674       "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
   3675       "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
   3676       "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
   3677       "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
   3678       "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
   3679       "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
   3680       "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
   3681       "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
   3682       "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
   3683       "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
   3684       "vmovdqu    %%ymm0,(%3,%2,1)               \n"
   3685       "lea        0x20(%2),%2                    \n"
   3686       "sub        $0x20,%4                       \n"
   3687       "jg        1b                              \n"
   3688       "vzeroupper                                \n"
   3689       : "+r"(src0),   // %0
   3690         "+r"(src1),   // %1
   3691         "+r"(alpha),  // %2
   3692         "+r"(dst),    // %3
   3693         "+rm"(width)  // %4
   3694         ::"memory",
   3695         "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
   3696         "xmm7");
   3697 }
   3698 #endif  // HAS_BLENDPLANEROW_AVX2
   3699 
   3700 #ifdef HAS_ARGBATTENUATEROW_SSSE3
   3701 // Shuffle table duplicating alpha
   3702 static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
   3703                                7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
   3704 static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
   3705                                15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
   3706 // Attenuate 4 pixels at a time.
   3707 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   3708   asm volatile (
   3709     "pcmpeqb   %%xmm3,%%xmm3                   \n"
   3710     "pslld     $0x18,%%xmm3                    \n"
   3711     "movdqa    %3,%%xmm4                       \n"
   3712     "movdqa    %4,%%xmm5                       \n"
   3713 
   3714     // 4 pixel loop.
   3715     LABELALIGN
   3716     "1:                                        \n"
   3717     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3718     "pshufb    %%xmm4,%%xmm0                   \n"
   3719     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
   3720     "punpcklbw %%xmm1,%%xmm1                   \n"
   3721     "pmulhuw   %%xmm1,%%xmm0                   \n"
   3722     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
   3723     "pshufb    %%xmm5,%%xmm1                   \n"
   3724     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
   3725     "punpckhbw %%xmm2,%%xmm2                   \n"
   3726     "pmulhuw   %%xmm2,%%xmm1                   \n"
   3727     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
   3728     "lea       " MEMLEA(0x10,0) ",%0           \n"
   3729     "pand      %%xmm3,%%xmm2                   \n"
   3730     "psrlw     $0x8,%%xmm0                     \n"
   3731     "psrlw     $0x8,%%xmm1                     \n"
   3732     "packuswb  %%xmm1,%%xmm0                   \n"
   3733     "por       %%xmm2,%%xmm0                   \n"
   3734     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   3735     "lea       " MEMLEA(0x10,1) ",%1           \n"
   3736     "sub       $0x4,%2                         \n"
   3737     "jg        1b                              \n"
   3738   : "+r"(src_argb),    // %0
   3739     "+r"(dst_argb),    // %1
   3740     "+r"(width)        // %2
   3741   : "m"(kShuffleAlpha0),  // %3
   3742     "m"(kShuffleAlpha1)  // %4
   3743   : "memory", "cc"
   3744     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   3745   );
   3746 }
   3747 #endif  // HAS_ARGBATTENUATEROW_SSSE3
   3748 
   3749 #ifdef HAS_ARGBATTENUATEROW_AVX2
   3750 // Shuffle table duplicating alpha.
   3751 static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
   3752                                          128u, 128u, 14u,  15u, 14u, 15u,
   3753                                          14u,  15u,  128u, 128u};
   3754 // Attenuate 8 pixels at a time.
   3755 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
   3756   asm volatile (
   3757     "vbroadcastf128 %3,%%ymm4                  \n"
   3758     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   3759     "vpslld     $0x18,%%ymm5,%%ymm5            \n"
   3760     "sub        %0,%1                          \n"
   3761 
   3762     // 8 pixel loop.
   3763     LABELALIGN
   3764     "1:                                        \n"
   3765     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
   3766     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
   3767     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
   3768     "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
   3769     "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
   3770     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
   3771     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
   3772     "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
   3773     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
   3774     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
   3775     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
   3776     "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
   3777     MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
   3778     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3779     "sub        $0x8,%2                        \n"
   3780     "jg        1b                              \n"
   3781     "vzeroupper                                \n"
   3782   : "+r"(src_argb),    // %0
   3783     "+r"(dst_argb),    // %1
   3784     "+r"(width)        // %2
   3785   : "m"(kShuffleAlpha_AVX2)  // %3
   3786   : "memory", "cc"
   3787     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   3788   );
   3789 }
   3790 #endif  // HAS_ARGBATTENUATEROW_AVX2
   3791 
   3792 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
   3793 // Unattenuate 4 pixels at a time.
   3794 void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
   3795                              uint8* dst_argb,
   3796                              int width) {
   3797   uintptr_t alpha;
   3798   asm volatile (
   3799     // 4 pixel loop.
   3800     LABELALIGN
   3801     "1:                                        \n"
   3802     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3803     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
   3804     "punpcklbw %%xmm0,%%xmm0                   \n"
   3805     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
   3806     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
   3807     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
   3808     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
   3809     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
   3810     "movlhps   %%xmm3,%%xmm2                   \n"
   3811     "pmulhuw   %%xmm2,%%xmm0                   \n"
   3812     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
   3813     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
   3814     "punpckhbw %%xmm1,%%xmm1                   \n"
   3815     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
   3816     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
   3817     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
   3818     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
   3819     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
   3820     "movlhps   %%xmm3,%%xmm2                   \n"
   3821     "pmulhuw   %%xmm2,%%xmm1                   \n"
   3822     "lea       " MEMLEA(0x10,0) ",%0           \n"
   3823     "packuswb  %%xmm1,%%xmm0                   \n"
   3824     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   3825     "lea       " MEMLEA(0x10,1) ",%1           \n"
   3826     "sub       $0x4,%2                         \n"
   3827     "jg        1b                              \n"
   3828   : "+r"(src_argb),     // %0
   3829     "+r"(dst_argb),     // %1
   3830     "+r"(width),        // %2
   3831     "=&r"(alpha)        // %3
   3832   : "r"(fixed_invtbl8)  // %4
   3833   : "memory", "cc", NACL_R14
   3834     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   3835   );
   3836 }
   3837 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
   3838 
   3839 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
   3840 // Shuffle table duplicating alpha.
   3841 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
   3842     0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
   3843 // Unattenuate 8 pixels at a time.
   3844 void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
   3845                              uint8* dst_argb,
   3846                              int width) {
   3847   uintptr_t alpha;
   3848   asm volatile (
   3849     "sub        %0,%1                          \n"
   3850     "vbroadcastf128 %5,%%ymm5                  \n"
   3851 
   3852     // 8 pixel loop.
   3853     LABELALIGN
   3854     "1:                                        \n"
   3855     // replace VPGATHER
   3856     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
   3857     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
   3858     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
   3859     MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
   3860     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
   3861     "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
   3862     MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
   3863     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
   3864     MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
   3865     "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
   3866     "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
   3867     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
   3868     "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
   3869     MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
   3870     "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
   3871     "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
   3872     MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
   3873     "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
   3874     MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
   3875     "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
   3876     "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
   3877     "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
   3878     "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
   3879     // end of VPGATHER
   3880 
   3881     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
   3882     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
   3883     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
   3884     "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
   3885     "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
   3886     "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
   3887     "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
   3888     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
   3889     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
   3890     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
   3891     MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
   3892     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3893     "sub        $0x8,%2                        \n"
   3894     "jg        1b                              \n"
   3895     "vzeroupper                                \n"
   3896   : "+r"(src_argb),      // %0
   3897     "+r"(dst_argb),      // %1
   3898     "+r"(width),         // %2
   3899     "=&r"(alpha)         // %3
   3900   : "r"(fixed_invtbl8),  // %4
   3901     "m"(kUnattenShuffleAlpha_AVX2)  // %5
   3902   : "memory", "cc", NACL_R14
   3903     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   3904   );
   3905 }
   3906 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
   3907 
   3908 #ifdef HAS_ARGBGRAYROW_SSSE3
   3909 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
   3910 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   3911   asm volatile (
   3912     "movdqa    %3,%%xmm4                       \n"
   3913     "movdqa    %4,%%xmm5                       \n"
   3914 
   3915     // 8 pixel loop.
   3916     LABELALIGN
   3917     "1:                                        \n"
   3918     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3919     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3920     "pmaddubsw %%xmm4,%%xmm0                   \n"
   3921     "pmaddubsw %%xmm4,%%xmm1                   \n"
   3922     "phaddw    %%xmm1,%%xmm0                   \n"
   3923     "paddw     %%xmm5,%%xmm0                   \n"
   3924     "psrlw     $0x7,%%xmm0                     \n"
   3925     "packuswb  %%xmm0,%%xmm0                   \n"
   3926     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
   3927     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
   3928     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3929     "psrld     $0x18,%%xmm2                    \n"
   3930     "psrld     $0x18,%%xmm3                    \n"
   3931     "packuswb  %%xmm3,%%xmm2                   \n"
   3932     "packuswb  %%xmm2,%%xmm2                   \n"
   3933     "movdqa    %%xmm0,%%xmm3                   \n"
   3934     "punpcklbw %%xmm0,%%xmm0                   \n"
   3935     "punpcklbw %%xmm2,%%xmm3                   \n"
   3936     "movdqa    %%xmm0,%%xmm1                   \n"
   3937     "punpcklwd %%xmm3,%%xmm0                   \n"
   3938     "punpckhwd %%xmm3,%%xmm1                   \n"
   3939     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   3940     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   3941     "lea       " MEMLEA(0x20,1) ",%1           \n"
   3942     "sub       $0x8,%2                         \n"
   3943     "jg        1b                              \n"
   3944   : "+r"(src_argb),   // %0
   3945     "+r"(dst_argb),   // %1
   3946     "+r"(width)       // %2
   3947   : "m"(kARGBToYJ),   // %3
   3948     "m"(kAddYJ64)     // %4
   3949   : "memory", "cc"
   3950     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   3951   );
   3952 }
   3953 #endif  // HAS_ARGBGRAYROW_SSSE3
   3954 
   3955 #ifdef HAS_ARGBSEPIAROW_SSSE3
   3956 //    b = (r * 35 + g * 68 + b * 17) >> 7
   3957 //    g = (r * 45 + g * 88 + b * 22) >> 7
   3958 //    r = (r * 50 + g * 98 + b * 24) >> 7
   3959 // Constant for ARGB color to sepia tone
   3960 static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
   3961                              17, 68, 35, 0, 17, 68, 35, 0};
   3962 
   3963 static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
   3964                              22, 88, 45, 0, 22, 88, 45, 0};
   3965 
   3966 static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
   3967                              24, 98, 50, 0, 24, 98, 50, 0};
   3968 
   3969 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
   3970 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
   3971   asm volatile (
   3972     "movdqa    %2,%%xmm2                       \n"
   3973     "movdqa    %3,%%xmm3                       \n"
   3974     "movdqa    %4,%%xmm4                       \n"
   3975 
   3976     // 8 pixel loop.
   3977     LABELALIGN
   3978     "1:                                        \n"
   3979     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3980     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
   3981     "pmaddubsw %%xmm2,%%xmm0                   \n"
   3982     "pmaddubsw %%xmm2,%%xmm6                   \n"
   3983     "phaddw    %%xmm6,%%xmm0                   \n"
   3984     "psrlw     $0x7,%%xmm0                     \n"
   3985     "packuswb  %%xmm0,%%xmm0                   \n"
   3986     "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
   3987     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3988     "pmaddubsw %%xmm3,%%xmm5                   \n"
   3989     "pmaddubsw %%xmm3,%%xmm1                   \n"
   3990     "phaddw    %%xmm1,%%xmm5                   \n"
   3991     "psrlw     $0x7,%%xmm5                     \n"
   3992     "packuswb  %%xmm5,%%xmm5                   \n"
   3993     "punpcklbw %%xmm5,%%xmm0                   \n"
   3994     "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
   3995     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3996     "pmaddubsw %%xmm4,%%xmm5                   \n"
   3997     "pmaddubsw %%xmm4,%%xmm1                   \n"
   3998     "phaddw    %%xmm1,%%xmm5                   \n"
   3999     "psrlw     $0x7,%%xmm5                     \n"
   4000     "packuswb  %%xmm5,%%xmm5                   \n"
   4001     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
   4002     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   4003     "psrld     $0x18,%%xmm6                    \n"
   4004     "psrld     $0x18,%%xmm1                    \n"
   4005     "packuswb  %%xmm1,%%xmm6                   \n"
   4006     "packuswb  %%xmm6,%%xmm6                   \n"
   4007     "punpcklbw %%xmm6,%%xmm5                   \n"
   4008     "movdqa    %%xmm0,%%xmm1                   \n"
   4009     "punpcklwd %%xmm5,%%xmm0                   \n"
   4010     "punpckhwd %%xmm5,%%xmm1                   \n"
   4011     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
   4012     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
   4013     "lea       " MEMLEA(0x20,0) ",%0           \n"
   4014     "sub       $0x8,%1                         \n"
   4015     "jg        1b                              \n"
   4016   : "+r"(dst_argb),      // %0
   4017     "+r"(width)          // %1
   4018   : "m"(kARGBToSepiaB),  // %2
   4019     "m"(kARGBToSepiaG),  // %3
   4020     "m"(kARGBToSepiaR)   // %4
   4021   : "memory", "cc"
   4022     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   4023   );
   4024 }
   4025 #endif  // HAS_ARGBSEPIAROW_SSSE3
   4026 
   4027 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
   4028 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
   4029 // Same as Sepia except matrix is provided.
   4030 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
   4031                               uint8* dst_argb,
   4032                               const int8* matrix_argb,
   4033                               int width) {
   4034   asm volatile (
   4035     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
   4036     "pshufd    $0x00,%%xmm5,%%xmm2             \n"
   4037     "pshufd    $0x55,%%xmm5,%%xmm3             \n"
   4038     "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
   4039     "pshufd    $0xff,%%xmm5,%%xmm5             \n"
   4040 
   4041     // 8 pixel loop.
   4042     LABELALIGN
   4043     "1:                                        \n"
   4044     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4045     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
   4046     "pmaddubsw %%xmm2,%%xmm0                   \n"
   4047     "pmaddubsw %%xmm2,%%xmm7                   \n"
   4048     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
   4049     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   4050     "pmaddubsw %%xmm3,%%xmm6                   \n"
   4051     "pmaddubsw %%xmm3,%%xmm1                   \n"
   4052     "phaddsw   %%xmm7,%%xmm0                   \n"
   4053     "phaddsw   %%xmm1,%%xmm6                   \n"
   4054     "psraw     $0x6,%%xmm0                     \n"
   4055     "psraw     $0x6,%%xmm6                     \n"
   4056     "packuswb  %%xmm0,%%xmm0                   \n"
   4057     "packuswb  %%xmm6,%%xmm6                   \n"
   4058     "punpcklbw %%xmm6,%%xmm0                   \n"
   4059     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
   4060     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
   4061     "pmaddubsw %%xmm4,%%xmm1                   \n"
   4062     "pmaddubsw %%xmm4,%%xmm7                   \n"
   4063     "phaddsw   %%xmm7,%%xmm1                   \n"
   4064     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
   4065     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
   4066     "pmaddubsw %%xmm5,%%xmm6                   \n"
   4067     "pmaddubsw %%xmm5,%%xmm7                   \n"
   4068     "phaddsw   %%xmm7,%%xmm6                   \n"
   4069     "psraw     $0x6,%%xmm1                     \n"
   4070     "psraw     $0x6,%%xmm6                     \n"
   4071     "packuswb  %%xmm1,%%xmm1                   \n"
   4072     "packuswb  %%xmm6,%%xmm6                   \n"
   4073     "punpcklbw %%xmm6,%%xmm1                   \n"
   4074     "movdqa    %%xmm0,%%xmm6                   \n"
   4075     "punpcklwd %%xmm1,%%xmm0                   \n"
   4076     "punpckhwd %%xmm1,%%xmm6                   \n"
   4077     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   4078     "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
   4079     "lea       " MEMLEA(0x20,0) ",%0           \n"
   4080     "lea       " MEMLEA(0x20,1) ",%1           \n"
   4081     "sub       $0x8,%2                         \n"
   4082     "jg        1b                              \n"
   4083   : "+r"(src_argb),      // %0
   4084     "+r"(dst_argb),      // %1
   4085     "+r"(width)          // %2
   4086   : "r"(matrix_argb)     // %3
   4087   : "memory", "cc"
   4088     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   4089   );
   4090 }
   4091 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
   4092 
   4093 #ifdef HAS_ARGBQUANTIZEROW_SSE2
   4094 // Quantize 4 ARGB pixels (16 bytes).
   4095 void ARGBQuantizeRow_SSE2(uint8* dst_argb,
   4096                           int scale,
   4097                           int interval_size,
   4098                           int interval_offset,
   4099                           int width) {
   4100   asm volatile (
   4101     "movd      %2,%%xmm2                       \n"
   4102     "movd      %3,%%xmm3                       \n"
   4103     "movd      %4,%%xmm4                       \n"
   4104     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
   4105     "pshufd    $0x44,%%xmm2,%%xmm2             \n"
   4106     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
   4107     "pshufd    $0x44,%%xmm3,%%xmm3             \n"
   4108     "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
   4109     "pshufd    $0x44,%%xmm4,%%xmm4             \n"
   4110     "pxor      %%xmm5,%%xmm5                   \n"
   4111     "pcmpeqb   %%xmm6,%%xmm6                   \n"
   4112     "pslld     $0x18,%%xmm6                    \n"
   4113 
   4114     // 4 pixel loop.
   4115     LABELALIGN
   4116     "1:                                        \n"
   4117     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4118     "punpcklbw %%xmm5,%%xmm0                   \n"
   4119     "pmulhuw   %%xmm2,%%xmm0                   \n"
   4120     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
   4121     "punpckhbw %%xmm5,%%xmm1                   \n"
   4122     "pmulhuw   %%xmm2,%%xmm1                   \n"
   4123     "pmullw    %%xmm3,%%xmm0                   \n"
   4124     "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
   4125     "pmullw    %%xmm3,%%xmm1                   \n"
   4126     "pand      %%xmm6,%%xmm7                   \n"
   4127     "paddw     %%xmm4,%%xmm0                   \n"
   4128     "paddw     %%xmm4,%%xmm1                   \n"
   4129     "packuswb  %%xmm1,%%xmm0                   \n"
   4130     "por       %%xmm7,%%xmm0                   \n"
   4131     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
   4132     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4133     "sub       $0x4,%1                         \n"
   4134     "jg        1b                              \n"
   4135   : "+r"(dst_argb),       // %0
   4136     "+r"(width)           // %1
   4137   : "r"(scale),           // %2
   4138     "r"(interval_size),   // %3
   4139     "r"(interval_offset)  // %4
   4140   : "memory", "cc"
   4141     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   4142   );
   4143 }
   4144 #endif  // HAS_ARGBQUANTIZEROW_SSE2
   4145 
   4146 #ifdef HAS_ARGBSHADEROW_SSE2
   4147 // Shade 4 pixels at a time by specified value.
   4148 void ARGBShadeRow_SSE2(const uint8* src_argb,
   4149                        uint8* dst_argb,
   4150                        int width,
   4151                        uint32 value) {
   4152   asm volatile (
   4153     "movd      %3,%%xmm2                       \n"
   4154     "punpcklbw %%xmm2,%%xmm2                   \n"
   4155     "punpcklqdq %%xmm2,%%xmm2                  \n"
   4156 
   4157     // 4 pixel loop.
   4158     LABELALIGN
   4159     "1:                                        \n"
   4160     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4161     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4162     "movdqa    %%xmm0,%%xmm1                   \n"
   4163     "punpcklbw %%xmm0,%%xmm0                   \n"
   4164     "punpckhbw %%xmm1,%%xmm1                   \n"
   4165     "pmulhuw   %%xmm2,%%xmm0                   \n"
   4166     "pmulhuw   %%xmm2,%%xmm1                   \n"
   4167     "psrlw     $0x8,%%xmm0                     \n"
   4168     "psrlw     $0x8,%%xmm1                     \n"
   4169     "packuswb  %%xmm1,%%xmm0                   \n"
   4170     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   4171     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4172     "sub       $0x4,%2                         \n"
   4173     "jg        1b                              \n"
   4174   : "+r"(src_argb),  // %0
   4175     "+r"(dst_argb),  // %1
   4176     "+r"(width)      // %2
   4177   : "r"(value)       // %3
   4178   : "memory", "cc"
   4179     , "xmm0", "xmm1", "xmm2"
   4180   );
   4181 }
   4182 #endif  // HAS_ARGBSHADEROW_SSE2
   4183 
   4184 #ifdef HAS_ARGBMULTIPLYROW_SSE2
   4185 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
   4186 void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
   4187                           const uint8* src_argb1,
   4188                           uint8* dst_argb,
   4189                           int width) {
   4190   asm volatile (
   4191     "pxor      %%xmm5,%%xmm5                   \n"
   4192 
   4193     // 4 pixel loop.
   4194     LABELALIGN
   4195     "1:                                        \n"
   4196     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4197     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4198     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
   4199     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4200     "movdqu    %%xmm0,%%xmm1                   \n"
   4201     "movdqu    %%xmm2,%%xmm3                   \n"
   4202     "punpcklbw %%xmm0,%%xmm0                   \n"
   4203     "punpckhbw %%xmm1,%%xmm1                   \n"
   4204     "punpcklbw %%xmm5,%%xmm2                   \n"
   4205     "punpckhbw %%xmm5,%%xmm3                   \n"
   4206     "pmulhuw   %%xmm2,%%xmm0                   \n"
   4207     "pmulhuw   %%xmm3,%%xmm1                   \n"
   4208     "packuswb  %%xmm1,%%xmm0                   \n"
   4209     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   4210     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4211     "sub       $0x4,%3                         \n"
   4212     "jg        1b                              \n"
   4213   : "+r"(src_argb0),  // %0
   4214     "+r"(src_argb1),  // %1
   4215     "+r"(dst_argb),   // %2
   4216     "+r"(width)       // %3
   4217   :
   4218   : "memory", "cc"
   4219     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   4220   );
   4221 }
   4222 #endif  // HAS_ARGBMULTIPLYROW_SSE2
   4223 
   4224 #ifdef HAS_ARGBMULTIPLYROW_AVX2
   4225 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
   4226 void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
   4227                           const uint8* src_argb1,
   4228                           uint8* dst_argb,
   4229                           int width) {
   4230   asm volatile (
   4231     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
   4232 
   4233     // 4 pixel loop.
   4234     LABELALIGN
   4235     "1:                                        \n"
   4236     "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
   4237     "lea        " MEMLEA(0x20,0) ",%0          \n"
   4238     "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
   4239     "lea        " MEMLEA(0x20,1) ",%1          \n"
   4240     "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
   4241     "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
   4242     "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
   4243     "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
   4244     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
   4245     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
   4246     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
   4247     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
   4248     "lea       " MEMLEA(0x20,2) ",%2           \n"
   4249     "sub        $0x8,%3                        \n"
   4250     "jg        1b                              \n"
   4251     "vzeroupper                                \n"
   4252   : "+r"(src_argb0),  // %0
   4253     "+r"(src_argb1),  // %1
   4254     "+r"(dst_argb),   // %2
   4255     "+r"(width)       // %3
   4256   :
   4257   : "memory", "cc"
   4258 #if defined(__AVX2__)
   4259     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   4260 #endif
   4261   );
   4262 }
   4263 #endif  // HAS_ARGBMULTIPLYROW_AVX2
   4264 
   4265 #ifdef HAS_ARGBADDROW_SSE2
   4266 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
   4267 void ARGBAddRow_SSE2(const uint8* src_argb0,
   4268                      const uint8* src_argb1,
   4269                      uint8* dst_argb,
   4270                      int width) {
   4271   asm volatile (
   4272     // 4 pixel loop.
   4273     LABELALIGN
   4274     "1:                                        \n"
   4275     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4276     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4277     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
   4278     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4279     "paddusb   %%xmm1,%%xmm0                   \n"
   4280     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   4281     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4282     "sub       $0x4,%3                         \n"
   4283     "jg        1b                              \n"
   4284   : "+r"(src_argb0),  // %0
   4285     "+r"(src_argb1),  // %1
   4286     "+r"(dst_argb),   // %2
   4287     "+r"(width)       // %3
   4288   :
   4289   : "memory", "cc"
   4290     , "xmm0", "xmm1"
   4291   );
   4292 }
   4293 #endif  // HAS_ARGBADDROW_SSE2
   4294 
   4295 #ifdef HAS_ARGBADDROW_AVX2
   4296 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
   4297 void ARGBAddRow_AVX2(const uint8* src_argb0,
   4298                      const uint8* src_argb1,
   4299                      uint8* dst_argb,
   4300                      int width) {
   4301   asm volatile (
   4302     // 4 pixel loop.
   4303     LABELALIGN
   4304     "1:                                        \n"
   4305     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
   4306     "lea        " MEMLEA(0x20,0) ",%0          \n"
   4307     "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
   4308     "lea        " MEMLEA(0x20,1) ",%1          \n"
   4309     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
   4310     "lea        " MEMLEA(0x20,2) ",%2          \n"
   4311     "sub        $0x8,%3                        \n"
   4312     "jg        1b                              \n"
   4313     "vzeroupper                                \n"
   4314   : "+r"(src_argb0),  // %0
   4315     "+r"(src_argb1),  // %1
   4316     "+r"(dst_argb),   // %2
   4317     "+r"(width)       // %3
   4318   :
   4319   : "memory", "cc"
   4320     , "xmm0"
   4321   );
   4322 }
   4323 #endif  // HAS_ARGBADDROW_AVX2
   4324 
   4325 #ifdef HAS_ARGBSUBTRACTROW_SSE2
   4326 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
   4327 void ARGBSubtractRow_SSE2(const uint8* src_argb0,
   4328                           const uint8* src_argb1,
   4329                           uint8* dst_argb,
   4330                           int width) {
   4331   asm volatile (
   4332     // 4 pixel loop.
   4333     LABELALIGN
   4334     "1:                                        \n"
   4335     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4336     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4337     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
   4338     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4339     "psubusb   %%xmm1,%%xmm0                   \n"
   4340     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   4341     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4342     "sub       $0x4,%3                         \n"
   4343     "jg        1b                              \n"
   4344   : "+r"(src_argb0),  // %0
   4345     "+r"(src_argb1),  // %1
   4346     "+r"(dst_argb),   // %2
   4347     "+r"(width)       // %3
   4348   :
   4349   : "memory", "cc"
   4350     , "xmm0", "xmm1"
   4351   );
   4352 }
   4353 #endif  // HAS_ARGBSUBTRACTROW_SSE2
   4354 
   4355 #ifdef HAS_ARGBSUBTRACTROW_AVX2
   4356 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
   4357 void ARGBSubtractRow_AVX2(const uint8* src_argb0,
   4358                           const uint8* src_argb1,
   4359                           uint8* dst_argb,
   4360                           int width) {
   4361   asm volatile (
   4362     // 4 pixel loop.
   4363     LABELALIGN
   4364     "1:                                        \n"
   4365     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
   4366     "lea        " MEMLEA(0x20,0) ",%0          \n"
   4367     "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
   4368     "lea        " MEMLEA(0x20,1) ",%1          \n"
   4369     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
   4370     "lea        " MEMLEA(0x20,2) ",%2          \n"
   4371     "sub        $0x8,%3                        \n"
   4372     "jg         1b                             \n"
   4373     "vzeroupper                                \n"
   4374   : "+r"(src_argb0),  // %0
   4375     "+r"(src_argb1),  // %1
   4376     "+r"(dst_argb),   // %2
   4377     "+r"(width)       // %3
   4378   :
   4379   : "memory", "cc"
   4380     , "xmm0"
   4381   );
   4382 }
   4383 #endif  // HAS_ARGBSUBTRACTROW_AVX2
   4384 
   4385 #ifdef HAS_SOBELXROW_SSE2
   4386 // SobelX as a matrix is
   4387 // -1  0  1
   4388 // -2  0  2
   4389 // -1  0  1
   4390 void SobelXRow_SSE2(const uint8* src_y0,
   4391                     const uint8* src_y1,
   4392                     const uint8* src_y2,
   4393                     uint8* dst_sobelx,
   4394                     int width) {
   4395   asm volatile (
   4396     "sub       %0,%1                           \n"
   4397     "sub       %0,%2                           \n"
   4398     "sub       %0,%3                           \n"
   4399     "pxor      %%xmm5,%%xmm5                   \n"
   4400 
   4401     // 8 pixel loop.
   4402     LABELALIGN
   4403     "1:                                        \n"
   4404     "movq      " MEMACCESS(0) ",%%xmm0         \n"
   4405     "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
   4406     "punpcklbw %%xmm5,%%xmm0                   \n"
   4407     "punpcklbw %%xmm5,%%xmm1                   \n"
   4408     "psubw     %%xmm1,%%xmm0                   \n"
   4409     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
   4410     MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
   4411     "punpcklbw %%xmm5,%%xmm1                   \n"
   4412     "punpcklbw %%xmm5,%%xmm2                   \n"
   4413     "psubw     %%xmm2,%%xmm1                   \n"
   4414     MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
   4415     MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
   4416     "punpcklbw %%xmm5,%%xmm2                   \n"
   4417     "punpcklbw %%xmm5,%%xmm3                   \n"
   4418     "psubw     %%xmm3,%%xmm2                   \n"
   4419     "paddw     %%xmm2,%%xmm0                   \n"
   4420     "paddw     %%xmm1,%%xmm0                   \n"
   4421     "paddw     %%xmm1,%%xmm0                   \n"
   4422     "pxor      %%xmm1,%%xmm1                   \n"
   4423     "psubw     %%xmm0,%%xmm1                   \n"
   4424     "pmaxsw    %%xmm1,%%xmm0                   \n"
   4425     "packuswb  %%xmm0,%%xmm0                   \n"
   4426     MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
   4427     "lea       " MEMLEA(0x8,0) ",%0            \n"
   4428     "sub       $0x8,%4                         \n"
   4429     "jg        1b                              \n"
   4430   : "+r"(src_y0),      // %0
   4431     "+r"(src_y1),      // %1
   4432     "+r"(src_y2),      // %2
   4433     "+r"(dst_sobelx),  // %3
   4434     "+r"(width)        // %4
   4435   :
   4436   : "memory", "cc", NACL_R14
   4437     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   4438   );
   4439 }
   4440 #endif  // HAS_SOBELXROW_SSE2
   4441 
   4442 #ifdef HAS_SOBELYROW_SSE2
   4443 // SobelY as a matrix is
   4444 // -1 -2 -1
   4445 //  0  0  0
   4446 //  1  2  1
   4447 void SobelYRow_SSE2(const uint8* src_y0,
   4448                     const uint8* src_y1,
   4449                     uint8* dst_sobely,
   4450                     int width) {
   4451   asm volatile (
   4452     "sub       %0,%1                           \n"
   4453     "sub       %0,%2                           \n"
   4454     "pxor      %%xmm5,%%xmm5                   \n"
   4455 
   4456     // 8 pixel loop.
   4457     LABELALIGN
   4458     "1:                                        \n"
   4459     "movq      " MEMACCESS(0) ",%%xmm0         \n"
   4460     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
   4461     "punpcklbw %%xmm5,%%xmm0                   \n"
   4462     "punpcklbw %%xmm5,%%xmm1                   \n"
   4463     "psubw     %%xmm1,%%xmm0                   \n"
   4464     "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
   4465     MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
   4466     "punpcklbw %%xmm5,%%xmm1                   \n"
   4467     "punpcklbw %%xmm5,%%xmm2                   \n"
   4468     "psubw     %%xmm2,%%xmm1                   \n"
   4469     "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
   4470     MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
   4471     "punpcklbw %%xmm5,%%xmm2                   \n"
   4472     "punpcklbw %%xmm5,%%xmm3                   \n"
   4473     "psubw     %%xmm3,%%xmm2                   \n"
   4474     "paddw     %%xmm2,%%xmm0                   \n"
   4475     "paddw     %%xmm1,%%xmm0                   \n"
   4476     "paddw     %%xmm1,%%xmm0                   \n"
   4477     "pxor      %%xmm1,%%xmm1                   \n"
   4478     "psubw     %%xmm0,%%xmm1                   \n"
   4479     "pmaxsw    %%xmm1,%%xmm0                   \n"
   4480     "packuswb  %%xmm0,%%xmm0                   \n"
   4481     MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
   4482     "lea       " MEMLEA(0x8,0) ",%0            \n"
   4483     "sub       $0x8,%3                         \n"
   4484     "jg        1b                              \n"
   4485   : "+r"(src_y0),      // %0
   4486     "+r"(src_y1),      // %1
   4487     "+r"(dst_sobely),  // %2
   4488     "+r"(width)        // %3
   4489   :
   4490   : "memory", "cc", NACL_R14
   4491     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   4492   );
   4493 }
   4494 #endif  // HAS_SOBELYROW_SSE2
   4495 
   4496 #ifdef HAS_SOBELROW_SSE2
   4497 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
   4498 // A = 255
   4499 // R = Sobel
   4500 // G = Sobel
   4501 // B = Sobel
   4502 void SobelRow_SSE2(const uint8* src_sobelx,
   4503                    const uint8* src_sobely,
   4504                    uint8* dst_argb,
   4505                    int width) {
   4506   asm volatile (
   4507     "sub       %0,%1                           \n"
   4508     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   4509     "pslld     $0x18,%%xmm5                    \n"
   4510 
   4511     // 8 pixel loop.
   4512     LABELALIGN
   4513     "1:                                        \n"
   4514     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4515     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
   4516     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4517     "paddusb   %%xmm1,%%xmm0                   \n"
   4518     "movdqa    %%xmm0,%%xmm2                   \n"
   4519     "punpcklbw %%xmm0,%%xmm2                   \n"
   4520     "punpckhbw %%xmm0,%%xmm0                   \n"
   4521     "movdqa    %%xmm2,%%xmm1                   \n"
   4522     "punpcklwd %%xmm2,%%xmm1                   \n"
   4523     "punpckhwd %%xmm2,%%xmm2                   \n"
   4524     "por       %%xmm5,%%xmm1                   \n"
   4525     "por       %%xmm5,%%xmm2                   \n"
   4526     "movdqa    %%xmm0,%%xmm3                   \n"
   4527     "punpcklwd %%xmm0,%%xmm3                   \n"
   4528     "punpckhwd %%xmm0,%%xmm0                   \n"
   4529     "por       %%xmm5,%%xmm3                   \n"
   4530     "por       %%xmm5,%%xmm0                   \n"
   4531     "movdqu    %%xmm1," MEMACCESS(2) "         \n"
   4532     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
   4533     "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
   4534     "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
   4535     "lea       " MEMLEA(0x40,2) ",%2           \n"
   4536     "sub       $0x10,%3                        \n"
   4537     "jg        1b                              \n"
   4538   : "+r"(src_sobelx),  // %0
   4539     "+r"(src_sobely),  // %1
   4540     "+r"(dst_argb),    // %2
   4541     "+r"(width)        // %3
   4542   :
   4543   : "memory", "cc", NACL_R14
   4544     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   4545   );
   4546 }
   4547 #endif  // HAS_SOBELROW_SSE2
   4548 
   4549 #ifdef HAS_SOBELTOPLANEROW_SSE2
   4550 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
   4551 void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
   4552                           const uint8* src_sobely,
   4553                           uint8* dst_y,
   4554                           int width) {
   4555   asm volatile (
   4556     "sub       %0,%1                           \n"
   4557     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   4558     "pslld     $0x18,%%xmm5                    \n"
   4559 
   4560     // 8 pixel loop.
   4561     LABELALIGN
   4562     "1:                                        \n"
   4563     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4564     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
   4565     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4566     "paddusb   %%xmm1,%%xmm0                   \n"
   4567     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   4568     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4569     "sub       $0x10,%3                        \n"
   4570     "jg        1b                              \n"
   4571   : "+r"(src_sobelx),  // %0
   4572     "+r"(src_sobely),  // %1
   4573     "+r"(dst_y),       // %2
   4574     "+r"(width)        // %3
   4575   :
   4576   : "memory", "cc", NACL_R14
   4577     "xmm0", "xmm1"
   4578   );
   4579 }
   4580 #endif  // HAS_SOBELTOPLANEROW_SSE2
   4581 
   4582 #ifdef HAS_SOBELXYROW_SSE2
   4583 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
   4584 // A = 255
   4585 // R = Sobel X
   4586 // G = Sobel
   4587 // B = Sobel Y
   4588 void SobelXYRow_SSE2(const uint8* src_sobelx,
   4589                      const uint8* src_sobely,
   4590                      uint8* dst_argb,
   4591                      int width) {
   4592   asm volatile (
   4593     "sub       %0,%1                           \n"
   4594     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   4595 
   4596     // 8 pixel loop.
   4597     LABELALIGN
   4598     "1:                                        \n"
   4599     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4600     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
   4601     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4602     "movdqa    %%xmm0,%%xmm2                   \n"
   4603     "paddusb   %%xmm1,%%xmm2                   \n"
   4604     "movdqa    %%xmm0,%%xmm3                   \n"
   4605     "punpcklbw %%xmm5,%%xmm3                   \n"
   4606     "punpckhbw %%xmm5,%%xmm0                   \n"
   4607     "movdqa    %%xmm1,%%xmm4                   \n"
   4608     "punpcklbw %%xmm2,%%xmm4                   \n"
   4609     "punpckhbw %%xmm2,%%xmm1                   \n"
   4610     "movdqa    %%xmm4,%%xmm6                   \n"
   4611     "punpcklwd %%xmm3,%%xmm6                   \n"
   4612     "punpckhwd %%xmm3,%%xmm4                   \n"
   4613     "movdqa    %%xmm1,%%xmm7                   \n"
   4614     "punpcklwd %%xmm0,%%xmm7                   \n"
   4615     "punpckhwd %%xmm0,%%xmm1                   \n"
   4616     "movdqu    %%xmm6," MEMACCESS(2) "         \n"
   4617     "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
   4618     "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
   4619     "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
   4620     "lea       " MEMLEA(0x40,2) ",%2           \n"
   4621     "sub       $0x10,%3                        \n"
   4622     "jg        1b                              \n"
   4623   : "+r"(src_sobelx),  // %0
   4624     "+r"(src_sobely),  // %1
   4625     "+r"(dst_argb),    // %2
   4626     "+r"(width)        // %3
   4627   :
   4628   : "memory", "cc", NACL_R14
   4629     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   4630   );
   4631 }
   4632 #endif  // HAS_SOBELXYROW_SSE2
   4633 
   4634 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
   4635 // Creates a table of cumulative sums where each value is a sum of all values
   4636 // above and to the left of the value, inclusive of the value.
   4637 void ComputeCumulativeSumRow_SSE2(const uint8* row,
   4638                                   int32* cumsum,
   4639                                   const int32* previous_cumsum,
   4640                                   int width) {
   4641   asm volatile (
   4642     "pxor      %%xmm0,%%xmm0                   \n"
   4643     "pxor      %%xmm1,%%xmm1                   \n"
   4644     "sub       $0x4,%3                         \n"
   4645     "jl        49f                             \n"
   4646     "test      $0xf,%1                         \n"
   4647     "jne       49f                             \n"
   4648 
   4649     // 4 pixel loop.
   4650     LABELALIGN
   4651     "40:                                       \n"
   4652     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
   4653     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4654     "movdqa    %%xmm2,%%xmm4                   \n"
   4655     "punpcklbw %%xmm1,%%xmm2                   \n"
   4656     "movdqa    %%xmm2,%%xmm3                   \n"
   4657     "punpcklwd %%xmm1,%%xmm2                   \n"
   4658     "punpckhwd %%xmm1,%%xmm3                   \n"
   4659     "punpckhbw %%xmm1,%%xmm4                   \n"
   4660     "movdqa    %%xmm4,%%xmm5                   \n"
   4661     "punpcklwd %%xmm1,%%xmm4                   \n"
   4662     "punpckhwd %%xmm1,%%xmm5                   \n"
   4663     "paddd     %%xmm2,%%xmm0                   \n"
   4664     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
   4665     "paddd     %%xmm0,%%xmm2                   \n"
   4666     "paddd     %%xmm3,%%xmm0                   \n"
   4667     "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
   4668     "paddd     %%xmm0,%%xmm3                   \n"
   4669     "paddd     %%xmm4,%%xmm0                   \n"
   4670     "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
   4671     "paddd     %%xmm0,%%xmm4                   \n"
   4672     "paddd     %%xmm5,%%xmm0                   \n"
   4673     "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
   4674     "lea       " MEMLEA(0x40,2) ",%2           \n"
   4675     "paddd     %%xmm0,%%xmm5                   \n"
   4676     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
   4677     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
   4678     "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
   4679     "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
   4680     "lea       " MEMLEA(0x40,1) ",%1           \n"
   4681     "sub       $0x4,%3                         \n"
   4682     "jge       40b                             \n"
   4683 
   4684     "49:                                       \n"
   4685     "add       $0x3,%3                         \n"
   4686     "jl        19f                             \n"
   4687 
   4688     // 1 pixel loop.
   4689     LABELALIGN
   4690     "10:                                       \n"
   4691     "movd      " MEMACCESS(0) ",%%xmm2         \n"
   4692     "lea       " MEMLEA(0x4,0) ",%0            \n"
   4693     "punpcklbw %%xmm1,%%xmm2                   \n"
   4694     "punpcklwd %%xmm1,%%xmm2                   \n"
   4695     "paddd     %%xmm2,%%xmm0                   \n"
   4696     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
   4697     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4698     "paddd     %%xmm0,%%xmm2                   \n"
   4699     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
   4700     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4701     "sub       $0x1,%3                         \n"
   4702     "jge       10b                             \n"
   4703 
   4704     "19:                                       \n"
   4705   : "+r"(row),  // %0
   4706     "+r"(cumsum),  // %1
   4707     "+r"(previous_cumsum),  // %2
   4708     "+r"(width)  // %3
   4709   :
   4710   : "memory", "cc"
   4711     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   4712   );
   4713 }
   4714 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
   4715 
   4716 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
   4717 void CumulativeSumToAverageRow_SSE2(const int32* topleft,
   4718                                     const int32* botleft,
   4719                                     int width,
   4720                                     int area,
   4721                                     uint8* dst,
   4722                                     int count) {
   4723   asm volatile (
   4724     "movd      %5,%%xmm5                       \n"
   4725     "cvtdq2ps  %%xmm5,%%xmm5                   \n"
   4726     "rcpss     %%xmm5,%%xmm4                   \n"
   4727     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
   4728     "sub       $0x4,%3                         \n"
   4729     "jl        49f                             \n"
   4730     "cmpl      $0x80,%5                        \n"
   4731     "ja        40f                             \n"
   4732 
   4733     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
   4734     "pcmpeqb   %%xmm6,%%xmm6                   \n"
   4735     "psrld     $0x10,%%xmm6                    \n"
   4736     "cvtdq2ps  %%xmm6,%%xmm6                   \n"
   4737     "addps     %%xmm6,%%xmm5                   \n"
   4738     "mulps     %%xmm4,%%xmm5                   \n"
   4739     "cvtps2dq  %%xmm5,%%xmm5                   \n"
   4740     "packssdw  %%xmm5,%%xmm5                   \n"
   4741 
   4742     // 4 pixel small loop.
   4743     LABELALIGN
   4744   "4:                                         \n"
   4745     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4746     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   4747     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   4748     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   4749     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
   4750     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
   4751     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
   4752     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
   4753     "lea       " MEMLEA(0x40,0) ",%0           \n"
   4754     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
   4755     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
   4756     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
   4757     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
   4758     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
   4759     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
   4760     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
   4761     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
   4762     "lea       " MEMLEA(0x40,1) ",%1           \n"
   4763     "packssdw  %%xmm1,%%xmm0                   \n"
   4764     "packssdw  %%xmm3,%%xmm2                   \n"
   4765     "pmulhuw   %%xmm5,%%xmm0                   \n"
   4766     "pmulhuw   %%xmm5,%%xmm2                   \n"
   4767     "packuswb  %%xmm2,%%xmm0                   \n"
   4768     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   4769     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4770     "sub       $0x4,%3                         \n"
   4771     "jge       4b                              \n"
   4772     "jmp       49f                             \n"
   4773 
   4774   // 4 pixel loop                              \n"
   4775     LABELALIGN
   4776   "40:                                         \n"
   4777     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4778     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   4779     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   4780     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   4781     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
   4782     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
   4783     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
   4784     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
   4785     "lea       " MEMLEA(0x40,0) ",%0           \n"
   4786     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
   4787     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
   4788     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
   4789     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
   4790     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
   4791     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
   4792     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
   4793     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
   4794     "lea       " MEMLEA(0x40,1) ",%1           \n"
   4795     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
   4796     "cvtdq2ps  %%xmm1,%%xmm1                   \n"
   4797     "mulps     %%xmm4,%%xmm0                   \n"
   4798     "mulps     %%xmm4,%%xmm1                   \n"
   4799     "cvtdq2ps  %%xmm2,%%xmm2                   \n"
   4800     "cvtdq2ps  %%xmm3,%%xmm3                   \n"
   4801     "mulps     %%xmm4,%%xmm2                   \n"
   4802     "mulps     %%xmm4,%%xmm3                   \n"
   4803     "cvtps2dq  %%xmm0,%%xmm0                   \n"
   4804     "cvtps2dq  %%xmm1,%%xmm1                   \n"
   4805     "cvtps2dq  %%xmm2,%%xmm2                   \n"
   4806     "cvtps2dq  %%xmm3,%%xmm3                   \n"
   4807     "packssdw  %%xmm1,%%xmm0                   \n"
   4808     "packssdw  %%xmm3,%%xmm2                   \n"
   4809     "packuswb  %%xmm2,%%xmm0                   \n"
   4810     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   4811     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4812     "sub       $0x4,%3                         \n"
   4813     "jge       40b                             \n"
   4814 
   4815   "49:                                         \n"
   4816     "add       $0x3,%3                         \n"
   4817     "jl        19f                             \n"
   4818 
   4819   // 1 pixel loop                              \n"
   4820     LABELALIGN
   4821   "10:                                         \n"
   4822     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4823     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
   4824     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4825     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
   4826     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
   4827     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4828     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
   4829     "mulps     %%xmm4,%%xmm0                   \n"
   4830     "cvtps2dq  %%xmm0,%%xmm0                   \n"
   4831     "packssdw  %%xmm0,%%xmm0                   \n"
   4832     "packuswb  %%xmm0,%%xmm0                   \n"
   4833     "movd      %%xmm0," MEMACCESS(2) "         \n"
   4834     "lea       " MEMLEA(0x4,2) ",%2            \n"
   4835     "sub       $0x1,%3                         \n"
   4836     "jge       10b                             \n"
   4837   "19:                                         \n"
   4838   : "+r"(topleft),  // %0
   4839     "+r"(botleft),  // %1
   4840     "+r"(dst),      // %2
   4841     "+rm"(count)    // %3
   4842   : "r"((intptr_t)(width)),  // %4
   4843     "rm"(area)     // %5
   4844   : "memory", "cc", NACL_R14
   4845     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   4846   );
   4847 }
   4848 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
   4849 
   4850 #ifdef HAS_ARGBAFFINEROW_SSE2
   4851 // Copy ARGB pixels from source image with slope to a row of destination.
   4852 LIBYUV_API
   4853 void ARGBAffineRow_SSE2(const uint8* src_argb,
   4854                         int src_argb_stride,
   4855                         uint8* dst_argb,
   4856                         const float* src_dudv,
   4857                         int width) {
   4858   intptr_t src_argb_stride_temp = src_argb_stride;
   4859   intptr_t temp;
   4860   asm volatile (
   4861     "movq      " MEMACCESS(3) ",%%xmm2         \n"
   4862     "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
   4863     "shl       $0x10,%1                        \n"
   4864     "add       $0x4,%1                         \n"
   4865     "movd      %1,%%xmm5                       \n"
   4866     "sub       $0x4,%4                         \n"
   4867     "jl        49f                             \n"
   4868 
   4869     "pshufd    $0x44,%%xmm7,%%xmm7             \n"
   4870     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
   4871     "movdqa    %%xmm2,%%xmm0                   \n"
   4872     "addps     %%xmm7,%%xmm0                   \n"
   4873     "movlhps   %%xmm0,%%xmm2                   \n"
   4874     "movdqa    %%xmm7,%%xmm4                   \n"
   4875     "addps     %%xmm4,%%xmm4                   \n"
   4876     "movdqa    %%xmm2,%%xmm3                   \n"
   4877     "addps     %%xmm4,%%xmm3                   \n"
   4878     "addps     %%xmm4,%%xmm4                   \n"
   4879 
   4880   // 4 pixel loop                              \n"
   4881     LABELALIGN
   4882   "40:                                         \n"
   4883     "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
   4884     "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
   4885     "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
   4886     "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
   4887     "movd      %%xmm0,%k1                      \n"
   4888     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   4889     "movd      %%xmm0,%k5                      \n"
   4890     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   4891     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
   4892     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
   4893     "punpckldq %%xmm6,%%xmm1                   \n"
   4894     "addps     %%xmm4,%%xmm2                   \n"
   4895     "movq      %%xmm1," MEMACCESS(2) "         \n"
   4896     "movd      %%xmm0,%k1                      \n"
   4897     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   4898     "movd      %%xmm0,%k5                      \n"
   4899     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
   4900     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
   4901     "punpckldq %%xmm6,%%xmm0                   \n"
   4902     "addps     %%xmm4,%%xmm3                   \n"
   4903     "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
   4904     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4905     "sub       $0x4,%4                         \n"
   4906     "jge       40b                             \n"
   4907 
   4908   "49:                                         \n"
   4909     "add       $0x3,%4                         \n"
   4910     "jl        19f                             \n"
   4911 
   4912   // 1 pixel loop                              \n"
   4913     LABELALIGN
   4914   "10:                                         \n"
   4915     "cvttps2dq %%xmm2,%%xmm0                   \n"
   4916     "packssdw  %%xmm0,%%xmm0                   \n"
   4917     "pmaddwd   %%xmm5,%%xmm0                   \n"
   4918     "addps     %%xmm7,%%xmm2                   \n"
   4919     "movd      %%xmm0,%k1                      \n"
   4920     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
   4921     "movd      %%xmm0," MEMACCESS(2) "         \n"
   4922     "lea       " MEMLEA(0x04,2) ",%2           \n"
   4923     "sub       $0x1,%4                         \n"
   4924     "jge       10b                             \n"
   4925   "19:                                         \n"
   4926   : "+r"(src_argb),  // %0
   4927     "+r"(src_argb_stride_temp),  // %1
   4928     "+r"(dst_argb),  // %2
   4929     "+r"(src_dudv),  // %3
   4930     "+rm"(width),    // %4
   4931     "=&r"(temp)      // %5
   4932   :
   4933   : "memory", "cc", NACL_R14
   4934     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   4935   );
   4936 }
   4937 #endif  // HAS_ARGBAFFINEROW_SSE2
   4938 
   4939 #ifdef HAS_INTERPOLATEROW_SSSE3
   4940 // Bilinear filter 16x2 -> 16x1
   4941 void InterpolateRow_SSSE3(uint8* dst_ptr,
   4942                           const uint8* src_ptr,
   4943                           ptrdiff_t src_stride,
   4944                           int dst_width,
   4945                           int source_y_fraction) {
   4946   asm volatile (
   4947     "sub       %1,%0                           \n"
   4948     "cmp       $0x0,%3                         \n"
   4949     "je        100f                            \n"
   4950     "cmp       $0x80,%3                        \n"
   4951     "je        50f                             \n"
   4952 
   4953     "movd      %3,%%xmm0                       \n"
   4954     "neg       %3                              \n"
   4955     "add       $0x100,%3                       \n"
   4956     "movd      %3,%%xmm5                       \n"
   4957     "punpcklbw %%xmm0,%%xmm5                   \n"
   4958     "punpcklwd %%xmm5,%%xmm5                   \n"
   4959     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
   4960     "mov       $0x80808080,%%eax               \n"
   4961     "movd      %%eax,%%xmm4                    \n"
   4962     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
   4963 
   4964     // General purpose row blend.
   4965     LABELALIGN
   4966     "1:                                        \n"
   4967     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
   4968     MEMOPREG(movdqu,0x00,1,4,1,xmm2)
   4969     "movdqa     %%xmm0,%%xmm1                  \n"
   4970     "punpcklbw  %%xmm2,%%xmm0                  \n"
   4971     "punpckhbw  %%xmm2,%%xmm1                  \n"
   4972     "psubb      %%xmm4,%%xmm0                  \n"
   4973     "psubb      %%xmm4,%%xmm1                  \n"
   4974     "movdqa     %%xmm5,%%xmm2                  \n"
   4975     "movdqa     %%xmm5,%%xmm3                  \n"
   4976     "pmaddubsw  %%xmm0,%%xmm2                  \n"
   4977     "pmaddubsw  %%xmm1,%%xmm3                  \n"
   4978     "paddw      %%xmm4,%%xmm2                  \n"
   4979     "paddw      %%xmm4,%%xmm3                  \n"
   4980     "psrlw      $0x8,%%xmm2                    \n"
   4981     "psrlw      $0x8,%%xmm3                    \n"
   4982     "packuswb   %%xmm3,%%xmm2                  \n"
   4983     MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
   4984     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4985     "sub       $0x10,%2                        \n"
   4986     "jg        1b                              \n"
   4987     "jmp       99f                             \n"
   4988 
   4989     // Blend 50 / 50.
   4990     LABELALIGN
   4991   "50:                                         \n"
   4992     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
   4993     MEMOPREG(movdqu,0x00,1,4,1,xmm1)
   4994     "pavgb     %%xmm1,%%xmm0                   \n"
   4995     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
   4996     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4997     "sub       $0x10,%2                        \n"
   4998     "jg        50b                             \n"
   4999     "jmp       99f                             \n"
   5000 
   5001     // Blend 100 / 0 - Copy row unchanged.
   5002     LABELALIGN
   5003   "100:                                        \n"
   5004     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
   5005     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
   5006     "lea       " MEMLEA(0x10,1) ",%1           \n"
   5007     "sub       $0x10,%2                        \n"
   5008     "jg        100b                            \n"
   5009 
   5010   "99:                                         \n"
   5011   : "+r"(dst_ptr),     // %0
   5012     "+r"(src_ptr),     // %1
   5013     "+rm"(dst_width),  // %2
   5014     "+r"(source_y_fraction)  // %3
   5015   : "r"((intptr_t)(src_stride))  // %4
   5016   : "memory", "cc", "eax", NACL_R14
   5017     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   5018   );
   5019 }
   5020 #endif  // HAS_INTERPOLATEROW_SSSE3
   5021 
   5022 #ifdef HAS_INTERPOLATEROW_AVX2
   5023 // Bilinear filter 32x2 -> 32x1
   5024 void InterpolateRow_AVX2(uint8* dst_ptr,
   5025                          const uint8* src_ptr,
   5026                          ptrdiff_t src_stride,
   5027                          int dst_width,
   5028                          int source_y_fraction) {
   5029   asm volatile (
   5030     "cmp       $0x0,%3                         \n"
   5031     "je        100f                            \n"
   5032     "sub       %1,%0                           \n"
   5033     "cmp       $0x80,%3                        \n"
   5034     "je        50f                             \n"
   5035 
   5036     "vmovd      %3,%%xmm0                      \n"
   5037     "neg        %3                             \n"
   5038     "add        $0x100,%3                      \n"
   5039     "vmovd      %3,%%xmm5                      \n"
   5040     "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
   5041     "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
   5042     "vbroadcastss %%xmm5,%%ymm5                \n"
   5043     "mov        $0x80808080,%%eax              \n"
   5044     "vmovd      %%eax,%%xmm4                   \n"
   5045     "vbroadcastss %%xmm4,%%ymm4                \n"
   5046 
   5047     // General purpose row blend.
   5048     LABELALIGN
   5049     "1:                                        \n"
   5050     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
   5051     MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
   5052     "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
   5053     "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
   5054     "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
   5055     "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
   5056     "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
   5057     "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
   5058     "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
   5059     "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
   5060     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
   5061     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
   5062     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
   5063     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
   5064     "lea       " MEMLEA(0x20,1) ",%1           \n"
   5065     "sub       $0x20,%2                        \n"
   5066     "jg        1b                              \n"
   5067     "jmp       99f                             \n"
   5068 
   5069     // Blend 50 / 50.
   5070     LABELALIGN
   5071   "50:                                         \n"
   5072     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
   5073     VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
   5074     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
   5075     "lea       " MEMLEA(0x20,1) ",%1           \n"
   5076     "sub       $0x20,%2                        \n"
   5077     "jg        50b                             \n"
   5078     "jmp       99f                             \n"
   5079 
   5080     // Blend 100 / 0 - Copy row unchanged.
   5081     LABELALIGN
   5082   "100:                                        \n"
   5083     "rep movsb " MEMMOVESTRING(1,0) "          \n"
   5084     "jmp       999f                            \n"
   5085 
   5086   "99:                                         \n"
   5087     "vzeroupper                                \n"
   5088   "999:                                        \n"
   5089   : "+D"(dst_ptr),    // %0
   5090     "+S"(src_ptr),    // %1
   5091     "+cm"(dst_width),  // %2
   5092     "+r"(source_y_fraction)  // %3
   5093   : "r"((intptr_t)(src_stride))  // %4
   5094   : "memory", "cc", "eax", NACL_R14
   5095     "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
   5096   );
   5097 }
   5098 #endif  // HAS_INTERPOLATEROW_AVX2
   5099 
   5100 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
   5101 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
   5102 void ARGBShuffleRow_SSSE3(const uint8* src_argb,
   5103                           uint8* dst_argb,
   5104                           const uint8* shuffler,
   5105                           int width) {
   5106   asm volatile (
   5107     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
   5108     LABELALIGN
   5109     "1:                                        \n"
   5110     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   5111     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   5112     "lea       " MEMLEA(0x20,0) ",%0           \n"
   5113     "pshufb    %%xmm5,%%xmm0                   \n"
   5114     "pshufb    %%xmm5,%%xmm1                   \n"
   5115     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   5116     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   5117     "lea       " MEMLEA(0x20,1) ",%1           \n"
   5118     "sub       $0x8,%2                         \n"
   5119     "jg        1b                              \n"
   5120   : "+r"(src_argb),  // %0
   5121     "+r"(dst_argb),  // %1
   5122     "+r"(width)        // %2
   5123   : "r"(shuffler)    // %3
   5124   : "memory", "cc"
   5125     , "xmm0", "xmm1", "xmm5"
   5126   );
   5127 }
   5128 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
   5129 
   5130 #ifdef HAS_ARGBSHUFFLEROW_AVX2
   5131 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
   5132 void ARGBShuffleRow_AVX2(const uint8* src_argb,
   5133                          uint8* dst_argb,
   5134                          const uint8* shuffler,
   5135                          int width) {
   5136   asm volatile (
   5137     "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
   5138     LABELALIGN
   5139     "1:                                        \n"
   5140     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   5141     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   5142     "lea       " MEMLEA(0x40,0) ",%0           \n"
   5143     "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
   5144     "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
   5145     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
   5146     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
   5147     "lea       " MEMLEA(0x40,1) ",%1           \n"
   5148     "sub       $0x10,%2                        \n"
   5149     "jg        1b                              \n"
   5150     "vzeroupper                                \n"
   5151   : "+r"(src_argb),  // %0
   5152     "+r"(dst_argb),  // %1
   5153     "+r"(width)        // %2
   5154   : "r"(shuffler)    // %3
   5155   : "memory", "cc"
   5156     , "xmm0", "xmm1", "xmm5"
   5157   );
   5158 }
   5159 #endif  // HAS_ARGBSHUFFLEROW_AVX2
   5160 
   5161 #ifdef HAS_ARGBSHUFFLEROW_SSE2
   5162 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
   5163 void ARGBShuffleRow_SSE2(const uint8* src_argb,
   5164                          uint8* dst_argb,
   5165                          const uint8* shuffler,
   5166                          int width) {
   5167   uintptr_t pixel_temp;
   5168   asm volatile (
   5169     "pxor      %%xmm5,%%xmm5                   \n"
   5170     "mov       " MEMACCESS(4) ",%k2            \n"
   5171     "cmp       $0x3000102,%k2                  \n"
   5172     "je        3012f                           \n"
   5173     "cmp       $0x10203,%k2                    \n"
   5174     "je        123f                            \n"
   5175     "cmp       $0x30201,%k2                    \n"
   5176     "je        321f                            \n"
   5177     "cmp       $0x2010003,%k2                  \n"
   5178     "je        2103f                           \n"
   5179 
   5180     LABELALIGN
   5181     "1:                                        \n"
   5182     "movzb     " MEMACCESS(4) ",%2             \n"
   5183     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
   5184     "mov       %b2," MEMACCESS(1) "            \n"
   5185     "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
   5186     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
   5187     "mov       %b2," MEMACCESS2(0x1,1) "       \n"
   5188     "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
   5189     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
   5190     "mov       %b2," MEMACCESS2(0x2,1) "       \n"
   5191     "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
   5192     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
   5193     "mov       %b2," MEMACCESS2(0x3,1) "       \n"
   5194     "lea       " MEMLEA(0x4,0) ",%0            \n"
   5195     "lea       " MEMLEA(0x4,1) ",%1            \n"
   5196     "sub       $0x1,%3                         \n"
   5197     "jg        1b                              \n"
   5198     "jmp       99f                             \n"
   5199 
   5200     LABELALIGN
   5201   "123:                                        \n"
   5202     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   5203     "lea       " MEMLEA(0x10,0) ",%0           \n"
   5204     "movdqa    %%xmm0,%%xmm1                   \n"
   5205     "punpcklbw %%xmm5,%%xmm0                   \n"
   5206     "punpckhbw %%xmm5,%%xmm1                   \n"
   5207     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
   5208     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
   5209     "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
   5210     "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
   5211     "packuswb  %%xmm1,%%xmm0                   \n"
   5212     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   5213     "lea       " MEMLEA(0x10,1) ",%1           \n"
   5214     "sub       $0x4,%3                         \n"
   5215     "jg        123b                            \n"
   5216     "jmp       99f                             \n"
   5217 
   5218     LABELALIGN
   5219   "321:                                        \n"
   5220     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   5221     "lea       " MEMLEA(0x10,0) ",%0           \n"
   5222     "movdqa    %%xmm0,%%xmm1                   \n"
   5223     "punpcklbw %%xmm5,%%xmm0                   \n"
   5224     "punpckhbw %%xmm5,%%xmm1                   \n"
   5225     "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
   5226     "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
   5227     "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
   5228     "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
   5229     "packuswb  %%xmm1,%%xmm0                   \n"
   5230     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   5231     "lea       " MEMLEA(0x10,1) ",%1           \n"
   5232     "sub       $0x4,%3                         \n"
   5233     "jg        321b                            \n"
   5234     "jmp       99f                             \n"
   5235 
   5236     LABELALIGN
   5237   "2103:                                       \n"
   5238     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   5239     "lea       " MEMLEA(0x10,0) ",%0           \n"
   5240     "movdqa    %%xmm0,%%xmm1                   \n"
   5241     "punpcklbw %%xmm5,%%xmm0                   \n"
   5242     "punpckhbw %%xmm5,%%xmm1                   \n"
   5243     "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
   5244     "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
   5245     "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
   5246     "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
   5247     "packuswb  %%xmm1,%%xmm0                   \n"
   5248     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   5249     "lea       " MEMLEA(0x10,1) ",%1           \n"
   5250     "sub       $0x4,%3                         \n"
   5251     "jg        2103b                           \n"
   5252     "jmp       99f                             \n"
   5253 
   5254     LABELALIGN
   5255   "3012:                                       \n"
   5256     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   5257     "lea       " MEMLEA(0x10,0) ",%0           \n"
   5258     "movdqa    %%xmm0,%%xmm1                   \n"
   5259     "punpcklbw %%xmm5,%%xmm0                   \n"
   5260     "punpckhbw %%xmm5,%%xmm1                   \n"
   5261     "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
   5262     "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
   5263     "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
   5264     "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
   5265     "packuswb  %%xmm1,%%xmm0                   \n"
   5266     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   5267     "lea       " MEMLEA(0x10,1) ",%1           \n"
   5268     "sub       $0x4,%3                         \n"
   5269     "jg        3012b                           \n"
   5270 
   5271   "99:                                         \n"
   5272   : "+r"(src_argb),     // %0
   5273     "+r"(dst_argb),     // %1
   5274     "=&d"(pixel_temp),  // %2
   5275     "+r"(width)         // %3
   5276   : "r"(shuffler)       // %4
   5277   : "memory", "cc", NACL_R14
   5278     "xmm0", "xmm1", "xmm5"
   5279   );
   5280 }
   5281 #endif  // HAS_ARGBSHUFFLEROW_SSE2
   5282 
   5283 #ifdef HAS_I422TOYUY2ROW_SSE2
   5284 void I422ToYUY2Row_SSE2(const uint8* src_y,
   5285                         const uint8* src_u,
   5286                         const uint8* src_v,
   5287                         uint8* dst_frame,
   5288                         int width) {
   5289   asm volatile (
   5290     "sub       %1,%2                             \n"
   5291     LABELALIGN
   5292     "1:                                        \n"
   5293     "movq      " MEMACCESS(1) ",%%xmm2           \n"
   5294     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
   5295     "lea       " MEMLEA(0x8,1) ",%1              \n"
   5296     "punpcklbw %%xmm3,%%xmm2                     \n"
   5297     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
   5298     "lea       " MEMLEA(0x10,0) ",%0             \n"
   5299     "movdqa    %%xmm0,%%xmm1                     \n"
   5300     "punpcklbw %%xmm2,%%xmm0                     \n"
   5301     "punpckhbw %%xmm2,%%xmm1                     \n"
   5302     "movdqu    %%xmm0," MEMACCESS(3) "           \n"
   5303     "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
   5304     "lea       " MEMLEA(0x20,3) ",%3             \n"
   5305     "sub       $0x10,%4                          \n"
   5306     "jg         1b                               \n"
   5307     : "+r"(src_y),  // %0
   5308       "+r"(src_u),  // %1
   5309       "+r"(src_v),  // %2
   5310       "+r"(dst_frame),  // %3
   5311       "+rm"(width)  // %4
   5312     :
   5313     : "memory", "cc", NACL_R14
   5314     "xmm0", "xmm1", "xmm2", "xmm3"
   5315   );
   5316 }
   5317 #endif  // HAS_I422TOYUY2ROW_SSE2
   5318 
   5319 #ifdef HAS_I422TOUYVYROW_SSE2
   5320 void I422ToUYVYRow_SSE2(const uint8* src_y,
   5321                         const uint8* src_u,
   5322                         const uint8* src_v,
   5323                         uint8* dst_frame,
   5324                         int width) {
   5325   asm volatile (
   5326     "sub        %1,%2                            \n"
   5327     LABELALIGN
   5328     "1:                                        \n"
   5329     "movq      " MEMACCESS(1) ",%%xmm2           \n"
   5330     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
   5331     "lea       " MEMLEA(0x8,1) ",%1              \n"
   5332     "punpcklbw %%xmm3,%%xmm2                     \n"
   5333     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
   5334     "movdqa    %%xmm2,%%xmm1                     \n"
   5335     "lea       " MEMLEA(0x10,0) ",%0             \n"
   5336     "punpcklbw %%xmm0,%%xmm1                     \n"
   5337     "punpckhbw %%xmm0,%%xmm2                     \n"
   5338     "movdqu    %%xmm1," MEMACCESS(3) "           \n"
   5339     "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
   5340     "lea       " MEMLEA(0x20,3) ",%3             \n"
   5341     "sub       $0x10,%4                          \n"
   5342     "jg         1b                               \n"
   5343     : "+r"(src_y),  // %0
   5344       "+r"(src_u),  // %1
   5345       "+r"(src_v),  // %2
   5346       "+r"(dst_frame),  // %3
   5347       "+rm"(width)  // %4
   5348     :
   5349     : "memory", "cc", NACL_R14
   5350     "xmm0", "xmm1", "xmm2", "xmm3"
   5351   );
   5352 }
   5353 #endif  // HAS_I422TOUYVYROW_SSE2
   5354 
   5355 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
   5356 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
   5357                             uint8* dst_argb,
   5358                             const float* poly,
   5359                             int width) {
   5360   asm volatile (
   5361     "pxor      %%xmm3,%%xmm3                   \n"
   5362 
   5363     // 2 pixel loop.
   5364     LABELALIGN
   5365     "1:                                        \n"
   5366     "movq      " MEMACCESS(0) ",%%xmm0         \n"
   5367     "lea       " MEMLEA(0x8,0) ",%0            \n"
   5368     "punpcklbw %%xmm3,%%xmm0                   \n"
   5369     "movdqa    %%xmm0,%%xmm4                   \n"
   5370     "punpcklwd %%xmm3,%%xmm0                   \n"
   5371     "punpckhwd %%xmm3,%%xmm4                   \n"
   5372     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
   5373     "cvtdq2ps  %%xmm4,%%xmm4                   \n"
   5374     "movdqa    %%xmm0,%%xmm1                   \n"
   5375     "movdqa    %%xmm4,%%xmm5                   \n"
   5376     "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
   5377     "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
   5378     "addps     " MEMACCESS(3) ",%%xmm0         \n"
   5379     "addps     " MEMACCESS(3) ",%%xmm4         \n"
   5380     "movdqa    %%xmm1,%%xmm2                   \n"
   5381     "movdqa    %%xmm5,%%xmm6                   \n"
   5382     "mulps     %%xmm1,%%xmm2                   \n"
   5383     "mulps     %%xmm5,%%xmm6                   \n"
   5384     "mulps     %%xmm2,%%xmm1                   \n"
   5385     "mulps     %%xmm6,%%xmm5                   \n"
   5386     "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
   5387     "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
   5388     "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
   5389     "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
   5390     "addps     %%xmm2,%%xmm0                   \n"
   5391     "addps     %%xmm6,%%xmm4                   \n"
   5392     "addps     %%xmm1,%%xmm0                   \n"
   5393     "addps     %%xmm5,%%xmm4                   \n"
   5394     "cvttps2dq %%xmm0,%%xmm0                   \n"
   5395     "cvttps2dq %%xmm4,%%xmm4                   \n"
   5396     "packuswb  %%xmm4,%%xmm0                   \n"
   5397     "packuswb  %%xmm0,%%xmm0                   \n"
   5398     "movq      %%xmm0," MEMACCESS(1) "         \n"
   5399     "lea       " MEMLEA(0x8,1) ",%1            \n"
   5400     "sub       $0x2,%2                         \n"
   5401     "jg        1b                              \n"
   5402   : "+r"(src_argb),  // %0
   5403     "+r"(dst_argb),  // %1
   5404     "+r"(width)      // %2
   5405   : "r"(poly)        // %3
   5406   : "memory", "cc"
   5407     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   5408   );
   5409 }
   5410 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
   5411 
   5412 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
   5413 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
   5414                             uint8* dst_argb,
   5415                             const float* poly,
   5416                             int width) {
   5417   asm volatile (
   5418     "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
   5419     "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
   5420     "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
   5421     "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
   5422 
   5423     // 2 pixel loop.
   5424     LABELALIGN
   5425     "1:                                        \n"
   5426     "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
   5427     "lea         " MEMLEA(0x8,0) ",%0          \n"
   5428     "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
   5429     "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
   5430     "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
   5431     "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
   5432     "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
   5433     "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
   5434     "vcvttps2dq  %%ymm0,%%ymm0                 \n"
   5435     "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
   5436     "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
   5437     "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
   5438     "vmovq       %%xmm0," MEMACCESS(1) "       \n"
   5439     "lea         " MEMLEA(0x8,1) ",%1          \n"
   5440     "sub         $0x2,%2                       \n"
   5441     "jg          1b                            \n"
   5442     "vzeroupper                                \n"
   5443   : "+r"(src_argb),  // %0
   5444     "+r"(dst_argb),  // %1
   5445     "+r"(width)      // %2
   5446   : "r"(poly)        // %3
   5447   : "memory", "cc",
   5448     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   5449   );
   5450 }
   5451 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
   5452 
   5453 #ifdef HAS_HALFFLOATROW_SSE2
   5454 static float kScaleBias = 1.9259299444e-34f;
   5455 void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
   5456   asm volatile (
   5457     "pshufd      $0x0,%3,%%xmm4                \n"
   5458     "pxor        %%xmm5,%%xmm5                 \n"
   5459     "sub         %0,%1                         \n"
   5460 
   5461     // 16 pixel loop.
   5462     LABELALIGN
   5463     "1:                                        \n"
   5464     "movdqu      " MEMACCESS(0) ",%%xmm2       \n"  // 8 shorts
   5465     "add         $0x10,%0                      \n"
   5466     "movdqa      %%xmm2,%%xmm3                 \n"
   5467     "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
   5468     "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
   5469     "punpckhwd   %%xmm5,%%xmm3                 \n"
   5470     "cvtdq2ps    %%xmm3,%%xmm3                 \n"
   5471     "mulps       %%xmm4,%%xmm2                 \n"
   5472     "mulps       %%xmm4,%%xmm3                 \n"
   5473     "psrld       $0xd,%%xmm2                   \n"
   5474     "psrld       $0xd,%%xmm3                   \n"
   5475     "packssdw    %%xmm3,%%xmm2                 \n"
   5476     MEMOPMEM(movdqu,xmm2,-0x10,0,1,1)
   5477     "sub         $0x8,%2                       \n"
   5478     "jg          1b                            \n"
   5479   : "+r"(src),    // %0
   5480     "+r"(dst),    // %1
   5481     "+r"(width)   // %2
   5482   : "x"(scale * kScaleBias)   // %3
   5483   : "memory", "cc",
   5484     "xmm2", "xmm3", "xmm4", "xmm5"
   5485   );
   5486 }
   5487 #endif  // HAS_HALFFLOATROW_SSE2
   5488 
   5489 #ifdef HAS_HALFFLOATROW_AVX2
   5490 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
   5491   asm volatile (
   5492     "vbroadcastss  %3, %%ymm4                  \n"
   5493     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
   5494     "sub        %0,%1                          \n"
   5495 
   5496     // 16 pixel loop.
   5497     LABELALIGN
   5498     "1:                                        \n"
   5499     "vmovdqu    " MEMACCESS(0) ",%%ymm2        \n"  // 16 shorts
   5500     "add        $0x20,%0                       \n"
   5501     "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates
   5502     "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"
   5503     "vcvtdq2ps  %%ymm3,%%ymm3                  \n"
   5504     "vcvtdq2ps  %%ymm2,%%ymm2                  \n"
   5505     "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"
   5506     "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"
   5507     "vpsrld     $0xd,%%ymm3,%%ymm3             \n"
   5508     "vpsrld     $0xd,%%ymm2,%%ymm2             \n"
   5509     "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates
   5510     MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1)
   5511     "sub        $0x10,%2                       \n"
   5512     "jg         1b                             \n"
   5513 
   5514     "vzeroupper                                \n"
   5515   : "+r"(src),    // %0
   5516     "+r"(dst),    // %1
   5517     "+r"(width)   // %2
   5518   : "x"(scale * kScaleBias)   // %3
   5519   : "memory", "cc",
   5520     "xmm2", "xmm3", "xmm4", "xmm5"
   5521   );
   5522 }
   5523 #endif  // HAS_HALFFLOATROW_AVX2
   5524 
   5525 #ifdef HAS_HALFFLOATROW_F16C
   5526 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
   5527   asm volatile (
   5528     "vbroadcastss  %3, %%ymm4                  \n"
   5529     "sub        %0,%1                          \n"
   5530 
   5531     // 16 pixel loop.
   5532     LABELALIGN
   5533     "1:                                        \n"
   5534     "vpmovzxwd   " MEMACCESS(0) ",%%ymm2       \n"  // 16 shorts -> 16 ints
   5535     "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm3 \n"
   5536     "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
   5537     "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
   5538     "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
   5539     "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
   5540     "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
   5541     "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
   5542     MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
   5543     MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
   5544     "add         $0x20,%0                      \n"
   5545     "sub         $0x10,%2                      \n"
   5546     "jg          1b                            \n"
   5547     "vzeroupper                                \n"
   5548   : "+r"(src),   // %0
   5549     "+r"(dst),   // %1
   5550     "+r"(width)  // %2
   5551   : "x"(scale)   // %3
   5552   : "memory", "cc",
   5553     "xmm2", "xmm3", "xmm4"
   5554   );
   5555 }
   5556 #endif  // HAS_HALFFLOATROW_F16C
   5557 
   5558 #ifdef HAS_HALFFLOATROW_F16C
   5559 void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
   5560   asm volatile (
   5561     "sub        %0,%1                          \n"
   5562     // 16 pixel loop.
   5563     LABELALIGN
   5564     "1:                                        \n"
   5565     "vpmovzxwd   " MEMACCESS(0) ",%%ymm2       \n"  // 16 shorts -> 16 ints
   5566     "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm3 \n"
   5567     "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
   5568     "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
   5569     "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
   5570     "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
   5571     MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
   5572     MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
   5573     "add         $0x20,%0                      \n"
   5574     "sub         $0x10,%2                      \n"
   5575     "jg          1b                            \n"
   5576     "vzeroupper                                \n"
   5577   : "+r"(src),   // %0
   5578     "+r"(dst),   // %1
   5579     "+r"(width)  // %2
   5580   :
   5581   : "memory", "cc",
   5582     "xmm2", "xmm3"
   5583   );
   5584 }
   5585 #endif  // HAS_HALFFLOATROW_F16C
   5586 
   5587 #ifdef HAS_ARGBCOLORTABLEROW_X86
   5588 // Tranform ARGB pixels with color table.
   5589 void ARGBColorTableRow_X86(uint8* dst_argb,
   5590                            const uint8* table_argb,
   5591                            int width) {
   5592   uintptr_t pixel_temp;
   5593   asm volatile (
   5594     // 1 pixel loop.
   5595     LABELALIGN
   5596     "1:                                        \n"
   5597     "movzb     " MEMACCESS(0) ",%1             \n"
   5598     "lea       " MEMLEA(0x4,0) ",%0            \n"
   5599     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
   5600     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
   5601     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
   5602     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
   5603     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
   5604     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
   5605     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
   5606     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
   5607     "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
   5608     MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
   5609     "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
   5610     "dec       %2                              \n"
   5611     "jg        1b                              \n"
   5612   : "+r"(dst_argb),     // %0
   5613     "=&d"(pixel_temp),  // %1
   5614     "+r"(width)         // %2
   5615   : "r"(table_argb)     // %3
   5616   : "memory", "cc");
   5617 }
   5618 #endif  // HAS_ARGBCOLORTABLEROW_X86
   5619 
   5620 #ifdef HAS_RGBCOLORTABLEROW_X86
   5621 // Tranform RGB pixels with color table.
   5622 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
   5623   uintptr_t pixel_temp;
   5624   asm volatile (
   5625     // 1 pixel loop.
   5626     LABELALIGN
   5627     "1:                                        \n"
   5628     "movzb     " MEMACCESS(0) ",%1             \n"
   5629     "lea       " MEMLEA(0x4,0) ",%0            \n"
   5630     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
   5631     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
   5632     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
   5633     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
   5634     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
   5635     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
   5636     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
   5637     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
   5638     "dec       %2                              \n"
   5639     "jg        1b                              \n"
   5640   : "+r"(dst_argb),     // %0
   5641     "=&d"(pixel_temp),  // %1
   5642     "+r"(width)         // %2
   5643   : "r"(table_argb)     // %3
   5644   : "memory", "cc");
   5645 }
   5646 #endif  // HAS_RGBCOLORTABLEROW_X86
   5647 
   5648 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
   5649 // Tranform RGB pixels with luma table.
   5650 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
   5651                                  uint8* dst_argb,
   5652                                  int width,
   5653                                  const uint8* luma,
   5654                                  uint32 lumacoeff) {
   5655   uintptr_t pixel_temp;
   5656   uintptr_t table_temp;
   5657   asm volatile (
   5658     "movd      %6,%%xmm3                       \n"
   5659     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
   5660     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   5661     "psllw     $0x8,%%xmm4                     \n"
   5662     "pxor      %%xmm5,%%xmm5                   \n"
   5663 
   5664     // 4 pixel loop.
   5665     LABELALIGN
   5666     "1:                                        \n"
   5667     "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
   5668     "pmaddubsw %%xmm3,%%xmm0                   \n"
   5669     "phaddw    %%xmm0,%%xmm0                   \n"
   5670     "pand      %%xmm4,%%xmm0                   \n"
   5671     "punpcklwd %%xmm5,%%xmm0                   \n"
   5672     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
   5673     "add       %5,%1                           \n"
   5674     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   5675 
   5676     "movzb     " MEMACCESS(2) ",%0             \n"
   5677     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5678     "mov       %b0," MEMACCESS(3) "            \n"
   5679     "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
   5680     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5681     "mov       %b0," MEMACCESS2(0x1,3) "       \n"
   5682     "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
   5683     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5684     "mov       %b0," MEMACCESS2(0x2,3) "       \n"
   5685     "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
   5686     "mov       %b0," MEMACCESS2(0x3,3) "       \n"
   5687 
   5688     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
   5689     "add       %5,%1                           \n"
   5690     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   5691 
   5692     "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
   5693     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5694     "mov       %b0," MEMACCESS2(0x4,3) "       \n"
   5695     "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
   5696     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5697     "mov       %b0," MEMACCESS2(0x5,3) "       \n"
   5698     "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
   5699     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5700     "mov       %b0," MEMACCESS2(0x6,3) "       \n"
   5701     "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
   5702     "mov       %b0," MEMACCESS2(0x7,3) "       \n"
   5703 
   5704     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
   5705     "add       %5,%1                           \n"
   5706     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   5707 
   5708     "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
   5709     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5710     "mov       %b0," MEMACCESS2(0x8,3) "       \n"
   5711     "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
   5712     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5713     "mov       %b0," MEMACCESS2(0x9,3) "       \n"
   5714     "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
   5715     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5716     "mov       %b0," MEMACCESS2(0xa,3) "       \n"
   5717     "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
   5718     "mov       %b0," MEMACCESS2(0xb,3) "       \n"
   5719 
   5720     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
   5721     "add       %5,%1                           \n"
   5722 
   5723     "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
   5724     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5725     "mov       %b0," MEMACCESS2(0xc,3) "       \n"
   5726     "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
   5727     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5728     "mov       %b0," MEMACCESS2(0xd,3) "       \n"
   5729     "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
   5730     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5731     "mov       %b0," MEMACCESS2(0xe,3) "       \n"
   5732     "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
   5733     "mov       %b0," MEMACCESS2(0xf,3) "       \n"
   5734     "lea       " MEMLEA(0x10,2) ",%2           \n"
   5735     "lea       " MEMLEA(0x10,3) ",%3           \n"
   5736     "sub       $0x4,%4                         \n"
   5737     "jg        1b                              \n"
   5738   : "=&d"(pixel_temp),  // %0
   5739     "=&a"(table_temp),  // %1
   5740     "+r"(src_argb),     // %2
   5741     "+r"(dst_argb),     // %3
   5742     "+rm"(width)        // %4
   5743   : "r"(luma),          // %5
   5744     "rm"(lumacoeff)     // %6
   5745   : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
   5746   );
   5747 }
   5748 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
   5749 
   5750 #endif  // defined(__x86_64__) || defined(__i386__)
   5751 
   5752 #ifdef __cplusplus
   5753 }  // extern "C"
   5754 }  // namespace libyuv
   5755 #endif
   5756