Home | History | Annotate | Download | only in source
      1 // VERSION 2
      2 /*
      3  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      4  *
      5  *  Use of this source code is governed by a BSD-style license
      6  *  that can be found in the LICENSE file in the root of the source
      7  *  tree. An additional intellectual property rights grant can be found
      8  *  in the file PATENTS. All contributing project authors may
      9  *  be found in the AUTHORS file in the root of the source tree.
     10  */
     11 
     12 #include "libyuv/row.h"
     13 
     14 #ifdef __cplusplus
     15 namespace libyuv {
     16 extern "C" {
     17 #endif
     18 
     19 // This module is for GCC x86 and x64.
     20 #if !defined(LIBYUV_DISABLE_X86) && \
     21     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
     22 
     23 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
     24 
     25 // Constants for ARGB
     26 static vec8 kARGBToY = {
     27   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
     28 };
     29 
     30 // JPeg full range.
     31 static vec8 kARGBToYJ = {
     32   15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
     33 };
     34 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
     35 
     36 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
     37 
     38 static vec8 kARGBToU = {
     39   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
     40 };
     41 
     42 static vec8 kARGBToUJ = {
     43   127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
     44 };
     45 
     46 static vec8 kARGBToV = {
     47   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
     48 };
     49 
     50 static vec8 kARGBToVJ = {
     51   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
     52 };
     53 
     54 // Constants for BGRA
     55 static vec8 kBGRAToY = {
     56   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
     57 };
     58 
     59 static vec8 kBGRAToU = {
     60   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
     61 };
     62 
     63 static vec8 kBGRAToV = {
     64   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
     65 };
     66 
     67 // Constants for ABGR
     68 static vec8 kABGRToY = {
     69   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
     70 };
     71 
     72 static vec8 kABGRToU = {
     73   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
     74 };
     75 
     76 static vec8 kABGRToV = {
     77   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
     78 };
     79 
     80 // Constants for RGBA.
     81 static vec8 kRGBAToY = {
     82   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
     83 };
     84 
     85 static vec8 kRGBAToU = {
     86   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
     87 };
     88 
     89 static vec8 kRGBAToV = {
     90   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
     91 };
     92 
     93 static uvec8 kAddY16 = {
     94   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
     95 };
     96 
     97 // 7 bit fixed point 0.5.
     98 static vec16 kAddYJ64 = {
     99   64, 64, 64, 64, 64, 64, 64, 64
    100 };
    101 
    102 static uvec8 kAddUV128 = {
    103   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
    104   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
    105 };
    106 
    107 static uvec16 kAddUVJ128 = {
    108   0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
    109 };
    110 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
    111 
    112 #ifdef HAS_RGB24TOARGBROW_SSSE3
    113 
    114 // Shuffle table for converting RGB24 to ARGB.
    115 static uvec8 kShuffleMaskRGB24ToARGB = {
    116   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
    117 };
    118 
    119 // Shuffle table for converting RAW to ARGB.
    120 static uvec8 kShuffleMaskRAWToARGB = {
    121   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
    122 };
    123 
    124 // Shuffle table for converting RAW to RGB24.  First 8.
    125 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
    126   2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
    127   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
    128 };
    129 
    130 // Shuffle table for converting RAW to RGB24.  Middle 8.
    131 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
    132   2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
    133   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
    134 };
    135 
    136 // Shuffle table for converting RAW to RGB24.  Last 8.
    137 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
    138   8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
    139   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
    140 };
    141 
    142 // Shuffle table for converting ARGB to RGB24.
    143 static uvec8 kShuffleMaskARGBToRGB24 = {
    144   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
    145 };
    146 
    147 // Shuffle table for converting ARGB to RAW.
    148 static uvec8 kShuffleMaskARGBToRAW = {
    149   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
    150 };
    151 
    152 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
    153 static uvec8 kShuffleMaskARGBToRGB24_0 = {
    154   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
    155 };
    156 
    157 // YUY2 shuf 16 Y to 32 Y.
    158 static const lvec8 kShuffleYUY2Y = {
    159   0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
    160   0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
    161 };
    162 
    163 // YUY2 shuf 8 UV to 16 UV.
    164 static const lvec8 kShuffleYUY2UV = {
    165   1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
    166   1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
    167 };
    168 
    169 // UYVY shuf 16 Y to 32 Y.
    170 static const lvec8 kShuffleUYVYY = {
    171   1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
    172   1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
    173 };
    174 
    175 // UYVY shuf 8 UV to 16 UV.
    176 static const lvec8 kShuffleUYVYUV = {
    177   0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
    178   0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
    179 };
    180 
    181 // NV21 shuf 8 VU to 16 UV.
    182 static const lvec8 kShuffleNV21 = {
    183   1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
    184   1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
    185 };
    186 #endif  // HAS_RGB24TOARGBROW_SSSE3
    187 
    188 #ifdef HAS_J400TOARGBROW_SSE2
    189 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
    190   asm volatile (
    191     "pcmpeqb   %%xmm5,%%xmm5                   \n"
    192     "pslld     $0x18,%%xmm5                    \n"
    193     LABELALIGN
    194   "1:                                          \n"
    195     "movq      " MEMACCESS(0) ",%%xmm0         \n"
    196     "lea       " MEMLEA(0x8,0) ",%0            \n"
    197     "punpcklbw %%xmm0,%%xmm0                   \n"
    198     "movdqa    %%xmm0,%%xmm1                   \n"
    199     "punpcklwd %%xmm0,%%xmm0                   \n"
    200     "punpckhwd %%xmm1,%%xmm1                   \n"
    201     "por       %%xmm5,%%xmm0                   \n"
    202     "por       %%xmm5,%%xmm1                   \n"
    203     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    204     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
    205     "lea       " MEMLEA(0x20,1) ",%1           \n"
    206     "sub       $0x8,%2                         \n"
    207     "jg        1b                              \n"
    208   : "+r"(src_y),     // %0
    209     "+r"(dst_argb),  // %1
    210     "+r"(width)        // %2
    211   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
    212   );
    213 }
    214 #endif  // HAS_J400TOARGBROW_SSE2
    215 
    216 #ifdef HAS_RGB24TOARGBROW_SSSE3
    217 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
    218   asm volatile (
    219     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
    220     "pslld     $0x18,%%xmm5                    \n"
    221     "movdqa    %3,%%xmm4                       \n"
    222     LABELALIGN
    223   "1:                                          \n"
    224     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    225     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    226     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
    227     "lea       " MEMLEA(0x30,0) ",%0           \n"
    228     "movdqa    %%xmm3,%%xmm2                   \n"
    229     "palignr   $0x8,%%xmm1,%%xmm2              \n"
    230     "pshufb    %%xmm4,%%xmm2                   \n"
    231     "por       %%xmm5,%%xmm2                   \n"
    232     "palignr   $0xc,%%xmm0,%%xmm1              \n"
    233     "pshufb    %%xmm4,%%xmm0                   \n"
    234     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
    235     "por       %%xmm5,%%xmm0                   \n"
    236     "pshufb    %%xmm4,%%xmm1                   \n"
    237     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    238     "por       %%xmm5,%%xmm1                   \n"
    239     "palignr   $0x4,%%xmm3,%%xmm3              \n"
    240     "pshufb    %%xmm4,%%xmm3                   \n"
    241     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
    242     "por       %%xmm5,%%xmm3                   \n"
    243     "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
    244     "lea       " MEMLEA(0x40,1) ",%1           \n"
    245     "sub       $0x10,%2                        \n"
    246     "jg        1b                              \n"
    247   : "+r"(src_rgb24),  // %0
    248     "+r"(dst_argb),  // %1
    249     "+r"(width)        // %2
    250   : "m"(kShuffleMaskRGB24ToARGB)  // %3
    251   : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    252   );
    253 }
    254 
    255 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
    256   asm volatile (
    257     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
    258     "pslld     $0x18,%%xmm5                    \n"
    259     "movdqa    %3,%%xmm4                       \n"
    260     LABELALIGN
    261   "1:                                          \n"
    262     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    263     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    264     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
    265     "lea       " MEMLEA(0x30,0) ",%0           \n"
    266     "movdqa    %%xmm3,%%xmm2                   \n"
    267     "palignr   $0x8,%%xmm1,%%xmm2              \n"
    268     "pshufb    %%xmm4,%%xmm2                   \n"
    269     "por       %%xmm5,%%xmm2                   \n"
    270     "palignr   $0xc,%%xmm0,%%xmm1              \n"
    271     "pshufb    %%xmm4,%%xmm0                   \n"
    272     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
    273     "por       %%xmm5,%%xmm0                   \n"
    274     "pshufb    %%xmm4,%%xmm1                   \n"
    275     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    276     "por       %%xmm5,%%xmm1                   \n"
    277     "palignr   $0x4,%%xmm3,%%xmm3              \n"
    278     "pshufb    %%xmm4,%%xmm3                   \n"
    279     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
    280     "por       %%xmm5,%%xmm3                   \n"
    281     "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
    282     "lea       " MEMLEA(0x40,1) ",%1           \n"
    283     "sub       $0x10,%2                        \n"
    284     "jg        1b                              \n"
    285   : "+r"(src_raw),   // %0
    286     "+r"(dst_argb),  // %1
    287     "+r"(width)        // %2
    288   : "m"(kShuffleMaskRAWToARGB)  // %3
    289   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    290   );
    291 }
    292 
    293 void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
    294   asm volatile (
    295    "movdqa     %3,%%xmm3                       \n"
    296    "movdqa     %4,%%xmm4                       \n"
    297    "movdqa     %5,%%xmm5                       \n"
    298     LABELALIGN
    299   "1:                                          \n"
    300     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    301     "movdqu    " MEMACCESS2(0x4,0) ",%%xmm1    \n"
    302     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm2    \n"
    303     "lea       " MEMLEA(0x18,0) ",%0           \n"
    304     "pshufb    %%xmm3,%%xmm0                   \n"
    305     "pshufb    %%xmm4,%%xmm1                   \n"
    306     "pshufb    %%xmm5,%%xmm2                   \n"
    307     "movq      %%xmm0," MEMACCESS(1) "         \n"
    308     "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
    309     "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
    310     "lea       " MEMLEA(0x18,1) ",%1           \n"
    311     "sub       $0x8,%2                         \n"
    312     "jg        1b                              \n"
    313   : "+r"(src_raw),    // %0
    314     "+r"(dst_rgb24),  // %1
    315     "+r"(width)       // %2
    316   : "m"(kShuffleMaskRAWToRGB24_0),  // %3
    317     "m"(kShuffleMaskRAWToRGB24_1),  // %4
    318     "m"(kShuffleMaskRAWToRGB24_2)   // %5
    319   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    320   );
    321 }
    322 
    323 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
    324   asm volatile (
    325     "mov       $0x1080108,%%eax                \n"
    326     "movd      %%eax,%%xmm5                    \n"
    327     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
    328     "mov       $0x20802080,%%eax               \n"
    329     "movd      %%eax,%%xmm6                    \n"
    330     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
    331     "pcmpeqb   %%xmm3,%%xmm3                   \n"
    332     "psllw     $0xb,%%xmm3                     \n"
    333     "pcmpeqb   %%xmm4,%%xmm4                   \n"
    334     "psllw     $0xa,%%xmm4                     \n"
    335     "psrlw     $0x5,%%xmm4                     \n"
    336     "pcmpeqb   %%xmm7,%%xmm7                   \n"
    337     "psllw     $0x8,%%xmm7                     \n"
    338     "sub       %0,%1                           \n"
    339     "sub       %0,%1                           \n"
    340     LABELALIGN
    341   "1:                                          \n"
    342     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    343     "movdqa    %%xmm0,%%xmm1                   \n"
    344     "movdqa    %%xmm0,%%xmm2                   \n"
    345     "pand      %%xmm3,%%xmm1                   \n"
    346     "psllw     $0xb,%%xmm2                     \n"
    347     "pmulhuw   %%xmm5,%%xmm1                   \n"
    348     "pmulhuw   %%xmm5,%%xmm2                   \n"
    349     "psllw     $0x8,%%xmm1                     \n"
    350     "por       %%xmm2,%%xmm1                   \n"
    351     "pand      %%xmm4,%%xmm0                   \n"
    352     "pmulhuw   %%xmm6,%%xmm0                   \n"
    353     "por       %%xmm7,%%xmm0                   \n"
    354     "movdqa    %%xmm1,%%xmm2                   \n"
    355     "punpcklbw %%xmm0,%%xmm1                   \n"
    356     "punpckhbw %%xmm0,%%xmm2                   \n"
    357     MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
    358     MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
    359     "lea       " MEMLEA(0x10,0) ",%0           \n"
    360     "sub       $0x8,%2                         \n"
    361     "jg        1b                              \n"
    362   : "+r"(src),  // %0
    363     "+r"(dst),  // %1
    364     "+r"(width)   // %2
    365   :
    366   : "memory", "cc", "eax", NACL_R14
    367     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    368   );
    369 }
    370 
    371 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
    372   asm volatile (
    373     "mov       $0x1080108,%%eax                \n"
    374     "movd      %%eax,%%xmm5                    \n"
    375     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
    376     "mov       $0x42004200,%%eax               \n"
    377     "movd      %%eax,%%xmm6                    \n"
    378     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
    379     "pcmpeqb   %%xmm3,%%xmm3                   \n"
    380     "psllw     $0xb,%%xmm3                     \n"
    381     "movdqa    %%xmm3,%%xmm4                   \n"
    382     "psrlw     $0x6,%%xmm4                     \n"
    383     "pcmpeqb   %%xmm7,%%xmm7                   \n"
    384     "psllw     $0x8,%%xmm7                     \n"
    385     "sub       %0,%1                           \n"
    386     "sub       %0,%1                           \n"
    387     LABELALIGN
    388   "1:                                          \n"
    389     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    390     "movdqa    %%xmm0,%%xmm1                   \n"
    391     "movdqa    %%xmm0,%%xmm2                   \n"
    392     "psllw     $0x1,%%xmm1                     \n"
    393     "psllw     $0xb,%%xmm2                     \n"
    394     "pand      %%xmm3,%%xmm1                   \n"
    395     "pmulhuw   %%xmm5,%%xmm2                   \n"
    396     "pmulhuw   %%xmm5,%%xmm1                   \n"
    397     "psllw     $0x8,%%xmm1                     \n"
    398     "por       %%xmm2,%%xmm1                   \n"
    399     "movdqa    %%xmm0,%%xmm2                   \n"
    400     "pand      %%xmm4,%%xmm0                   \n"
    401     "psraw     $0x8,%%xmm2                     \n"
    402     "pmulhuw   %%xmm6,%%xmm0                   \n"
    403     "pand      %%xmm7,%%xmm2                   \n"
    404     "por       %%xmm2,%%xmm0                   \n"
    405     "movdqa    %%xmm1,%%xmm2                   \n"
    406     "punpcklbw %%xmm0,%%xmm1                   \n"
    407     "punpckhbw %%xmm0,%%xmm2                   \n"
    408     MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
    409     MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
    410     "lea       " MEMLEA(0x10,0) ",%0           \n"
    411     "sub       $0x8,%2                         \n"
    412     "jg        1b                              \n"
    413   : "+r"(src),  // %0
    414     "+r"(dst),  // %1
    415     "+r"(width)   // %2
    416   :
    417   : "memory", "cc", "eax", NACL_R14
    418     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    419   );
    420 }
    421 
    422 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
    423   asm volatile (
    424     "mov       $0xf0f0f0f,%%eax                \n"
    425     "movd      %%eax,%%xmm4                    \n"
    426     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
    427     "movdqa    %%xmm4,%%xmm5                   \n"
    428     "pslld     $0x4,%%xmm5                     \n"
    429     "sub       %0,%1                           \n"
    430     "sub       %0,%1                           \n"
    431     LABELALIGN
    432   "1:                                          \n"
    433     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    434     "movdqa    %%xmm0,%%xmm2                   \n"
    435     "pand      %%xmm4,%%xmm0                   \n"
    436     "pand      %%xmm5,%%xmm2                   \n"
    437     "movdqa    %%xmm0,%%xmm1                   \n"
    438     "movdqa    %%xmm2,%%xmm3                   \n"
    439     "psllw     $0x4,%%xmm1                     \n"
    440     "psrlw     $0x4,%%xmm3                     \n"
    441     "por       %%xmm1,%%xmm0                   \n"
    442     "por       %%xmm3,%%xmm2                   \n"
    443     "movdqa    %%xmm0,%%xmm1                   \n"
    444     "punpcklbw %%xmm2,%%xmm0                   \n"
    445     "punpckhbw %%xmm2,%%xmm1                   \n"
    446     MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
    447     MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
    448     "lea       " MEMLEA(0x10,0) ",%0           \n"
    449     "sub       $0x8,%2                         \n"
    450     "jg        1b                              \n"
    451   : "+r"(src),  // %0
    452     "+r"(dst),  // %1
    453     "+r"(width)   // %2
    454   :
    455   : "memory", "cc", "eax", NACL_R14
    456     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    457   );
    458 }
    459 
    460 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
    461   asm volatile (
    462     "movdqa    %3,%%xmm6                       \n"
    463     LABELALIGN
    464   "1:                                          \n"
    465     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    466     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    467     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
    468     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
    469     "lea       " MEMLEA(0x40,0) ",%0           \n"
    470     "pshufb    %%xmm6,%%xmm0                   \n"
    471     "pshufb    %%xmm6,%%xmm1                   \n"
    472     "pshufb    %%xmm6,%%xmm2                   \n"
    473     "pshufb    %%xmm6,%%xmm3                   \n"
    474     "movdqa    %%xmm1,%%xmm4                   \n"
    475     "psrldq    $0x4,%%xmm1                     \n"
    476     "pslldq    $0xc,%%xmm4                     \n"
    477     "movdqa    %%xmm2,%%xmm5                   \n"
    478     "por       %%xmm4,%%xmm0                   \n"
    479     "pslldq    $0x8,%%xmm5                     \n"
    480     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    481     "por       %%xmm5,%%xmm1                   \n"
    482     "psrldq    $0x8,%%xmm2                     \n"
    483     "pslldq    $0x4,%%xmm3                     \n"
    484     "por       %%xmm3,%%xmm2                   \n"
    485     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
    486     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
    487     "lea       " MEMLEA(0x30,1) ",%1           \n"
    488     "sub       $0x10,%2                        \n"
    489     "jg        1b                              \n"
    490   : "+r"(src),  // %0
    491     "+r"(dst),  // %1
    492     "+r"(width)   // %2
    493   : "m"(kShuffleMaskARGBToRGB24)  // %3
    494   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
    495   );
    496 }
    497 
    498 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
    499   asm volatile (
    500     "movdqa    %3,%%xmm6                       \n"
    501     LABELALIGN
    502   "1:                                          \n"
    503     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    504     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    505     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
    506     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
    507     "lea       " MEMLEA(0x40,0) ",%0           \n"
    508     "pshufb    %%xmm6,%%xmm0                   \n"
    509     "pshufb    %%xmm6,%%xmm1                   \n"
    510     "pshufb    %%xmm6,%%xmm2                   \n"
    511     "pshufb    %%xmm6,%%xmm3                   \n"
    512     "movdqa    %%xmm1,%%xmm4                   \n"
    513     "psrldq    $0x4,%%xmm1                     \n"
    514     "pslldq    $0xc,%%xmm4                     \n"
    515     "movdqa    %%xmm2,%%xmm5                   \n"
    516     "por       %%xmm4,%%xmm0                   \n"
    517     "pslldq    $0x8,%%xmm5                     \n"
    518     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    519     "por       %%xmm5,%%xmm1                   \n"
    520     "psrldq    $0x8,%%xmm2                     \n"
    521     "pslldq    $0x4,%%xmm3                     \n"
    522     "por       %%xmm3,%%xmm2                   \n"
    523     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
    524     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
    525     "lea       " MEMLEA(0x30,1) ",%1           \n"
    526     "sub       $0x10,%2                        \n"
    527     "jg        1b                              \n"
    528   : "+r"(src),  // %0
    529     "+r"(dst),  // %1
    530     "+r"(width)   // %2
    531   : "m"(kShuffleMaskARGBToRAW)  // %3
    532   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
    533   );
    534 }
    535 
    536 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
    537   asm volatile (
    538     "pcmpeqb   %%xmm3,%%xmm3                   \n"
    539     "psrld     $0x1b,%%xmm3                    \n"
    540     "pcmpeqb   %%xmm4,%%xmm4                   \n"
    541     "psrld     $0x1a,%%xmm4                    \n"
    542     "pslld     $0x5,%%xmm4                     \n"
    543     "pcmpeqb   %%xmm5,%%xmm5                   \n"
    544     "pslld     $0xb,%%xmm5                     \n"
    545     LABELALIGN
    546   "1:                                          \n"
    547     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    548     "movdqa    %%xmm0,%%xmm1                   \n"
    549     "movdqa    %%xmm0,%%xmm2                   \n"
    550     "pslld     $0x8,%%xmm0                     \n"
    551     "psrld     $0x3,%%xmm1                     \n"
    552     "psrld     $0x5,%%xmm2                     \n"
    553     "psrad     $0x10,%%xmm0                    \n"
    554     "pand      %%xmm3,%%xmm1                   \n"
    555     "pand      %%xmm4,%%xmm2                   \n"
    556     "pand      %%xmm5,%%xmm0                   \n"
    557     "por       %%xmm2,%%xmm1                   \n"
    558     "por       %%xmm1,%%xmm0                   \n"
    559     "packssdw  %%xmm0,%%xmm0                   \n"
    560     "lea       " MEMLEA(0x10,0) ",%0           \n"
    561     "movq      %%xmm0," MEMACCESS(1) "         \n"
    562     "lea       " MEMLEA(0x8,1) ",%1            \n"
    563     "sub       $0x4,%2                         \n"
    564     "jg        1b                              \n"
    565   : "+r"(src),  // %0
    566     "+r"(dst),  // %1
    567     "+r"(width)   // %2
    568   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    569   );
    570 }
    571 
    572 void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
    573                                 const uint32 dither4, int width) {
    574   asm volatile (
    575     "movd       %3,%%xmm6                      \n"
    576     "punpcklbw  %%xmm6,%%xmm6                  \n"
    577     "movdqa     %%xmm6,%%xmm7                  \n"
    578     "punpcklwd  %%xmm6,%%xmm6                  \n"
    579     "punpckhwd  %%xmm7,%%xmm7                  \n"
    580     "pcmpeqb    %%xmm3,%%xmm3                  \n"
    581     "psrld      $0x1b,%%xmm3                   \n"
    582     "pcmpeqb    %%xmm4,%%xmm4                  \n"
    583     "psrld      $0x1a,%%xmm4                   \n"
    584     "pslld      $0x5,%%xmm4                    \n"
    585     "pcmpeqb    %%xmm5,%%xmm5                  \n"
    586     "pslld      $0xb,%%xmm5                    \n"
    587 
    588     LABELALIGN
    589   "1:                                          \n"
    590     "movdqu     (%0),%%xmm0                    \n"
    591     "paddusb    %%xmm6,%%xmm0                  \n"
    592     "movdqa     %%xmm0,%%xmm1                  \n"
    593     "movdqa     %%xmm0,%%xmm2                  \n"
    594     "pslld      $0x8,%%xmm0                    \n"
    595     "psrld      $0x3,%%xmm1                    \n"
    596     "psrld      $0x5,%%xmm2                    \n"
    597     "psrad      $0x10,%%xmm0                   \n"
    598     "pand       %%xmm3,%%xmm1                  \n"
    599     "pand       %%xmm4,%%xmm2                  \n"
    600     "pand       %%xmm5,%%xmm0                  \n"
    601     "por        %%xmm2,%%xmm1                  \n"
    602     "por        %%xmm1,%%xmm0                  \n"
    603     "packssdw   %%xmm0,%%xmm0                  \n"
    604     "lea        0x10(%0),%0                    \n"
    605     "movq       %%xmm0,(%1)                    \n"
    606     "lea        0x8(%1),%1                     \n"
    607     "sub        $0x4,%2                        \n"
    608     "jg        1b                              \n"
    609   : "+r"(src),  // %0
    610     "+r"(dst),  // %1
    611     "+r"(width)   // %2
    612   : "m"(dither4) // %3
    613   : "memory", "cc",
    614     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    615   );
    616 }
    617 
    618 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
    619 void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
    620                                 const uint32 dither4, int width) {
    621   asm volatile (
    622     "vbroadcastss %3,%%xmm6                    \n"
    623     "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
    624     "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
    625     "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
    626     "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
    627     "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
    628     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
    629     "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
    630     "vpslld     $0x5,%%ymm4,%%ymm4             \n"
    631     "vpslld     $0xb,%%ymm3,%%ymm5             \n"
    632 
    633     LABELALIGN
    634   "1:                                          \n"
    635     "vmovdqu    (%0),%%ymm0                    \n"
    636     "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
    637     "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
    638     "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
    639     "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
    640     "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
    641     "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
    642     "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
    643     "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
    644     "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
    645     "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
    646     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
    647     "lea        0x20(%0),%0                    \n"
    648     "vmovdqu    %%xmm0,(%1)                    \n"
    649     "lea        0x10(%1),%1                    \n"
    650     "sub        $0x8,%2                        \n"
    651     "jg         1b                             \n"
    652     "vzeroupper                                \n"
    653   : "+r"(src),  // %0
    654     "+r"(dst),  // %1
    655     "+r"(width)   // %2
    656   : "m"(dither4) // %3
    657   : "memory", "cc",
    658     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    659   );
    660 }
    661 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
    662 
    663 
    664 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
    665   asm volatile (
    666     "pcmpeqb   %%xmm4,%%xmm4                   \n"
    667     "psrld     $0x1b,%%xmm4                    \n"
    668     "movdqa    %%xmm4,%%xmm5                   \n"
    669     "pslld     $0x5,%%xmm5                     \n"
    670     "movdqa    %%xmm4,%%xmm6                   \n"
    671     "pslld     $0xa,%%xmm6                     \n"
    672     "pcmpeqb   %%xmm7,%%xmm7                   \n"
    673     "pslld     $0xf,%%xmm7                     \n"
    674     LABELALIGN
    675   "1:                                          \n"
    676     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    677     "movdqa    %%xmm0,%%xmm1                   \n"
    678     "movdqa    %%xmm0,%%xmm2                   \n"
    679     "movdqa    %%xmm0,%%xmm3                   \n"
    680     "psrad     $0x10,%%xmm0                    \n"
    681     "psrld     $0x3,%%xmm1                     \n"
    682     "psrld     $0x6,%%xmm2                     \n"
    683     "psrld     $0x9,%%xmm3                     \n"
    684     "pand      %%xmm7,%%xmm0                   \n"
    685     "pand      %%xmm4,%%xmm1                   \n"
    686     "pand      %%xmm5,%%xmm2                   \n"
    687     "pand      %%xmm6,%%xmm3                   \n"
    688     "por       %%xmm1,%%xmm0                   \n"
    689     "por       %%xmm3,%%xmm2                   \n"
    690     "por       %%xmm2,%%xmm0                   \n"
    691     "packssdw  %%xmm0,%%xmm0                   \n"
    692     "lea       " MEMLEA(0x10,0) ",%0           \n"
    693     "movq      %%xmm0," MEMACCESS(1) "         \n"
    694     "lea       " MEMLEA(0x8,1) ",%1            \n"
    695     "sub       $0x4,%2                         \n"
    696     "jg        1b                              \n"
    697   : "+r"(src),  // %0
    698     "+r"(dst),  // %1
    699     "+r"(width)   // %2
    700   :: "memory", "cc",
    701     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    702   );
    703 }
    704 
    705 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
    706   asm volatile (
    707     "pcmpeqb   %%xmm4,%%xmm4                   \n"
    708     "psllw     $0xc,%%xmm4                     \n"
    709     "movdqa    %%xmm4,%%xmm3                   \n"
    710     "psrlw     $0x8,%%xmm3                     \n"
    711     LABELALIGN
    712   "1:                                          \n"
    713     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    714     "movdqa    %%xmm0,%%xmm1                   \n"
    715     "pand      %%xmm3,%%xmm0                   \n"
    716     "pand      %%xmm4,%%xmm1                   \n"
    717     "psrlq     $0x4,%%xmm0                     \n"
    718     "psrlq     $0x8,%%xmm1                     \n"
    719     "por       %%xmm1,%%xmm0                   \n"
    720     "packuswb  %%xmm0,%%xmm0                   \n"
    721     "lea       " MEMLEA(0x10,0) ",%0           \n"
    722     "movq      %%xmm0," MEMACCESS(1) "         \n"
    723     "lea       " MEMLEA(0x8,1) ",%1            \n"
    724     "sub       $0x4,%2                         \n"
    725     "jg        1b                              \n"
    726   : "+r"(src),  // %0
    727     "+r"(dst),  // %1
    728     "+r"(width)   // %2
    729   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
    730   );
    731 }
    732 #endif  // HAS_RGB24TOARGBROW_SSSE3
    733 
    734 #ifdef HAS_ARGBTOYROW_SSSE3
    735 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
    736 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
    737   asm volatile (
    738     "movdqa    %3,%%xmm4                       \n"
    739     "movdqa    %4,%%xmm5                       \n"
    740     LABELALIGN
    741   "1:                                          \n"
    742     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    743     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    744     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
    745     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
    746     "pmaddubsw %%xmm4,%%xmm0                   \n"
    747     "pmaddubsw %%xmm4,%%xmm1                   \n"
    748     "pmaddubsw %%xmm4,%%xmm2                   \n"
    749     "pmaddubsw %%xmm4,%%xmm3                   \n"
    750     "lea       " MEMLEA(0x40,0) ",%0           \n"
    751     "phaddw    %%xmm1,%%xmm0                   \n"
    752     "phaddw    %%xmm3,%%xmm2                   \n"
    753     "psrlw     $0x7,%%xmm0                     \n"
    754     "psrlw     $0x7,%%xmm2                     \n"
    755     "packuswb  %%xmm2,%%xmm0                   \n"
    756     "paddb     %%xmm5,%%xmm0                   \n"
    757     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    758     "lea       " MEMLEA(0x10,1) ",%1           \n"
    759     "sub       $0x10,%2                        \n"
    760     "jg        1b                              \n"
    761   : "+r"(src_argb),  // %0
    762     "+r"(dst_y),     // %1
    763     "+r"(width)        // %2
    764   : "m"(kARGBToY),   // %3
    765     "m"(kAddY16)     // %4
    766   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    767   );
    768 }
    769 #endif  // HAS_ARGBTOYROW_SSSE3
    770 
    771 #ifdef HAS_ARGBTOYJROW_SSSE3
    772 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
    773 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
    774 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
    775   asm volatile (
    776     "movdqa    %3,%%xmm4                       \n"
    777     "movdqa    %4,%%xmm5                       \n"
    778     LABELALIGN
    779   "1:                                          \n"
    780     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    781     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    782     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
    783     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
    784     "pmaddubsw %%xmm4,%%xmm0                   \n"
    785     "pmaddubsw %%xmm4,%%xmm1                   \n"
    786     "pmaddubsw %%xmm4,%%xmm2                   \n"
    787     "pmaddubsw %%xmm4,%%xmm3                   \n"
    788     "lea       " MEMLEA(0x40,0) ",%0           \n"
    789     "phaddw    %%xmm1,%%xmm0                   \n"
    790     "phaddw    %%xmm3,%%xmm2                   \n"
    791     "paddw     %%xmm5,%%xmm0                   \n"
    792     "paddw     %%xmm5,%%xmm2                   \n"
    793     "psrlw     $0x7,%%xmm0                     \n"
    794     "psrlw     $0x7,%%xmm2                     \n"
    795     "packuswb  %%xmm2,%%xmm0                   \n"
    796     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    797     "lea       " MEMLEA(0x10,1) ",%1           \n"
    798     "sub       $0x10,%2                        \n"
    799     "jg        1b                              \n"
    800   : "+r"(src_argb),  // %0
    801     "+r"(dst_y),     // %1
    802     "+r"(width)        // %2
    803   : "m"(kARGBToYJ),  // %3
    804     "m"(kAddYJ64)    // %4
    805   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    806   );
    807 }
    808 #endif  // HAS_ARGBTOYJROW_SSSE3
    809 
    810 #ifdef HAS_ARGBTOYROW_AVX2
    811 // vpermd for vphaddw + vpackuswb vpermd.
    812 static const lvec32 kPermdARGBToY_AVX = {
    813   0, 4, 1, 5, 2, 6, 3, 7
    814 };
    815 
    816 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
    817 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
    818   asm volatile (
    819     "vbroadcastf128 %3,%%ymm4                  \n"
    820     "vbroadcastf128 %4,%%ymm5                  \n"
    821     "vmovdqu    %5,%%ymm6                      \n"
    822     LABELALIGN
    823   "1:                                          \n"
    824     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
    825     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
    826     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
    827     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
    828     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
    829     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
    830     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
    831     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
    832     "lea       " MEMLEA(0x80,0) ",%0           \n"
    833     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
    834     "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
    835     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
    836     "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
    837     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
    838     "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
    839     "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
    840     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
    841     "lea       " MEMLEA(0x20,1) ",%1           \n"
    842     "sub       $0x20,%2                        \n"
    843     "jg        1b                              \n"
    844     "vzeroupper                                \n"
    845   : "+r"(src_argb),  // %0
    846     "+r"(dst_y),     // %1
    847     "+r"(width)        // %2
    848   : "m"(kARGBToY),   // %3
    849     "m"(kAddY16),    // %4
    850     "m"(kPermdARGBToY_AVX)  // %5
    851   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
    852   );
    853 }
    854 #endif  // HAS_ARGBTOYROW_AVX2
    855 
    856 #ifdef HAS_ARGBTOYJROW_AVX2
    857 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
    858 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
    859   asm volatile (
    860     "vbroadcastf128 %3,%%ymm4                  \n"
    861     "vbroadcastf128 %4,%%ymm5                  \n"
    862     "vmovdqu    %5,%%ymm6                      \n"
    863     LABELALIGN
    864   "1:                                          \n"
    865     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
    866     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
    867     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
    868     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
    869     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
    870     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
    871     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
    872     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
    873     "lea       " MEMLEA(0x80,0) ",%0           \n"
    874     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
    875     "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
    876     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
    877     "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
    878     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
    879     "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
    880     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
    881     "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
    882     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
    883     "lea       " MEMLEA(0x20,1) ",%1           \n"
    884     "sub       $0x20,%2                        \n"
    885     "jg        1b                              \n"
    886     "vzeroupper                                \n"
    887   : "+r"(src_argb),  // %0
    888     "+r"(dst_y),     // %1
    889     "+r"(width)        // %2
    890   : "m"(kARGBToYJ),   // %3
    891     "m"(kAddYJ64),    // %4
    892     "m"(kPermdARGBToY_AVX)  // %5
    893   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
    894   );
    895 }
    896 #endif  // HAS_ARGBTOYJROW_AVX2
    897 
    898 #ifdef HAS_ARGBTOUVROW_SSSE3
    899 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
    900                        uint8* dst_u, uint8* dst_v, int width) {
    901   asm volatile (
    902     "movdqa    %5,%%xmm3                       \n"
    903     "movdqa    %6,%%xmm4                       \n"
    904     "movdqa    %7,%%xmm5                       \n"
    905     "sub       %1,%2                           \n"
    906     LABELALIGN
    907   "1:                                          \n"
    908     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    909     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
    910     "pavgb     %%xmm7,%%xmm0                   \n"
    911     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    912     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
    913     "pavgb     %%xmm7,%%xmm1                   \n"
    914     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
    915     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
    916     "pavgb     %%xmm7,%%xmm2                   \n"
    917     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
    918     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
    919     "pavgb     %%xmm7,%%xmm6                   \n"
    920 
    921     "lea       " MEMLEA(0x40,0) ",%0           \n"
    922     "movdqa    %%xmm0,%%xmm7                   \n"
    923     "shufps    $0x88,%%xmm1,%%xmm0             \n"
    924     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
    925     "pavgb     %%xmm7,%%xmm0                   \n"
    926     "movdqa    %%xmm2,%%xmm7                   \n"
    927     "shufps    $0x88,%%xmm6,%%xmm2             \n"
    928     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
    929     "pavgb     %%xmm7,%%xmm2                   \n"
    930     "movdqa    %%xmm0,%%xmm1                   \n"
    931     "movdqa    %%xmm2,%%xmm6                   \n"
    932     "pmaddubsw %%xmm4,%%xmm0                   \n"
    933     "pmaddubsw %%xmm4,%%xmm2                   \n"
    934     "pmaddubsw %%xmm3,%%xmm1                   \n"
    935     "pmaddubsw %%xmm3,%%xmm6                   \n"
    936     "phaddw    %%xmm2,%%xmm0                   \n"
    937     "phaddw    %%xmm6,%%xmm1                   \n"
    938     "psraw     $0x8,%%xmm0                     \n"
    939     "psraw     $0x8,%%xmm1                     \n"
    940     "packsswb  %%xmm1,%%xmm0                   \n"
    941     "paddb     %%xmm5,%%xmm0                   \n"
    942     "movlps    %%xmm0," MEMACCESS(1) "         \n"
    943     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
    944     "lea       " MEMLEA(0x8,1) ",%1            \n"
    945     "sub       $0x10,%3                        \n"
    946     "jg        1b                              \n"
    947   : "+r"(src_argb0),       // %0
    948     "+r"(dst_u),           // %1
    949     "+r"(dst_v),           // %2
    950     "+rm"(width)           // %3
    951   : "r"((intptr_t)(src_stride_argb)), // %4
    952     "m"(kARGBToV),  // %5
    953     "m"(kARGBToU),  // %6
    954     "m"(kAddUV128)  // %7
    955   : "memory", "cc", NACL_R14
    956     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
    957   );
    958 }
    959 #endif  // HAS_ARGBTOUVROW_SSSE3
    960 
    961 #ifdef HAS_ARGBTOUVROW_AVX2
    962 // vpshufb for vphaddw + vpackuswb packed to shorts.
    963 static const lvec8 kShufARGBToUV_AVX = {
    964   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
    965   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
    966 };
    967 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
    968                       uint8* dst_u, uint8* dst_v, int width) {
    969   asm volatile (
    970     "vbroadcastf128 %5,%%ymm5                  \n"
    971     "vbroadcastf128 %6,%%ymm6                  \n"
    972     "vbroadcastf128 %7,%%ymm7                  \n"
    973     "sub       %1,%2                           \n"
    974     LABELALIGN
    975   "1:                                          \n"
    976     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
    977     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
    978     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
    979     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
    980     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
    981     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
    982     VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
    983     VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
    984     "lea       " MEMLEA(0x80,0) ",%0           \n"
    985     "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
    986     "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
    987     "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
    988     "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
    989     "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
    990     "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
    991 
    992     "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
    993     "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
    994     "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
    995     "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
    996     "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
    997     "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
    998     "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
    999     "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
   1000     "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
   1001     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
   1002     "vpshufb    %8,%%ymm0,%%ymm0               \n"
   1003     "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
   1004 
   1005     "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
   1006     VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
   1007     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1008     "sub       $0x20,%3                        \n"
   1009     "jg        1b                              \n"
   1010     "vzeroupper                                \n"
   1011   : "+r"(src_argb0),       // %0
   1012     "+r"(dst_u),           // %1
   1013     "+r"(dst_v),           // %2
   1014     "+rm"(width)           // %3
   1015   : "r"((intptr_t)(src_stride_argb)), // %4
   1016     "m"(kAddUV128),  // %5
   1017     "m"(kARGBToV),   // %6
   1018     "m"(kARGBToU),   // %7
   1019     "m"(kShufARGBToUV_AVX)  // %8
   1020   : "memory", "cc", NACL_R14
   1021     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   1022   );
   1023 }
   1024 #endif  // HAS_ARGBTOUVROW_AVX2
   1025 
   1026 #ifdef HAS_ARGBTOUVJROW_AVX2
   1027 void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
   1028                        uint8* dst_u, uint8* dst_v, int width) {
   1029   asm volatile (
   1030     "vbroadcastf128 %5,%%ymm5                  \n"
   1031     "vbroadcastf128 %6,%%ymm6                  \n"
   1032     "vbroadcastf128 %7,%%ymm7                  \n"
   1033     "sub       %1,%2                           \n"
   1034     LABELALIGN
   1035   "1:                                          \n"
   1036     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
   1037     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
   1038     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
   1039     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
   1040     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
   1041     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
   1042     VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
   1043     VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
   1044     "lea       " MEMLEA(0x80,0) ",%0           \n"
   1045     "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
   1046     "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
   1047     "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
   1048     "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
   1049     "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
   1050     "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
   1051 
   1052     "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
   1053     "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
   1054     "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
   1055     "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
   1056     "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
   1057     "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
   1058     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
   1059     "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
   1060     "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
   1061     "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
   1062     "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
   1063     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
   1064     "vpshufb    %8,%%ymm0,%%ymm0               \n"
   1065 
   1066     "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
   1067     VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
   1068     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1069     "sub       $0x20,%3                        \n"
   1070     "jg        1b                              \n"
   1071     "vzeroupper                                \n"
   1072   : "+r"(src_argb0),       // %0
   1073     "+r"(dst_u),           // %1
   1074     "+r"(dst_v),           // %2
   1075     "+rm"(width)           // %3
   1076   : "r"((intptr_t)(src_stride_argb)), // %4
   1077     "m"(kAddUVJ128),  // %5
   1078     "m"(kARGBToVJ),  // %6
   1079     "m"(kARGBToUJ),  // %7
   1080     "m"(kShufARGBToUV_AVX)  // %8
   1081   : "memory", "cc", NACL_R14
   1082     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   1083   );
   1084 }
   1085 #endif  // HAS_ARGBTOUVJROW_AVX2
   1086 
   1087 #ifdef HAS_ARGBTOUVJROW_SSSE3
   1088 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1089                         uint8* dst_u, uint8* dst_v, int width) {
   1090   asm volatile (
   1091     "movdqa    %5,%%xmm3                       \n"
   1092     "movdqa    %6,%%xmm4                       \n"
   1093     "movdqa    %7,%%xmm5                       \n"
   1094     "sub       %1,%2                           \n"
   1095     LABELALIGN
   1096   "1:                                          \n"
   1097     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1098     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
   1099     "pavgb     %%xmm7,%%xmm0                   \n"
   1100     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1101     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
   1102     "pavgb     %%xmm7,%%xmm1                   \n"
   1103     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1104     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
   1105     "pavgb     %%xmm7,%%xmm2                   \n"
   1106     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   1107     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
   1108     "pavgb     %%xmm7,%%xmm6                   \n"
   1109 
   1110     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1111     "movdqa    %%xmm0,%%xmm7                   \n"
   1112     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1113     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   1114     "pavgb     %%xmm7,%%xmm0                   \n"
   1115     "movdqa    %%xmm2,%%xmm7                   \n"
   1116     "shufps    $0x88,%%xmm6,%%xmm2             \n"
   1117     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
   1118     "pavgb     %%xmm7,%%xmm2                   \n"
   1119     "movdqa    %%xmm0,%%xmm1                   \n"
   1120     "movdqa    %%xmm2,%%xmm6                   \n"
   1121     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1122     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1123     "pmaddubsw %%xmm3,%%xmm1                   \n"
   1124     "pmaddubsw %%xmm3,%%xmm6                   \n"
   1125     "phaddw    %%xmm2,%%xmm0                   \n"
   1126     "phaddw    %%xmm6,%%xmm1                   \n"
   1127     "paddw     %%xmm5,%%xmm0                   \n"
   1128     "paddw     %%xmm5,%%xmm1                   \n"
   1129     "psraw     $0x8,%%xmm0                     \n"
   1130     "psraw     $0x8,%%xmm1                     \n"
   1131     "packsswb  %%xmm1,%%xmm0                   \n"
   1132     "movlps    %%xmm0," MEMACCESS(1) "         \n"
   1133     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
   1134     "lea       " MEMLEA(0x8,1) ",%1            \n"
   1135     "sub       $0x10,%3                        \n"
   1136     "jg        1b                              \n"
   1137   : "+r"(src_argb0),       // %0
   1138     "+r"(dst_u),           // %1
   1139     "+r"(dst_v),           // %2
   1140     "+rm"(width)           // %3
   1141   : "r"((intptr_t)(src_stride_argb)), // %4
   1142     "m"(kARGBToVJ),  // %5
   1143     "m"(kARGBToUJ),  // %6
   1144     "m"(kAddUVJ128)  // %7
   1145   : "memory", "cc", NACL_R14
   1146     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   1147   );
   1148 }
   1149 #endif  // HAS_ARGBTOUVJROW_SSSE3
   1150 
   1151 #ifdef HAS_ARGBTOUV444ROW_SSSE3
   1152 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
   1153                           int width) {
   1154   asm volatile (
   1155     "movdqa    %4,%%xmm3                       \n"
   1156     "movdqa    %5,%%xmm4                       \n"
   1157     "movdqa    %6,%%xmm5                       \n"
   1158     "sub       %1,%2                           \n"
   1159     LABELALIGN
   1160   "1:                                          \n"
   1161     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1162     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1163     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1164     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   1165     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1166     "pmaddubsw %%xmm4,%%xmm1                   \n"
   1167     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1168     "pmaddubsw %%xmm4,%%xmm6                   \n"
   1169     "phaddw    %%xmm1,%%xmm0                   \n"
   1170     "phaddw    %%xmm6,%%xmm2                   \n"
   1171     "psraw     $0x8,%%xmm0                     \n"
   1172     "psraw     $0x8,%%xmm2                     \n"
   1173     "packsswb  %%xmm2,%%xmm0                   \n"
   1174     "paddb     %%xmm5,%%xmm0                   \n"
   1175     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1176     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1177     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1178     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1179     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   1180     "pmaddubsw %%xmm3,%%xmm0                   \n"
   1181     "pmaddubsw %%xmm3,%%xmm1                   \n"
   1182     "pmaddubsw %%xmm3,%%xmm2                   \n"
   1183     "pmaddubsw %%xmm3,%%xmm6                   \n"
   1184     "phaddw    %%xmm1,%%xmm0                   \n"
   1185     "phaddw    %%xmm6,%%xmm2                   \n"
   1186     "psraw     $0x8,%%xmm0                     \n"
   1187     "psraw     $0x8,%%xmm2                     \n"
   1188     "packsswb  %%xmm2,%%xmm0                   \n"
   1189     "paddb     %%xmm5,%%xmm0                   \n"
   1190     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1191     MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
   1192     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1193     "sub       $0x10,%3                        \n"
   1194     "jg        1b                              \n"
   1195   : "+r"(src_argb),        // %0
   1196     "+r"(dst_u),           // %1
   1197     "+r"(dst_v),           // %2
   1198     "+rm"(width)           // %3
   1199   : "m"(kARGBToV),  // %4
   1200     "m"(kARGBToU),  // %5
   1201     "m"(kAddUV128)  // %6
   1202   : "memory", "cc", NACL_R14
   1203     "xmm0", "xmm1", "xmm2", "xmm6"
   1204   );
   1205 }
   1206 #endif  // HAS_ARGBTOUV444ROW_SSSE3
   1207 
   1208 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
   1209   asm volatile (
   1210     "movdqa    %4,%%xmm5                       \n"
   1211     "movdqa    %3,%%xmm4                       \n"
   1212     LABELALIGN
   1213   "1:                                          \n"
   1214     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1215     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1216     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1217     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   1218     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1219     "pmaddubsw %%xmm4,%%xmm1                   \n"
   1220     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1221     "pmaddubsw %%xmm4,%%xmm3                   \n"
   1222     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1223     "phaddw    %%xmm1,%%xmm0                   \n"
   1224     "phaddw    %%xmm3,%%xmm2                   \n"
   1225     "psrlw     $0x7,%%xmm0                     \n"
   1226     "psrlw     $0x7,%%xmm2                     \n"
   1227     "packuswb  %%xmm2,%%xmm0                   \n"
   1228     "paddb     %%xmm5,%%xmm0                   \n"
   1229     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1230     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1231     "sub       $0x10,%2                        \n"
   1232     "jg        1b                              \n"
   1233   : "+r"(src_bgra),  // %0
   1234     "+r"(dst_y),     // %1
   1235     "+r"(width)        // %2
   1236   : "m"(kBGRAToY),   // %3
   1237     "m"(kAddY16)     // %4
   1238   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1239   );
   1240 }
   1241 
   1242 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
   1243                        uint8* dst_u, uint8* dst_v, int width) {
   1244   asm volatile (
   1245     "movdqa    %5,%%xmm3                       \n"
   1246     "movdqa    %6,%%xmm4                       \n"
   1247     "movdqa    %7,%%xmm5                       \n"
   1248     "sub       %1,%2                           \n"
   1249     LABELALIGN
   1250   "1:                                          \n"
   1251     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1252     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
   1253     "pavgb     %%xmm7,%%xmm0                   \n"
   1254     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1255     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
   1256     "pavgb     %%xmm7,%%xmm1                   \n"
   1257     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1258     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
   1259     "pavgb     %%xmm7,%%xmm2                   \n"
   1260     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   1261     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
   1262     "pavgb     %%xmm7,%%xmm6                   \n"
   1263 
   1264     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1265     "movdqa    %%xmm0,%%xmm7                   \n"
   1266     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1267     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   1268     "pavgb     %%xmm7,%%xmm0                   \n"
   1269     "movdqa    %%xmm2,%%xmm7                   \n"
   1270     "shufps    $0x88,%%xmm6,%%xmm2             \n"
   1271     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
   1272     "pavgb     %%xmm7,%%xmm2                   \n"
   1273     "movdqa    %%xmm0,%%xmm1                   \n"
   1274     "movdqa    %%xmm2,%%xmm6                   \n"
   1275     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1276     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1277     "pmaddubsw %%xmm3,%%xmm1                   \n"
   1278     "pmaddubsw %%xmm3,%%xmm6                   \n"
   1279     "phaddw    %%xmm2,%%xmm0                   \n"
   1280     "phaddw    %%xmm6,%%xmm1                   \n"
   1281     "psraw     $0x8,%%xmm0                     \n"
   1282     "psraw     $0x8,%%xmm1                     \n"
   1283     "packsswb  %%xmm1,%%xmm0                   \n"
   1284     "paddb     %%xmm5,%%xmm0                   \n"
   1285     "movlps    %%xmm0," MEMACCESS(1) "         \n"
   1286     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
   1287     "lea       " MEMLEA(0x8,1) ",%1            \n"
   1288     "sub       $0x10,%3                        \n"
   1289     "jg        1b                              \n"
   1290   : "+r"(src_bgra0),       // %0
   1291     "+r"(dst_u),           // %1
   1292     "+r"(dst_v),           // %2
   1293     "+rm"(width)           // %3
   1294   : "r"((intptr_t)(src_stride_bgra)), // %4
   1295     "m"(kBGRAToV),  // %5
   1296     "m"(kBGRAToU),  // %6
   1297     "m"(kAddUV128)  // %7
   1298   : "memory", "cc", NACL_R14
   1299     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   1300   );
   1301 }
   1302 
   1303 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
   1304   asm volatile (
   1305     "movdqa    %4,%%xmm5                       \n"
   1306     "movdqa    %3,%%xmm4                       \n"
   1307     LABELALIGN
   1308   "1:                                          \n"
   1309     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1310     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1311     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1312     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   1313     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1314     "pmaddubsw %%xmm4,%%xmm1                   \n"
   1315     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1316     "pmaddubsw %%xmm4,%%xmm3                   \n"
   1317     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1318     "phaddw    %%xmm1,%%xmm0                   \n"
   1319     "phaddw    %%xmm3,%%xmm2                   \n"
   1320     "psrlw     $0x7,%%xmm0                     \n"
   1321     "psrlw     $0x7,%%xmm2                     \n"
   1322     "packuswb  %%xmm2,%%xmm0                   \n"
   1323     "paddb     %%xmm5,%%xmm0                   \n"
   1324     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1325     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1326     "sub       $0x10,%2                        \n"
   1327     "jg        1b                              \n"
   1328   : "+r"(src_abgr),  // %0
   1329     "+r"(dst_y),     // %1
   1330     "+r"(width)        // %2
   1331   : "m"(kABGRToY),   // %3
   1332     "m"(kAddY16)     // %4
   1333   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1334   );
   1335 }
   1336 
   1337 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
   1338   asm volatile (
   1339     "movdqa    %4,%%xmm5                       \n"
   1340     "movdqa    %3,%%xmm4                       \n"
   1341     LABELALIGN
   1342   "1:                                          \n"
   1343     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1344     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1345     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1346     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   1347     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1348     "pmaddubsw %%xmm4,%%xmm1                   \n"
   1349     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1350     "pmaddubsw %%xmm4,%%xmm3                   \n"
   1351     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1352     "phaddw    %%xmm1,%%xmm0                   \n"
   1353     "phaddw    %%xmm3,%%xmm2                   \n"
   1354     "psrlw     $0x7,%%xmm0                     \n"
   1355     "psrlw     $0x7,%%xmm2                     \n"
   1356     "packuswb  %%xmm2,%%xmm0                   \n"
   1357     "paddb     %%xmm5,%%xmm0                   \n"
   1358     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1359     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1360     "sub       $0x10,%2                        \n"
   1361     "jg        1b                              \n"
   1362   : "+r"(src_rgba),  // %0
   1363     "+r"(dst_y),     // %1
   1364     "+r"(width)        // %2
   1365   : "m"(kRGBAToY),   // %3
   1366     "m"(kAddY16)     // %4
   1367   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1368   );
   1369 }
   1370 
   1371 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
   1372                        uint8* dst_u, uint8* dst_v, int width) {
   1373   asm volatile (
   1374     "movdqa    %5,%%xmm3                       \n"
   1375     "movdqa    %6,%%xmm4                       \n"
   1376     "movdqa    %7,%%xmm5                       \n"
   1377     "sub       %1,%2                           \n"
   1378     LABELALIGN
   1379   "1:                                          \n"
   1380     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1381     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
   1382     "pavgb     %%xmm7,%%xmm0                   \n"
   1383     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1384     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
   1385     "pavgb     %%xmm7,%%xmm1                   \n"
   1386     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1387     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
   1388     "pavgb     %%xmm7,%%xmm2                   \n"
   1389     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   1390     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
   1391     "pavgb     %%xmm7,%%xmm6                   \n"
   1392 
   1393     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1394     "movdqa    %%xmm0,%%xmm7                   \n"
   1395     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1396     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   1397     "pavgb     %%xmm7,%%xmm0                   \n"
   1398     "movdqa    %%xmm2,%%xmm7                   \n"
   1399     "shufps    $0x88,%%xmm6,%%xmm2             \n"
   1400     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
   1401     "pavgb     %%xmm7,%%xmm2                   \n"
   1402     "movdqa    %%xmm0,%%xmm1                   \n"
   1403     "movdqa    %%xmm2,%%xmm6                   \n"
   1404     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1405     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1406     "pmaddubsw %%xmm3,%%xmm1                   \n"
   1407     "pmaddubsw %%xmm3,%%xmm6                   \n"
   1408     "phaddw    %%xmm2,%%xmm0                   \n"
   1409     "phaddw    %%xmm6,%%xmm1                   \n"
   1410     "psraw     $0x8,%%xmm0                     \n"
   1411     "psraw     $0x8,%%xmm1                     \n"
   1412     "packsswb  %%xmm1,%%xmm0                   \n"
   1413     "paddb     %%xmm5,%%xmm0                   \n"
   1414     "movlps    %%xmm0," MEMACCESS(1) "         \n"
   1415     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
   1416     "lea       " MEMLEA(0x8,1) ",%1            \n"
   1417     "sub       $0x10,%3                        \n"
   1418     "jg        1b                              \n"
   1419   : "+r"(src_abgr0),       // %0
   1420     "+r"(dst_u),           // %1
   1421     "+r"(dst_v),           // %2
   1422     "+rm"(width)           // %3
   1423   : "r"((intptr_t)(src_stride_abgr)), // %4
   1424     "m"(kABGRToV),  // %5
   1425     "m"(kABGRToU),  // %6
   1426     "m"(kAddUV128)  // %7
   1427   : "memory", "cc", NACL_R14
   1428     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   1429   );
   1430 }
   1431 
   1432 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
   1433                        uint8* dst_u, uint8* dst_v, int width) {
   1434   asm volatile (
   1435     "movdqa    %5,%%xmm3                       \n"
   1436     "movdqa    %6,%%xmm4                       \n"
   1437     "movdqa    %7,%%xmm5                       \n"
   1438     "sub       %1,%2                           \n"
   1439     LABELALIGN
   1440   "1:                                          \n"
   1441     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1442     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
   1443     "pavgb     %%xmm7,%%xmm0                   \n"
   1444     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1445     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
   1446     "pavgb     %%xmm7,%%xmm1                   \n"
   1447     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1448     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
   1449     "pavgb     %%xmm7,%%xmm2                   \n"
   1450     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   1451     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
   1452     "pavgb     %%xmm7,%%xmm6                   \n"
   1453 
   1454     "lea       " MEMLEA(0x40,0) ",%0           \n"
   1455     "movdqa    %%xmm0,%%xmm7                   \n"
   1456     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1457     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   1458     "pavgb     %%xmm7,%%xmm0                   \n"
   1459     "movdqa    %%xmm2,%%xmm7                   \n"
   1460     "shufps    $0x88,%%xmm6,%%xmm2             \n"
   1461     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
   1462     "pavgb     %%xmm7,%%xmm2                   \n"
   1463     "movdqa    %%xmm0,%%xmm1                   \n"
   1464     "movdqa    %%xmm2,%%xmm6                   \n"
   1465     "pmaddubsw %%xmm4,%%xmm0                   \n"
   1466     "pmaddubsw %%xmm4,%%xmm2                   \n"
   1467     "pmaddubsw %%xmm3,%%xmm1                   \n"
   1468     "pmaddubsw %%xmm3,%%xmm6                   \n"
   1469     "phaddw    %%xmm2,%%xmm0                   \n"
   1470     "phaddw    %%xmm6,%%xmm1                   \n"
   1471     "psraw     $0x8,%%xmm0                     \n"
   1472     "psraw     $0x8,%%xmm1                     \n"
   1473     "packsswb  %%xmm1,%%xmm0                   \n"
   1474     "paddb     %%xmm5,%%xmm0                   \n"
   1475     "movlps    %%xmm0," MEMACCESS(1) "         \n"
   1476     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
   1477     "lea       " MEMLEA(0x8,1) ",%1            \n"
   1478     "sub       $0x10,%3                        \n"
   1479     "jg        1b                              \n"
   1480   : "+r"(src_rgba0),       // %0
   1481     "+r"(dst_u),           // %1
   1482     "+r"(dst_v),           // %2
   1483     "+rm"(width)           // %3
   1484   : "r"((intptr_t)(src_stride_rgba)), // %4
   1485     "m"(kRGBAToV),  // %5
   1486     "m"(kRGBAToU),  // %6
   1487     "m"(kAddUV128)  // %7
   1488   : "memory", "cc", NACL_R14
   1489     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   1490   );
   1491 }
   1492 
   1493 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
   1494 
   1495 // Read 8 UV from 444
   1496 #define READYUV444                                                             \
   1497     "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
   1498     MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
   1499     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
   1500     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
   1501     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
   1502     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
   1503     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
   1504 
   1505 // Read 4 UV from 422, upsample to 8 UV
   1506 #define READYUV422                                                             \
   1507     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
   1508     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
   1509     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
   1510     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
   1511     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
   1512     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
   1513     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
   1514     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
   1515 
   1516 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
   1517 #define READYUVA422                                                            \
   1518     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
   1519     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
   1520     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
   1521     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
   1522     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
   1523     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
   1524     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
   1525     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
   1526     "movq       " MEMACCESS([a_buf]) ",%%xmm5                   \n"            \
   1527     "lea        " MEMLEA(0x8, [a_buf]) ",%[a_buf]               \n"
   1528 
   1529 // Read 2 UV from 411, upsample to 8 UV.
   1530 // reading 4 bytes is an msan violation.
   1531 //    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"
   1532 //    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)
   1533 // pinsrw fails with drmemory
   1534 //  __asm pinsrw     xmm0, [esi], 0        /* U */
   1535 //  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
   1536 #define READYUV411_TEMP                                                        \
   1537     "movzwl     " MEMACCESS([u_buf]) ",%[temp]                  \n"            \
   1538     "movd       %[temp],%%xmm0                                  \n"            \
   1539     MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) "       \n"            \
   1540     "movd       %[temp],%%xmm1                                  \n"            \
   1541     "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
   1542     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
   1543     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
   1544     "punpckldq  %%xmm0,%%xmm0                                   \n"            \
   1545     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
   1546     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
   1547     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
   1548 
   1549 // Read 4 UV from NV12, upsample to 8 UV
   1550 #define READNV12                                                               \
   1551     "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
   1552     "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
   1553     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
   1554     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
   1555     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
   1556     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
   1557 
   1558 // Read 4 VU from NV21, upsample to 8 UV
   1559 #define READNV21                                                               \
   1560     "movq       " MEMACCESS([vu_buf]) ",%%xmm0                  \n"            \
   1561     "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \
   1562     "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \
   1563     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
   1564     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
   1565     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
   1566 
   1567 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
   1568 #define READYUY2                                                               \
   1569     "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                \n"            \
   1570     "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \
   1571     "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm0                \n"            \
   1572     "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \
   1573     "lea        " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf]        \n"
   1574 
   1575 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
   1576 #define READUYVY                                                               \
   1577     "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                \n"            \
   1578     "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \
   1579     "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm0                \n"            \
   1580     "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \
   1581     "lea        " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf]        \n"
   1582 
   1583 #if defined(__x86_64__)
   1584 #define YUVTORGB_SETUP(yuvconstants)                                           \
   1585     "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8            \n"            \
   1586     "movdqa     " MEMACCESS2(32, [yuvconstants]) ",%%xmm9       \n"            \
   1587     "movdqa     " MEMACCESS2(64, [yuvconstants]) ",%%xmm10      \n"            \
   1588     "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm11      \n"            \
   1589     "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm12     \n"            \
   1590     "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm13     \n"            \
   1591     "movdqa     " MEMACCESS2(192, [yuvconstants]) ",%%xmm14     \n"
   1592 // Convert 8 pixels: 8 UV and 8 Y
   1593 #define YUVTORGB(yuvconstants)                                                 \
   1594     "movdqa     %%xmm0,%%xmm1                                   \n"            \
   1595     "movdqa     %%xmm0,%%xmm2                                   \n"            \
   1596     "movdqa     %%xmm0,%%xmm3                                   \n"            \
   1597     "movdqa     %%xmm11,%%xmm0                                  \n"            \
   1598     "pmaddubsw  %%xmm8,%%xmm1                                   \n"            \
   1599     "psubw      %%xmm1,%%xmm0                                   \n"            \
   1600     "movdqa     %%xmm12,%%xmm1                                  \n"            \
   1601     "pmaddubsw  %%xmm9,%%xmm2                                   \n"            \
   1602     "psubw      %%xmm2,%%xmm1                                   \n"            \
   1603     "movdqa     %%xmm13,%%xmm2                                  \n"            \
   1604     "pmaddubsw  %%xmm10,%%xmm3                                  \n"            \
   1605     "psubw      %%xmm3,%%xmm2                                   \n"            \
   1606     "pmulhuw    %%xmm14,%%xmm4                                  \n"            \
   1607     "paddsw     %%xmm4,%%xmm0                                   \n"            \
   1608     "paddsw     %%xmm4,%%xmm1                                   \n"            \
   1609     "paddsw     %%xmm4,%%xmm2                                   \n"            \
   1610     "psraw      $0x6,%%xmm0                                     \n"            \
   1611     "psraw      $0x6,%%xmm1                                     \n"            \
   1612     "psraw      $0x6,%%xmm2                                     \n"            \
   1613     "packuswb   %%xmm0,%%xmm0                                   \n"            \
   1614     "packuswb   %%xmm1,%%xmm1                                   \n"            \
   1615     "packuswb   %%xmm2,%%xmm2                                   \n"
   1616 #define YUVTORGB_REGS \
   1617     "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
   1618 
   1619 #else
   1620 #define YUVTORGB_SETUP(yuvconstants)
   1621 // Convert 8 pixels: 8 UV and 8 Y
   1622 #define YUVTORGB(yuvconstants)                                                 \
   1623     "movdqa     %%xmm0,%%xmm1                                   \n"            \
   1624     "movdqa     %%xmm0,%%xmm2                                   \n"            \
   1625     "movdqa     %%xmm0,%%xmm3                                   \n"            \
   1626     "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm0       \n"            \
   1627     "pmaddubsw  " MEMACCESS([yuvconstants]) ",%%xmm1            \n"            \
   1628     "psubw      %%xmm1,%%xmm0                                   \n"            \
   1629     "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm1      \n"            \
   1630     "pmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%xmm2       \n"            \
   1631     "psubw      %%xmm2,%%xmm1                                   \n"            \
   1632     "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm2      \n"            \
   1633     "pmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%xmm3       \n"            \
   1634     "psubw      %%xmm3,%%xmm2                                   \n"            \
   1635     "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm4      \n"            \
   1636     "paddsw     %%xmm4,%%xmm0                                   \n"            \
   1637     "paddsw     %%xmm4,%%xmm1                                   \n"            \
   1638     "paddsw     %%xmm4,%%xmm2                                   \n"            \
   1639     "psraw      $0x6,%%xmm0                                     \n"            \
   1640     "psraw      $0x6,%%xmm1                                     \n"            \
   1641     "psraw      $0x6,%%xmm2                                     \n"            \
   1642     "packuswb   %%xmm0,%%xmm0                                   \n"            \
   1643     "packuswb   %%xmm1,%%xmm1                                   \n"            \
   1644     "packuswb   %%xmm2,%%xmm2                                   \n"
   1645 #define YUVTORGB_REGS
   1646 #endif
   1647 
   1648 // Store 8 ARGB values.
   1649 #define STOREARGB                                                              \
   1650     "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
   1651     "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
   1652     "movdqa     %%xmm0,%%xmm1                                    \n"           \
   1653     "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
   1654     "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
   1655     "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
   1656     "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
   1657     "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
   1658 
   1659 // Store 8 RGBA values.
   1660 #define STORERGBA                                                              \
   1661     "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
   1662     "punpcklbw %%xmm2,%%xmm1                                     \n"           \
   1663     "punpcklbw %%xmm0,%%xmm5                                     \n"           \
   1664     "movdqa    %%xmm5,%%xmm0                                     \n"           \
   1665     "punpcklwd %%xmm1,%%xmm5                                     \n"           \
   1666     "punpckhwd %%xmm1,%%xmm0                                     \n"           \
   1667     "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
   1668     "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
   1669     "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
   1670 
   1671 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
   1672                                 const uint8* u_buf,
   1673                                 const uint8* v_buf,
   1674                                 uint8* dst_argb,
   1675                                 const struct YuvConstants* yuvconstants,
   1676                                 int width) {
   1677   asm volatile (
   1678     YUVTORGB_SETUP(yuvconstants)
   1679     "sub       %[u_buf],%[v_buf]               \n"
   1680     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1681     LABELALIGN
   1682   "1:                                          \n"
   1683     READYUV444
   1684     YUVTORGB(yuvconstants)
   1685     STOREARGB
   1686     "sub       $0x8,%[width]                   \n"
   1687     "jg        1b                              \n"
   1688   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1689     [u_buf]"+r"(u_buf),    // %[u_buf]
   1690     [v_buf]"+r"(v_buf),    // %[v_buf]
   1691     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1692     [width]"+rm"(width)    // %[width]
   1693   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   1694   : "memory", "cc", NACL_R14 YUVTORGB_REGS
   1695     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1696   );
   1697 }
   1698 
   1699 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
   1700                                  const uint8* u_buf,
   1701                                  const uint8* v_buf,
   1702                                  uint8* dst_rgb24,
   1703                                  const struct YuvConstants* yuvconstants,
   1704                                  int width) {
   1705   asm volatile (
   1706     YUVTORGB_SETUP(yuvconstants)
   1707     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
   1708     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
   1709     "sub       %[u_buf],%[v_buf]               \n"
   1710     LABELALIGN
   1711   "1:                                          \n"
   1712     READYUV422
   1713     YUVTORGB(yuvconstants)
   1714     "punpcklbw %%xmm1,%%xmm0                   \n"
   1715     "punpcklbw %%xmm2,%%xmm2                   \n"
   1716     "movdqa    %%xmm0,%%xmm1                   \n"
   1717     "punpcklwd %%xmm2,%%xmm0                   \n"
   1718     "punpckhwd %%xmm2,%%xmm1                   \n"
   1719     "pshufb    %%xmm5,%%xmm0                   \n"
   1720     "pshufb    %%xmm6,%%xmm1                   \n"
   1721     "palignr   $0xc,%%xmm0,%%xmm1              \n"
   1722     "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
   1723     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
   1724     "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
   1725     "subl      $0x8,%[width]                   \n"
   1726     "jg        1b                              \n"
   1727   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1728     [u_buf]"+r"(u_buf),    // %[u_buf]
   1729     [v_buf]"+r"(v_buf),    // %[v_buf]
   1730     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
   1731 #if defined(__i386__) && defined(__pic__)
   1732     [width]"+m"(width)     // %[width]
   1733 #else
   1734     [width]"+rm"(width)    // %[width]
   1735 #endif
   1736   : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
   1737     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
   1738     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
   1739   : "memory", "cc", NACL_R14 YUVTORGB_REGS
   1740     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   1741   );
   1742 }
   1743 
   1744 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
   1745                                 const uint8* u_buf,
   1746                                 const uint8* v_buf,
   1747                                 uint8* dst_argb,
   1748                                 const struct YuvConstants* yuvconstants,
   1749                                 int width) {
   1750   asm volatile (
   1751     YUVTORGB_SETUP(yuvconstants)
   1752     "sub       %[u_buf],%[v_buf]               \n"
   1753     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1754     LABELALIGN
   1755   "1:                                          \n"
   1756     READYUV422
   1757     YUVTORGB(yuvconstants)
   1758     STOREARGB
   1759     "sub       $0x8,%[width]                   \n"
   1760     "jg        1b                              \n"
   1761   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1762     [u_buf]"+r"(u_buf),    // %[u_buf]
   1763     [v_buf]"+r"(v_buf),    // %[v_buf]
   1764     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1765     [width]"+rm"(width)    // %[width]
   1766   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   1767   : "memory", "cc", NACL_R14 YUVTORGB_REGS
   1768     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1769   );
   1770 }
   1771 
   1772 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
   1773 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
   1774                                      const uint8* u_buf,
   1775                                      const uint8* v_buf,
   1776                                      const uint8* a_buf,
   1777                                      uint8* dst_argb,
   1778                                      const struct YuvConstants* yuvconstants,
   1779                                      int width) {
   1780   asm volatile (
   1781     YUVTORGB_SETUP(yuvconstants)
   1782     "sub       %[u_buf],%[v_buf]               \n"
   1783     LABELALIGN
   1784   "1:                                          \n"
   1785     READYUVA422
   1786     YUVTORGB(yuvconstants)
   1787     STOREARGB
   1788     "subl      $0x8,%[width]                   \n"
   1789     "jg        1b                              \n"
   1790   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1791     [u_buf]"+r"(u_buf),    // %[u_buf]
   1792     [v_buf]"+r"(v_buf),    // %[v_buf]
   1793     [a_buf]"+r"(a_buf),    // %[a_buf]
   1794     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1795 #if defined(__i386__) && defined(__pic__)
   1796     [width]"+m"(width)     // %[width]
   1797 #else
   1798     [width]"+rm"(width)    // %[width]
   1799 #endif
   1800   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   1801   : "memory", "cc", NACL_R14 YUVTORGB_REGS
   1802     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1803   );
   1804 }
   1805 #endif  // HAS_I422ALPHATOARGBROW_SSSE3
   1806 
   1807 #ifdef HAS_I411TOARGBROW_SSSE3
   1808 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
   1809                                 const uint8* u_buf,
   1810                                 const uint8* v_buf,
   1811                                 uint8* dst_argb,
   1812                                 const struct YuvConstants* yuvconstants,
   1813                                 int width) {
   1814   int temp;
   1815   asm volatile (
   1816     YUVTORGB_SETUP(yuvconstants)
   1817     "sub       %[u_buf],%[v_buf]               \n"
   1818     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1819     LABELALIGN
   1820   "1:                                          \n"
   1821     READYUV411_TEMP
   1822     YUVTORGB(yuvconstants)
   1823     STOREARGB
   1824     "subl      $0x8,%[width]                   \n"
   1825     "jg        1b                              \n"
   1826   : [y_buf]"+r"(y_buf),        // %[y_buf]
   1827     [u_buf]"+r"(u_buf),        // %[u_buf]
   1828     [v_buf]"+r"(v_buf),        // %[v_buf]
   1829     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1830     [temp]"=&r"(temp),         // %[temp]
   1831 #if defined(__i386__) && defined(__pic__)
   1832     [width]"+m"(width)         // %[width]
   1833 #else
   1834     [width]"+rm"(width)        // %[width]
   1835 #endif
   1836   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   1837   : "memory", "cc", NACL_R14 YUVTORGB_REGS
   1838     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1839   );
   1840 }
   1841 #endif
   1842 
   1843 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
   1844                                 const uint8* uv_buf,
   1845                                 uint8* dst_argb,
   1846                                 const struct YuvConstants* yuvconstants,
   1847                                 int width) {
   1848   asm volatile (
   1849     YUVTORGB_SETUP(yuvconstants)
   1850     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1851     LABELALIGN
   1852   "1:                                          \n"
   1853     READNV12
   1854     YUVTORGB(yuvconstants)
   1855     STOREARGB
   1856     "sub       $0x8,%[width]                   \n"
   1857     "jg        1b                              \n"
   1858   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1859     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
   1860     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1861     [width]"+rm"(width)    // %[width]
   1862   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   1863     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
   1864       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1865   );
   1866 }
   1867 
   1868 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
   1869                                 const uint8* vu_buf,
   1870                                 uint8* dst_argb,
   1871                                 const struct YuvConstants* yuvconstants,
   1872                                 int width) {
   1873   asm volatile (
   1874     YUVTORGB_SETUP(yuvconstants)
   1875     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1876     LABELALIGN
   1877   "1:                                          \n"
   1878     READNV21
   1879     YUVTORGB(yuvconstants)
   1880     STOREARGB
   1881     "sub       $0x8,%[width]                   \n"
   1882     "jg        1b                              \n"
   1883   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1884     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
   1885     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1886     [width]"+rm"(width)    // %[width]
   1887   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
   1888     [kShuffleNV21]"m"(kShuffleNV21)
   1889     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
   1890       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1891   );
   1892 }
   1893 
   1894 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
   1895                                 uint8* dst_argb,
   1896                                 const struct YuvConstants* yuvconstants,
   1897                                 int width) {
   1898   asm volatile (
   1899     YUVTORGB_SETUP(yuvconstants)
   1900     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1901     LABELALIGN
   1902   "1:                                          \n"
   1903     READYUY2
   1904     YUVTORGB(yuvconstants)
   1905     STOREARGB
   1906     "sub       $0x8,%[width]                   \n"
   1907     "jg        1b                              \n"
   1908   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
   1909     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1910     [width]"+rm"(width)    // %[width]
   1911   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
   1912     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
   1913     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
   1914     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
   1915       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1916   );
   1917 }
   1918 
   1919 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
   1920                                 uint8* dst_argb,
   1921                                 const struct YuvConstants* yuvconstants,
   1922                                 int width) {
   1923   asm volatile (
   1924     YUVTORGB_SETUP(yuvconstants)
   1925     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1926     LABELALIGN
   1927   "1:                                          \n"
   1928     READUYVY
   1929     YUVTORGB(yuvconstants)
   1930     STOREARGB
   1931     "sub       $0x8,%[width]                   \n"
   1932     "jg        1b                              \n"
   1933   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
   1934     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   1935     [width]"+rm"(width)    // %[width]
   1936   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
   1937     [kShuffleUYVYY]"m"(kShuffleUYVYY),
   1938     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
   1939     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
   1940       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1941   );
   1942 }
   1943 
   1944 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
   1945                                 const uint8* u_buf,
   1946                                 const uint8* v_buf,
   1947                                 uint8* dst_rgba,
   1948                                 const struct YuvConstants* yuvconstants,
   1949                                 int width) {
   1950   asm volatile (
   1951     YUVTORGB_SETUP(yuvconstants)
   1952     "sub       %[u_buf],%[v_buf]               \n"
   1953     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1954     LABELALIGN
   1955   "1:                                          \n"
   1956     READYUV422
   1957     YUVTORGB(yuvconstants)
   1958     STORERGBA
   1959     "sub       $0x8,%[width]                   \n"
   1960     "jg        1b                              \n"
   1961   : [y_buf]"+r"(y_buf),    // %[y_buf]
   1962     [u_buf]"+r"(u_buf),    // %[u_buf]
   1963     [v_buf]"+r"(v_buf),    // %[v_buf]
   1964     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
   1965     [width]"+rm"(width)    // %[width]
   1966   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   1967   : "memory", "cc", NACL_R14 YUVTORGB_REGS
   1968     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1969   );
   1970 }
   1971 
   1972 #endif  // HAS_I422TOARGBROW_SSSE3
   1973 
   1974 // Read 16 UV from 444
   1975 #define READYUV444_AVX2                                                        \
   1976     "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
   1977     MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)                         \
   1978     "lea        " MEMLEA(0x10, [u_buf]) ",%[u_buf]                  \n"        \
   1979     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
   1980     "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n"        \
   1981     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
   1982     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
   1983     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
   1984     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
   1985     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
   1986 
   1987 // Read 8 UV from 422, upsample to 16 UV.
   1988 #define READYUV422_AVX2                                                        \
   1989     "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
   1990     MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
   1991     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
   1992     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
   1993     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
   1994     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
   1995     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
   1996     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
   1997     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
   1998     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
   1999 
   2000 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
   2001 #define READYUVA422_AVX2                                                       \
   2002     "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
   2003     MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
   2004     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
   2005     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
   2006     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
   2007     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
   2008     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
   2009     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
   2010     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
   2011     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"        \
   2012     "vmovdqu    " MEMACCESS([a_buf]) ",%%xmm5                       \n"        \
   2013     "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \
   2014     "lea        " MEMLEA(0x10, [a_buf]) ",%[a_buf]                  \n"
   2015 
   2016 // Read 4 UV from 411, upsample to 16 UV.
   2017 #define READYUV411_AVX2                                                        \
   2018     "vmovd      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
   2019     MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
   2020     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]                   \n"        \
   2021     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
   2022     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
   2023     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
   2024     "vpunpckldq %%ymm0,%%ymm0,%%ymm0                                \n"        \
   2025     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
   2026     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
   2027     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
   2028     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
   2029 
   2030 // Read 8 UV from NV12, upsample to 16 UV.
   2031 #define READNV12_AVX2                                                          \
   2032     "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                      \n"        \
   2033     "lea        " MEMLEA(0x10, [uv_buf]) ",%[uv_buf]                \n"        \
   2034     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
   2035     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
   2036     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
   2037     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
   2038     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
   2039     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
   2040 
   2041 // Read 8 VU from NV21, upsample to 16 UV.
   2042 #define READNV21_AVX2                                                          \
   2043     "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                      \n"        \
   2044     "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \
   2045     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
   2046     "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \
   2047     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
   2048     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
   2049     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
   2050     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
   2051 
   2052 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
   2053 #define READYUY2_AVX2                                                          \
   2054     "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                    \n"        \
   2055     "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \
   2056     "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm0                    \n"        \
   2057     "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \
   2058     "lea        " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf]            \n"
   2059 
   2060 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
   2061 #define READUYVY_AVX2                                                          \
   2062     "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                   \n"        \
   2063     "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \
   2064     "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm0                   \n"        \
   2065     "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \
   2066     "lea        " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf]            \n"
   2067 
   2068 #if defined(__x86_64__)
   2069 #define YUVTORGB_SETUP_AVX2(yuvconstants)                                      \
   2070     "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8            \n"           \
   2071     "vmovdqa     " MEMACCESS2(32, [yuvconstants]) ",%%ymm9       \n"           \
   2072     "vmovdqa     " MEMACCESS2(64, [yuvconstants]) ",%%ymm10      \n"           \
   2073     "vmovdqa     " MEMACCESS2(96, [yuvconstants]) ",%%ymm11      \n"           \
   2074     "vmovdqa     " MEMACCESS2(128, [yuvconstants]) ",%%ymm12     \n"           \
   2075     "vmovdqa     " MEMACCESS2(160, [yuvconstants]) ",%%ymm13     \n"           \
   2076     "vmovdqa     " MEMACCESS2(192, [yuvconstants]) ",%%ymm14     \n"
   2077 #define YUVTORGB_AVX2(yuvconstants)                                            \
   2078     "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n"        \
   2079     "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n"        \
   2080     "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n"        \
   2081     "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n"        \
   2082     "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n"        \
   2083     "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n"        \
   2084     "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n"        \
   2085     "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
   2086     "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
   2087     "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
   2088     "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
   2089     "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
   2090     "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
   2091     "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
   2092     "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
   2093     "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
   2094 #define YUVTORGB_REGS_AVX2 \
   2095     "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
   2096 #else  // Convert 16 pixels: 16 UV and 16 Y.
   2097 #define YUVTORGB_SETUP_AVX2(yuvconstants)
   2098 #define YUVTORGB_AVX2(yuvconstants)                                            \
   2099     "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2   \n"        \
   2100     "vpmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1   \n"        \
   2101     "vpmaddubsw  " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0        \n"        \
   2102     "vmovdqu     " MEMACCESS2(160, [yuvconstants]) ",%%ymm3         \n"        \
   2103     "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
   2104     "vmovdqu     " MEMACCESS2(128, [yuvconstants]) ",%%ymm3         \n"        \
   2105     "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
   2106     "vmovdqu     " MEMACCESS2(96, [yuvconstants]) ",%%ymm3          \n"        \
   2107     "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
   2108     "vpmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4  \n"        \
   2109     "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
   2110     "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
   2111     "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
   2112     "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
   2113     "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
   2114     "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
   2115     "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
   2116     "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
   2117     "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
   2118 #define YUVTORGB_REGS_AVX2
   2119 #endif
   2120 
   2121 // Store 16 ARGB values.
   2122 #define STOREARGB_AVX2                                                         \
   2123     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
   2124     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
   2125     "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \
   2126     "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \
   2127     "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n"        \
   2128     "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n"        \
   2129     "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "                    \n"        \
   2130     "vmovdqu    %%ymm0," MEMACCESS2(0x20, [dst_argb]) "             \n"        \
   2131     "lea       " MEMLEA(0x40, [dst_argb]) ", %[dst_argb]            \n"
   2132 
   2133 #ifdef HAS_I444TOARGBROW_AVX2
   2134 // 16 pixels
   2135 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
   2136 void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
   2137                                const uint8* u_buf,
   2138                                const uint8* v_buf,
   2139                                uint8* dst_argb,
   2140                                const struct YuvConstants* yuvconstants,
   2141                                int width) {
   2142   asm volatile (
   2143     YUVTORGB_SETUP_AVX2(yuvconstants)
   2144     "sub       %[u_buf],%[v_buf]               \n"
   2145     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
   2146     LABELALIGN
   2147   "1:                                          \n"
   2148     READYUV444_AVX2
   2149     YUVTORGB_AVX2(yuvconstants)
   2150     STOREARGB_AVX2
   2151     "sub       $0x10,%[width]                  \n"
   2152     "jg        1b                              \n"
   2153     "vzeroupper                                \n"
   2154   : [y_buf]"+r"(y_buf),    // %[y_buf]
   2155     [u_buf]"+r"(u_buf),    // %[u_buf]
   2156     [v_buf]"+r"(v_buf),    // %[v_buf]
   2157     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2158     [width]"+rm"(width)    // %[width]
   2159   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   2160   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
   2161     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2162   );
   2163 }
   2164 #endif  // HAS_I444TOARGBROW_AVX2
   2165 
   2166 #ifdef HAS_I411TOARGBROW_AVX2
   2167 // 16 pixels
   2168 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2169 void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
   2170                                const uint8* u_buf,
   2171                                const uint8* v_buf,
   2172                                uint8* dst_argb,
   2173                                const struct YuvConstants* yuvconstants,
   2174                                int width) {
   2175   asm volatile (
   2176     YUVTORGB_SETUP_AVX2(yuvconstants)
   2177     "sub       %[u_buf],%[v_buf]               \n"
   2178     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
   2179     LABELALIGN
   2180   "1:                                          \n"
   2181     READYUV411_AVX2
   2182     YUVTORGB_AVX2(yuvconstants)
   2183     STOREARGB_AVX2
   2184     "sub       $0x10,%[width]                  \n"
   2185     "jg        1b                              \n"
   2186     "vzeroupper                                \n"
   2187   : [y_buf]"+r"(y_buf),    // %[y_buf]
   2188     [u_buf]"+r"(u_buf),    // %[u_buf]
   2189     [v_buf]"+r"(v_buf),    // %[v_buf]
   2190     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2191     [width]"+rm"(width)    // %[width]
   2192   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   2193   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
   2194     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2195   );
   2196 }
   2197 #endif  // HAS_I411TOARGBROW_AVX2
   2198 
   2199 #if defined(HAS_I422TOARGBROW_AVX2)
   2200 // 16 pixels
   2201 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2202 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
   2203                                const uint8* u_buf,
   2204                                const uint8* v_buf,
   2205                                uint8* dst_argb,
   2206                                const struct YuvConstants* yuvconstants,
   2207                                int width) {
   2208   asm volatile (
   2209     YUVTORGB_SETUP_AVX2(yuvconstants)
   2210     "sub       %[u_buf],%[v_buf]               \n"
   2211     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
   2212     LABELALIGN
   2213   "1:                                          \n"
   2214     READYUV422_AVX2
   2215     YUVTORGB_AVX2(yuvconstants)
   2216     STOREARGB_AVX2
   2217     "sub       $0x10,%[width]                  \n"
   2218     "jg        1b                              \n"
   2219     "vzeroupper                                \n"
   2220   : [y_buf]"+r"(y_buf),    // %[y_buf]
   2221     [u_buf]"+r"(u_buf),    // %[u_buf]
   2222     [v_buf]"+r"(v_buf),    // %[v_buf]
   2223     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2224     [width]"+rm"(width)    // %[width]
   2225   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   2226   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
   2227     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2228   );
   2229 }
   2230 #endif  // HAS_I422TOARGBROW_AVX2
   2231 
   2232 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
   2233 // 16 pixels
   2234 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
   2235 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
   2236                                const uint8* u_buf,
   2237                                const uint8* v_buf,
   2238                                const uint8* a_buf,
   2239                                uint8* dst_argb,
   2240                                const struct YuvConstants* yuvconstants,
   2241                                int width) {
   2242   asm volatile (
   2243     YUVTORGB_SETUP_AVX2(yuvconstants)
   2244     "sub       %[u_buf],%[v_buf]               \n"
   2245     LABELALIGN
   2246   "1:                                          \n"
   2247     READYUVA422_AVX2
   2248     YUVTORGB_AVX2(yuvconstants)
   2249     STOREARGB_AVX2
   2250     "subl      $0x10,%[width]                  \n"
   2251     "jg        1b                              \n"
   2252     "vzeroupper                                \n"
   2253   : [y_buf]"+r"(y_buf),    // %[y_buf]
   2254     [u_buf]"+r"(u_buf),    // %[u_buf]
   2255     [v_buf]"+r"(v_buf),    // %[v_buf]
   2256     [a_buf]"+r"(a_buf),    // %[a_buf]
   2257     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2258 #if defined(__i386__) && defined(__pic__)
   2259     [width]"+m"(width)     // %[width]
   2260 #else
   2261     [width]"+rm"(width)    // %[width]
   2262 #endif
   2263   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   2264   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
   2265     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2266   );
   2267 }
   2268 #endif  // HAS_I422ALPHATOARGBROW_AVX2
   2269 
   2270 #if defined(HAS_I422TORGBAROW_AVX2)
   2271 // 16 pixels
   2272 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
   2273 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
   2274                                const uint8* u_buf,
   2275                                const uint8* v_buf,
   2276                                uint8* dst_argb,
   2277                                const struct YuvConstants* yuvconstants,
   2278                                int width) {
   2279   asm volatile (
   2280     YUVTORGB_SETUP_AVX2(yuvconstants)
   2281     "sub       %[u_buf],%[v_buf]               \n"
   2282     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   2283     LABELALIGN
   2284   "1:                                          \n"
   2285     READYUV422_AVX2
   2286     YUVTORGB_AVX2(yuvconstants)
   2287 
   2288     // Step 3: Weave into RGBA
   2289     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
   2290     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
   2291     "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
   2292     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
   2293     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
   2294     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
   2295     "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
   2296     "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
   2297     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
   2298     "sub       $0x10,%[width]                  \n"
   2299     "jg        1b                              \n"
   2300     "vzeroupper                                \n"
   2301   : [y_buf]"+r"(y_buf),    // %[y_buf]
   2302     [u_buf]"+r"(u_buf),    // %[u_buf]
   2303     [v_buf]"+r"(v_buf),    // %[v_buf]
   2304     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2305     [width]"+rm"(width)    // %[width]
   2306   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   2307   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
   2308     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2309   );
   2310 }
   2311 #endif  // HAS_I422TORGBAROW_AVX2
   2312 
   2313 #if defined(HAS_NV12TOARGBROW_AVX2)
   2314 // 16 pixels.
   2315 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2316 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
   2317                                const uint8* uv_buf,
   2318                                uint8* dst_argb,
   2319                                const struct YuvConstants* yuvconstants,
   2320                                int width) {
   2321   asm volatile (
   2322     YUVTORGB_SETUP_AVX2(yuvconstants)
   2323     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   2324     LABELALIGN
   2325   "1:                                          \n"
   2326     READNV12_AVX2
   2327     YUVTORGB_AVX2(yuvconstants)
   2328     STOREARGB_AVX2
   2329     "sub       $0x10,%[width]                  \n"
   2330     "jg        1b                              \n"
   2331     "vzeroupper                                \n"
   2332   : [y_buf]"+r"(y_buf),    // %[y_buf]
   2333     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
   2334     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2335     [width]"+rm"(width)    // %[width]
   2336   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
   2337     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
   2338     "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2339   );
   2340 }
   2341 #endif  // HAS_NV12TOARGBROW_AVX2
   2342 
   2343 #if defined(HAS_NV21TOARGBROW_AVX2)
   2344 // 16 pixels.
   2345 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
   2346 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
   2347                                const uint8* vu_buf,
   2348                                uint8* dst_argb,
   2349                                const struct YuvConstants* yuvconstants,
   2350                                int width) {
   2351   asm volatile (
   2352     YUVTORGB_SETUP_AVX2(yuvconstants)
   2353     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   2354     LABELALIGN
   2355   "1:                                          \n"
   2356     READNV21_AVX2
   2357     YUVTORGB_AVX2(yuvconstants)
   2358     STOREARGB_AVX2
   2359     "sub       $0x10,%[width]                  \n"
   2360     "jg        1b                              \n"
   2361     "vzeroupper                                \n"
   2362   : [y_buf]"+r"(y_buf),    // %[y_buf]
   2363     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
   2364     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2365     [width]"+rm"(width)    // %[width]
   2366   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
   2367     [kShuffleNV21]"m"(kShuffleNV21)
   2368     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
   2369       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2370   );
   2371 }
   2372 #endif  // HAS_NV21TOARGBROW_AVX2
   2373 
   2374 #if defined(HAS_YUY2TOARGBROW_AVX2)
   2375 // 16 pixels.
   2376 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
   2377 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
   2378                                uint8* dst_argb,
   2379                                const struct YuvConstants* yuvconstants,
   2380                                int width) {
   2381   asm volatile (
   2382     YUVTORGB_SETUP_AVX2(yuvconstants)
   2383     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   2384     LABELALIGN
   2385   "1:                                          \n"
   2386     READYUY2_AVX2
   2387     YUVTORGB_AVX2(yuvconstants)
   2388     STOREARGB_AVX2
   2389     "sub       $0x10,%[width]                  \n"
   2390     "jg        1b                              \n"
   2391     "vzeroupper                                \n"
   2392   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
   2393     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2394     [width]"+rm"(width)    // %[width]
   2395   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
   2396     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
   2397     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
   2398     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
   2399       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2400   );
   2401 }
   2402 #endif  // HAS_YUY2TOARGBROW_AVX2
   2403 
   2404 #if defined(HAS_UYVYTOARGBROW_AVX2)
   2405 // 16 pixels.
   2406 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
   2407 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
   2408                                uint8* dst_argb,
   2409                                const struct YuvConstants* yuvconstants,
   2410                                int width) {
   2411   asm volatile (
   2412     YUVTORGB_SETUP_AVX2(yuvconstants)
   2413     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   2414     LABELALIGN
   2415   "1:                                          \n"
   2416     READUYVY_AVX2
   2417     YUVTORGB_AVX2(yuvconstants)
   2418     STOREARGB_AVX2
   2419     "sub       $0x10,%[width]                  \n"
   2420     "jg        1b                              \n"
   2421     "vzeroupper                                \n"
   2422   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
   2423     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
   2424     [width]"+rm"(width)    // %[width]
   2425   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
   2426     [kShuffleUYVYY]"m"(kShuffleUYVYY),
   2427     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
   2428     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
   2429       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2430   );
   2431 }
   2432 #endif  // HAS_UYVYTOARGBROW_AVX2
   2433 
   2434 #ifdef HAS_I400TOARGBROW_SSE2
   2435 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
   2436   asm volatile (
   2437     "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
   2438     "movd      %%eax,%%xmm2                    \n"
   2439     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
   2440     "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
   2441     "movd      %%eax,%%xmm3                    \n"
   2442     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
   2443     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   2444     "pslld     $0x18,%%xmm4                    \n"
   2445     LABELALIGN
   2446   "1:                                          \n"
   2447     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
   2448     "movq      " MEMACCESS(0) ",%%xmm0         \n"
   2449     "lea       " MEMLEA(0x8,0) ",%0            \n"
   2450     "punpcklbw %%xmm0,%%xmm0                   \n"
   2451     "pmulhuw   %%xmm2,%%xmm0                   \n"
   2452     "psubusw   %%xmm3,%%xmm0                   \n"
   2453     "psrlw     $6, %%xmm0                      \n"
   2454     "packuswb  %%xmm0,%%xmm0                   \n"
   2455 
   2456     // Step 2: Weave into ARGB
   2457     "punpcklbw %%xmm0,%%xmm0                   \n"
   2458     "movdqa    %%xmm0,%%xmm1                   \n"
   2459     "punpcklwd %%xmm0,%%xmm0                   \n"
   2460     "punpckhwd %%xmm1,%%xmm1                   \n"
   2461     "por       %%xmm4,%%xmm0                   \n"
   2462     "por       %%xmm4,%%xmm1                   \n"
   2463     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   2464     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   2465     "lea       " MEMLEA(0x20,1) ",%1           \n"
   2466 
   2467     "sub       $0x8,%2                         \n"
   2468     "jg        1b                              \n"
   2469   : "+r"(y_buf),     // %0
   2470     "+r"(dst_argb),  // %1
   2471     "+rm"(width)     // %2
   2472   :
   2473   : "memory", "cc", "eax"
   2474     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   2475   );
   2476 }
   2477 #endif  // HAS_I400TOARGBROW_SSE2
   2478 
   2479 #ifdef HAS_I400TOARGBROW_AVX2
   2480 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
   2481 // note: vpunpcklbw mutates and vpackuswb unmutates.
   2482 void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
   2483   asm volatile (
   2484     "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
   2485     "vmovd      %%eax,%%xmm2                   \n"
   2486     "vbroadcastss %%xmm2,%%ymm2                \n"
   2487     "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
   2488     "vmovd      %%eax,%%xmm3                   \n"
   2489     "vbroadcastss %%xmm3,%%ymm3                \n"
   2490     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
   2491     "vpslld     $0x18,%%ymm4,%%ymm4            \n"
   2492 
   2493     LABELALIGN
   2494   "1:                                          \n"
   2495     // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
   2496     "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
   2497     "lea        " MEMLEA(0x10,0) ",%0          \n"
   2498     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
   2499     "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
   2500     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
   2501     "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
   2502     "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
   2503     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
   2504     "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
   2505     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
   2506     "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
   2507     "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
   2508     "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
   2509     "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
   2510     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
   2511     "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
   2512     "lea       " MEMLEA(0x40,1) ",%1           \n"
   2513     "sub        $0x10,%2                       \n"
   2514     "jg        1b                              \n"
   2515     "vzeroupper                                \n"
   2516   : "+r"(y_buf),     // %0
   2517     "+r"(dst_argb),  // %1
   2518     "+rm"(width)     // %2
   2519   :
   2520   : "memory", "cc", "eax"
   2521     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   2522   );
   2523 }
   2524 #endif  // HAS_I400TOARGBROW_AVX2
   2525 
   2526 #ifdef HAS_MIRRORROW_SSSE3
   2527 // Shuffle table for reversing the bytes.
   2528 static uvec8 kShuffleMirror = {
   2529   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
   2530 };
   2531 
   2532 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   2533   intptr_t temp_width = (intptr_t)(width);
   2534   asm volatile (
   2535     "movdqa    %3,%%xmm5                       \n"
   2536     LABELALIGN
   2537   "1:                                          \n"
   2538     MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
   2539     "pshufb    %%xmm5,%%xmm0                   \n"
   2540     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   2541     "lea       " MEMLEA(0x10,1) ",%1           \n"
   2542     "sub       $0x10,%2                        \n"
   2543     "jg        1b                              \n"
   2544   : "+r"(src),  // %0
   2545     "+r"(dst),  // %1
   2546     "+r"(temp_width)  // %2
   2547   : "m"(kShuffleMirror) // %3
   2548   : "memory", "cc", NACL_R14
   2549     "xmm0", "xmm5"
   2550   );
   2551 }
   2552 #endif  // HAS_MIRRORROW_SSSE3
   2553 
   2554 #ifdef HAS_MIRRORROW_AVX2
   2555 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   2556   intptr_t temp_width = (intptr_t)(width);
   2557   asm volatile (
   2558     "vbroadcastf128 %3,%%ymm5                  \n"
   2559     LABELALIGN
   2560   "1:                                          \n"
   2561     MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
   2562     "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
   2563     "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
   2564     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
   2565     "lea       " MEMLEA(0x20,1) ",%1           \n"
   2566     "sub       $0x20,%2                        \n"
   2567     "jg        1b                              \n"
   2568     "vzeroupper                                \n"
   2569   : "+r"(src),  // %0
   2570     "+r"(dst),  // %1
   2571     "+r"(temp_width)  // %2
   2572   : "m"(kShuffleMirror) // %3
   2573   : "memory", "cc", NACL_R14
   2574     "xmm0", "xmm5"
   2575   );
   2576 }
   2577 #endif  // HAS_MIRRORROW_AVX2
   2578 
   2579 #ifdef HAS_MIRRORUVROW_SSSE3
   2580 // Shuffle table for reversing the bytes of UV channels.
   2581 static uvec8 kShuffleMirrorUV = {
   2582   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
   2583 };
   2584 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
   2585                        int width) {
   2586   intptr_t temp_width = (intptr_t)(width);
   2587   asm volatile (
   2588     "movdqa    %4,%%xmm1                       \n"
   2589     "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
   2590     "sub       %1,%2                           \n"
   2591     LABELALIGN
   2592   "1:                                          \n"
   2593     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   2594     "lea       " MEMLEA(-0x10,0) ",%0          \n"
   2595     "pshufb    %%xmm1,%%xmm0                   \n"
   2596     "movlpd    %%xmm0," MEMACCESS(1) "         \n"
   2597     MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
   2598     "lea       " MEMLEA(0x8,1) ",%1            \n"
   2599     "sub       $8,%3                           \n"
   2600     "jg        1b                              \n"
   2601   : "+r"(src),      // %0
   2602     "+r"(dst_u),    // %1
   2603     "+r"(dst_v),    // %2
   2604     "+r"(temp_width)  // %3
   2605   : "m"(kShuffleMirrorUV)  // %4
   2606   : "memory", "cc", NACL_R14
   2607     "xmm0", "xmm1"
   2608   );
   2609 }
   2610 #endif  // HAS_MIRRORUVROW_SSSE3
   2611 
   2612 #ifdef HAS_ARGBMIRRORROW_SSE2
   2613 
   2614 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   2615   intptr_t temp_width = (intptr_t)(width);
   2616   asm volatile (
   2617     "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
   2618     LABELALIGN
   2619   "1:                                          \n"
   2620     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   2621     "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
   2622     "lea       " MEMLEA(-0x10,0) ",%0          \n"
   2623     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   2624     "lea       " MEMLEA(0x10,1) ",%1           \n"
   2625     "sub       $0x4,%2                         \n"
   2626     "jg        1b                              \n"
   2627   : "+r"(src),  // %0
   2628     "+r"(dst),  // %1
   2629     "+r"(temp_width)  // %2
   2630   :
   2631   : "memory", "cc"
   2632     , "xmm0"
   2633   );
   2634 }
   2635 #endif  // HAS_ARGBMIRRORROW_SSE2
   2636 
   2637 #ifdef HAS_ARGBMIRRORROW_AVX2
   2638 // Shuffle table for reversing the bytes.
   2639 static const ulvec32 kARGBShuffleMirror_AVX2 = {
   2640   7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
   2641 };
   2642 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   2643   intptr_t temp_width = (intptr_t)(width);
   2644   asm volatile (
   2645     "vmovdqu    %3,%%ymm5                      \n"
   2646     LABELALIGN
   2647   "1:                                          \n"
   2648     VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
   2649     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
   2650     "lea        " MEMLEA(0x20,1) ",%1          \n"
   2651     "sub        $0x8,%2                        \n"
   2652     "jg         1b                             \n"
   2653     "vzeroupper                                \n"
   2654   : "+r"(src),  // %0
   2655     "+r"(dst),  // %1
   2656     "+r"(temp_width)  // %2
   2657   : "m"(kARGBShuffleMirror_AVX2) // %3
   2658   : "memory", "cc", NACL_R14
   2659     "xmm0", "xmm5"
   2660   );
   2661 }
   2662 #endif  // HAS_ARGBMIRRORROW_AVX2
   2663 
   2664 #ifdef HAS_SPLITUVROW_AVX2
   2665 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
   2666                      int width) {
   2667   asm volatile (
   2668     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
   2669     "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
   2670     "sub        %1,%2                            \n"
   2671     LABELALIGN
   2672   "1:                                            \n"
   2673     "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
   2674     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
   2675     "lea        " MEMLEA(0x40,0) ",%0            \n"
   2676     "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
   2677     "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
   2678     "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
   2679     "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
   2680     "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
   2681     "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
   2682     "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
   2683     "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
   2684     "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
   2685     MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
   2686     "lea        " MEMLEA(0x20,1) ",%1            \n"
   2687     "sub        $0x20,%3                         \n"
   2688     "jg         1b                               \n"
   2689     "vzeroupper                                  \n"
   2690   : "+r"(src_uv),     // %0
   2691     "+r"(dst_u),      // %1
   2692     "+r"(dst_v),      // %2
   2693     "+r"(width)         // %3
   2694   :
   2695   : "memory", "cc", NACL_R14
   2696     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   2697   );
   2698 }
   2699 #endif  // HAS_SPLITUVROW_AVX2
   2700 
   2701 #ifdef HAS_SPLITUVROW_SSE2
   2702 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
   2703                      int width) {
   2704   asm volatile (
   2705     "pcmpeqb    %%xmm5,%%xmm5                    \n"
   2706     "psrlw      $0x8,%%xmm5                      \n"
   2707     "sub        %1,%2                            \n"
   2708     LABELALIGN
   2709   "1:                                            \n"
   2710     "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
   2711     "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
   2712     "lea        " MEMLEA(0x20,0) ",%0            \n"
   2713     "movdqa     %%xmm0,%%xmm2                    \n"
   2714     "movdqa     %%xmm1,%%xmm3                    \n"
   2715     "pand       %%xmm5,%%xmm0                    \n"
   2716     "pand       %%xmm5,%%xmm1                    \n"
   2717     "packuswb   %%xmm1,%%xmm0                    \n"
   2718     "psrlw      $0x8,%%xmm2                      \n"
   2719     "psrlw      $0x8,%%xmm3                      \n"
   2720     "packuswb   %%xmm3,%%xmm2                    \n"
   2721     "movdqu     %%xmm0," MEMACCESS(1) "          \n"
   2722     MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
   2723     "lea        " MEMLEA(0x10,1) ",%1            \n"
   2724     "sub        $0x10,%3                         \n"
   2725     "jg         1b                               \n"
   2726   : "+r"(src_uv),     // %0
   2727     "+r"(dst_u),      // %1
   2728     "+r"(dst_v),      // %2
   2729     "+r"(width)         // %3
   2730   :
   2731   : "memory", "cc", NACL_R14
   2732     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   2733   );
   2734 }
   2735 #endif  // HAS_SPLITUVROW_SSE2
   2736 
   2737 #ifdef HAS_MERGEUVROW_AVX2
   2738 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
   2739                      int width) {
   2740   asm volatile (
   2741     "sub       %0,%1                             \n"
   2742     LABELALIGN
   2743   "1:                                            \n"
   2744     "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
   2745     MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
   2746     "lea       " MEMLEA(0x20,0) ",%0             \n"
   2747     "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
   2748     "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
   2749     "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
   2750     "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
   2751     "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
   2752     "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
   2753     "lea       " MEMLEA(0x40,2) ",%2             \n"
   2754     "sub       $0x20,%3                          \n"
   2755     "jg        1b                                \n"
   2756     "vzeroupper                                  \n"
   2757   : "+r"(src_u),     // %0
   2758     "+r"(src_v),     // %1
   2759     "+r"(dst_uv),    // %2
   2760     "+r"(width)      // %3
   2761   :
   2762   : "memory", "cc", NACL_R14
   2763     "xmm0", "xmm1", "xmm2"
   2764   );
   2765 }
   2766 #endif  // HAS_MERGEUVROW_AVX2
   2767 
   2768 #ifdef HAS_MERGEUVROW_SSE2
   2769 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
   2770                      int width) {
   2771   asm volatile (
   2772     "sub       %0,%1                             \n"
   2773     LABELALIGN
   2774   "1:                                            \n"
   2775     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
   2776     MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
   2777     "lea       " MEMLEA(0x10,0) ",%0             \n"
   2778     "movdqa    %%xmm0,%%xmm2                     \n"
   2779     "punpcklbw %%xmm1,%%xmm0                     \n"
   2780     "punpckhbw %%xmm1,%%xmm2                     \n"
   2781     "movdqu    %%xmm0," MEMACCESS(2) "           \n"
   2782     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
   2783     "lea       " MEMLEA(0x20,2) ",%2             \n"
   2784     "sub       $0x10,%3                          \n"
   2785     "jg        1b                                \n"
   2786   : "+r"(src_u),     // %0
   2787     "+r"(src_v),     // %1
   2788     "+r"(dst_uv),    // %2
   2789     "+r"(width)      // %3
   2790   :
   2791   : "memory", "cc", NACL_R14
   2792     "xmm0", "xmm1", "xmm2"
   2793   );
   2794 }
   2795 #endif  // HAS_MERGEUVROW_SSE2
   2796 
   2797 #ifdef HAS_COPYROW_SSE2
   2798 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   2799   asm volatile (
   2800     "test       $0xf,%0                        \n"
   2801     "jne        2f                             \n"
   2802     "test       $0xf,%1                        \n"
   2803     "jne        2f                             \n"
   2804     LABELALIGN
   2805   "1:                                          \n"
   2806     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   2807     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   2808     "lea       " MEMLEA(0x20,0) ",%0           \n"
   2809     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   2810     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   2811     "lea       " MEMLEA(0x20,1) ",%1           \n"
   2812     "sub       $0x20,%2                        \n"
   2813     "jg        1b                              \n"
   2814     "jmp       9f                              \n"
   2815     LABELALIGN
   2816   "2:                                          \n"
   2817     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   2818     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   2819     "lea       " MEMLEA(0x20,0) ",%0           \n"
   2820     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   2821     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   2822     "lea       " MEMLEA(0x20,1) ",%1           \n"
   2823     "sub       $0x20,%2                        \n"
   2824     "jg        2b                              \n"
   2825   "9:                                          \n"
   2826   : "+r"(src),   // %0
   2827     "+r"(dst),   // %1
   2828     "+r"(count)  // %2
   2829   :
   2830   : "memory", "cc"
   2831     , "xmm0", "xmm1"
   2832   );
   2833 }
   2834 #endif  // HAS_COPYROW_SSE2
   2835 
   2836 #ifdef HAS_COPYROW_AVX
   2837 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
   2838   asm volatile (
   2839     LABELALIGN
   2840   "1:                                          \n"
   2841     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   2842     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   2843     "lea       " MEMLEA(0x40,0) ",%0           \n"
   2844     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
   2845     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
   2846     "lea       " MEMLEA(0x40,1) ",%1           \n"
   2847     "sub       $0x40,%2                        \n"
   2848     "jg        1b                              \n"
   2849   : "+r"(src),   // %0
   2850     "+r"(dst),   // %1
   2851     "+r"(count)  // %2
   2852   :
   2853   : "memory", "cc"
   2854     , "xmm0", "xmm1"
   2855   );
   2856 }
   2857 #endif  // HAS_COPYROW_AVX
   2858 
   2859 #ifdef HAS_COPYROW_ERMS
   2860 // Multiple of 1.
   2861 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
   2862   size_t width_tmp = (size_t)(width);
   2863   asm volatile (
   2864     "rep movsb " MEMMOVESTRING(0,1) "          \n"
   2865   : "+S"(src),  // %0
   2866     "+D"(dst),  // %1
   2867     "+c"(width_tmp) // %2
   2868   :
   2869   : "memory", "cc"
   2870   );
   2871 }
   2872 #endif  // HAS_COPYROW_ERMS
   2873 
   2874 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
   2875 // width in pixels
   2876 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   2877   asm volatile (
   2878     "pcmpeqb   %%xmm0,%%xmm0                   \n"
   2879     "pslld     $0x18,%%xmm0                    \n"
   2880     "pcmpeqb   %%xmm1,%%xmm1                   \n"
   2881     "psrld     $0x8,%%xmm1                     \n"
   2882     LABELALIGN
   2883   "1:                                          \n"
   2884     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
   2885     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
   2886     "lea       " MEMLEA(0x20,0) ",%0           \n"
   2887     "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
   2888     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
   2889     "pand      %%xmm0,%%xmm2                   \n"
   2890     "pand      %%xmm0,%%xmm3                   \n"
   2891     "pand      %%xmm1,%%xmm4                   \n"
   2892     "pand      %%xmm1,%%xmm5                   \n"
   2893     "por       %%xmm4,%%xmm2                   \n"
   2894     "por       %%xmm5,%%xmm3                   \n"
   2895     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
   2896     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
   2897     "lea       " MEMLEA(0x20,1) ",%1           \n"
   2898     "sub       $0x8,%2                         \n"
   2899     "jg        1b                              \n"
   2900   : "+r"(src),   // %0
   2901     "+r"(dst),   // %1
   2902     "+r"(width)  // %2
   2903   :
   2904   : "memory", "cc"
   2905     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   2906   );
   2907 }
   2908 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
   2909 
   2910 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
   2911 // width in pixels
   2912 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   2913   asm volatile (
   2914     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
   2915     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
   2916     LABELALIGN
   2917   "1:                                          \n"
   2918     "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
   2919     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
   2920     "lea       " MEMLEA(0x40,0) ",%0           \n"
   2921     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
   2922     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
   2923     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
   2924     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
   2925     "lea       " MEMLEA(0x40,1) ",%1           \n"
   2926     "sub       $0x10,%2                        \n"
   2927     "jg        1b                              \n"
   2928     "vzeroupper                                \n"
   2929   : "+r"(src),   // %0
   2930     "+r"(dst),   // %1
   2931     "+r"(width)  // %2
   2932   :
   2933   : "memory", "cc"
   2934     , "xmm0", "xmm1", "xmm2"
   2935   );
   2936 }
   2937 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
   2938 
   2939 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
   2940 // width in pixels
   2941 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
   2942  asm volatile (
   2943     LABELALIGN
   2944   "1:                                          \n"
   2945     "movdqu    " MEMACCESS(0) ", %%xmm0        \n"
   2946     "movdqu    " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
   2947     "lea       " MEMLEA(0x20, 0) ", %0         \n"
   2948     "psrld     $0x18, %%xmm0                   \n"
   2949     "psrld     $0x18, %%xmm1                   \n"
   2950     "packssdw  %%xmm1, %%xmm0                  \n"
   2951     "packuswb  %%xmm0, %%xmm0                  \n"
   2952     "movq      %%xmm0," MEMACCESS(1) "         \n"
   2953     "lea       " MEMLEA(0x8, 1) ", %1          \n"
   2954     "sub       $0x8, %2                        \n"
   2955     "jg        1b                              \n"
   2956   : "+r"(src_argb),  // %0
   2957     "+r"(dst_a),     // %1
   2958     "+rm"(width)     // %2
   2959   :
   2960   : "memory", "cc"
   2961     , "xmm0", "xmm1"
   2962   );
   2963 }
   2964 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
   2965 
   2966 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
   2967 // width in pixels
   2968 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   2969   asm volatile (
   2970     "pcmpeqb   %%xmm0,%%xmm0                   \n"
   2971     "pslld     $0x18,%%xmm0                    \n"
   2972     "pcmpeqb   %%xmm1,%%xmm1                   \n"
   2973     "psrld     $0x8,%%xmm1                     \n"
   2974     LABELALIGN
   2975   "1:                                          \n"
   2976     "movq      " MEMACCESS(0) ",%%xmm2         \n"
   2977     "lea       " MEMLEA(0x8,0) ",%0            \n"
   2978     "punpcklbw %%xmm2,%%xmm2                   \n"
   2979     "punpckhwd %%xmm2,%%xmm3                   \n"
   2980     "punpcklwd %%xmm2,%%xmm2                   \n"
   2981     "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
   2982     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
   2983     "pand      %%xmm0,%%xmm2                   \n"
   2984     "pand      %%xmm0,%%xmm3                   \n"
   2985     "pand      %%xmm1,%%xmm4                   \n"
   2986     "pand      %%xmm1,%%xmm5                   \n"
   2987     "por       %%xmm4,%%xmm2                   \n"
   2988     "por       %%xmm5,%%xmm3                   \n"
   2989     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
   2990     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
   2991     "lea       " MEMLEA(0x20,1) ",%1           \n"
   2992     "sub       $0x8,%2                         \n"
   2993     "jg        1b                              \n"
   2994   : "+r"(src),   // %0
   2995     "+r"(dst),   // %1
   2996     "+r"(width)  // %2
   2997   :
   2998   : "memory", "cc"
   2999     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   3000   );
   3001 }
   3002 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
   3003 
   3004 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
   3005 // width in pixels
   3006 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   3007   asm volatile (
   3008     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
   3009     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
   3010     LABELALIGN
   3011   "1:                                          \n"
   3012     "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
   3013     "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
   3014     "lea       " MEMLEA(0x10,0) ",%0           \n"
   3015     "vpslld    $0x18,%%ymm1,%%ymm1             \n"
   3016     "vpslld    $0x18,%%ymm2,%%ymm2             \n"
   3017     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
   3018     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
   3019     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
   3020     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
   3021     "lea       " MEMLEA(0x40,1) ",%1           \n"
   3022     "sub       $0x10,%2                        \n"
   3023     "jg        1b                              \n"
   3024     "vzeroupper                                \n"
   3025   : "+r"(src),   // %0
   3026     "+r"(dst),   // %1
   3027     "+r"(width)  // %2
   3028   :
   3029   : "memory", "cc"
   3030     , "xmm0", "xmm1", "xmm2"
   3031   );
   3032 }
   3033 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
   3034 
   3035 #ifdef HAS_SETROW_X86
   3036 void SetRow_X86(uint8* dst, uint8 v8, int width) {
   3037   size_t width_tmp = (size_t)(width >> 2);
   3038   const uint32 v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
   3039   asm volatile (
   3040     "rep stosl " MEMSTORESTRING(eax,0) "       \n"
   3041     : "+D"(dst),       // %0
   3042       "+c"(width_tmp)  // %1
   3043     : "a"(v32)         // %2
   3044     : "memory", "cc");
   3045 }
   3046 
   3047 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
   3048   size_t width_tmp = (size_t)(width);
   3049   asm volatile (
   3050     "rep stosb " MEMSTORESTRING(al,0) "        \n"
   3051     : "+D"(dst),       // %0
   3052       "+c"(width_tmp)  // %1
   3053     : "a"(v8)          // %2
   3054     : "memory", "cc");
   3055 }
   3056 
   3057 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
   3058   size_t width_tmp = (size_t)(width);
   3059   asm volatile (
   3060     "rep stosl " MEMSTORESTRING(eax,0) "       \n"
   3061     : "+D"(dst_argb),  // %0
   3062       "+c"(width_tmp)  // %1
   3063     : "a"(v32)         // %2
   3064     : "memory", "cc");
   3065 }
   3066 #endif  // HAS_SETROW_X86
   3067 
   3068 #ifdef HAS_YUY2TOYROW_SSE2
   3069 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
   3070   asm volatile (
   3071     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   3072     "psrlw     $0x8,%%xmm5                     \n"
   3073     LABELALIGN
   3074   "1:                                          \n"
   3075     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3076     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3077     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3078     "pand      %%xmm5,%%xmm0                   \n"
   3079     "pand      %%xmm5,%%xmm1                   \n"
   3080     "packuswb  %%xmm1,%%xmm0                   \n"
   3081     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   3082     "lea       " MEMLEA(0x10,1) ",%1           \n"
   3083     "sub       $0x10,%2                        \n"
   3084     "jg        1b                              \n"
   3085   : "+r"(src_yuy2),  // %0
   3086     "+r"(dst_y),     // %1
   3087     "+r"(width)        // %2
   3088   :
   3089   : "memory", "cc"
   3090     , "xmm0", "xmm1", "xmm5"
   3091   );
   3092 }
   3093 
   3094 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
   3095                       uint8* dst_u, uint8* dst_v, int width) {
   3096   asm volatile (
   3097     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   3098     "psrlw     $0x8,%%xmm5                     \n"
   3099     "sub       %1,%2                           \n"
   3100     LABELALIGN
   3101   "1:                                          \n"
   3102     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3103     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3104     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
   3105     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
   3106     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3107     "pavgb     %%xmm2,%%xmm0                   \n"
   3108     "pavgb     %%xmm3,%%xmm1                   \n"
   3109     "psrlw     $0x8,%%xmm0                     \n"
   3110     "psrlw     $0x8,%%xmm1                     \n"
   3111     "packuswb  %%xmm1,%%xmm0                   \n"
   3112     "movdqa    %%xmm0,%%xmm1                   \n"
   3113     "pand      %%xmm5,%%xmm0                   \n"
   3114     "packuswb  %%xmm0,%%xmm0                   \n"
   3115     "psrlw     $0x8,%%xmm1                     \n"
   3116     "packuswb  %%xmm1,%%xmm1                   \n"
   3117     "movq      %%xmm0," MEMACCESS(1) "         \n"
   3118     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
   3119     "lea       " MEMLEA(0x8,1) ",%1            \n"
   3120     "sub       $0x10,%3                        \n"
   3121     "jg        1b                              \n"
   3122   : "+r"(src_yuy2),    // %0
   3123     "+r"(dst_u),       // %1
   3124     "+r"(dst_v),       // %2
   3125     "+r"(width)          // %3
   3126   : "r"((intptr_t)(stride_yuy2))  // %4
   3127   : "memory", "cc", NACL_R14
   3128     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   3129   );
   3130 }
   3131 
   3132 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
   3133                          uint8* dst_u, uint8* dst_v, int width) {
   3134   asm volatile (
   3135     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   3136     "psrlw     $0x8,%%xmm5                     \n"
   3137     "sub       %1,%2                           \n"
   3138     LABELALIGN
   3139   "1:                                          \n"
   3140     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3141     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3142     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3143     "psrlw     $0x8,%%xmm0                     \n"
   3144     "psrlw     $0x8,%%xmm1                     \n"
   3145     "packuswb  %%xmm1,%%xmm0                   \n"
   3146     "movdqa    %%xmm0,%%xmm1                   \n"
   3147     "pand      %%xmm5,%%xmm0                   \n"
   3148     "packuswb  %%xmm0,%%xmm0                   \n"
   3149     "psrlw     $0x8,%%xmm1                     \n"
   3150     "packuswb  %%xmm1,%%xmm1                   \n"
   3151     "movq      %%xmm0," MEMACCESS(1) "         \n"
   3152     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
   3153     "lea       " MEMLEA(0x8,1) ",%1            \n"
   3154     "sub       $0x10,%3                        \n"
   3155     "jg        1b                              \n"
   3156   : "+r"(src_yuy2),    // %0
   3157     "+r"(dst_u),       // %1
   3158     "+r"(dst_v),       // %2
   3159     "+r"(width)          // %3
   3160   :
   3161   : "memory", "cc", NACL_R14
   3162     "xmm0", "xmm1", "xmm5"
   3163   );
   3164 }
   3165 
   3166 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
   3167   asm volatile (
   3168     LABELALIGN
   3169   "1:                                          \n"
   3170     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3171     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3172     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3173     "psrlw     $0x8,%%xmm0                     \n"
   3174     "psrlw     $0x8,%%xmm1                     \n"
   3175     "packuswb  %%xmm1,%%xmm0                   \n"
   3176     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   3177     "lea       " MEMLEA(0x10,1) ",%1           \n"
   3178     "sub       $0x10,%2                        \n"
   3179     "jg        1b                              \n"
   3180   : "+r"(src_uyvy),  // %0
   3181     "+r"(dst_y),     // %1
   3182     "+r"(width)        // %2
   3183   :
   3184   : "memory", "cc"
   3185     , "xmm0", "xmm1"
   3186   );
   3187 }
   3188 
   3189 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
   3190                       uint8* dst_u, uint8* dst_v, int width) {
   3191   asm volatile (
   3192     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   3193     "psrlw     $0x8,%%xmm5                     \n"
   3194     "sub       %1,%2                           \n"
   3195     LABELALIGN
   3196   "1:                                          \n"
   3197     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3198     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3199     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
   3200     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
   3201     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3202     "pavgb     %%xmm2,%%xmm0                   \n"
   3203     "pavgb     %%xmm3,%%xmm1                   \n"
   3204     "pand      %%xmm5,%%xmm0                   \n"
   3205     "pand      %%xmm5,%%xmm1                   \n"
   3206     "packuswb  %%xmm1,%%xmm0                   \n"
   3207     "movdqa    %%xmm0,%%xmm1                   \n"
   3208     "pand      %%xmm5,%%xmm0                   \n"
   3209     "packuswb  %%xmm0,%%xmm0                   \n"
   3210     "psrlw     $0x8,%%xmm1                     \n"
   3211     "packuswb  %%xmm1,%%xmm1                   \n"
   3212     "movq      %%xmm0," MEMACCESS(1) "         \n"
   3213     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
   3214     "lea       " MEMLEA(0x8,1) ",%1            \n"
   3215     "sub       $0x10,%3                        \n"
   3216     "jg        1b                              \n"
   3217   : "+r"(src_uyvy),    // %0
   3218     "+r"(dst_u),       // %1
   3219     "+r"(dst_v),       // %2
   3220     "+r"(width)          // %3
   3221   : "r"((intptr_t)(stride_uyvy))  // %4
   3222   : "memory", "cc", NACL_R14
   3223     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   3224   );
   3225 }
   3226 
   3227 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
   3228                          uint8* dst_u, uint8* dst_v, int width) {
   3229   asm volatile (
   3230     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   3231     "psrlw     $0x8,%%xmm5                     \n"
   3232     "sub       %1,%2                           \n"
   3233     LABELALIGN
   3234   "1:                                          \n"
   3235     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3236     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3237     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3238     "pand      %%xmm5,%%xmm0                   \n"
   3239     "pand      %%xmm5,%%xmm1                   \n"
   3240     "packuswb  %%xmm1,%%xmm0                   \n"
   3241     "movdqa    %%xmm0,%%xmm1                   \n"
   3242     "pand      %%xmm5,%%xmm0                   \n"
   3243     "packuswb  %%xmm0,%%xmm0                   \n"
   3244     "psrlw     $0x8,%%xmm1                     \n"
   3245     "packuswb  %%xmm1,%%xmm1                   \n"
   3246     "movq      %%xmm0," MEMACCESS(1) "         \n"
   3247     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
   3248     "lea       " MEMLEA(0x8,1) ",%1            \n"
   3249     "sub       $0x10,%3                        \n"
   3250     "jg        1b                              \n"
   3251   : "+r"(src_uyvy),    // %0
   3252     "+r"(dst_u),       // %1
   3253     "+r"(dst_v),       // %2
   3254     "+r"(width)          // %3
   3255   :
   3256   : "memory", "cc", NACL_R14
   3257     "xmm0", "xmm1", "xmm5"
   3258   );
   3259 }
   3260 #endif  // HAS_YUY2TOYROW_SSE2
   3261 
   3262 #ifdef HAS_YUY2TOYROW_AVX2
   3263 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
   3264   asm volatile (
   3265     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
   3266     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
   3267     LABELALIGN
   3268   "1:                                          \n"
   3269     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   3270     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   3271     "lea       " MEMLEA(0x40,0) ",%0           \n"
   3272     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
   3273     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
   3274     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
   3275     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3276     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
   3277     "lea      " MEMLEA(0x20,1) ",%1            \n"
   3278     "sub       $0x20,%2                        \n"
   3279     "jg        1b                              \n"
   3280     "vzeroupper                                \n"
   3281   : "+r"(src_yuy2),  // %0
   3282     "+r"(dst_y),     // %1
   3283     "+r"(width)        // %2
   3284   :
   3285   : "memory", "cc"
   3286     , "xmm0", "xmm1", "xmm5"
   3287   );
   3288 }
   3289 
   3290 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
   3291                       uint8* dst_u, uint8* dst_v, int width) {
   3292   asm volatile (
   3293     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
   3294     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
   3295     "sub       %1,%2                           \n"
   3296     LABELALIGN
   3297   "1:                                          \n"
   3298     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   3299     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   3300     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
   3301     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
   3302     "lea       " MEMLEA(0x40,0) ",%0           \n"
   3303     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
   3304     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
   3305     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
   3306     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3307     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
   3308     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
   3309     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
   3310     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
   3311     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
   3312     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3313     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
   3314     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
   3315     "lea      " MEMLEA(0x10,1) ",%1            \n"
   3316     "sub       $0x20,%3                        \n"
   3317     "jg        1b                              \n"
   3318     "vzeroupper                                \n"
   3319   : "+r"(src_yuy2),    // %0
   3320     "+r"(dst_u),       // %1
   3321     "+r"(dst_v),       // %2
   3322     "+r"(width)          // %3
   3323   : "r"((intptr_t)(stride_yuy2))  // %4
   3324   : "memory", "cc", NACL_R14
   3325     "xmm0", "xmm1", "xmm5"
   3326   );
   3327 }
   3328 
   3329 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
   3330                          uint8* dst_u, uint8* dst_v, int width) {
   3331   asm volatile (
   3332     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
   3333     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
   3334     "sub       %1,%2                           \n"
   3335     LABELALIGN
   3336   "1:                                          \n"
   3337     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   3338     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   3339     "lea       " MEMLEA(0x40,0) ",%0           \n"
   3340     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
   3341     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
   3342     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
   3343     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3344     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
   3345     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
   3346     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
   3347     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
   3348     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
   3349     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3350     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
   3351     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
   3352     "lea      " MEMLEA(0x10,1) ",%1            \n"
   3353     "sub       $0x20,%3                        \n"
   3354     "jg        1b                              \n"
   3355     "vzeroupper                                \n"
   3356   : "+r"(src_yuy2),    // %0
   3357     "+r"(dst_u),       // %1
   3358     "+r"(dst_v),       // %2
   3359     "+r"(width)          // %3
   3360   :
   3361   : "memory", "cc", NACL_R14
   3362     "xmm0", "xmm1", "xmm5"
   3363   );
   3364 }
   3365 
   3366 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
   3367   asm volatile (
   3368     LABELALIGN
   3369   "1:                                          \n"
   3370     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   3371     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   3372     "lea       " MEMLEA(0x40,0) ",%0           \n"
   3373     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
   3374     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
   3375     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
   3376     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3377     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
   3378     "lea      " MEMLEA(0x20,1) ",%1            \n"
   3379     "sub       $0x20,%2                        \n"
   3380     "jg        1b                              \n"
   3381     "vzeroupper                                \n"
   3382   : "+r"(src_uyvy),  // %0
   3383     "+r"(dst_y),     // %1
   3384     "+r"(width)        // %2
   3385   :
   3386   : "memory", "cc"
   3387     , "xmm0", "xmm1", "xmm5"
   3388   );
   3389 }
   3390 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
   3391                       uint8* dst_u, uint8* dst_v, int width) {
   3392   asm volatile (
   3393     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
   3394     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
   3395     "sub       %1,%2                           \n"
   3396 
   3397     LABELALIGN
   3398   "1:                                          \n"
   3399     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   3400     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   3401     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
   3402     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
   3403     "lea       " MEMLEA(0x40,0) ",%0           \n"
   3404     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
   3405     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
   3406     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
   3407     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3408     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
   3409     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
   3410     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
   3411     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
   3412     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
   3413     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3414     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
   3415     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
   3416     "lea      " MEMLEA(0x10,1) ",%1            \n"
   3417     "sub       $0x20,%3                        \n"
   3418     "jg        1b                              \n"
   3419     "vzeroupper                                \n"
   3420   : "+r"(src_uyvy),    // %0
   3421     "+r"(dst_u),       // %1
   3422     "+r"(dst_v),       // %2
   3423     "+r"(width)          // %3
   3424   : "r"((intptr_t)(stride_uyvy))  // %4
   3425   : "memory", "cc", NACL_R14
   3426     "xmm0", "xmm1", "xmm5"
   3427   );
   3428 }
   3429 
   3430 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
   3431                          uint8* dst_u, uint8* dst_v, int width) {
   3432   asm volatile (
   3433     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   3434     "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
   3435     "sub       %1,%2                           \n"
   3436     LABELALIGN
   3437   "1:                                          \n"
   3438     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   3439     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   3440     "lea       " MEMLEA(0x40,0) ",%0           \n"
   3441     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
   3442     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
   3443     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
   3444     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3445     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
   3446     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
   3447     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
   3448     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
   3449     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
   3450     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
   3451     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
   3452     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
   3453     "lea      " MEMLEA(0x10,1) ",%1            \n"
   3454     "sub       $0x20,%3                        \n"
   3455     "jg        1b                              \n"
   3456     "vzeroupper                                \n"
   3457   : "+r"(src_uyvy),    // %0
   3458     "+r"(dst_u),       // %1
   3459     "+r"(dst_v),       // %2
   3460     "+r"(width)          // %3
   3461   :
   3462   : "memory", "cc", NACL_R14
   3463     "xmm0", "xmm1", "xmm5"
   3464   );
   3465 }
   3466 #endif  // HAS_YUY2TOYROW_AVX2
   3467 
   3468 #ifdef HAS_ARGBBLENDROW_SSSE3
   3469 // Shuffle table for isolating alpha.
   3470 static uvec8 kShuffleAlpha = {
   3471   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
   3472   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
   3473 };
   3474 
   3475 // Blend 8 pixels at a time
   3476 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
   3477                         uint8* dst_argb, int width) {
   3478   asm volatile (
   3479     "pcmpeqb   %%xmm7,%%xmm7                   \n"
   3480     "psrlw     $0xf,%%xmm7                     \n"
   3481     "pcmpeqb   %%xmm6,%%xmm6                   \n"
   3482     "psrlw     $0x8,%%xmm6                     \n"
   3483     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   3484     "psllw     $0x8,%%xmm5                     \n"
   3485     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   3486     "pslld     $0x18,%%xmm4                    \n"
   3487     "sub       $0x4,%3                         \n"
   3488     "jl        49f                             \n"
   3489 
   3490     // 4 pixel loop.
   3491     LABELALIGN
   3492   "40:                                         \n"
   3493     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
   3494     "lea       " MEMLEA(0x10,0) ",%0           \n"
   3495     "movdqa    %%xmm3,%%xmm0                   \n"
   3496     "pxor      %%xmm4,%%xmm3                   \n"
   3497     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
   3498     "pshufb    %4,%%xmm3                       \n"
   3499     "pand      %%xmm6,%%xmm2                   \n"
   3500     "paddw     %%xmm7,%%xmm3                   \n"
   3501     "pmullw    %%xmm3,%%xmm2                   \n"
   3502     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
   3503     "lea       " MEMLEA(0x10,1) ",%1           \n"
   3504     "psrlw     $0x8,%%xmm1                     \n"
   3505     "por       %%xmm4,%%xmm0                   \n"
   3506     "pmullw    %%xmm3,%%xmm1                   \n"
   3507     "psrlw     $0x8,%%xmm2                     \n"
   3508     "paddusb   %%xmm2,%%xmm0                   \n"
   3509     "pand      %%xmm5,%%xmm1                   \n"
   3510     "paddusb   %%xmm1,%%xmm0                   \n"
   3511     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   3512     "lea       " MEMLEA(0x10,2) ",%2           \n"
   3513     "sub       $0x4,%3                         \n"
   3514     "jge       40b                             \n"
   3515 
   3516   "49:                                         \n"
   3517     "add       $0x3,%3                         \n"
   3518     "jl        99f                             \n"
   3519 
   3520     // 1 pixel loop.
   3521   "91:                                         \n"
   3522     "movd      " MEMACCESS(0) ",%%xmm3         \n"
   3523     "lea       " MEMLEA(0x4,0) ",%0            \n"
   3524     "movdqa    %%xmm3,%%xmm0                   \n"
   3525     "pxor      %%xmm4,%%xmm3                   \n"
   3526     "movd      " MEMACCESS(1) ",%%xmm2         \n"
   3527     "pshufb    %4,%%xmm3                       \n"
   3528     "pand      %%xmm6,%%xmm2                   \n"
   3529     "paddw     %%xmm7,%%xmm3                   \n"
   3530     "pmullw    %%xmm3,%%xmm2                   \n"
   3531     "movd      " MEMACCESS(1) ",%%xmm1         \n"
   3532     "lea       " MEMLEA(0x4,1) ",%1            \n"
   3533     "psrlw     $0x8,%%xmm1                     \n"
   3534     "por       %%xmm4,%%xmm0                   \n"
   3535     "pmullw    %%xmm3,%%xmm1                   \n"
   3536     "psrlw     $0x8,%%xmm2                     \n"
   3537     "paddusb   %%xmm2,%%xmm0                   \n"
   3538     "pand      %%xmm5,%%xmm1                   \n"
   3539     "paddusb   %%xmm1,%%xmm0                   \n"
   3540     "movd      %%xmm0," MEMACCESS(2) "         \n"
   3541     "lea       " MEMLEA(0x4,2) ",%2            \n"
   3542     "sub       $0x1,%3                         \n"
   3543     "jge       91b                             \n"
   3544   "99:                                         \n"
   3545   : "+r"(src_argb0),    // %0
   3546     "+r"(src_argb1),    // %1
   3547     "+r"(dst_argb),     // %2
   3548     "+r"(width)         // %3
   3549   : "m"(kShuffleAlpha)  // %4
   3550   : "memory", "cc"
   3551     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   3552   );
   3553 }
   3554 #endif  // HAS_ARGBBLENDROW_SSSE3
   3555 
   3556 #ifdef HAS_BLENDPLANEROW_SSSE3
   3557 // Blend 8 pixels at a time.
   3558 // unsigned version of math
   3559 // =((A2*C2)+(B2*(255-C2))+255)/256
   3560 // signed version of math
   3561 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
   3562 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
   3563                          const uint8* alpha, uint8* dst, int width) {
   3564   asm volatile (
   3565     "pcmpeqb    %%xmm5,%%xmm5                  \n"
   3566     "psllw      $0x8,%%xmm5                    \n"
   3567     "mov        $0x80808080,%%eax              \n"
   3568     "movd       %%eax,%%xmm6                   \n"
   3569     "pshufd     $0x0,%%xmm6,%%xmm6             \n"
   3570     "mov        $0x807f807f,%%eax              \n"
   3571     "movd       %%eax,%%xmm7                   \n"
   3572     "pshufd     $0x0,%%xmm7,%%xmm7             \n"
   3573     "sub        %2,%0                          \n"
   3574     "sub        %2,%1                          \n"
   3575     "sub        %2,%3                          \n"
   3576 
   3577     // 8 pixel loop.
   3578     LABELALIGN
   3579   "1:                                          \n"
   3580     "movq       (%2),%%xmm0                    \n"
   3581     "punpcklbw  %%xmm0,%%xmm0                  \n"
   3582     "pxor       %%xmm5,%%xmm0                  \n"
   3583     "movq       (%0,%2,1),%%xmm1               \n"
   3584     "movq       (%1,%2,1),%%xmm2               \n"
   3585     "punpcklbw  %%xmm2,%%xmm1                  \n"
   3586     "psubb      %%xmm6,%%xmm1                  \n"
   3587     "pmaddubsw  %%xmm1,%%xmm0                  \n"
   3588     "paddw      %%xmm7,%%xmm0                  \n"
   3589     "psrlw      $0x8,%%xmm0                    \n"
   3590     "packuswb   %%xmm0,%%xmm0                  \n"
   3591     "movq       %%xmm0,(%3,%2,1)               \n"
   3592     "lea        0x8(%2),%2                     \n"
   3593     "sub        $0x8,%4                        \n"
   3594     "jg        1b                              \n"
   3595   : "+r"(src0),       // %0
   3596     "+r"(src1),       // %1
   3597     "+r"(alpha),      // %2
   3598     "+r"(dst),        // %3
   3599     "+rm"(width)      // %4
   3600   :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
   3601   );
   3602 }
   3603 #endif  // HAS_BLENDPLANEROW_SSSE3
   3604 
   3605 #ifdef HAS_BLENDPLANEROW_AVX2
   3606 // Blend 32 pixels at a time.
   3607 // unsigned version of math
   3608 // =((A2*C2)+(B2*(255-C2))+255)/256
   3609 // signed version of math
   3610 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
   3611 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
   3612                         const uint8* alpha, uint8* dst, int width) {
   3613   asm volatile (
   3614     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   3615     "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
   3616     "mov        $0x80808080,%%eax              \n"
   3617     "vmovd      %%eax,%%xmm6                   \n"
   3618     "vbroadcastss %%xmm6,%%ymm6                \n"
   3619     "mov        $0x807f807f,%%eax              \n"
   3620     "vmovd      %%eax,%%xmm7                   \n"
   3621     "vbroadcastss %%xmm7,%%ymm7                \n"
   3622     "sub        %2,%0                          \n"
   3623     "sub        %2,%1                          \n"
   3624     "sub        %2,%3                          \n"
   3625 
   3626     // 32 pixel loop.
   3627     LABELALIGN
   3628   "1:                                          \n"
   3629     "vmovdqu    (%2),%%ymm0                    \n"
   3630     "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
   3631     "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
   3632     "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
   3633     "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
   3634     "vmovdqu    (%0,%2,1),%%ymm1               \n"
   3635     "vmovdqu    (%1,%2,1),%%ymm2               \n"
   3636     "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
   3637     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
   3638     "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
   3639     "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
   3640     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
   3641     "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
   3642     "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
   3643     "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
   3644     "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
   3645     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
   3646     "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
   3647     "vmovdqu    %%ymm0,(%3,%2,1)               \n"
   3648     "lea        0x20(%2),%2                    \n"
   3649     "sub        $0x20,%4                       \n"
   3650     "jg        1b                              \n"
   3651     "vzeroupper                                \n"
   3652   : "+r"(src0),       // %0
   3653     "+r"(src1),       // %1
   3654     "+r"(alpha),      // %2
   3655     "+r"(dst),        // %3
   3656     "+rm"(width)      // %4
   3657   :: "memory", "cc", "eax",
   3658      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   3659   );
   3660 }
   3661 #endif  // HAS_BLENDPLANEROW_AVX2
   3662 
   3663 #ifdef HAS_ARGBATTENUATEROW_SSSE3
   3664 // Shuffle table duplicating alpha
   3665 static uvec8 kShuffleAlpha0 = {
   3666   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
   3667 };
   3668 static uvec8 kShuffleAlpha1 = {
   3669   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
   3670   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
   3671 };
   3672 // Attenuate 4 pixels at a time.
   3673 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   3674   asm volatile (
   3675     "pcmpeqb   %%xmm3,%%xmm3                   \n"
   3676     "pslld     $0x18,%%xmm3                    \n"
   3677     "movdqa    %3,%%xmm4                       \n"
   3678     "movdqa    %4,%%xmm5                       \n"
   3679 
   3680     // 4 pixel loop.
   3681     LABELALIGN
   3682   "1:                                          \n"
   3683     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3684     "pshufb    %%xmm4,%%xmm0                   \n"
   3685     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
   3686     "punpcklbw %%xmm1,%%xmm1                   \n"
   3687     "pmulhuw   %%xmm1,%%xmm0                   \n"
   3688     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
   3689     "pshufb    %%xmm5,%%xmm1                   \n"
   3690     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
   3691     "punpckhbw %%xmm2,%%xmm2                   \n"
   3692     "pmulhuw   %%xmm2,%%xmm1                   \n"
   3693     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
   3694     "lea       " MEMLEA(0x10,0) ",%0           \n"
   3695     "pand      %%xmm3,%%xmm2                   \n"
   3696     "psrlw     $0x8,%%xmm0                     \n"
   3697     "psrlw     $0x8,%%xmm1                     \n"
   3698     "packuswb  %%xmm1,%%xmm0                   \n"
   3699     "por       %%xmm2,%%xmm0                   \n"
   3700     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   3701     "lea       " MEMLEA(0x10,1) ",%1           \n"
   3702     "sub       $0x4,%2                         \n"
   3703     "jg        1b                              \n"
   3704   : "+r"(src_argb),    // %0
   3705     "+r"(dst_argb),    // %1
   3706     "+r"(width)        // %2
   3707   : "m"(kShuffleAlpha0),  // %3
   3708     "m"(kShuffleAlpha1)  // %4
   3709   : "memory", "cc"
   3710     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   3711   );
   3712 }
   3713 #endif  // HAS_ARGBATTENUATEROW_SSSE3
   3714 
   3715 #ifdef HAS_ARGBATTENUATEROW_AVX2
   3716 // Shuffle table duplicating alpha.
   3717 static const uvec8 kShuffleAlpha_AVX2 = {
   3718   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
   3719 };
   3720 // Attenuate 8 pixels at a time.
   3721 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
   3722   asm volatile (
   3723     "vbroadcastf128 %3,%%ymm4                  \n"
   3724     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
   3725     "vpslld     $0x18,%%ymm5,%%ymm5            \n"
   3726     "sub        %0,%1                          \n"
   3727 
   3728     // 8 pixel loop.
   3729     LABELALIGN
   3730   "1:                                          \n"
   3731     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
   3732     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
   3733     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
   3734     "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
   3735     "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
   3736     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
   3737     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
   3738     "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
   3739     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
   3740     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
   3741     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
   3742     "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
   3743     MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
   3744     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3745     "sub        $0x8,%2                        \n"
   3746     "jg        1b                              \n"
   3747     "vzeroupper                                \n"
   3748   : "+r"(src_argb),    // %0
   3749     "+r"(dst_argb),    // %1
   3750     "+r"(width)        // %2
   3751   : "m"(kShuffleAlpha_AVX2)  // %3
   3752   : "memory", "cc"
   3753     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   3754   );
   3755 }
   3756 #endif  // HAS_ARGBATTENUATEROW_AVX2
   3757 
   3758 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
   3759 // Unattenuate 4 pixels at a time.
   3760 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
   3761                              int width) {
   3762   uintptr_t alpha;
   3763   asm volatile (
   3764     // 4 pixel loop.
   3765     LABELALIGN
   3766   "1:                                          \n"
   3767     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3768     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
   3769     "punpcklbw %%xmm0,%%xmm0                   \n"
   3770     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
   3771     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
   3772     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
   3773     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
   3774     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
   3775     "movlhps   %%xmm3,%%xmm2                   \n"
   3776     "pmulhuw   %%xmm2,%%xmm0                   \n"
   3777     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
   3778     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
   3779     "punpckhbw %%xmm1,%%xmm1                   \n"
   3780     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
   3781     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
   3782     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
   3783     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
   3784     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
   3785     "movlhps   %%xmm3,%%xmm2                   \n"
   3786     "pmulhuw   %%xmm2,%%xmm1                   \n"
   3787     "lea       " MEMLEA(0x10,0) ",%0           \n"
   3788     "packuswb  %%xmm1,%%xmm0                   \n"
   3789     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   3790     "lea       " MEMLEA(0x10,1) ",%1           \n"
   3791     "sub       $0x4,%2                         \n"
   3792     "jg        1b                              \n"
   3793   : "+r"(src_argb),     // %0
   3794     "+r"(dst_argb),     // %1
   3795     "+r"(width),        // %2
   3796     "=&r"(alpha)        // %3
   3797   : "r"(fixed_invtbl8)  // %4
   3798   : "memory", "cc", NACL_R14
   3799     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   3800   );
   3801 }
   3802 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
   3803 
   3804 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
   3805 // Shuffle table duplicating alpha.
   3806 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
   3807   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
   3808 };
   3809 // Unattenuate 8 pixels at a time.
   3810 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
   3811                              int width) {
   3812   uintptr_t alpha;
   3813   asm volatile (
   3814     "sub        %0,%1                          \n"
   3815     "vbroadcastf128 %5,%%ymm5                  \n"
   3816 
   3817     // 8 pixel loop.
   3818     LABELALIGN
   3819   "1:                                          \n"
   3820     // replace VPGATHER
   3821     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
   3822     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
   3823     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
   3824     MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
   3825     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
   3826     "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
   3827     MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
   3828     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
   3829     MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
   3830     "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
   3831     "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
   3832     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
   3833     "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
   3834     MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
   3835     "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
   3836     "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
   3837     MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
   3838     "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
   3839     MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
   3840     "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
   3841     "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
   3842     "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
   3843     "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
   3844     // end of VPGATHER
   3845 
   3846     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
   3847     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
   3848     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
   3849     "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
   3850     "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
   3851     "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
   3852     "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
   3853     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
   3854     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
   3855     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
   3856     MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
   3857     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3858     "sub        $0x8,%2                        \n"
   3859     "jg        1b                              \n"
   3860     "vzeroupper                                \n"
   3861   : "+r"(src_argb),      // %0
   3862     "+r"(dst_argb),      // %1
   3863     "+r"(width),         // %2
   3864     "=&r"(alpha)         // %3
   3865   : "r"(fixed_invtbl8),  // %4
   3866     "m"(kUnattenShuffleAlpha_AVX2)  // %5
   3867   : "memory", "cc", NACL_R14
   3868     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   3869   );
   3870 }
   3871 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
   3872 
   3873 #ifdef HAS_ARGBGRAYROW_SSSE3
   3874 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
   3875 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   3876   asm volatile (
   3877     "movdqa    %3,%%xmm4                       \n"
   3878     "movdqa    %4,%%xmm5                       \n"
   3879 
   3880     // 8 pixel loop.
   3881     LABELALIGN
   3882   "1:                                          \n"
   3883     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3884     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3885     "pmaddubsw %%xmm4,%%xmm0                   \n"
   3886     "pmaddubsw %%xmm4,%%xmm1                   \n"
   3887     "phaddw    %%xmm1,%%xmm0                   \n"
   3888     "paddw     %%xmm5,%%xmm0                   \n"
   3889     "psrlw     $0x7,%%xmm0                     \n"
   3890     "packuswb  %%xmm0,%%xmm0                   \n"
   3891     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
   3892     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
   3893     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3894     "psrld     $0x18,%%xmm2                    \n"
   3895     "psrld     $0x18,%%xmm3                    \n"
   3896     "packuswb  %%xmm3,%%xmm2                   \n"
   3897     "packuswb  %%xmm2,%%xmm2                   \n"
   3898     "movdqa    %%xmm0,%%xmm3                   \n"
   3899     "punpcklbw %%xmm0,%%xmm0                   \n"
   3900     "punpcklbw %%xmm2,%%xmm3                   \n"
   3901     "movdqa    %%xmm0,%%xmm1                   \n"
   3902     "punpcklwd %%xmm3,%%xmm0                   \n"
   3903     "punpckhwd %%xmm3,%%xmm1                   \n"
   3904     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   3905     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   3906     "lea       " MEMLEA(0x20,1) ",%1           \n"
   3907     "sub       $0x8,%2                         \n"
   3908     "jg        1b                              \n"
   3909   : "+r"(src_argb),   // %0
   3910     "+r"(dst_argb),   // %1
   3911     "+r"(width)       // %2
   3912   : "m"(kARGBToYJ),   // %3
   3913     "m"(kAddYJ64)     // %4
   3914   : "memory", "cc"
   3915     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   3916   );
   3917 }
   3918 #endif  // HAS_ARGBGRAYROW_SSSE3
   3919 
   3920 #ifdef HAS_ARGBSEPIAROW_SSSE3
   3921 //    b = (r * 35 + g * 68 + b * 17) >> 7
   3922 //    g = (r * 45 + g * 88 + b * 22) >> 7
   3923 //    r = (r * 50 + g * 98 + b * 24) >> 7
   3924 // Constant for ARGB color to sepia tone
   3925 static vec8 kARGBToSepiaB = {
   3926   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
   3927 };
   3928 
   3929 static vec8 kARGBToSepiaG = {
   3930   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
   3931 };
   3932 
   3933 static vec8 kARGBToSepiaR = {
   3934   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
   3935 };
   3936 
   3937 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
   3938 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
   3939   asm volatile (
   3940     "movdqa    %2,%%xmm2                       \n"
   3941     "movdqa    %3,%%xmm3                       \n"
   3942     "movdqa    %4,%%xmm4                       \n"
   3943 
   3944     // 8 pixel loop.
   3945     LABELALIGN
   3946   "1:                                          \n"
   3947     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   3948     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
   3949     "pmaddubsw %%xmm2,%%xmm0                   \n"
   3950     "pmaddubsw %%xmm2,%%xmm6                   \n"
   3951     "phaddw    %%xmm6,%%xmm0                   \n"
   3952     "psrlw     $0x7,%%xmm0                     \n"
   3953     "packuswb  %%xmm0,%%xmm0                   \n"
   3954     "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
   3955     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3956     "pmaddubsw %%xmm3,%%xmm5                   \n"
   3957     "pmaddubsw %%xmm3,%%xmm1                   \n"
   3958     "phaddw    %%xmm1,%%xmm5                   \n"
   3959     "psrlw     $0x7,%%xmm5                     \n"
   3960     "packuswb  %%xmm5,%%xmm5                   \n"
   3961     "punpcklbw %%xmm5,%%xmm0                   \n"
   3962     "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
   3963     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3964     "pmaddubsw %%xmm4,%%xmm5                   \n"
   3965     "pmaddubsw %%xmm4,%%xmm1                   \n"
   3966     "phaddw    %%xmm1,%%xmm5                   \n"
   3967     "psrlw     $0x7,%%xmm5                     \n"
   3968     "packuswb  %%xmm5,%%xmm5                   \n"
   3969     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
   3970     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   3971     "psrld     $0x18,%%xmm6                    \n"
   3972     "psrld     $0x18,%%xmm1                    \n"
   3973     "packuswb  %%xmm1,%%xmm6                   \n"
   3974     "packuswb  %%xmm6,%%xmm6                   \n"
   3975     "punpcklbw %%xmm6,%%xmm5                   \n"
   3976     "movdqa    %%xmm0,%%xmm1                   \n"
   3977     "punpcklwd %%xmm5,%%xmm0                   \n"
   3978     "punpckhwd %%xmm5,%%xmm1                   \n"
   3979     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
   3980     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
   3981     "lea       " MEMLEA(0x20,0) ",%0           \n"
   3982     "sub       $0x8,%1                         \n"
   3983     "jg        1b                              \n"
   3984   : "+r"(dst_argb),      // %0
   3985     "+r"(width)          // %1
   3986   : "m"(kARGBToSepiaB),  // %2
   3987     "m"(kARGBToSepiaG),  // %3
   3988     "m"(kARGBToSepiaR)   // %4
   3989   : "memory", "cc"
   3990     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   3991   );
   3992 }
   3993 #endif  // HAS_ARGBSEPIAROW_SSSE3
   3994 
   3995 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
   3996 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
   3997 // Same as Sepia except matrix is provided.
   3998 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
   3999                               const int8* matrix_argb, int width) {
   4000   asm volatile (
   4001     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
   4002     "pshufd    $0x00,%%xmm5,%%xmm2             \n"
   4003     "pshufd    $0x55,%%xmm5,%%xmm3             \n"
   4004     "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
   4005     "pshufd    $0xff,%%xmm5,%%xmm5             \n"
   4006 
   4007     // 8 pixel loop.
   4008     LABELALIGN
   4009   "1:                                          \n"
   4010     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4011     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
   4012     "pmaddubsw %%xmm2,%%xmm0                   \n"
   4013     "pmaddubsw %%xmm2,%%xmm7                   \n"
   4014     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
   4015     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   4016     "pmaddubsw %%xmm3,%%xmm6                   \n"
   4017     "pmaddubsw %%xmm3,%%xmm1                   \n"
   4018     "phaddsw   %%xmm7,%%xmm0                   \n"
   4019     "phaddsw   %%xmm1,%%xmm6                   \n"
   4020     "psraw     $0x6,%%xmm0                     \n"
   4021     "psraw     $0x6,%%xmm6                     \n"
   4022     "packuswb  %%xmm0,%%xmm0                   \n"
   4023     "packuswb  %%xmm6,%%xmm6                   \n"
   4024     "punpcklbw %%xmm6,%%xmm0                   \n"
   4025     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
   4026     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
   4027     "pmaddubsw %%xmm4,%%xmm1                   \n"
   4028     "pmaddubsw %%xmm4,%%xmm7                   \n"
   4029     "phaddsw   %%xmm7,%%xmm1                   \n"
   4030     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
   4031     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
   4032     "pmaddubsw %%xmm5,%%xmm6                   \n"
   4033     "pmaddubsw %%xmm5,%%xmm7                   \n"
   4034     "phaddsw   %%xmm7,%%xmm6                   \n"
   4035     "psraw     $0x6,%%xmm1                     \n"
   4036     "psraw     $0x6,%%xmm6                     \n"
   4037     "packuswb  %%xmm1,%%xmm1                   \n"
   4038     "packuswb  %%xmm6,%%xmm6                   \n"
   4039     "punpcklbw %%xmm6,%%xmm1                   \n"
   4040     "movdqa    %%xmm0,%%xmm6                   \n"
   4041     "punpcklwd %%xmm1,%%xmm0                   \n"
   4042     "punpckhwd %%xmm1,%%xmm6                   \n"
   4043     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   4044     "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
   4045     "lea       " MEMLEA(0x20,0) ",%0           \n"
   4046     "lea       " MEMLEA(0x20,1) ",%1           \n"
   4047     "sub       $0x8,%2                         \n"
   4048     "jg        1b                              \n"
   4049   : "+r"(src_argb),      // %0
   4050     "+r"(dst_argb),      // %1
   4051     "+r"(width)          // %2
   4052   : "r"(matrix_argb)     // %3
   4053   : "memory", "cc"
   4054     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   4055   );
   4056 }
   4057 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
   4058 
   4059 #ifdef HAS_ARGBQUANTIZEROW_SSE2
   4060 // Quantize 4 ARGB pixels (16 bytes).
   4061 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
   4062                           int interval_offset, int width) {
   4063   asm volatile (
   4064     "movd      %2,%%xmm2                       \n"
   4065     "movd      %3,%%xmm3                       \n"
   4066     "movd      %4,%%xmm4                       \n"
   4067     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
   4068     "pshufd    $0x44,%%xmm2,%%xmm2             \n"
   4069     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
   4070     "pshufd    $0x44,%%xmm3,%%xmm3             \n"
   4071     "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
   4072     "pshufd    $0x44,%%xmm4,%%xmm4             \n"
   4073     "pxor      %%xmm5,%%xmm5                   \n"
   4074     "pcmpeqb   %%xmm6,%%xmm6                   \n"
   4075     "pslld     $0x18,%%xmm6                    \n"
   4076 
   4077     // 4 pixel loop.
   4078     LABELALIGN
   4079   "1:                                          \n"
   4080     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4081     "punpcklbw %%xmm5,%%xmm0                   \n"
   4082     "pmulhuw   %%xmm2,%%xmm0                   \n"
   4083     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
   4084     "punpckhbw %%xmm5,%%xmm1                   \n"
   4085     "pmulhuw   %%xmm2,%%xmm1                   \n"
   4086     "pmullw    %%xmm3,%%xmm0                   \n"
   4087     "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
   4088     "pmullw    %%xmm3,%%xmm1                   \n"
   4089     "pand      %%xmm6,%%xmm7                   \n"
   4090     "paddw     %%xmm4,%%xmm0                   \n"
   4091     "paddw     %%xmm4,%%xmm1                   \n"
   4092     "packuswb  %%xmm1,%%xmm0                   \n"
   4093     "por       %%xmm7,%%xmm0                   \n"
   4094     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
   4095     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4096     "sub       $0x4,%1                         \n"
   4097     "jg        1b                              \n"
   4098   : "+r"(dst_argb),       // %0
   4099     "+r"(width)           // %1
   4100   : "r"(scale),           // %2
   4101     "r"(interval_size),   // %3
   4102     "r"(interval_offset)  // %4
   4103   : "memory", "cc"
   4104     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   4105   );
   4106 }
   4107 #endif  // HAS_ARGBQUANTIZEROW_SSE2
   4108 
   4109 #ifdef HAS_ARGBSHADEROW_SSE2
   4110 // Shade 4 pixels at a time by specified value.
   4111 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
   4112                        uint32 value) {
   4113   asm volatile (
   4114     "movd      %3,%%xmm2                       \n"
   4115     "punpcklbw %%xmm2,%%xmm2                   \n"
   4116     "punpcklqdq %%xmm2,%%xmm2                  \n"
   4117 
   4118     // 4 pixel loop.
   4119     LABELALIGN
   4120   "1:                                          \n"
   4121     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4122     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4123     "movdqa    %%xmm0,%%xmm1                   \n"
   4124     "punpcklbw %%xmm0,%%xmm0                   \n"
   4125     "punpckhbw %%xmm1,%%xmm1                   \n"
   4126     "pmulhuw   %%xmm2,%%xmm0                   \n"
   4127     "pmulhuw   %%xmm2,%%xmm1                   \n"
   4128     "psrlw     $0x8,%%xmm0                     \n"
   4129     "psrlw     $0x8,%%xmm1                     \n"
   4130     "packuswb  %%xmm1,%%xmm0                   \n"
   4131     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   4132     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4133     "sub       $0x4,%2                         \n"
   4134     "jg        1b                              \n"
   4135   : "+r"(src_argb),  // %0
   4136     "+r"(dst_argb),  // %1
   4137     "+r"(width)      // %2
   4138   : "r"(value)       // %3
   4139   : "memory", "cc"
   4140     , "xmm0", "xmm1", "xmm2"
   4141   );
   4142 }
   4143 #endif  // HAS_ARGBSHADEROW_SSE2
   4144 
   4145 #ifdef HAS_ARGBMULTIPLYROW_SSE2
   4146 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
   4147 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   4148                           uint8* dst_argb, int width) {
   4149   asm volatile (
   4150     "pxor      %%xmm5,%%xmm5                  \n"
   4151 
   4152     // 4 pixel loop.
   4153     LABELALIGN
   4154   "1:                                          \n"
   4155     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4156     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4157     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
   4158     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4159     "movdqu    %%xmm0,%%xmm1                   \n"
   4160     "movdqu    %%xmm2,%%xmm3                   \n"
   4161     "punpcklbw %%xmm0,%%xmm0                   \n"
   4162     "punpckhbw %%xmm1,%%xmm1                   \n"
   4163     "punpcklbw %%xmm5,%%xmm2                   \n"
   4164     "punpckhbw %%xmm5,%%xmm3                   \n"
   4165     "pmulhuw   %%xmm2,%%xmm0                   \n"
   4166     "pmulhuw   %%xmm3,%%xmm1                   \n"
   4167     "packuswb  %%xmm1,%%xmm0                   \n"
   4168     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   4169     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4170     "sub       $0x4,%3                         \n"
   4171     "jg        1b                              \n"
   4172   : "+r"(src_argb0),  // %0
   4173     "+r"(src_argb1),  // %1
   4174     "+r"(dst_argb),   // %2
   4175     "+r"(width)       // %3
   4176   :
   4177   : "memory", "cc"
   4178     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   4179   );
   4180 }
   4181 #endif  // HAS_ARGBMULTIPLYROW_SSE2
   4182 
   4183 #ifdef HAS_ARGBMULTIPLYROW_AVX2
   4184 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
   4185 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
   4186                           uint8* dst_argb, int width) {
   4187   asm volatile (
   4188     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
   4189 
   4190     // 4 pixel loop.
   4191     LABELALIGN
   4192   "1:                                          \n"
   4193     "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
   4194     "lea        " MEMLEA(0x20,0) ",%0          \n"
   4195     "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
   4196     "lea        " MEMLEA(0x20,1) ",%1          \n"
   4197     "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
   4198     "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
   4199     "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
   4200     "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
   4201     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
   4202     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
   4203     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
   4204     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
   4205     "lea       " MEMLEA(0x20,2) ",%2           \n"
   4206     "sub        $0x8,%3                        \n"
   4207     "jg        1b                              \n"
   4208     "vzeroupper                                \n"
   4209   : "+r"(src_argb0),  // %0
   4210     "+r"(src_argb1),  // %1
   4211     "+r"(dst_argb),   // %2
   4212     "+r"(width)       // %3
   4213   :
   4214   : "memory", "cc"
   4215 #if defined(__AVX2__)
   4216     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   4217 #endif
   4218   );
   4219 }
   4220 #endif  // HAS_ARGBMULTIPLYROW_AVX2
   4221 
   4222 #ifdef HAS_ARGBADDROW_SSE2
   4223 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
   4224 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   4225                      uint8* dst_argb, int width) {
   4226   asm volatile (
   4227     // 4 pixel loop.
   4228     LABELALIGN
   4229   "1:                                          \n"
   4230     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4231     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4232     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
   4233     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4234     "paddusb   %%xmm1,%%xmm0                   \n"
   4235     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   4236     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4237     "sub       $0x4,%3                         \n"
   4238     "jg        1b                              \n"
   4239   : "+r"(src_argb0),  // %0
   4240     "+r"(src_argb1),  // %1
   4241     "+r"(dst_argb),   // %2
   4242     "+r"(width)       // %3
   4243   :
   4244   : "memory", "cc"
   4245     , "xmm0", "xmm1"
   4246   );
   4247 }
   4248 #endif  // HAS_ARGBADDROW_SSE2
   4249 
   4250 #ifdef HAS_ARGBADDROW_AVX2
   4251 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
   4252 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
   4253                      uint8* dst_argb, int width) {
   4254   asm volatile (
   4255     // 4 pixel loop.
   4256     LABELALIGN
   4257   "1:                                          \n"
   4258     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
   4259     "lea        " MEMLEA(0x20,0) ",%0          \n"
   4260     "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
   4261     "lea        " MEMLEA(0x20,1) ",%1          \n"
   4262     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
   4263     "lea        " MEMLEA(0x20,2) ",%2          \n"
   4264     "sub        $0x8,%3                        \n"
   4265     "jg        1b                              \n"
   4266     "vzeroupper                                \n"
   4267   : "+r"(src_argb0),  // %0
   4268     "+r"(src_argb1),  // %1
   4269     "+r"(dst_argb),   // %2
   4270     "+r"(width)       // %3
   4271   :
   4272   : "memory", "cc"
   4273     , "xmm0"
   4274   );
   4275 }
   4276 #endif  // HAS_ARGBADDROW_AVX2
   4277 
   4278 #ifdef HAS_ARGBSUBTRACTROW_SSE2
   4279 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
   4280 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   4281                           uint8* dst_argb, int width) {
   4282   asm volatile (
   4283     // 4 pixel loop.
   4284     LABELALIGN
   4285   "1:                                          \n"
   4286     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4287     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4288     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
   4289     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4290     "psubusb   %%xmm1,%%xmm0                   \n"
   4291     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   4292     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4293     "sub       $0x4,%3                         \n"
   4294     "jg        1b                              \n"
   4295   : "+r"(src_argb0),  // %0
   4296     "+r"(src_argb1),  // %1
   4297     "+r"(dst_argb),   // %2
   4298     "+r"(width)       // %3
   4299   :
   4300   : "memory", "cc"
   4301     , "xmm0", "xmm1"
   4302   );
   4303 }
   4304 #endif  // HAS_ARGBSUBTRACTROW_SSE2
   4305 
   4306 #ifdef HAS_ARGBSUBTRACTROW_AVX2
   4307 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
   4308 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
   4309                           uint8* dst_argb, int width) {
   4310   asm volatile (
   4311     // 4 pixel loop.
   4312     LABELALIGN
   4313   "1:                                          \n"
   4314     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
   4315     "lea        " MEMLEA(0x20,0) ",%0          \n"
   4316     "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
   4317     "lea        " MEMLEA(0x20,1) ",%1          \n"
   4318     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
   4319     "lea        " MEMLEA(0x20,2) ",%2          \n"
   4320     "sub        $0x8,%3                        \n"
   4321     "jg        1b                              \n"
   4322     "vzeroupper                                \n"
   4323   : "+r"(src_argb0),  // %0
   4324     "+r"(src_argb1),  // %1
   4325     "+r"(dst_argb),   // %2
   4326     "+r"(width)       // %3
   4327   :
   4328   : "memory", "cc"
   4329     , "xmm0"
   4330   );
   4331 }
   4332 #endif  // HAS_ARGBSUBTRACTROW_AVX2
   4333 
   4334 #ifdef HAS_SOBELXROW_SSE2
   4335 // SobelX as a matrix is
   4336 // -1  0  1
   4337 // -2  0  2
   4338 // -1  0  1
   4339 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
   4340                     const uint8* src_y2, uint8* dst_sobelx, int width) {
   4341   asm volatile (
   4342     "sub       %0,%1                           \n"
   4343     "sub       %0,%2                           \n"
   4344     "sub       %0,%3                           \n"
   4345     "pxor      %%xmm5,%%xmm5                   \n"
   4346 
   4347     // 8 pixel loop.
   4348     LABELALIGN
   4349   "1:                                          \n"
   4350     "movq      " MEMACCESS(0) ",%%xmm0         \n"
   4351     "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
   4352     "punpcklbw %%xmm5,%%xmm0                   \n"
   4353     "punpcklbw %%xmm5,%%xmm1                   \n"
   4354     "psubw     %%xmm1,%%xmm0                   \n"
   4355     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
   4356     MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
   4357     "punpcklbw %%xmm5,%%xmm1                   \n"
   4358     "punpcklbw %%xmm5,%%xmm2                   \n"
   4359     "psubw     %%xmm2,%%xmm1                   \n"
   4360     MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
   4361     MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
   4362     "punpcklbw %%xmm5,%%xmm2                   \n"
   4363     "punpcklbw %%xmm5,%%xmm3                   \n"
   4364     "psubw     %%xmm3,%%xmm2                   \n"
   4365     "paddw     %%xmm2,%%xmm0                   \n"
   4366     "paddw     %%xmm1,%%xmm0                   \n"
   4367     "paddw     %%xmm1,%%xmm0                   \n"
   4368     "pxor      %%xmm1,%%xmm1                   \n"
   4369     "psubw     %%xmm0,%%xmm1                   \n"
   4370     "pmaxsw    %%xmm1,%%xmm0                   \n"
   4371     "packuswb  %%xmm0,%%xmm0                   \n"
   4372     MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
   4373     "lea       " MEMLEA(0x8,0) ",%0            \n"
   4374     "sub       $0x8,%4                         \n"
   4375     "jg        1b                              \n"
   4376   : "+r"(src_y0),      // %0
   4377     "+r"(src_y1),      // %1
   4378     "+r"(src_y2),      // %2
   4379     "+r"(dst_sobelx),  // %3
   4380     "+r"(width)        // %4
   4381   :
   4382   : "memory", "cc", NACL_R14
   4383     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   4384   );
   4385 }
   4386 #endif  // HAS_SOBELXROW_SSE2
   4387 
   4388 #ifdef HAS_SOBELYROW_SSE2
   4389 // SobelY as a matrix is
   4390 // -1 -2 -1
   4391 //  0  0  0
   4392 //  1  2  1
   4393 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
   4394                     uint8* dst_sobely, int width) {
   4395   asm volatile (
   4396     "sub       %0,%1                           \n"
   4397     "sub       %0,%2                           \n"
   4398     "pxor      %%xmm5,%%xmm5                   \n"
   4399 
   4400     // 8 pixel loop.
   4401     LABELALIGN
   4402   "1:                                          \n"
   4403     "movq      " MEMACCESS(0) ",%%xmm0         \n"
   4404     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
   4405     "punpcklbw %%xmm5,%%xmm0                   \n"
   4406     "punpcklbw %%xmm5,%%xmm1                   \n"
   4407     "psubw     %%xmm1,%%xmm0                   \n"
   4408     "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
   4409     MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
   4410     "punpcklbw %%xmm5,%%xmm1                   \n"
   4411     "punpcklbw %%xmm5,%%xmm2                   \n"
   4412     "psubw     %%xmm2,%%xmm1                   \n"
   4413     "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
   4414     MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
   4415     "punpcklbw %%xmm5,%%xmm2                   \n"
   4416     "punpcklbw %%xmm5,%%xmm3                   \n"
   4417     "psubw     %%xmm3,%%xmm2                   \n"
   4418     "paddw     %%xmm2,%%xmm0                   \n"
   4419     "paddw     %%xmm1,%%xmm0                   \n"
   4420     "paddw     %%xmm1,%%xmm0                   \n"
   4421     "pxor      %%xmm1,%%xmm1                   \n"
   4422     "psubw     %%xmm0,%%xmm1                   \n"
   4423     "pmaxsw    %%xmm1,%%xmm0                   \n"
   4424     "packuswb  %%xmm0,%%xmm0                   \n"
   4425     MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
   4426     "lea       " MEMLEA(0x8,0) ",%0            \n"
   4427     "sub       $0x8,%3                         \n"
   4428     "jg        1b                              \n"
   4429   : "+r"(src_y0),      // %0
   4430     "+r"(src_y1),      // %1
   4431     "+r"(dst_sobely),  // %2
   4432     "+r"(width)        // %3
   4433   :
   4434   : "memory", "cc", NACL_R14
   4435     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   4436   );
   4437 }
   4438 #endif  // HAS_SOBELYROW_SSE2
   4439 
   4440 #ifdef HAS_SOBELROW_SSE2
   4441 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
   4442 // A = 255
   4443 // R = Sobel
   4444 // G = Sobel
   4445 // B = Sobel
   4446 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
   4447                    uint8* dst_argb, int width) {
   4448   asm volatile (
   4449     "sub       %0,%1                           \n"
   4450     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   4451     "pslld     $0x18,%%xmm5                    \n"
   4452 
   4453     // 8 pixel loop.
   4454     LABELALIGN
   4455   "1:                                          \n"
   4456     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4457     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
   4458     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4459     "paddusb   %%xmm1,%%xmm0                   \n"
   4460     "movdqa    %%xmm0,%%xmm2                   \n"
   4461     "punpcklbw %%xmm0,%%xmm2                   \n"
   4462     "punpckhbw %%xmm0,%%xmm0                   \n"
   4463     "movdqa    %%xmm2,%%xmm1                   \n"
   4464     "punpcklwd %%xmm2,%%xmm1                   \n"
   4465     "punpckhwd %%xmm2,%%xmm2                   \n"
   4466     "por       %%xmm5,%%xmm1                   \n"
   4467     "por       %%xmm5,%%xmm2                   \n"
   4468     "movdqa    %%xmm0,%%xmm3                   \n"
   4469     "punpcklwd %%xmm0,%%xmm3                   \n"
   4470     "punpckhwd %%xmm0,%%xmm0                   \n"
   4471     "por       %%xmm5,%%xmm3                   \n"
   4472     "por       %%xmm5,%%xmm0                   \n"
   4473     "movdqu    %%xmm1," MEMACCESS(2) "         \n"
   4474     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
   4475     "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
   4476     "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
   4477     "lea       " MEMLEA(0x40,2) ",%2           \n"
   4478     "sub       $0x10,%3                        \n"
   4479     "jg        1b                              \n"
   4480   : "+r"(src_sobelx),  // %0
   4481     "+r"(src_sobely),  // %1
   4482     "+r"(dst_argb),    // %2
   4483     "+r"(width)        // %3
   4484   :
   4485   : "memory", "cc", NACL_R14
   4486     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   4487   );
   4488 }
   4489 #endif  // HAS_SOBELROW_SSE2
   4490 
   4491 #ifdef HAS_SOBELTOPLANEROW_SSE2
   4492 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
   4493 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
   4494                           uint8* dst_y, int width) {
   4495   asm volatile (
   4496     "sub       %0,%1                           \n"
   4497     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   4498     "pslld     $0x18,%%xmm5                    \n"
   4499 
   4500     // 8 pixel loop.
   4501     LABELALIGN
   4502   "1:                                          \n"
   4503     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4504     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
   4505     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4506     "paddusb   %%xmm1,%%xmm0                   \n"
   4507     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   4508     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4509     "sub       $0x10,%3                        \n"
   4510     "jg        1b                              \n"
   4511   : "+r"(src_sobelx),  // %0
   4512     "+r"(src_sobely),  // %1
   4513     "+r"(dst_y),       // %2
   4514     "+r"(width)        // %3
   4515   :
   4516   : "memory", "cc", NACL_R14
   4517     "xmm0", "xmm1"
   4518   );
   4519 }
   4520 #endif  // HAS_SOBELTOPLANEROW_SSE2
   4521 
   4522 #ifdef HAS_SOBELXYROW_SSE2
   4523 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
   4524 // A = 255
   4525 // R = Sobel X
   4526 // G = Sobel
   4527 // B = Sobel Y
   4528 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
   4529                      uint8* dst_argb, int width) {
   4530   asm volatile (
   4531     "sub       %0,%1                           \n"
   4532     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   4533 
   4534     // 8 pixel loop.
   4535     LABELALIGN
   4536   "1:                                          \n"
   4537     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4538     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
   4539     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4540     "movdqa    %%xmm0,%%xmm2                   \n"
   4541     "paddusb   %%xmm1,%%xmm2                   \n"
   4542     "movdqa    %%xmm0,%%xmm3                   \n"
   4543     "punpcklbw %%xmm5,%%xmm3                   \n"
   4544     "punpckhbw %%xmm5,%%xmm0                   \n"
   4545     "movdqa    %%xmm1,%%xmm4                   \n"
   4546     "punpcklbw %%xmm2,%%xmm4                   \n"
   4547     "punpckhbw %%xmm2,%%xmm1                   \n"
   4548     "movdqa    %%xmm4,%%xmm6                   \n"
   4549     "punpcklwd %%xmm3,%%xmm6                   \n"
   4550     "punpckhwd %%xmm3,%%xmm4                   \n"
   4551     "movdqa    %%xmm1,%%xmm7                   \n"
   4552     "punpcklwd %%xmm0,%%xmm7                   \n"
   4553     "punpckhwd %%xmm0,%%xmm1                   \n"
   4554     "movdqu    %%xmm6," MEMACCESS(2) "         \n"
   4555     "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
   4556     "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
   4557     "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
   4558     "lea       " MEMLEA(0x40,2) ",%2           \n"
   4559     "sub       $0x10,%3                        \n"
   4560     "jg        1b                              \n"
   4561   : "+r"(src_sobelx),  // %0
   4562     "+r"(src_sobely),  // %1
   4563     "+r"(dst_argb),    // %2
   4564     "+r"(width)        // %3
   4565   :
   4566   : "memory", "cc", NACL_R14
   4567     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   4568   );
   4569 }
   4570 #endif  // HAS_SOBELXYROW_SSE2
   4571 
   4572 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
   4573 // Creates a table of cumulative sums where each value is a sum of all values
   4574 // above and to the left of the value, inclusive of the value.
   4575 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
   4576                                   const int32* previous_cumsum, int width) {
   4577   asm volatile (
   4578     "pxor      %%xmm0,%%xmm0                   \n"
   4579     "pxor      %%xmm1,%%xmm1                   \n"
   4580     "sub       $0x4,%3                         \n"
   4581     "jl        49f                             \n"
   4582     "test      $0xf,%1                         \n"
   4583     "jne       49f                             \n"
   4584 
   4585   // 4 pixel loop                              \n"
   4586     LABELALIGN
   4587   "40:                                         \n"
   4588     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
   4589     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4590     "movdqa    %%xmm2,%%xmm4                   \n"
   4591     "punpcklbw %%xmm1,%%xmm2                   \n"
   4592     "movdqa    %%xmm2,%%xmm3                   \n"
   4593     "punpcklwd %%xmm1,%%xmm2                   \n"
   4594     "punpckhwd %%xmm1,%%xmm3                   \n"
   4595     "punpckhbw %%xmm1,%%xmm4                   \n"
   4596     "movdqa    %%xmm4,%%xmm5                   \n"
   4597     "punpcklwd %%xmm1,%%xmm4                   \n"
   4598     "punpckhwd %%xmm1,%%xmm5                   \n"
   4599     "paddd     %%xmm2,%%xmm0                   \n"
   4600     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
   4601     "paddd     %%xmm0,%%xmm2                   \n"
   4602     "paddd     %%xmm3,%%xmm0                   \n"
   4603     "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
   4604     "paddd     %%xmm0,%%xmm3                   \n"
   4605     "paddd     %%xmm4,%%xmm0                   \n"
   4606     "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
   4607     "paddd     %%xmm0,%%xmm4                   \n"
   4608     "paddd     %%xmm5,%%xmm0                   \n"
   4609     "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
   4610     "lea       " MEMLEA(0x40,2) ",%2           \n"
   4611     "paddd     %%xmm0,%%xmm5                   \n"
   4612     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
   4613     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
   4614     "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
   4615     "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
   4616     "lea       " MEMLEA(0x40,1) ",%1           \n"
   4617     "sub       $0x4,%3                         \n"
   4618     "jge       40b                             \n"
   4619 
   4620   "49:                                         \n"
   4621     "add       $0x3,%3                         \n"
   4622     "jl        19f                             \n"
   4623 
   4624   // 1 pixel loop                              \n"
   4625     LABELALIGN
   4626   "10:                                         \n"
   4627     "movd      " MEMACCESS(0) ",%%xmm2         \n"
   4628     "lea       " MEMLEA(0x4,0) ",%0            \n"
   4629     "punpcklbw %%xmm1,%%xmm2                   \n"
   4630     "punpcklwd %%xmm1,%%xmm2                   \n"
   4631     "paddd     %%xmm2,%%xmm0                   \n"
   4632     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
   4633     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4634     "paddd     %%xmm0,%%xmm2                   \n"
   4635     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
   4636     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4637     "sub       $0x1,%3                         \n"
   4638     "jge       10b                             \n"
   4639 
   4640   "19:                                         \n"
   4641   : "+r"(row),  // %0
   4642     "+r"(cumsum),  // %1
   4643     "+r"(previous_cumsum),  // %2
   4644     "+r"(width)  // %3
   4645   :
   4646   : "memory", "cc"
   4647     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   4648   );
   4649 }
   4650 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
   4651 
   4652 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
   4653 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
   4654                                     int width, int area, uint8* dst,
   4655                                     int count) {
   4656   asm volatile (
   4657     "movd      %5,%%xmm5                       \n"
   4658     "cvtdq2ps  %%xmm5,%%xmm5                   \n"
   4659     "rcpss     %%xmm5,%%xmm4                   \n"
   4660     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
   4661     "sub       $0x4,%3                         \n"
   4662     "jl        49f                             \n"
   4663     "cmpl      $0x80,%5                        \n"
   4664     "ja        40f                             \n"
   4665 
   4666     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
   4667     "pcmpeqb   %%xmm6,%%xmm6                   \n"
   4668     "psrld     $0x10,%%xmm6                    \n"
   4669     "cvtdq2ps  %%xmm6,%%xmm6                   \n"
   4670     "addps     %%xmm6,%%xmm5                   \n"
   4671     "mulps     %%xmm4,%%xmm5                   \n"
   4672     "cvtps2dq  %%xmm5,%%xmm5                   \n"
   4673     "packssdw  %%xmm5,%%xmm5                   \n"
   4674 
   4675   // 4 pixel small loop                        \n"
   4676     LABELALIGN
   4677   "4:                                         \n"
   4678     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4679     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   4680     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   4681     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   4682     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
   4683     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
   4684     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
   4685     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
   4686     "lea       " MEMLEA(0x40,0) ",%0           \n"
   4687     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
   4688     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
   4689     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
   4690     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
   4691     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
   4692     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
   4693     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
   4694     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
   4695     "lea       " MEMLEA(0x40,1) ",%1           \n"
   4696     "packssdw  %%xmm1,%%xmm0                   \n"
   4697     "packssdw  %%xmm3,%%xmm2                   \n"
   4698     "pmulhuw   %%xmm5,%%xmm0                   \n"
   4699     "pmulhuw   %%xmm5,%%xmm2                   \n"
   4700     "packuswb  %%xmm2,%%xmm0                   \n"
   4701     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   4702     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4703     "sub       $0x4,%3                         \n"
   4704     "jge       4b                              \n"
   4705     "jmp       49f                             \n"
   4706 
   4707   // 4 pixel loop                              \n"
   4708     LABELALIGN
   4709   "40:                                         \n"
   4710     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4711     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   4712     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   4713     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   4714     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
   4715     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
   4716     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
   4717     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
   4718     "lea       " MEMLEA(0x40,0) ",%0           \n"
   4719     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
   4720     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
   4721     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
   4722     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
   4723     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
   4724     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
   4725     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
   4726     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
   4727     "lea       " MEMLEA(0x40,1) ",%1           \n"
   4728     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
   4729     "cvtdq2ps  %%xmm1,%%xmm1                   \n"
   4730     "mulps     %%xmm4,%%xmm0                   \n"
   4731     "mulps     %%xmm4,%%xmm1                   \n"
   4732     "cvtdq2ps  %%xmm2,%%xmm2                   \n"
   4733     "cvtdq2ps  %%xmm3,%%xmm3                   \n"
   4734     "mulps     %%xmm4,%%xmm2                   \n"
   4735     "mulps     %%xmm4,%%xmm3                   \n"
   4736     "cvtps2dq  %%xmm0,%%xmm0                   \n"
   4737     "cvtps2dq  %%xmm1,%%xmm1                   \n"
   4738     "cvtps2dq  %%xmm2,%%xmm2                   \n"
   4739     "cvtps2dq  %%xmm3,%%xmm3                   \n"
   4740     "packssdw  %%xmm1,%%xmm0                   \n"
   4741     "packssdw  %%xmm3,%%xmm2                   \n"
   4742     "packuswb  %%xmm2,%%xmm0                   \n"
   4743     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   4744     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4745     "sub       $0x4,%3                         \n"
   4746     "jge       40b                             \n"
   4747 
   4748   "49:                                         \n"
   4749     "add       $0x3,%3                         \n"
   4750     "jl        19f                             \n"
   4751 
   4752   // 1 pixel loop                              \n"
   4753     LABELALIGN
   4754   "10:                                         \n"
   4755     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   4756     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
   4757     "lea       " MEMLEA(0x10,0) ",%0           \n"
   4758     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
   4759     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
   4760     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4761     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
   4762     "mulps     %%xmm4,%%xmm0                   \n"
   4763     "cvtps2dq  %%xmm0,%%xmm0                   \n"
   4764     "packssdw  %%xmm0,%%xmm0                   \n"
   4765     "packuswb  %%xmm0,%%xmm0                   \n"
   4766     "movd      %%xmm0," MEMACCESS(2) "         \n"
   4767     "lea       " MEMLEA(0x4,2) ",%2            \n"
   4768     "sub       $0x1,%3                         \n"
   4769     "jge       10b                             \n"
   4770   "19:                                         \n"
   4771   : "+r"(topleft),  // %0
   4772     "+r"(botleft),  // %1
   4773     "+r"(dst),      // %2
   4774     "+rm"(count)    // %3
   4775   : "r"((intptr_t)(width)),  // %4
   4776     "rm"(area)     // %5
   4777   : "memory", "cc", NACL_R14
   4778     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   4779   );
   4780 }
   4781 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
   4782 
   4783 #ifdef HAS_ARGBAFFINEROW_SSE2
   4784 // Copy ARGB pixels from source image with slope to a row of destination.
   4785 LIBYUV_API
   4786 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
   4787                         uint8* dst_argb, const float* src_dudv, int width) {
   4788   intptr_t src_argb_stride_temp = src_argb_stride;
   4789   intptr_t temp;
   4790   asm volatile (
   4791     "movq      " MEMACCESS(3) ",%%xmm2         \n"
   4792     "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
   4793     "shl       $0x10,%1                        \n"
   4794     "add       $0x4,%1                         \n"
   4795     "movd      %1,%%xmm5                       \n"
   4796     "sub       $0x4,%4                         \n"
   4797     "jl        49f                             \n"
   4798 
   4799     "pshufd    $0x44,%%xmm7,%%xmm7             \n"
   4800     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
   4801     "movdqa    %%xmm2,%%xmm0                   \n"
   4802     "addps     %%xmm7,%%xmm0                   \n"
   4803     "movlhps   %%xmm0,%%xmm2                   \n"
   4804     "movdqa    %%xmm7,%%xmm4                   \n"
   4805     "addps     %%xmm4,%%xmm4                   \n"
   4806     "movdqa    %%xmm2,%%xmm3                   \n"
   4807     "addps     %%xmm4,%%xmm3                   \n"
   4808     "addps     %%xmm4,%%xmm4                   \n"
   4809 
   4810   // 4 pixel loop                              \n"
   4811     LABELALIGN
   4812   "40:                                         \n"
   4813     "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
   4814     "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
   4815     "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
   4816     "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
   4817     "movd      %%xmm0,%k1                      \n"
   4818     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   4819     "movd      %%xmm0,%k5                      \n"
   4820     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   4821     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
   4822     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
   4823     "punpckldq %%xmm6,%%xmm1                   \n"
   4824     "addps     %%xmm4,%%xmm2                   \n"
   4825     "movq      %%xmm1," MEMACCESS(2) "         \n"
   4826     "movd      %%xmm0,%k1                      \n"
   4827     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   4828     "movd      %%xmm0,%k5                      \n"
   4829     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
   4830     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
   4831     "punpckldq %%xmm6,%%xmm0                   \n"
   4832     "addps     %%xmm4,%%xmm3                   \n"
   4833     "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
   4834     "lea       " MEMLEA(0x10,2) ",%2           \n"
   4835     "sub       $0x4,%4                         \n"
   4836     "jge       40b                             \n"
   4837 
   4838   "49:                                         \n"
   4839     "add       $0x3,%4                         \n"
   4840     "jl        19f                             \n"
   4841 
   4842   // 1 pixel loop                              \n"
   4843     LABELALIGN
   4844   "10:                                         \n"
   4845     "cvttps2dq %%xmm2,%%xmm0                   \n"
   4846     "packssdw  %%xmm0,%%xmm0                   \n"
   4847     "pmaddwd   %%xmm5,%%xmm0                   \n"
   4848     "addps     %%xmm7,%%xmm2                   \n"
   4849     "movd      %%xmm0,%k1                      \n"
   4850     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
   4851     "movd      %%xmm0," MEMACCESS(2) "         \n"
   4852     "lea       " MEMLEA(0x04,2) ",%2           \n"
   4853     "sub       $0x1,%4                         \n"
   4854     "jge       10b                             \n"
   4855   "19:                                         \n"
   4856   : "+r"(src_argb),  // %0
   4857     "+r"(src_argb_stride_temp),  // %1
   4858     "+r"(dst_argb),  // %2
   4859     "+r"(src_dudv),  // %3
   4860     "+rm"(width),    // %4
   4861     "=&r"(temp)      // %5
   4862   :
   4863   : "memory", "cc", NACL_R14
   4864     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   4865   );
   4866 }
   4867 #endif  // HAS_ARGBAFFINEROW_SSE2
   4868 
   4869 #ifdef HAS_INTERPOLATEROW_SSSE3
   4870 // Bilinear filter 16x2 -> 16x1
   4871 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   4872                           ptrdiff_t src_stride, int dst_width,
   4873                           int source_y_fraction) {
   4874   asm volatile (
   4875     "sub       %1,%0                           \n"
   4876     "cmp       $0x0,%3                         \n"
   4877     "je        100f                            \n"
   4878     "cmp       $0x80,%3                        \n"
   4879     "je        50f                             \n"
   4880 
   4881     "movd      %3,%%xmm0                       \n"
   4882     "neg       %3                              \n"
   4883     "add       $0x100,%3                       \n"
   4884     "movd      %3,%%xmm5                       \n"
   4885     "punpcklbw %%xmm0,%%xmm5                   \n"
   4886     "punpcklwd %%xmm5,%%xmm5                   \n"
   4887     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
   4888     "mov       $0x80808080,%%eax               \n"
   4889     "movd      %%eax,%%xmm4                    \n"
   4890     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
   4891 
   4892     // General purpose row blend.
   4893     LABELALIGN
   4894   "1:                                          \n"
   4895     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
   4896     MEMOPREG(movdqu,0x00,1,4,1,xmm2)
   4897     "movdqa     %%xmm0,%%xmm1                  \n"
   4898     "punpcklbw  %%xmm2,%%xmm0                  \n"
   4899     "punpckhbw  %%xmm2,%%xmm1                  \n"
   4900     "psubb      %%xmm4,%%xmm0                  \n"
   4901     "psubb      %%xmm4,%%xmm1                  \n"
   4902     "movdqa     %%xmm5,%%xmm2                  \n"
   4903     "movdqa     %%xmm5,%%xmm3                  \n"
   4904     "pmaddubsw  %%xmm0,%%xmm2                  \n"
   4905     "pmaddubsw  %%xmm1,%%xmm3                  \n"
   4906     "paddw      %%xmm4,%%xmm2                  \n"
   4907     "paddw      %%xmm4,%%xmm3                  \n"
   4908     "psrlw      $0x8,%%xmm2                    \n"
   4909     "psrlw      $0x8,%%xmm3                    \n"
   4910     "packuswb   %%xmm3,%%xmm2                  \n"
   4911     MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
   4912     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4913     "sub       $0x10,%2                        \n"
   4914     "jg        1b                              \n"
   4915     "jmp       99f                             \n"
   4916 
   4917     // Blend 50 / 50.
   4918     LABELALIGN
   4919   "50:                                         \n"
   4920     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
   4921     MEMOPREG(movdqu,0x00,1,4,1,xmm1)
   4922     "pavgb     %%xmm1,%%xmm0                   \n"
   4923     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
   4924     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4925     "sub       $0x10,%2                        \n"
   4926     "jg        50b                             \n"
   4927     "jmp       99f                             \n"
   4928 
   4929     // Blend 100 / 0 - Copy row unchanged.
   4930     LABELALIGN
   4931   "100:                                        \n"
   4932     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
   4933     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
   4934     "lea       " MEMLEA(0x10,1) ",%1           \n"
   4935     "sub       $0x10,%2                        \n"
   4936     "jg        100b                            \n"
   4937 
   4938   "99:                                         \n"
   4939   : "+r"(dst_ptr),     // %0
   4940     "+r"(src_ptr),     // %1
   4941     "+rm"(dst_width),  // %2
   4942     "+r"(source_y_fraction)  // %3
   4943   : "r"((intptr_t)(src_stride))  // %4
   4944   : "memory", "cc", "eax", NACL_R14
   4945     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   4946   );
   4947 }
   4948 #endif  // HAS_INTERPOLATEROW_SSSE3
   4949 
   4950 #ifdef HAS_INTERPOLATEROW_AVX2
   4951 // Bilinear filter 32x2 -> 32x1
   4952 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
   4953                          ptrdiff_t src_stride, int dst_width,
   4954                          int source_y_fraction) {
   4955   asm volatile (
   4956     "cmp       $0x0,%3                         \n"
   4957     "je        100f                            \n"
   4958     "sub       %1,%0                           \n"
   4959     "cmp       $0x80,%3                        \n"
   4960     "je        50f                             \n"
   4961 
   4962     "vmovd      %3,%%xmm0                      \n"
   4963     "neg        %3                             \n"
   4964     "add        $0x100,%3                      \n"
   4965     "vmovd      %3,%%xmm5                      \n"
   4966     "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
   4967     "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
   4968     "vbroadcastss %%xmm5,%%ymm5                \n"
   4969     "mov        $0x80808080,%%eax              \n"
   4970     "vmovd      %%eax,%%xmm4                   \n"
   4971     "vbroadcastss %%xmm4,%%ymm4                \n"
   4972 
   4973     // General purpose row blend.
   4974     LABELALIGN
   4975   "1:                                          \n"
   4976     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
   4977     MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
   4978     "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
   4979     "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
   4980     "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
   4981     "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
   4982     "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
   4983     "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
   4984     "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
   4985     "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
   4986     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
   4987     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
   4988     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
   4989     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
   4990     "lea       " MEMLEA(0x20,1) ",%1           \n"
   4991     "sub       $0x20,%2                        \n"
   4992     "jg        1b                              \n"
   4993     "jmp       99f                             \n"
   4994 
   4995     // Blend 50 / 50.
   4996     LABELALIGN
   4997   "50:                                         \n"
   4998     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
   4999     VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
   5000     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
   5001     "lea       " MEMLEA(0x20,1) ",%1           \n"
   5002     "sub       $0x20,%2                        \n"
   5003     "jg        50b                             \n"
   5004     "jmp       99f                             \n"
   5005 
   5006     // Blend 100 / 0 - Copy row unchanged.
   5007     LABELALIGN
   5008   "100:                                        \n"
   5009     "rep movsb " MEMMOVESTRING(1,0) "          \n"
   5010     "jmp       999f                            \n"
   5011 
   5012   "99:                                         \n"
   5013     "vzeroupper                                \n"
   5014   "999:                                        \n"
   5015   : "+D"(dst_ptr),    // %0
   5016     "+S"(src_ptr),    // %1
   5017     "+cm"(dst_width),  // %2
   5018     "+r"(source_y_fraction)  // %3
   5019   : "r"((intptr_t)(src_stride))  // %4
   5020   : "memory", "cc", "eax", NACL_R14
   5021     "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
   5022   );
   5023 }
   5024 #endif  // HAS_INTERPOLATEROW_AVX2
   5025 
   5026 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
   5027 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
   5028 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
   5029                           const uint8* shuffler, int width) {
   5030   asm volatile (
   5031     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
   5032     LABELALIGN
   5033   "1:                                          \n"
   5034     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   5035     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   5036     "lea       " MEMLEA(0x20,0) ",%0           \n"
   5037     "pshufb    %%xmm5,%%xmm0                   \n"
   5038     "pshufb    %%xmm5,%%xmm1                   \n"
   5039     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   5040     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   5041     "lea       " MEMLEA(0x20,1) ",%1           \n"
   5042     "sub       $0x8,%2                         \n"
   5043     "jg        1b                              \n"
   5044   : "+r"(src_argb),  // %0
   5045     "+r"(dst_argb),  // %1
   5046     "+r"(width)        // %2
   5047   : "r"(shuffler)    // %3
   5048   : "memory", "cc"
   5049     , "xmm0", "xmm1", "xmm5"
   5050   );
   5051 }
   5052 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
   5053 
   5054 #ifdef HAS_ARGBSHUFFLEROW_AVX2
   5055 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
   5056 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
   5057                          const uint8* shuffler, int width) {
   5058   asm volatile (
   5059     "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
   5060     LABELALIGN
   5061   "1:                                          \n"
   5062     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
   5063     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
   5064     "lea       " MEMLEA(0x40,0) ",%0           \n"
   5065     "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
   5066     "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
   5067     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
   5068     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
   5069     "lea       " MEMLEA(0x40,1) ",%1           \n"
   5070     "sub       $0x10,%2                        \n"
   5071     "jg        1b                              \n"
   5072     "vzeroupper                                \n"
   5073   : "+r"(src_argb),  // %0
   5074     "+r"(dst_argb),  // %1
   5075     "+r"(width)        // %2
   5076   : "r"(shuffler)    // %3
   5077   : "memory", "cc"
   5078     , "xmm0", "xmm1", "xmm5"
   5079   );
   5080 }
   5081 #endif  // HAS_ARGBSHUFFLEROW_AVX2
   5082 
   5083 #ifdef HAS_ARGBSHUFFLEROW_SSE2
   5084 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
   5085 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
   5086                          const uint8* shuffler, int width) {
   5087   uintptr_t pixel_temp;
   5088   asm volatile (
   5089     "pxor      %%xmm5,%%xmm5                   \n"
   5090     "mov       " MEMACCESS(4) ",%k2            \n"
   5091     "cmp       $0x3000102,%k2                  \n"
   5092     "je        3012f                           \n"
   5093     "cmp       $0x10203,%k2                    \n"
   5094     "je        123f                            \n"
   5095     "cmp       $0x30201,%k2                    \n"
   5096     "je        321f                            \n"
   5097     "cmp       $0x2010003,%k2                  \n"
   5098     "je        2103f                           \n"
   5099 
   5100     LABELALIGN
   5101   "1:                                          \n"
   5102     "movzb     " MEMACCESS(4) ",%2             \n"
   5103     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
   5104     "mov       %b2," MEMACCESS(1) "            \n"
   5105     "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
   5106     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
   5107     "mov       %b2," MEMACCESS2(0x1,1) "       \n"
   5108     "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
   5109     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
   5110     "mov       %b2," MEMACCESS2(0x2,1) "       \n"
   5111     "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
   5112     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
   5113     "mov       %b2," MEMACCESS2(0x3,1) "       \n"
   5114     "lea       " MEMLEA(0x4,0) ",%0            \n"
   5115     "lea       " MEMLEA(0x4,1) ",%1            \n"
   5116     "sub       $0x1,%3                         \n"
   5117     "jg        1b                              \n"
   5118     "jmp       99f                             \n"
   5119 
   5120     LABELALIGN
   5121   "123:                                        \n"
   5122     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   5123     "lea       " MEMLEA(0x10,0) ",%0           \n"
   5124     "movdqa    %%xmm0,%%xmm1                   \n"
   5125     "punpcklbw %%xmm5,%%xmm0                   \n"
   5126     "punpckhbw %%xmm5,%%xmm1                   \n"
   5127     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
   5128     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
   5129     "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
   5130     "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
   5131     "packuswb  %%xmm1,%%xmm0                   \n"
   5132     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   5133     "lea       " MEMLEA(0x10,1) ",%1           \n"
   5134     "sub       $0x4,%3                         \n"
   5135     "jg        123b                            \n"
   5136     "jmp       99f                             \n"
   5137 
   5138     LABELALIGN
   5139   "321:                                        \n"
   5140     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   5141     "lea       " MEMLEA(0x10,0) ",%0           \n"
   5142     "movdqa    %%xmm0,%%xmm1                   \n"
   5143     "punpcklbw %%xmm5,%%xmm0                   \n"
   5144     "punpckhbw %%xmm5,%%xmm1                   \n"
   5145     "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
   5146     "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
   5147     "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
   5148     "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
   5149     "packuswb  %%xmm1,%%xmm0                   \n"
   5150     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   5151     "lea       " MEMLEA(0x10,1) ",%1           \n"
   5152     "sub       $0x4,%3                         \n"
   5153     "jg        321b                            \n"
   5154     "jmp       99f                             \n"
   5155 
   5156     LABELALIGN
   5157   "2103:                                       \n"
   5158     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   5159     "lea       " MEMLEA(0x10,0) ",%0           \n"
   5160     "movdqa    %%xmm0,%%xmm1                   \n"
   5161     "punpcklbw %%xmm5,%%xmm0                   \n"
   5162     "punpckhbw %%xmm5,%%xmm1                   \n"
   5163     "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
   5164     "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
   5165     "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
   5166     "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
   5167     "packuswb  %%xmm1,%%xmm0                   \n"
   5168     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   5169     "lea       " MEMLEA(0x10,1) ",%1           \n"
   5170     "sub       $0x4,%3                         \n"
   5171     "jg        2103b                           \n"
   5172     "jmp       99f                             \n"
   5173 
   5174     LABELALIGN
   5175   "3012:                                       \n"
   5176     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   5177     "lea       " MEMLEA(0x10,0) ",%0           \n"
   5178     "movdqa    %%xmm0,%%xmm1                   \n"
   5179     "punpcklbw %%xmm5,%%xmm0                   \n"
   5180     "punpckhbw %%xmm5,%%xmm1                   \n"
   5181     "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
   5182     "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
   5183     "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
   5184     "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
   5185     "packuswb  %%xmm1,%%xmm0                   \n"
   5186     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   5187     "lea       " MEMLEA(0x10,1) ",%1           \n"
   5188     "sub       $0x4,%3                         \n"
   5189     "jg        3012b                           \n"
   5190 
   5191   "99:                                         \n"
   5192   : "+r"(src_argb),     // %0
   5193     "+r"(dst_argb),     // %1
   5194     "=&d"(pixel_temp),  // %2
   5195     "+r"(width)         // %3
   5196   : "r"(shuffler)       // %4
   5197   : "memory", "cc", NACL_R14
   5198     "xmm0", "xmm1", "xmm5"
   5199   );
   5200 }
   5201 #endif  // HAS_ARGBSHUFFLEROW_SSE2
   5202 
   5203 #ifdef HAS_I422TOYUY2ROW_SSE2
   5204 void I422ToYUY2Row_SSE2(const uint8* src_y,
   5205                         const uint8* src_u,
   5206                         const uint8* src_v,
   5207                         uint8* dst_frame, int width) {
   5208  asm volatile (
   5209     "sub       %1,%2                             \n"
   5210     LABELALIGN
   5211   "1:                                            \n"
   5212     "movq      " MEMACCESS(1) ",%%xmm2           \n"
   5213     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
   5214     "lea       " MEMLEA(0x8,1) ",%1              \n"
   5215     "punpcklbw %%xmm3,%%xmm2                     \n"
   5216     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
   5217     "lea       " MEMLEA(0x10,0) ",%0             \n"
   5218     "movdqa    %%xmm0,%%xmm1                     \n"
   5219     "punpcklbw %%xmm2,%%xmm0                     \n"
   5220     "punpckhbw %%xmm2,%%xmm1                     \n"
   5221     "movdqu    %%xmm0," MEMACCESS(3) "           \n"
   5222     "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
   5223     "lea       " MEMLEA(0x20,3) ",%3             \n"
   5224     "sub       $0x10,%4                          \n"
   5225     "jg         1b                               \n"
   5226     : "+r"(src_y),  // %0
   5227       "+r"(src_u),  // %1
   5228       "+r"(src_v),  // %2
   5229       "+r"(dst_frame),  // %3
   5230       "+rm"(width)  // %4
   5231     :
   5232     : "memory", "cc", NACL_R14
   5233     "xmm0", "xmm1", "xmm2", "xmm3"
   5234   );
   5235 }
   5236 #endif  // HAS_I422TOYUY2ROW_SSE2
   5237 
   5238 #ifdef HAS_I422TOUYVYROW_SSE2
   5239 void I422ToUYVYRow_SSE2(const uint8* src_y,
   5240                         const uint8* src_u,
   5241                         const uint8* src_v,
   5242                         uint8* dst_frame, int width) {
   5243  asm volatile (
   5244     "sub        %1,%2                            \n"
   5245     LABELALIGN
   5246   "1:                                            \n"
   5247     "movq      " MEMACCESS(1) ",%%xmm2           \n"
   5248     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
   5249     "lea       " MEMLEA(0x8,1) ",%1              \n"
   5250     "punpcklbw %%xmm3,%%xmm2                     \n"
   5251     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
   5252     "movdqa    %%xmm2,%%xmm1                     \n"
   5253     "lea       " MEMLEA(0x10,0) ",%0             \n"
   5254     "punpcklbw %%xmm0,%%xmm1                     \n"
   5255     "punpckhbw %%xmm0,%%xmm2                     \n"
   5256     "movdqu    %%xmm1," MEMACCESS(3) "           \n"
   5257     "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
   5258     "lea       " MEMLEA(0x20,3) ",%3             \n"
   5259     "sub       $0x10,%4                          \n"
   5260     "jg         1b                               \n"
   5261     : "+r"(src_y),  // %0
   5262       "+r"(src_u),  // %1
   5263       "+r"(src_v),  // %2
   5264       "+r"(dst_frame),  // %3
   5265       "+rm"(width)  // %4
   5266     :
   5267     : "memory", "cc", NACL_R14
   5268     "xmm0", "xmm1", "xmm2", "xmm3"
   5269   );
   5270 }
   5271 #endif  // HAS_I422TOUYVYROW_SSE2
   5272 
   5273 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
   5274 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
   5275                             uint8* dst_argb, const float* poly,
   5276                             int width) {
   5277   asm volatile (
   5278     "pxor      %%xmm3,%%xmm3                   \n"
   5279 
   5280     // 2 pixel loop.
   5281     LABELALIGN
   5282   "1:                                          \n"
   5283     "movq      " MEMACCESS(0) ",%%xmm0         \n"
   5284     "lea       " MEMLEA(0x8,0) ",%0            \n"
   5285     "punpcklbw %%xmm3,%%xmm0                   \n"
   5286     "movdqa    %%xmm0,%%xmm4                   \n"
   5287     "punpcklwd %%xmm3,%%xmm0                   \n"
   5288     "punpckhwd %%xmm3,%%xmm4                   \n"
   5289     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
   5290     "cvtdq2ps  %%xmm4,%%xmm4                   \n"
   5291     "movdqa    %%xmm0,%%xmm1                   \n"
   5292     "movdqa    %%xmm4,%%xmm5                   \n"
   5293     "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
   5294     "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
   5295     "addps     " MEMACCESS(3) ",%%xmm0         \n"
   5296     "addps     " MEMACCESS(3) ",%%xmm4         \n"
   5297     "movdqa    %%xmm1,%%xmm2                   \n"
   5298     "movdqa    %%xmm5,%%xmm6                   \n"
   5299     "mulps     %%xmm1,%%xmm2                   \n"
   5300     "mulps     %%xmm5,%%xmm6                   \n"
   5301     "mulps     %%xmm2,%%xmm1                   \n"
   5302     "mulps     %%xmm6,%%xmm5                   \n"
   5303     "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
   5304     "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
   5305     "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
   5306     "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
   5307     "addps     %%xmm2,%%xmm0                   \n"
   5308     "addps     %%xmm6,%%xmm4                   \n"
   5309     "addps     %%xmm1,%%xmm0                   \n"
   5310     "addps     %%xmm5,%%xmm4                   \n"
   5311     "cvttps2dq %%xmm0,%%xmm0                   \n"
   5312     "cvttps2dq %%xmm4,%%xmm4                   \n"
   5313     "packuswb  %%xmm4,%%xmm0                   \n"
   5314     "packuswb  %%xmm0,%%xmm0                   \n"
   5315     "movq      %%xmm0," MEMACCESS(1) "         \n"
   5316     "lea       " MEMLEA(0x8,1) ",%1            \n"
   5317     "sub       $0x2,%2                         \n"
   5318     "jg        1b                              \n"
   5319   : "+r"(src_argb),  // %0
   5320     "+r"(dst_argb),  // %1
   5321     "+r"(width)      // %2
   5322   : "r"(poly)        // %3
   5323   : "memory", "cc"
   5324     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   5325   );
   5326 }
   5327 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
   5328 
   5329 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
   5330 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
   5331                             uint8* dst_argb, const float* poly,
   5332                             int width) {
   5333   asm volatile (
   5334     "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
   5335     "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
   5336     "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
   5337     "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
   5338 
   5339     // 2 pixel loop.
   5340     LABELALIGN
   5341   "1:                                          \n"
   5342     "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
   5343     "lea         " MEMLEA(0x8,0) ",%0          \n"
   5344     "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
   5345     "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
   5346     "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
   5347     "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
   5348     "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
   5349     "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
   5350     "vcvttps2dq  %%ymm0,%%ymm0                 \n"
   5351     "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
   5352     "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
   5353     "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
   5354     "vmovq       %%xmm0," MEMACCESS(1) "       \n"
   5355     "lea         " MEMLEA(0x8,1) ",%1          \n"
   5356     "sub         $0x2,%2                       \n"
   5357     "jg          1b                            \n"
   5358     "vzeroupper                                \n"
   5359   : "+r"(src_argb),  // %0
   5360     "+r"(dst_argb),  // %1
   5361     "+r"(width)      // %2
   5362   : "r"(poly)        // %3
   5363   : "memory", "cc",
   5364     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   5365   );
   5366 }
   5367 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
   5368 
   5369 #ifdef HAS_ARGBCOLORTABLEROW_X86
   5370 // Tranform ARGB pixels with color table.
   5371 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
   5372                            int width) {
   5373   uintptr_t pixel_temp;
   5374   asm volatile (
   5375     // 1 pixel loop.
   5376     LABELALIGN
   5377   "1:                                          \n"
   5378     "movzb     " MEMACCESS(0) ",%1             \n"
   5379     "lea       " MEMLEA(0x4,0) ",%0            \n"
   5380     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
   5381     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
   5382     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
   5383     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
   5384     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
   5385     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
   5386     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
   5387     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
   5388     "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
   5389     MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
   5390     "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
   5391     "dec       %2                              \n"
   5392     "jg        1b                              \n"
   5393   : "+r"(dst_argb),     // %0
   5394     "=&d"(pixel_temp),  // %1
   5395     "+r"(width)         // %2
   5396   : "r"(table_argb)     // %3
   5397   : "memory", "cc");
   5398 }
   5399 #endif  // HAS_ARGBCOLORTABLEROW_X86
   5400 
   5401 #ifdef HAS_RGBCOLORTABLEROW_X86
   5402 // Tranform RGB pixels with color table.
   5403 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
   5404   uintptr_t pixel_temp;
   5405   asm volatile (
   5406     // 1 pixel loop.
   5407     LABELALIGN
   5408   "1:                                          \n"
   5409     "movzb     " MEMACCESS(0) ",%1             \n"
   5410     "lea       " MEMLEA(0x4,0) ",%0            \n"
   5411     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
   5412     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
   5413     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
   5414     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
   5415     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
   5416     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
   5417     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
   5418     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
   5419     "dec       %2                              \n"
   5420     "jg        1b                              \n"
   5421   : "+r"(dst_argb),     // %0
   5422     "=&d"(pixel_temp),  // %1
   5423     "+r"(width)         // %2
   5424   : "r"(table_argb)     // %3
   5425   : "memory", "cc");
   5426 }
   5427 #endif  // HAS_RGBCOLORTABLEROW_X86
   5428 
   5429 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
   5430 // Tranform RGB pixels with luma table.
   5431 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
   5432                                  int width,
   5433                                  const uint8* luma, uint32 lumacoeff) {
   5434   uintptr_t pixel_temp;
   5435   uintptr_t table_temp;
   5436   asm volatile (
   5437     "movd      %6,%%xmm3                       \n"
   5438     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
   5439     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   5440     "psllw     $0x8,%%xmm4                     \n"
   5441     "pxor      %%xmm5,%%xmm5                   \n"
   5442 
   5443     // 4 pixel loop.
   5444     LABELALIGN
   5445   "1:                                          \n"
   5446     "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
   5447     "pmaddubsw %%xmm3,%%xmm0                   \n"
   5448     "phaddw    %%xmm0,%%xmm0                   \n"
   5449     "pand      %%xmm4,%%xmm0                   \n"
   5450     "punpcklwd %%xmm5,%%xmm0                   \n"
   5451     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
   5452     "add       %5,%1                           \n"
   5453     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   5454 
   5455     "movzb     " MEMACCESS(2) ",%0             \n"
   5456     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5457     "mov       %b0," MEMACCESS(3) "            \n"
   5458     "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
   5459     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5460     "mov       %b0," MEMACCESS2(0x1,3) "       \n"
   5461     "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
   5462     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5463     "mov       %b0," MEMACCESS2(0x2,3) "       \n"
   5464     "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
   5465     "mov       %b0," MEMACCESS2(0x3,3) "       \n"
   5466 
   5467     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
   5468     "add       %5,%1                           \n"
   5469     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   5470 
   5471     "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
   5472     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5473     "mov       %b0," MEMACCESS2(0x4,3) "       \n"
   5474     "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
   5475     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5476     "mov       %b0," MEMACCESS2(0x5,3) "       \n"
   5477     "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
   5478     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5479     "mov       %b0," MEMACCESS2(0x6,3) "       \n"
   5480     "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
   5481     "mov       %b0," MEMACCESS2(0x7,3) "       \n"
   5482 
   5483     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
   5484     "add       %5,%1                           \n"
   5485     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
   5486 
   5487     "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
   5488     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5489     "mov       %b0," MEMACCESS2(0x8,3) "       \n"
   5490     "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
   5491     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5492     "mov       %b0," MEMACCESS2(0x9,3) "       \n"
   5493     "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
   5494     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5495     "mov       %b0," MEMACCESS2(0xa,3) "       \n"
   5496     "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
   5497     "mov       %b0," MEMACCESS2(0xb,3) "       \n"
   5498 
   5499     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
   5500     "add       %5,%1                           \n"
   5501 
   5502     "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
   5503     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5504     "mov       %b0," MEMACCESS2(0xc,3) "       \n"
   5505     "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
   5506     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5507     "mov       %b0," MEMACCESS2(0xd,3) "       \n"
   5508     "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
   5509     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
   5510     "mov       %b0," MEMACCESS2(0xe,3) "       \n"
   5511     "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
   5512     "mov       %b0," MEMACCESS2(0xf,3) "       \n"
   5513     "lea       " MEMLEA(0x10,2) ",%2           \n"
   5514     "lea       " MEMLEA(0x10,3) ",%3           \n"
   5515     "sub       $0x4,%4                         \n"
   5516     "jg        1b                              \n"
   5517   : "=&d"(pixel_temp),  // %0
   5518     "=&a"(table_temp),  // %1
   5519     "+r"(src_argb),     // %2
   5520     "+r"(dst_argb),     // %3
   5521     "+rm"(width)        // %4
   5522   : "r"(luma),          // %5
   5523     "rm"(lumacoeff)     // %6
   5524   : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
   5525   );
   5526 }
   5527 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
   5528 
   5529 #endif  // defined(__x86_64__) || defined(__i386__)
   5530 
   5531 #ifdef __cplusplus
   5532 }  // extern "C"
   5533 }  // namespace libyuv
   5534 #endif
   5535