Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 #include "libyuv/scale_row.h"
     13 
     14 #ifdef __cplusplus
     15 namespace libyuv {
     16 extern "C" {
     17 #endif
     18 
     19 // This module is for GCC x86 and x64.
     20 #if !defined(LIBYUV_DISABLE_X86) && \
     21     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
     22 
     23 // Offsets for source bytes 0 to 9
     24 static uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
     25                        128, 128, 128, 128, 128, 128, 128, 128};
     26 
     27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
     28 static uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
     29                        128, 128, 128, 128, 128, 128, 128, 128};
     30 
     31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
     32 static uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
     33                        128, 128, 128, 128, 128, 128, 128, 128};
     34 
     35 // Offsets for source bytes 0 to 10
     36 static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
     37 
     38 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
     39 static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
     40 
     41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
     42 static uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
     43                         10, 11, 12, 13, 13, 14, 14, 15};
     44 
     45 // Coefficients for source bytes 0 to 10
     46 static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
     47 
     48 // Coefficients for source bytes 10 to 21
     49 static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
     50 
     51 // Coefficients for source bytes 21 to 31
     52 static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
     53 
     54 // Coefficients for source bytes 21 to 31
     55 static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
     56 
     57 static uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
     58                          128, 128, 128, 128, 128, 128, 128, 128};
     59 
     60 static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
     61                          6,   8,   11,  14,  128, 128, 128, 128};
     62 
     63 // Arrange words 0,3,6 into 0,1,2
     64 static uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
     65                         128, 128, 128, 128, 128, 128, 128, 128};
     66 
     67 // Arrange words 0,3,6 into 3,4,5
     68 static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
     69                          6,   7,   12,  13,  128, 128, 128, 128};
     70 
     71 // Scaling values for boxes of 3x3 and 2x3
     72 static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
     73                             65536 / 9, 65536 / 6, 0,         0};
     74 
     75 // Arrange first value for pixels 0,1,2,3,4,5
     76 static uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
     77                          11, 128, 14, 128, 128, 128, 128, 128};
     78 
     79 // Arrange second value for pixels 0,1,2,3,4,5
     80 static uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
     81                          12, 128, 15, 128, 128, 128, 128, 128};
     82 
     83 // Arrange third value for pixels 0,1,2,3,4,5
     84 static uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
     85                          13, 128, 128, 128, 128, 128, 128, 128};
     86 
     87 // Scaling values for boxes of 3x2 and 2x2
     88 static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
     89                            65536 / 3, 65536 / 2, 0,         0};
     90 
     91 // GCC versions of row functions are verbatim conversions from Visual C.
     92 // Generated using gcc disassembly on Visual C object file:
     93 // objdump -D yuvscaler.obj >yuvscaler.txt
     94 
     95 void ScaleRowDown2_SSSE3(const uint8* src_ptr,
     96                          ptrdiff_t src_stride,
     97                          uint8* dst_ptr,
     98                          int dst_width) {
     99   (void)src_stride;
    100   asm volatile (
    101     LABELALIGN
    102   "1:                                          \n"
    103     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    104     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    105     "lea       " MEMLEA(0x20,0) ",%0           \n"
    106     "psrlw     $0x8,%%xmm0                     \n"
    107     "psrlw     $0x8,%%xmm1                     \n"
    108     "packuswb  %%xmm1,%%xmm0                   \n"
    109     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    110     "lea       " MEMLEA(0x10,1) ",%1           \n"
    111     "sub       $0x10,%2                        \n"
    112     "jg        1b                              \n"
    113   : "+r"(src_ptr),    // %0
    114     "+r"(dst_ptr),    // %1
    115     "+r"(dst_width)   // %2
    116   :: "memory", "cc", "xmm0", "xmm1"
    117   );
    118 }
    119 
    120 void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
    121                                ptrdiff_t src_stride,
    122                                uint8* dst_ptr,
    123                                int dst_width) {
    124   (void)src_stride;
    125   asm volatile (
    126     "pcmpeqb    %%xmm4,%%xmm4                  \n"
    127     "psrlw      $0xf,%%xmm4                    \n"
    128     "packuswb   %%xmm4,%%xmm4                  \n"
    129     "pxor       %%xmm5,%%xmm5                  \n"
    130 
    131     LABELALIGN
    132   "1:                                          \n"
    133     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    134     "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
    135     "lea       " MEMLEA(0x20,0) ",%0           \n"
    136     "pmaddubsw  %%xmm4,%%xmm0                  \n"
    137     "pmaddubsw  %%xmm4,%%xmm1                  \n"
    138     "pavgw      %%xmm5,%%xmm0                  \n"
    139     "pavgw      %%xmm5,%%xmm1                  \n"
    140     "packuswb   %%xmm1,%%xmm0                  \n"
    141     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    142     "lea       " MEMLEA(0x10,1) ",%1           \n"
    143     "sub       $0x10,%2                        \n"
    144     "jg        1b                              \n"
    145   : "+r"(src_ptr),    // %0
    146     "+r"(dst_ptr),    // %1
    147     "+r"(dst_width)   // %2
    148   :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
    149   );
    150 }
    151 
    152 void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
    153                             ptrdiff_t src_stride,
    154                             uint8* dst_ptr,
    155                             int dst_width) {
    156   asm volatile (
    157     "pcmpeqb    %%xmm4,%%xmm4                  \n"
    158     "psrlw      $0xf,%%xmm4                    \n"
    159     "packuswb   %%xmm4,%%xmm4                  \n"
    160     "pxor       %%xmm5,%%xmm5                  \n"
    161 
    162     LABELALIGN
    163   "1:                                          \n"
    164     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    165     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    166     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
    167     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
    168     "lea       " MEMLEA(0x20,0) ",%0           \n"
    169     "pmaddubsw  %%xmm4,%%xmm0                  \n"
    170     "pmaddubsw  %%xmm4,%%xmm1                  \n"
    171     "pmaddubsw  %%xmm4,%%xmm2                  \n"
    172     "pmaddubsw  %%xmm4,%%xmm3                  \n"
    173     "paddw      %%xmm2,%%xmm0                  \n"
    174     "paddw      %%xmm3,%%xmm1                  \n"
    175     "psrlw      $0x1,%%xmm0                    \n"
    176     "psrlw      $0x1,%%xmm1                    \n"
    177     "pavgw      %%xmm5,%%xmm0                  \n"
    178     "pavgw      %%xmm5,%%xmm1                  \n"
    179     "packuswb   %%xmm1,%%xmm0                  \n"
    180     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    181     "lea       " MEMLEA(0x10,1) ",%1           \n"
    182     "sub       $0x10,%2                        \n"
    183     "jg        1b                              \n"
    184   : "+r"(src_ptr),    // %0
    185     "+r"(dst_ptr),    // %1
    186     "+r"(dst_width)   // %2
    187   : "r"((intptr_t)(src_stride))   // %3
    188   : "memory", "cc", NACL_R14
    189     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
    190   );
    191 }
    192 
    193 #ifdef HAS_SCALEROWDOWN2_AVX2
    194 void ScaleRowDown2_AVX2(const uint8* src_ptr,
    195                         ptrdiff_t src_stride,
    196                         uint8* dst_ptr,
    197                         int dst_width) {
    198   (void)src_stride;
    199   asm volatile (
    200     LABELALIGN
    201   "1:                                          \n"
    202     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
    203     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
    204     "lea        " MEMLEA(0x40,0) ",%0          \n"
    205     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
    206     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
    207     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
    208     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
    209     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
    210     "lea        " MEMLEA(0x20,1) ",%1          \n"
    211     "sub        $0x20,%2                       \n"
    212     "jg         1b                             \n"
    213     "vzeroupper                                \n"
    214   : "+r"(src_ptr),    // %0
    215     "+r"(dst_ptr),    // %1
    216     "+r"(dst_width)   // %2
    217   :: "memory", "cc", "xmm0", "xmm1"
    218   );
    219 }
    220 
    221 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
    222                               ptrdiff_t src_stride,
    223                               uint8* dst_ptr,
    224                               int dst_width) {
    225   (void)src_stride;
    226   asm volatile (
    227     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
    228     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
    229     "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
    230     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
    231 
    232     LABELALIGN
    233   "1:                                          \n"
    234     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
    235     "vmovdqu    " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
    236     "lea        " MEMLEA(0x40,0) ",%0          \n"
    237     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
    238     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
    239     "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
    240     "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
    241     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
    242     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
    243     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
    244     "lea        " MEMLEA(0x20,1) ",%1          \n"
    245     "sub        $0x20,%2                       \n"
    246     "jg         1b                             \n"
    247     "vzeroupper                                \n"
    248   : "+r"(src_ptr),    // %0
    249     "+r"(dst_ptr),    // %1
    250     "+r"(dst_width)   // %2
    251   :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
    252   );
    253 }
    254 
    255 void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
    256                            ptrdiff_t src_stride,
    257                            uint8* dst_ptr,
    258                            int dst_width) {
    259   asm volatile (
    260     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
    261     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
    262     "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
    263     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
    264 
    265     LABELALIGN
    266   "1:                                          \n"
    267     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
    268     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
    269     MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
    270     MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
    271     "lea        " MEMLEA(0x40,0) ",%0          \n"
    272     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
    273     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
    274     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
    275     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
    276     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
    277     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
    278     "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
    279     "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
    280     "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
    281     "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
    282     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
    283     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
    284     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
    285     "lea        " MEMLEA(0x20,1) ",%1          \n"
    286     "sub        $0x20,%2                       \n"
    287     "jg         1b                             \n"
    288     "vzeroupper                                \n"
    289   : "+r"(src_ptr),    // %0
    290     "+r"(dst_ptr),    // %1
    291     "+r"(dst_width)   // %2
    292   : "r"((intptr_t)(src_stride))   // %3
    293   : "memory", "cc", NACL_R14
    294     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
    295   );
    296 }
    297 #endif  // HAS_SCALEROWDOWN2_AVX2
    298 
    299 void ScaleRowDown4_SSSE3(const uint8* src_ptr,
    300                          ptrdiff_t src_stride,
    301                          uint8* dst_ptr,
    302                          int dst_width) {
    303   (void)src_stride;
    304   asm volatile (
    305     "pcmpeqb   %%xmm5,%%xmm5                   \n"
    306     "psrld     $0x18,%%xmm5                    \n"
    307     "pslld     $0x10,%%xmm5                    \n"
    308 
    309     LABELALIGN
    310   "1:                                          \n"
    311     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    312     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    313     "lea       " MEMLEA(0x20,0) ",%0           \n"
    314     "pand      %%xmm5,%%xmm0                   \n"
    315     "pand      %%xmm5,%%xmm1                   \n"
    316     "packuswb  %%xmm1,%%xmm0                   \n"
    317     "psrlw     $0x8,%%xmm0                     \n"
    318     "packuswb  %%xmm0,%%xmm0                   \n"
    319     "movq      %%xmm0," MEMACCESS(1) "         \n"
    320     "lea       " MEMLEA(0x8,1) ",%1            \n"
    321     "sub       $0x8,%2                         \n"
    322     "jg        1b                              \n"
    323   : "+r"(src_ptr),    // %0
    324     "+r"(dst_ptr),    // %1
    325     "+r"(dst_width)   // %2
    326   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
    327   );
    328 }
    329 
    330 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
    331                             ptrdiff_t src_stride,
    332                             uint8* dst_ptr,
    333                             int dst_width) {
    334   intptr_t stridex3;
    335   asm volatile (
    336     "pcmpeqb    %%xmm4,%%xmm4                  \n"
    337     "psrlw      $0xf,%%xmm4                    \n"
    338     "movdqa     %%xmm4,%%xmm5                  \n"
    339     "packuswb   %%xmm4,%%xmm4                  \n"
    340     "psllw      $0x3,%%xmm5                    \n"
    341     "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
    342 
    343     LABELALIGN
    344   "1:                                          \n"
    345     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    346     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    347     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
    348     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
    349     "pmaddubsw  %%xmm4,%%xmm0                  \n"
    350     "pmaddubsw  %%xmm4,%%xmm1                  \n"
    351     "pmaddubsw  %%xmm4,%%xmm2                  \n"
    352     "pmaddubsw  %%xmm4,%%xmm3                  \n"
    353     "paddw      %%xmm2,%%xmm0                  \n"
    354     "paddw      %%xmm3,%%xmm1                  \n"
    355     MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
    356     MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
    357     "pmaddubsw  %%xmm4,%%xmm2                  \n"
    358     "pmaddubsw  %%xmm4,%%xmm3                  \n"
    359     "paddw      %%xmm2,%%xmm0                  \n"
    360     "paddw      %%xmm3,%%xmm1                  \n"
    361     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
    362     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
    363     "lea       " MEMLEA(0x20,0) ",%0           \n"
    364     "pmaddubsw  %%xmm4,%%xmm2                  \n"
    365     "pmaddubsw  %%xmm4,%%xmm3                  \n"
    366     "paddw      %%xmm2,%%xmm0                  \n"
    367     "paddw      %%xmm3,%%xmm1                  \n"
    368     "phaddw     %%xmm1,%%xmm0                  \n"
    369     "paddw      %%xmm5,%%xmm0                  \n"
    370     "psrlw      $0x4,%%xmm0                    \n"
    371     "packuswb   %%xmm0,%%xmm0                  \n"
    372     "movq      %%xmm0," MEMACCESS(1) "         \n"
    373     "lea       " MEMLEA(0x8,1) ",%1            \n"
    374     "sub       $0x8,%2                         \n"
    375     "jg        1b                              \n"
    376   : "+r"(src_ptr),     // %0
    377     "+r"(dst_ptr),     // %1
    378     "+r"(dst_width),   // %2
    379     "=&r"(stridex3)    // %3
    380   : "r"((intptr_t)(src_stride))    // %4
    381   : "memory", "cc", NACL_R14
    382     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    383   );
    384 }
    385 
    386 #ifdef HAS_SCALEROWDOWN4_AVX2
    387 void ScaleRowDown4_AVX2(const uint8* src_ptr,
    388                         ptrdiff_t src_stride,
    389                         uint8* dst_ptr,
    390                         int dst_width) {
    391   (void)src_stride;
    392   asm volatile (
    393     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
    394     "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
    395     "vpslld     $0x10,%%ymm5,%%ymm5            \n"
    396     LABELALIGN
    397   "1:                                          \n"
    398     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
    399     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
    400     "lea        " MEMLEA(0x40,0) ",%0          \n"
    401     "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
    402     "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
    403     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
    404     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
    405     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
    406     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
    407     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
    408     "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
    409     "lea        " MEMLEA(0x10,1) ",%1          \n"
    410     "sub        $0x10,%2                       \n"
    411     "jg         1b                             \n"
    412     "vzeroupper                                \n"
    413   : "+r"(src_ptr),    // %0
    414     "+r"(dst_ptr),    // %1
    415     "+r"(dst_width)   // %2
    416   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
    417   );
    418 }
    419 
    420 void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
    421                            ptrdiff_t src_stride,
    422                            uint8* dst_ptr,
    423                            int dst_width) {
    424   asm volatile (
    425     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
    426     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
    427     "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
    428     "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
    429 
    430     LABELALIGN
    431   "1:                                          \n"
    432     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
    433     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
    434     MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
    435     MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
    436     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
    437     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
    438     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
    439     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
    440     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
    441     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
    442     MEMOPREG(vmovdqu,0x00,0,3,2,ymm2)          //  vmovdqu  (%0,%3,2),%%ymm2
    443     MEMOPREG(vmovdqu,0x20,0,3,2,ymm3)          //  vmovdqu  0x20(%0,%3,2),%%ymm3
    444     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
    445     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
    446     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
    447     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
    448     MEMOPREG(vmovdqu,0x00,0,4,1,ymm2)          //  vmovdqu  (%0,%4,1),%%ymm2
    449     MEMOPREG(vmovdqu,0x20,0,4,1,ymm3)          //  vmovdqu  0x20(%0,%4,1),%%ymm3
    450     "lea        " MEMLEA(0x40,0) ",%0          \n"
    451     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
    452     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
    453     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
    454     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
    455     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
    456     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
    457     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
    458     "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
    459     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
    460     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
    461     "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
    462     "lea        " MEMLEA(0x10,1) ",%1          \n"
    463     "sub        $0x10,%2                       \n"
    464     "jg         1b                             \n"
    465     "vzeroupper                                \n"
    466   : "+r"(src_ptr),    // %0
    467     "+r"(dst_ptr),    // %1
    468     "+r"(dst_width)   // %2
    469   : "r"((intptr_t)(src_stride)),  // %3
    470     "r"((intptr_t)(src_stride * 3))   // %4
    471   : "memory", "cc", NACL_R14
    472     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    473   );
    474 }
    475 #endif  // HAS_SCALEROWDOWN4_AVX2
    476 
    477 void ScaleRowDown34_SSSE3(const uint8* src_ptr,
    478                           ptrdiff_t src_stride,
    479                           uint8* dst_ptr,
    480                           int dst_width) {
    481   (void)src_stride;
    482   asm volatile(
    483       "movdqa    %0,%%xmm3                       \n"
    484       "movdqa    %1,%%xmm4                       \n"
    485       "movdqa    %2,%%xmm5                       \n"
    486       :
    487       : "m"(kShuf0),  // %0
    488         "m"(kShuf1),  // %1
    489         "m"(kShuf2)   // %2
    490       );
    491   asm volatile (
    492     LABELALIGN
    493   "1:                                          \n"
    494     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    495     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
    496     "lea       " MEMLEA(0x20,0) ",%0           \n"
    497     "movdqa    %%xmm2,%%xmm1                   \n"
    498     "palignr   $0x8,%%xmm0,%%xmm1              \n"
    499     "pshufb    %%xmm3,%%xmm0                   \n"
    500     "pshufb    %%xmm4,%%xmm1                   \n"
    501     "pshufb    %%xmm5,%%xmm2                   \n"
    502     "movq      %%xmm0," MEMACCESS(1) "         \n"
    503     "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
    504     "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
    505     "lea       " MEMLEA(0x18,1) ",%1           \n"
    506     "sub       $0x18,%2                        \n"
    507     "jg        1b                              \n"
    508   : "+r"(src_ptr),   // %0
    509     "+r"(dst_ptr),   // %1
    510     "+r"(dst_width)  // %2
    511   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
    512   );
    513 }
    514 
    515 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
    516                                 ptrdiff_t src_stride,
    517                                 uint8* dst_ptr,
    518                                 int dst_width) {
    519   asm volatile(
    520       "movdqa    %0,%%xmm2                       \n"  // kShuf01
    521       "movdqa    %1,%%xmm3                       \n"  // kShuf11
    522       "movdqa    %2,%%xmm4                       \n"  // kShuf21
    523       :
    524       : "m"(kShuf01),  // %0
    525         "m"(kShuf11),  // %1
    526         "m"(kShuf21)   // %2
    527       );
    528   asm volatile(
    529       "movdqa    %0,%%xmm5                       \n"  // kMadd01
    530       "movdqa    %1,%%xmm0                       \n"  // kMadd11
    531       "movdqa    %2,%%xmm1                       \n"  // kRound34
    532       :
    533       : "m"(kMadd01),  // %0
    534         "m"(kMadd11),  // %1
    535         "m"(kRound34)  // %2
    536       );
    537   asm volatile (
    538     LABELALIGN
    539   "1:                                          \n"
    540     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
    541     MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
    542     "pavgb     %%xmm7,%%xmm6                   \n"
    543     "pshufb    %%xmm2,%%xmm6                   \n"
    544     "pmaddubsw %%xmm5,%%xmm6                   \n"
    545     "paddsw    %%xmm1,%%xmm6                   \n"
    546     "psrlw     $0x2,%%xmm6                     \n"
    547     "packuswb  %%xmm6,%%xmm6                   \n"
    548     "movq      %%xmm6," MEMACCESS(1) "         \n"
    549     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
    550     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
    551     "pavgb     %%xmm7,%%xmm6                   \n"
    552     "pshufb    %%xmm3,%%xmm6                   \n"
    553     "pmaddubsw %%xmm0,%%xmm6                   \n"
    554     "paddsw    %%xmm1,%%xmm6                   \n"
    555     "psrlw     $0x2,%%xmm6                     \n"
    556     "packuswb  %%xmm6,%%xmm6                   \n"
    557     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
    558     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
    559     MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
    560     "lea       " MEMLEA(0x20,0) ",%0           \n"
    561     "pavgb     %%xmm7,%%xmm6                   \n"
    562     "pshufb    %%xmm4,%%xmm6                   \n"
    563     "pmaddubsw %4,%%xmm6                       \n"
    564     "paddsw    %%xmm1,%%xmm6                   \n"
    565     "psrlw     $0x2,%%xmm6                     \n"
    566     "packuswb  %%xmm6,%%xmm6                   \n"
    567     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
    568     "lea       " MEMLEA(0x18,1) ",%1           \n"
    569     "sub       $0x18,%2                        \n"
    570     "jg        1b                              \n"
    571   : "+r"(src_ptr),   // %0
    572     "+r"(dst_ptr),   // %1
    573     "+r"(dst_width)  // %2
    574   : "r"((intptr_t)(src_stride)),  // %3
    575     "m"(kMadd21)     // %4
    576   : "memory", "cc", NACL_R14
    577     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    578   );
    579 }
    580 
    581 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
    582                                 ptrdiff_t src_stride,
    583                                 uint8* dst_ptr,
    584                                 int dst_width) {
    585   asm volatile(
    586       "movdqa    %0,%%xmm2                       \n"  // kShuf01
    587       "movdqa    %1,%%xmm3                       \n"  // kShuf11
    588       "movdqa    %2,%%xmm4                       \n"  // kShuf21
    589       :
    590       : "m"(kShuf01),  // %0
    591         "m"(kShuf11),  // %1
    592         "m"(kShuf21)   // %2
    593       );
    594   asm volatile(
    595       "movdqa    %0,%%xmm5                       \n"  // kMadd01
    596       "movdqa    %1,%%xmm0                       \n"  // kMadd11
    597       "movdqa    %2,%%xmm1                       \n"  // kRound34
    598       :
    599       : "m"(kMadd01),  // %0
    600         "m"(kMadd11),  // %1
    601         "m"(kRound34)  // %2
    602       );
    603 
    604   asm volatile (
    605     LABELALIGN
    606   "1:                                          \n"
    607     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
    608     MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
    609     "pavgb     %%xmm6,%%xmm7                   \n"
    610     "pavgb     %%xmm7,%%xmm6                   \n"
    611     "pshufb    %%xmm2,%%xmm6                   \n"
    612     "pmaddubsw %%xmm5,%%xmm6                   \n"
    613     "paddsw    %%xmm1,%%xmm6                   \n"
    614     "psrlw     $0x2,%%xmm6                     \n"
    615     "packuswb  %%xmm6,%%xmm6                   \n"
    616     "movq      %%xmm6," MEMACCESS(1) "         \n"
    617     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
    618     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
    619     "pavgb     %%xmm6,%%xmm7                   \n"
    620     "pavgb     %%xmm7,%%xmm6                   \n"
    621     "pshufb    %%xmm3,%%xmm6                   \n"
    622     "pmaddubsw %%xmm0,%%xmm6                   \n"
    623     "paddsw    %%xmm1,%%xmm6                   \n"
    624     "psrlw     $0x2,%%xmm6                     \n"
    625     "packuswb  %%xmm6,%%xmm6                   \n"
    626     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
    627     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
    628     MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
    629     "lea       " MEMLEA(0x20,0) ",%0           \n"
    630     "pavgb     %%xmm6,%%xmm7                   \n"
    631     "pavgb     %%xmm7,%%xmm6                   \n"
    632     "pshufb    %%xmm4,%%xmm6                   \n"
    633     "pmaddubsw %4,%%xmm6                       \n"
    634     "paddsw    %%xmm1,%%xmm6                   \n"
    635     "psrlw     $0x2,%%xmm6                     \n"
    636     "packuswb  %%xmm6,%%xmm6                   \n"
    637     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
    638     "lea       " MEMLEA(0x18,1) ",%1           \n"
    639     "sub       $0x18,%2                        \n"
    640     "jg        1b                              \n"
    641     : "+r"(src_ptr),   // %0
    642       "+r"(dst_ptr),   // %1
    643       "+r"(dst_width)  // %2
    644     : "r"((intptr_t)(src_stride)),  // %3
    645       "m"(kMadd21)     // %4
    646     : "memory", "cc", NACL_R14
    647       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    648   );
    649 }
    650 
    651 void ScaleRowDown38_SSSE3(const uint8* src_ptr,
    652                           ptrdiff_t src_stride,
    653                           uint8* dst_ptr,
    654                           int dst_width) {
    655   (void)src_stride;
    656   asm volatile (
    657     "movdqa    %3,%%xmm4                       \n"
    658     "movdqa    %4,%%xmm5                       \n"
    659 
    660     LABELALIGN
    661   "1:                                          \n"
    662     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    663     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    664     "lea       " MEMLEA(0x20,0) ",%0           \n"
    665     "pshufb    %%xmm4,%%xmm0                   \n"
    666     "pshufb    %%xmm5,%%xmm1                   \n"
    667     "paddusb   %%xmm1,%%xmm0                   \n"
    668     "movq      %%xmm0," MEMACCESS(1) "         \n"
    669     "movhlps   %%xmm0,%%xmm1                   \n"
    670     "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
    671     "lea       " MEMLEA(0xc,1) ",%1            \n"
    672     "sub       $0xc,%2                         \n"
    673     "jg        1b                              \n"
    674   : "+r"(src_ptr),   // %0
    675     "+r"(dst_ptr),   // %1
    676     "+r"(dst_width)  // %2
    677   : "m"(kShuf38a),   // %3
    678     "m"(kShuf38b)    // %4
    679   : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
    680   );
    681 }
    682 
    683 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
    684                                 ptrdiff_t src_stride,
    685                                 uint8* dst_ptr,
    686                                 int dst_width) {
    687   asm volatile(
    688       "movdqa    %0,%%xmm2                       \n"
    689       "movdqa    %1,%%xmm3                       \n"
    690       "movdqa    %2,%%xmm4                       \n"
    691       "movdqa    %3,%%xmm5                       \n"
    692       :
    693       : "m"(kShufAb0),  // %0
    694         "m"(kShufAb1),  // %1
    695         "m"(kShufAb2),  // %2
    696         "m"(kScaleAb2)  // %3
    697       );
    698   asm volatile (
    699     LABELALIGN
    700   "1:                                          \n"
    701     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    702     MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
    703     "lea       " MEMLEA(0x10,0) ",%0           \n"
    704     "pavgb     %%xmm1,%%xmm0                   \n"
    705     "movdqa    %%xmm0,%%xmm1                   \n"
    706     "pshufb    %%xmm2,%%xmm1                   \n"
    707     "movdqa    %%xmm0,%%xmm6                   \n"
    708     "pshufb    %%xmm3,%%xmm6                   \n"
    709     "paddusw   %%xmm6,%%xmm1                   \n"
    710     "pshufb    %%xmm4,%%xmm0                   \n"
    711     "paddusw   %%xmm0,%%xmm1                   \n"
    712     "pmulhuw   %%xmm5,%%xmm1                   \n"
    713     "packuswb  %%xmm1,%%xmm1                   \n"
    714     "movd      %%xmm1," MEMACCESS(1) "         \n"
    715     "psrlq     $0x10,%%xmm1                    \n"
    716     "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
    717     "lea       " MEMLEA(0x6,1) ",%1            \n"
    718     "sub       $0x6,%2                         \n"
    719     "jg        1b                              \n"
    720   : "+r"(src_ptr),     // %0
    721     "+r"(dst_ptr),     // %1
    722     "+r"(dst_width)    // %2
    723   : "r"((intptr_t)(src_stride))  // %3
    724   : "memory", "cc", NACL_R14
    725     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
    726   );
    727 }
    728 
    729 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
    730                                 ptrdiff_t src_stride,
    731                                 uint8* dst_ptr,
    732                                 int dst_width) {
    733   asm volatile(
    734       "movdqa    %0,%%xmm2                       \n"
    735       "movdqa    %1,%%xmm3                       \n"
    736       "movdqa    %2,%%xmm4                       \n"
    737       "pxor      %%xmm5,%%xmm5                   \n"
    738       :
    739       : "m"(kShufAc),    // %0
    740         "m"(kShufAc3),   // %1
    741         "m"(kScaleAc33)  // %2
    742       );
    743   asm volatile (
    744     LABELALIGN
    745   "1:                                          \n"
    746     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    747     MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
    748     "movhlps   %%xmm0,%%xmm1                   \n"
    749     "movhlps   %%xmm6,%%xmm7                   \n"
    750     "punpcklbw %%xmm5,%%xmm0                   \n"
    751     "punpcklbw %%xmm5,%%xmm1                   \n"
    752     "punpcklbw %%xmm5,%%xmm6                   \n"
    753     "punpcklbw %%xmm5,%%xmm7                   \n"
    754     "paddusw   %%xmm6,%%xmm0                   \n"
    755     "paddusw   %%xmm7,%%xmm1                   \n"
    756     MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
    757     "lea       " MEMLEA(0x10,0) ",%0           \n"
    758     "movhlps   %%xmm6,%%xmm7                   \n"
    759     "punpcklbw %%xmm5,%%xmm6                   \n"
    760     "punpcklbw %%xmm5,%%xmm7                   \n"
    761     "paddusw   %%xmm6,%%xmm0                   \n"
    762     "paddusw   %%xmm7,%%xmm1                   \n"
    763     "movdqa    %%xmm0,%%xmm6                   \n"
    764     "psrldq    $0x2,%%xmm0                     \n"
    765     "paddusw   %%xmm0,%%xmm6                   \n"
    766     "psrldq    $0x2,%%xmm0                     \n"
    767     "paddusw   %%xmm0,%%xmm6                   \n"
    768     "pshufb    %%xmm2,%%xmm6                   \n"
    769     "movdqa    %%xmm1,%%xmm7                   \n"
    770     "psrldq    $0x2,%%xmm1                     \n"
    771     "paddusw   %%xmm1,%%xmm7                   \n"
    772     "psrldq    $0x2,%%xmm1                     \n"
    773     "paddusw   %%xmm1,%%xmm7                   \n"
    774     "pshufb    %%xmm3,%%xmm7                   \n"
    775     "paddusw   %%xmm7,%%xmm6                   \n"
    776     "pmulhuw   %%xmm4,%%xmm6                   \n"
    777     "packuswb  %%xmm6,%%xmm6                   \n"
    778     "movd      %%xmm6," MEMACCESS(1) "         \n"
    779     "psrlq     $0x10,%%xmm6                    \n"
    780     "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
    781     "lea       " MEMLEA(0x6,1) ",%1            \n"
    782     "sub       $0x6,%2                         \n"
    783     "jg        1b                              \n"
    784   : "+r"(src_ptr),    // %0
    785     "+r"(dst_ptr),    // %1
    786     "+r"(dst_width)   // %2
    787   : "r"((intptr_t)(src_stride))   // %3
    788   : "memory", "cc", NACL_R14
    789     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    790   );
    791 }
    792 
    793 // Reads 16xN bytes and produces 16 shorts at a time.
    794 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
    795   asm volatile (
    796     "pxor      %%xmm5,%%xmm5                   \n"
    797 
    798     LABELALIGN
    799   "1:                                          \n"
    800     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
    801     "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
    802     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
    803     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm1   \n"
    804     "movdqa    %%xmm3,%%xmm2                   \n"
    805     "punpcklbw %%xmm5,%%xmm2                   \n"
    806     "punpckhbw %%xmm5,%%xmm3                   \n"
    807     "paddusw   %%xmm2,%%xmm0                   \n"
    808     "paddusw   %%xmm3,%%xmm1                   \n"
    809     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    810     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
    811     "lea       " MEMLEA(0x20,1) ",%1           \n"
    812     "sub       $0x10,%2                        \n"
    813     "jg        1b                              \n"
    814   : "+r"(src_ptr),     // %0
    815     "+r"(dst_ptr),     // %1
    816     "+r"(src_width)    // %2
    817   :
    818   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
    819   );
    820 }
    821 
    822 #ifdef HAS_SCALEADDROW_AVX2
    823 // Reads 32 bytes and accumulates to 32 shorts at a time.
    824 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
    825   asm volatile (
    826     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
    827 
    828     LABELALIGN
    829   "1:                                          \n"
    830     "vmovdqu    " MEMACCESS(0) ",%%ymm3        \n"
    831     "lea        " MEMLEA(0x20,0) ",%0          \n"  // src_ptr += 32
    832     "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
    833     "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
    834     "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
    835     "vpaddusw   " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
    836     "vpaddusw   " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
    837     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
    838     "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
    839     "lea       " MEMLEA(0x40,1) ",%1           \n"
    840     "sub       $0x20,%2                        \n"
    841     "jg        1b                              \n"
    842     "vzeroupper                                \n"
    843   : "+r"(src_ptr),     // %0
    844     "+r"(dst_ptr),     // %1
    845     "+r"(src_width)    // %2
    846   :
    847   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
    848   );
    849 }
    850 #endif  // HAS_SCALEADDROW_AVX2
    851 
    852 // Constant for making pixels signed to avoid pmaddubsw
    853 // saturation.
    854 static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
    855                         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
    856 
    857 // Constant for making pixels unsigned and adding .5 for rounding.
    858 static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
    859                          0x4040, 0x4040, 0x4040, 0x4040};
    860 
    861 // Bilinear column filtering. SSSE3 version.
    862 void ScaleFilterCols_SSSE3(uint8* dst_ptr,
    863                            const uint8* src_ptr,
    864                            int dst_width,
    865                            int x,
    866                            int dx) {
    867   intptr_t x0, x1, temp_pixel;
    868   asm volatile (
    869     "movd      %6,%%xmm2                       \n"
    870     "movd      %7,%%xmm3                       \n"
    871     "movl      $0x04040000,%k2                 \n"
    872     "movd      %k2,%%xmm5                      \n"
    873     "pcmpeqb   %%xmm6,%%xmm6                   \n"
    874     "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
    875     "pcmpeqb   %%xmm7,%%xmm7                   \n"
    876     "psrlw     $15,%%xmm7                      \n"  // 0x00010001
    877 
    878     "pextrw    $0x1,%%xmm2,%k3                 \n"
    879     "subl      $0x2,%5                         \n"
    880     "jl        29f                             \n"
    881     "movdqa    %%xmm2,%%xmm0                   \n"
    882     "paddd     %%xmm3,%%xmm0                   \n"
    883     "punpckldq %%xmm0,%%xmm2                   \n"
    884     "punpckldq %%xmm3,%%xmm3                   \n"
    885     "paddd     %%xmm3,%%xmm3                   \n"
    886     "pextrw    $0x3,%%xmm2,%k4                 \n"
    887 
    888     LABELALIGN
    889   "2:                                          \n"
    890     "movdqa    %%xmm2,%%xmm1                   \n"
    891     "paddd     %%xmm3,%%xmm2                   \n"
    892     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
    893     "movd      %k2,%%xmm0                      \n"
    894     "psrlw     $0x9,%%xmm1                     \n"
    895     MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
    896     "movd      %k2,%%xmm4                      \n"
    897     "pshufb    %%xmm5,%%xmm1                   \n"
    898     "punpcklwd %%xmm4,%%xmm0                   \n"
    899     "psubb     %8,%%xmm0                       \n"  // make pixels signed.
    900     "pxor      %%xmm6,%%xmm1                   \n"  // 128 - f = (f ^ 127 ) + 1
    901     "paddusb   %%xmm7,%%xmm1                   \n"
    902     "pmaddubsw %%xmm0,%%xmm1                   \n"
    903     "pextrw    $0x1,%%xmm2,%k3                 \n"
    904     "pextrw    $0x3,%%xmm2,%k4                 \n"
    905     "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
    906     "psrlw     $0x7,%%xmm1                     \n"
    907     "packuswb  %%xmm1,%%xmm1                   \n"
    908     "movd      %%xmm1,%k2                      \n"
    909     "mov       %w2," MEMACCESS(0) "            \n"
    910     "lea       " MEMLEA(0x2,0) ",%0            \n"
    911     "subl      $0x2,%5                         \n"
    912     "jge       2b                              \n"
    913 
    914     LABELALIGN
    915   "29:                                         \n"
    916     "addl      $0x1,%5                         \n"
    917     "jl        99f                             \n"
    918     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
    919     "movd      %k2,%%xmm0                      \n"
    920     "psrlw     $0x9,%%xmm2                     \n"
    921     "pshufb    %%xmm5,%%xmm2                   \n"
    922     "psubb     %8,%%xmm0                       \n"  // make pixels signed.
    923     "pxor      %%xmm6,%%xmm2                   \n"
    924     "paddusb   %%xmm7,%%xmm2                   \n"
    925     "pmaddubsw %%xmm0,%%xmm2                   \n"
    926     "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
    927     "psrlw     $0x7,%%xmm2                     \n"
    928     "packuswb  %%xmm2,%%xmm2                   \n"
    929     "movd      %%xmm2,%k2                      \n"
    930     "mov       %b2," MEMACCESS(0) "            \n"
    931   "99:                                         \n"
    932   : "+r"(dst_ptr),      // %0
    933     "+r"(src_ptr),      // %1
    934     "=&a"(temp_pixel),  // %2
    935     "=&r"(x0),          // %3
    936     "=&r"(x1),          // %4
    937 #if defined(__x86_64__)
    938     "+rm"(dst_width)    // %5
    939 #else
    940     "+m"(dst_width)    // %5
    941 #endif
    942   : "rm"(x),            // %6
    943     "rm"(dx),           // %7
    944 #if defined(__x86_64__)
    945     "x"(kFsub80),       // %8
    946     "x"(kFadd40)        // %9
    947 #else
    948     "m"(kFsub80),       // %8
    949     "m"(kFadd40)        // %9
    950 #endif
    951   : "memory", "cc", NACL_R14
    952     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    953   );
    954 }
    955 
    956 // Reads 4 pixels, duplicates them and writes 8 pixels.
    957 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
    958 void ScaleColsUp2_SSE2(uint8* dst_ptr,
    959                        const uint8* src_ptr,
    960                        int dst_width,
    961                        int x,
    962                        int dx) {
    963   (void)x;
    964   (void)dx;
    965   asm volatile (
    966     LABELALIGN
    967   "1:                                          \n"
    968     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
    969     "lea       " MEMLEA(0x10,1) ",%1           \n"
    970     "movdqa    %%xmm0,%%xmm1                   \n"
    971     "punpcklbw %%xmm0,%%xmm0                   \n"
    972     "punpckhbw %%xmm1,%%xmm1                   \n"
    973     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
    974     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
    975     "lea       " MEMLEA(0x20,0) ",%0           \n"
    976     "sub       $0x20,%2                         \n"
    977     "jg        1b                              \n"
    978 
    979   : "+r"(dst_ptr),     // %0
    980     "+r"(src_ptr),     // %1
    981     "+r"(dst_width)    // %2
    982   :: "memory", "cc", "xmm0", "xmm1"
    983   );
    984 }
    985 
    986 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
    987                             ptrdiff_t src_stride,
    988                             uint8* dst_argb,
    989                             int dst_width) {
    990   (void)src_stride;
    991   asm volatile (
    992     LABELALIGN
    993   "1:                                          \n"
    994     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    995     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    996     "lea       " MEMLEA(0x20,0) ",%0           \n"
    997     "shufps    $0xdd,%%xmm1,%%xmm0             \n"
    998     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    999     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1000     "sub       $0x4,%2                         \n"
   1001     "jg        1b                              \n"
   1002   : "+r"(src_argb),  // %0
   1003     "+r"(dst_argb),  // %1
   1004     "+r"(dst_width)  // %2
   1005   :: "memory", "cc", "xmm0", "xmm1"
   1006   );
   1007 }
   1008 
   1009 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
   1010                                   ptrdiff_t src_stride,
   1011                                   uint8* dst_argb,
   1012                                   int dst_width) {
   1013   (void)src_stride;
   1014   asm volatile (
   1015     LABELALIGN
   1016   "1:                                          \n"
   1017     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1018     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1019     "lea       " MEMLEA(0x20,0) ",%0           \n"
   1020     "movdqa    %%xmm0,%%xmm2                   \n"
   1021     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1022     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
   1023     "pavgb     %%xmm2,%%xmm0                   \n"
   1024     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1025     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1026     "sub       $0x4,%2                         \n"
   1027     "jg        1b                              \n"
   1028   : "+r"(src_argb),  // %0
   1029     "+r"(dst_argb),  // %1
   1030     "+r"(dst_width)  // %2
   1031   :: "memory", "cc", "xmm0", "xmm1"
   1032   );
   1033 }
   1034 
   1035 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
   1036                                ptrdiff_t src_stride,
   1037                                uint8* dst_argb,
   1038                                int dst_width) {
   1039   asm volatile (
   1040     LABELALIGN
   1041   "1:                                          \n"
   1042     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1043     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1044     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
   1045     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
   1046     "lea       " MEMLEA(0x20,0) ",%0           \n"
   1047     "pavgb     %%xmm2,%%xmm0                   \n"
   1048     "pavgb     %%xmm3,%%xmm1                   \n"
   1049     "movdqa    %%xmm0,%%xmm2                   \n"
   1050     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1051     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
   1052     "pavgb     %%xmm2,%%xmm0                   \n"
   1053     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1054     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1055     "sub       $0x4,%2                         \n"
   1056     "jg        1b                              \n"
   1057   : "+r"(src_argb),   // %0
   1058     "+r"(dst_argb),   // %1
   1059     "+r"(dst_width)   // %2
   1060   : "r"((intptr_t)(src_stride))   // %3
   1061   : "memory", "cc", NACL_R14
   1062     "xmm0", "xmm1", "xmm2", "xmm3"
   1063   );
   1064 }
   1065 
   1066 // Reads 4 pixels at a time.
   1067 // Alignment requirement: dst_argb 16 byte aligned.
   1068 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
   1069                                ptrdiff_t src_stride,
   1070                                int src_stepx,
   1071                                uint8* dst_argb,
   1072                                int dst_width) {
   1073   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   1074   intptr_t src_stepx_x12;
   1075   (void)src_stride;
   1076   asm volatile (
   1077     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
   1078     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
   1079     LABELALIGN
   1080   "1:                                          \n"
   1081     "movd      " MEMACCESS(0) ",%%xmm0         \n"
   1082     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
   1083     "punpckldq %%xmm1,%%xmm0                   \n"
   1084     MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
   1085     MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
   1086     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
   1087     "punpckldq %%xmm3,%%xmm2                   \n"
   1088     "punpcklqdq %%xmm2,%%xmm0                  \n"
   1089     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   1090     "lea       " MEMLEA(0x10,2) ",%2           \n"
   1091     "sub       $0x4,%3                         \n"
   1092     "jg        1b                              \n"
   1093   : "+r"(src_argb),       // %0
   1094     "+r"(src_stepx_x4),   // %1
   1095     "+r"(dst_argb),       // %2
   1096     "+r"(dst_width),      // %3
   1097     "=&r"(src_stepx_x12)  // %4
   1098   :: "memory", "cc", NACL_R14
   1099     "xmm0", "xmm1", "xmm2", "xmm3"
   1100   );
   1101 }
   1102 
   1103 // Blends four 2x2 to 4x1.
   1104 // Alignment requirement: dst_argb 16 byte aligned.
   1105 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
   1106                                   ptrdiff_t src_stride,
   1107                                   int src_stepx,
   1108                                   uint8* dst_argb,
   1109                                   int dst_width) {
   1110   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   1111   intptr_t src_stepx_x12;
   1112   intptr_t row1 = (intptr_t)(src_stride);
   1113   asm volatile (
   1114     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
   1115     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
   1116     "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
   1117 
   1118     LABELALIGN
   1119   "1:                                          \n"
   1120     "movq      " MEMACCESS(0) ",%%xmm0         \n"
   1121     MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
   1122     MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
   1123     MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
   1124     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
   1125     "movq      " MEMACCESS(5) ",%%xmm2         \n"
   1126     MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
   1127     MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
   1128     MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
   1129     "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
   1130     "pavgb     %%xmm2,%%xmm0                   \n"
   1131     "pavgb     %%xmm3,%%xmm1                   \n"
   1132     "movdqa    %%xmm0,%%xmm2                   \n"
   1133     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1134     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
   1135     "pavgb     %%xmm2,%%xmm0                   \n"
   1136     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   1137     "lea       " MEMLEA(0x10,2) ",%2           \n"
   1138     "sub       $0x4,%3                         \n"
   1139     "jg        1b                              \n"
   1140   : "+r"(src_argb),        // %0
   1141     "+r"(src_stepx_x4),    // %1
   1142     "+r"(dst_argb),        // %2
   1143     "+rm"(dst_width),      // %3
   1144     "=&r"(src_stepx_x12),  // %4
   1145     "+r"(row1)             // %5
   1146   :: "memory", "cc", NACL_R14
   1147     "xmm0", "xmm1", "xmm2", "xmm3"
   1148   );
   1149 }
   1150 
   1151 void ScaleARGBCols_SSE2(uint8* dst_argb,
   1152                         const uint8* src_argb,
   1153                         int dst_width,
   1154                         int x,
   1155                         int dx) {
   1156   intptr_t x0, x1;
   1157   asm volatile (
   1158     "movd      %5,%%xmm2                       \n"
   1159     "movd      %6,%%xmm3                       \n"
   1160     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
   1161     "pshufd    $0x11,%%xmm3,%%xmm0             \n"
   1162     "paddd     %%xmm0,%%xmm2                   \n"
   1163     "paddd     %%xmm3,%%xmm3                   \n"
   1164     "pshufd    $0x5,%%xmm3,%%xmm0              \n"
   1165     "paddd     %%xmm0,%%xmm2                   \n"
   1166     "paddd     %%xmm3,%%xmm3                   \n"
   1167     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
   1168     "pextrw    $0x1,%%xmm2,%k0                 \n"
   1169     "pextrw    $0x3,%%xmm2,%k1                 \n"
   1170     "cmp       $0x0,%4                         \n"
   1171     "jl        99f                             \n"
   1172     "sub       $0x4,%4                         \n"
   1173     "jl        49f                             \n"
   1174 
   1175     LABELALIGN
   1176   "40:                                         \n"
   1177     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
   1178     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
   1179     "pextrw    $0x5,%%xmm2,%k0                 \n"
   1180     "pextrw    $0x7,%%xmm2,%k1                 \n"
   1181     "paddd     %%xmm3,%%xmm2                   \n"
   1182     "punpckldq %%xmm1,%%xmm0                   \n"
   1183     MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
   1184     MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
   1185     "pextrw    $0x1,%%xmm2,%k0                 \n"
   1186     "pextrw    $0x3,%%xmm2,%k1                 \n"
   1187     "punpckldq %%xmm4,%%xmm1                   \n"
   1188     "punpcklqdq %%xmm1,%%xmm0                  \n"
   1189     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
   1190     "lea       " MEMLEA(0x10,2) ",%2           \n"
   1191     "sub       $0x4,%4                         \n"
   1192     "jge       40b                             \n"
   1193 
   1194   "49:                                         \n"
   1195     "test      $0x2,%4                         \n"
   1196     "je        29f                             \n"
   1197     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
   1198     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
   1199     "pextrw    $0x5,%%xmm2,%k0                 \n"
   1200     "punpckldq %%xmm1,%%xmm0                   \n"
   1201     "movq      %%xmm0," MEMACCESS(2) "         \n"
   1202     "lea       " MEMLEA(0x8,2) ",%2            \n"
   1203   "29:                                         \n"
   1204     "test      $0x1,%4                         \n"
   1205     "je        99f                             \n"
   1206     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
   1207     "movd      %%xmm0," MEMACCESS(2) "         \n"
   1208   "99:                                         \n"
   1209   : "=&a"(x0),         // %0
   1210     "=&d"(x1),         // %1
   1211     "+r"(dst_argb),    // %2
   1212     "+r"(src_argb),    // %3
   1213     "+r"(dst_width)    // %4
   1214   : "rm"(x),           // %5
   1215     "rm"(dx)           // %6
   1216   : "memory", "cc", NACL_R14
   1217     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   1218   );
   1219 }
   1220 
   1221 // Reads 4 pixels, duplicates them and writes 8 pixels.
   1222 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
   1223 void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
   1224                            const uint8* src_argb,
   1225                            int dst_width,
   1226                            int x,
   1227                            int dx) {
   1228   (void)x;
   1229   (void)dx;
   1230   asm volatile (
   1231     LABELALIGN
   1232   "1:                                          \n"
   1233     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
   1234     "lea       " MEMLEA(0x10,1) ",%1           \n"
   1235     "movdqa    %%xmm0,%%xmm1                   \n"
   1236     "punpckldq %%xmm0,%%xmm0                   \n"
   1237     "punpckhdq %%xmm1,%%xmm1                   \n"
   1238     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
   1239     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
   1240     "lea       " MEMLEA(0x20,0) ",%0           \n"
   1241     "sub       $0x8,%2                         \n"
   1242     "jg        1b                              \n"
   1243 
   1244   : "+r"(dst_argb),    // %0
   1245     "+r"(src_argb),    // %1
   1246     "+r"(dst_width)    // %2
   1247   :: "memory", "cc", NACL_R14
   1248     "xmm0", "xmm1"
   1249   );
   1250 }
   1251 
   1252 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
   1253 static uvec8 kShuffleColARGB = {
   1254     0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
   1255     8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
   1256 };
   1257 
   1258 // Shuffle table for duplicating 2 fractions into 8 bytes each
   1259 static uvec8 kShuffleFractions = {
   1260     0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
   1261 };
   1262 
   1263 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
   1264 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
   1265                                const uint8* src_argb,
   1266                                int dst_width,
   1267                                int x,
   1268                                int dx) {
   1269   intptr_t x0, x1;
   1270   asm volatile(
   1271       "movdqa    %0,%%xmm4                       \n"
   1272       "movdqa    %1,%%xmm5                       \n"
   1273       :
   1274       : "m"(kShuffleColARGB),   // %0
   1275         "m"(kShuffleFractions)  // %1
   1276       );
   1277 
   1278   asm volatile (
   1279     "movd      %5,%%xmm2                       \n"
   1280     "movd      %6,%%xmm3                       \n"
   1281     "pcmpeqb   %%xmm6,%%xmm6                   \n"
   1282     "psrlw     $0x9,%%xmm6                     \n"
   1283     "pextrw    $0x1,%%xmm2,%k3                 \n"
   1284     "sub       $0x2,%2                         \n"
   1285     "jl        29f                             \n"
   1286     "movdqa    %%xmm2,%%xmm0                   \n"
   1287     "paddd     %%xmm3,%%xmm0                   \n"
   1288     "punpckldq %%xmm0,%%xmm2                   \n"
   1289     "punpckldq %%xmm3,%%xmm3                   \n"
   1290     "paddd     %%xmm3,%%xmm3                   \n"
   1291     "pextrw    $0x3,%%xmm2,%k4                 \n"
   1292 
   1293     LABELALIGN
   1294   "2:                                          \n"
   1295     "movdqa    %%xmm2,%%xmm1                   \n"
   1296     "paddd     %%xmm3,%%xmm2                   \n"
   1297     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
   1298     "psrlw     $0x9,%%xmm1                     \n"
   1299     MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
   1300     "pshufb    %%xmm5,%%xmm1                   \n"
   1301     "pshufb    %%xmm4,%%xmm0                   \n"
   1302     "pxor      %%xmm6,%%xmm1                   \n"
   1303     "pmaddubsw %%xmm1,%%xmm0                   \n"
   1304     "psrlw     $0x7,%%xmm0                     \n"
   1305     "pextrw    $0x1,%%xmm2,%k3                 \n"
   1306     "pextrw    $0x3,%%xmm2,%k4                 \n"
   1307     "packuswb  %%xmm0,%%xmm0                   \n"
   1308     "movq      %%xmm0," MEMACCESS(0) "         \n"
   1309     "lea       " MEMLEA(0x8,0) ",%0            \n"
   1310     "sub       $0x2,%2                         \n"
   1311     "jge       2b                              \n"
   1312 
   1313     LABELALIGN
   1314   "29:                                         \n"
   1315     "add       $0x1,%2                         \n"
   1316     "jl        99f                             \n"
   1317     "psrlw     $0x9,%%xmm2                     \n"
   1318     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
   1319     "pshufb    %%xmm5,%%xmm2                   \n"
   1320     "pshufb    %%xmm4,%%xmm0                   \n"
   1321     "pxor      %%xmm6,%%xmm2                   \n"
   1322     "pmaddubsw %%xmm2,%%xmm0                   \n"
   1323     "psrlw     $0x7,%%xmm0                     \n"
   1324     "packuswb  %%xmm0,%%xmm0                   \n"
   1325     "movd      %%xmm0," MEMACCESS(0) "         \n"
   1326 
   1327     LABELALIGN
   1328   "99:                                         \n"
   1329   : "+r"(dst_argb),    // %0
   1330     "+r"(src_argb),    // %1
   1331     "+rm"(dst_width),  // %2
   1332     "=&r"(x0),         // %3
   1333     "=&r"(x1)          // %4
   1334   : "rm"(x),           // %5
   1335     "rm"(dx)           // %6
   1336   : "memory", "cc", NACL_R14
   1337     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   1338   );
   1339 }
   1340 
   1341 // Divide num by div and return as 16.16 fixed point result.
   1342 int FixedDiv_X86(int num, int div) {
   1343   asm volatile(
   1344       "cdq                                       \n"
   1345       "shld      $0x10,%%eax,%%edx               \n"
   1346       "shl       $0x10,%%eax                     \n"
   1347       "idiv      %1                              \n"
   1348       "mov       %0, %%eax                       \n"
   1349       : "+a"(num)  // %0
   1350       : "c"(div)   // %1
   1351       : "memory", "cc", "edx");
   1352   return num;
   1353 }
   1354 
   1355 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
   1356 int FixedDiv1_X86(int num, int div) {
   1357   asm volatile(
   1358       "cdq                                       \n"
   1359       "shld      $0x10,%%eax,%%edx               \n"
   1360       "shl       $0x10,%%eax                     \n"
   1361       "sub       $0x10001,%%eax                  \n"
   1362       "sbb       $0x0,%%edx                      \n"
   1363       "sub       $0x1,%1                         \n"
   1364       "idiv      %1                              \n"
   1365       "mov       %0, %%eax                       \n"
   1366       : "+a"(num)  // %0
   1367       : "c"(div)   // %1
   1368       : "memory", "cc", "edx");
   1369   return num;
   1370 }
   1371 
   1372 #endif  // defined(__x86_64__) || defined(__i386__)
   1373 
   1374 #ifdef __cplusplus
   1375 }  // extern "C"
   1376 }  // namespace libyuv
   1377 #endif
   1378