Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 #include "libyuv/scale_row.h"
     13 
     14 #ifdef __cplusplus
     15 namespace libyuv {
     16 extern "C" {
     17 #endif
     18 
     19 // This module is for 32 bit Visual C x86 and clangcl
     20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
     21 
     22 // Offsets for source bytes 0 to 9
     23 static uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
     24                        128, 128, 128, 128, 128, 128, 128, 128};
     25 
     26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
     27 static uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
     28                        128, 128, 128, 128, 128, 128, 128, 128};
     29 
     30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
     31 static uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
     32                        128, 128, 128, 128, 128, 128, 128, 128};
     33 
     34 // Offsets for source bytes 0 to 10
     35 static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
     36 
     37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
     38 static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
     39 
     40 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
     41 static uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
     42                         10, 11, 12, 13, 13, 14, 14, 15};
     43 
     44 // Coefficients for source bytes 0 to 10
     45 static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
     46 
     47 // Coefficients for source bytes 10 to 21
     48 static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
     49 
     50 // Coefficients for source bytes 21 to 31
     51 static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
     52 
     53 // Coefficients for source bytes 21 to 31
     54 static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
     55 
     56 static uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
     57                          128, 128, 128, 128, 128, 128, 128, 128};
     58 
     59 static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
     60                          6,   8,   11,  14,  128, 128, 128, 128};
     61 
     62 // Arrange words 0,3,6 into 0,1,2
     63 static uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
     64                         128, 128, 128, 128, 128, 128, 128, 128};
     65 
     66 // Arrange words 0,3,6 into 3,4,5
     67 static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
     68                          6,   7,   12,  13,  128, 128, 128, 128};
     69 
     70 // Scaling values for boxes of 3x3 and 2x3
     71 static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
     72                             65536 / 9, 65536 / 6, 0,         0};
     73 
     74 // Arrange first value for pixels 0,1,2,3,4,5
     75 static uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
     76                          11, 128, 14, 128, 128, 128, 128, 128};
     77 
     78 // Arrange second value for pixels 0,1,2,3,4,5
     79 static uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
     80                          12, 128, 15, 128, 128, 128, 128, 128};
     81 
     82 // Arrange third value for pixels 0,1,2,3,4,5
     83 static uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
     84                          13, 128, 128, 128, 128, 128, 128, 128};
     85 
     86 // Scaling values for boxes of 3x2 and 2x2
     87 static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
     88                            65536 / 3, 65536 / 2, 0,         0};
     89 
     90 // Reads 32 pixels, throws half away and writes 16 pixels.
     91 __declspec(naked) void ScaleRowDown2_SSSE3(const uint8* src_ptr,
     92                                            ptrdiff_t src_stride,
     93                                            uint8* dst_ptr,
     94                                            int dst_width) {
     95   __asm {
     96     mov        eax, [esp + 4]  // src_ptr
     97     // src_stride ignored
     98     mov        edx, [esp + 12]  // dst_ptr
     99     mov        ecx, [esp + 16]  // dst_width
    100 
    101   wloop:
    102     movdqu     xmm0, [eax]
    103     movdqu     xmm1, [eax + 16]
    104     lea        eax,  [eax + 32]
    105     psrlw      xmm0, 8          // isolate odd pixels.
    106     psrlw      xmm1, 8
    107     packuswb   xmm0, xmm1
    108     movdqu     [edx], xmm0
    109     lea        edx, [edx + 16]
    110     sub        ecx, 16
    111     jg         wloop
    112 
    113     ret
    114   }
    115 }
    116 
    117 // Blends 32x1 rectangle to 16x1.
    118 __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
    119                                                  ptrdiff_t src_stride,
    120                                                  uint8* dst_ptr,
    121                                                  int dst_width) {
    122   __asm {
    123     mov        eax, [esp + 4]  // src_ptr
    124     // src_stride
    125     mov        edx, [esp + 12]  // dst_ptr
    126     mov        ecx, [esp + 16]  // dst_width
    127 
    128     pcmpeqb    xmm4, xmm4  // constant 0x0101
    129     psrlw      xmm4, 15
    130     packuswb   xmm4, xmm4
    131     pxor       xmm5, xmm5  // constant 0
    132 
    133   wloop:
    134     movdqu     xmm0, [eax]
    135     movdqu     xmm1, [eax + 16]
    136     lea        eax,  [eax + 32]
    137     pmaddubsw  xmm0, xmm4  // horizontal add
    138     pmaddubsw  xmm1, xmm4
    139     pavgw      xmm0, xmm5       // (x + 1) / 2
    140     pavgw      xmm1, xmm5
    141     packuswb   xmm0, xmm1
    142     movdqu     [edx], xmm0
    143     lea        edx, [edx + 16]
    144     sub        ecx, 16
    145     jg         wloop
    146 
    147     ret
    148   }
    149 }
    150 
    151 // Blends 32x2 rectangle to 16x1.
    152 __declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
    153                                               ptrdiff_t src_stride,
    154                                               uint8* dst_ptr,
    155                                               int dst_width) {
    156   __asm {
    157     push       esi
    158     mov        eax, [esp + 4 + 4]  // src_ptr
    159     mov        esi, [esp + 4 + 8]  // src_stride
    160     mov        edx, [esp + 4 + 12]  // dst_ptr
    161     mov        ecx, [esp + 4 + 16]  // dst_width
    162 
    163     pcmpeqb    xmm4, xmm4  // constant 0x0101
    164     psrlw      xmm4, 15
    165     packuswb   xmm4, xmm4
    166     pxor       xmm5, xmm5  // constant 0
    167 
    168   wloop:
    169     movdqu     xmm0, [eax]
    170     movdqu     xmm1, [eax + 16]
    171     movdqu     xmm2, [eax + esi]
    172     movdqu     xmm3, [eax + esi + 16]
    173     lea        eax,  [eax + 32]
    174     pmaddubsw  xmm0, xmm4  // horizontal add
    175     pmaddubsw  xmm1, xmm4
    176     pmaddubsw  xmm2, xmm4
    177     pmaddubsw  xmm3, xmm4
    178     paddw      xmm0, xmm2  // vertical add
    179     paddw      xmm1, xmm3
    180     psrlw      xmm0, 1
    181     psrlw      xmm1, 1
    182     pavgw      xmm0, xmm5  // (x + 1) / 2
    183     pavgw      xmm1, xmm5
    184     packuswb   xmm0, xmm1
    185     movdqu     [edx], xmm0
    186     lea        edx, [edx + 16]
    187     sub        ecx, 16
    188     jg         wloop
    189 
    190     pop        esi
    191     ret
    192   }
    193 }
    194 
    195 #ifdef HAS_SCALEROWDOWN2_AVX2
    196 // Reads 64 pixels, throws half away and writes 32 pixels.
    197 __declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr,
    198                                           ptrdiff_t src_stride,
    199                                           uint8* dst_ptr,
    200                                           int dst_width) {
    201   __asm {
    202     mov        eax, [esp + 4]  // src_ptr
    203     // src_stride ignored
    204     mov        edx, [esp + 12]  // dst_ptr
    205     mov        ecx, [esp + 16]  // dst_width
    206 
    207   wloop:
    208     vmovdqu     ymm0, [eax]
    209     vmovdqu     ymm1, [eax + 32]
    210     lea         eax,  [eax + 64]
    211     vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
    212     vpsrlw      ymm1, ymm1, 8
    213     vpackuswb   ymm0, ymm0, ymm1
    214     vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
    215     vmovdqu     [edx], ymm0
    216     lea         edx, [edx + 32]
    217     sub         ecx, 32
    218     jg          wloop
    219 
    220     vzeroupper
    221     ret
    222   }
    223 }
    224 
    225 // Blends 64x1 rectangle to 32x1.
    226 __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
    227                                                 ptrdiff_t src_stride,
    228                                                 uint8* dst_ptr,
    229                                                 int dst_width) {
    230   __asm {
    231     mov         eax, [esp + 4]  // src_ptr
    232     // src_stride
    233     mov         edx, [esp + 12]  // dst_ptr
    234     mov         ecx, [esp + 16]  // dst_width
    235 
    236     vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
    237     vpsrlw      ymm4, ymm4, 15
    238     vpackuswb   ymm4, ymm4, ymm4
    239     vpxor       ymm5, ymm5, ymm5  // constant 0
    240 
    241   wloop:
    242     vmovdqu     ymm0, [eax]
    243     vmovdqu     ymm1, [eax + 32]
    244     lea         eax,  [eax + 64]
    245     vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
    246     vpmaddubsw  ymm1, ymm1, ymm4
    247     vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
    248     vpavgw      ymm1, ymm1, ymm5
    249     vpackuswb   ymm0, ymm0, ymm1
    250     vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
    251     vmovdqu     [edx], ymm0
    252     lea         edx, [edx + 32]
    253     sub         ecx, 32
    254     jg          wloop
    255 
    256     vzeroupper
    257     ret
    258   }
    259 }
    260 
    261 // For rounding, average = (sum + 2) / 4
    262 // becomes average((sum >> 1), 0)
    263 // Blends 64x2 rectangle to 32x1.
    264 __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
    265                                              ptrdiff_t src_stride,
    266                                              uint8* dst_ptr,
    267                                              int dst_width) {
    268   __asm {
    269     push        esi
    270     mov         eax, [esp + 4 + 4]  // src_ptr
    271     mov         esi, [esp + 4 + 8]  // src_stride
    272     mov         edx, [esp + 4 + 12]  // dst_ptr
    273     mov         ecx, [esp + 4 + 16]  // dst_width
    274 
    275     vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
    276     vpsrlw      ymm4, ymm4, 15
    277     vpackuswb   ymm4, ymm4, ymm4
    278     vpxor       ymm5, ymm5, ymm5  // constant 0
    279 
    280   wloop:
    281     vmovdqu     ymm0, [eax]
    282     vmovdqu     ymm1, [eax + 32]
    283     vmovdqu     ymm2, [eax + esi]
    284     vmovdqu     ymm3, [eax + esi + 32]
    285     lea         eax,  [eax + 64]
    286     vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
    287     vpmaddubsw  ymm1, ymm1, ymm4
    288     vpmaddubsw  ymm2, ymm2, ymm4
    289     vpmaddubsw  ymm3, ymm3, ymm4
    290     vpaddw      ymm0, ymm0, ymm2  // vertical add
    291     vpaddw      ymm1, ymm1, ymm3
    292     vpsrlw      ymm0, ymm0, 1  // (x + 2) / 4 = (x / 2 + 1) / 2
    293     vpsrlw      ymm1, ymm1, 1
    294     vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
    295     vpavgw      ymm1, ymm1, ymm5
    296     vpackuswb   ymm0, ymm0, ymm1
    297     vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
    298     vmovdqu     [edx], ymm0
    299     lea         edx, [edx + 32]
    300     sub         ecx, 32
    301     jg          wloop
    302 
    303     pop         esi
    304     vzeroupper
    305     ret
    306   }
    307 }
    308 #endif  // HAS_SCALEROWDOWN2_AVX2
    309 
    310 // Point samples 32 pixels to 8 pixels.
    311 __declspec(naked) void ScaleRowDown4_SSSE3(const uint8* src_ptr,
    312                                            ptrdiff_t src_stride,
    313                                            uint8* dst_ptr,
    314                                            int dst_width) {
    315   __asm {
    316     mov        eax, [esp + 4]  // src_ptr
    317     // src_stride ignored
    318     mov        edx, [esp + 12]  // dst_ptr
    319     mov        ecx, [esp + 16]  // dst_width
    320     pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
    321     psrld      xmm5, 24
    322     pslld      xmm5, 16
    323 
    324   wloop:
    325     movdqu     xmm0, [eax]
    326     movdqu     xmm1, [eax + 16]
    327     lea        eax,  [eax + 32]
    328     pand       xmm0, xmm5
    329     pand       xmm1, xmm5
    330     packuswb   xmm0, xmm1
    331     psrlw      xmm0, 8
    332     packuswb   xmm0, xmm0
    333     movq       qword ptr [edx], xmm0
    334     lea        edx, [edx + 8]
    335     sub        ecx, 8
    336     jg         wloop
    337 
    338     ret
    339   }
    340 }
    341 
    342 // Blends 32x4 rectangle to 8x1.
    343 __declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
    344                                               ptrdiff_t src_stride,
    345                                               uint8* dst_ptr,
    346                                               int dst_width) {
    347   __asm {
    348     push       esi
    349     push       edi
    350     mov        eax, [esp + 8 + 4]  // src_ptr
    351     mov        esi, [esp + 8 + 8]  // src_stride
    352     mov        edx, [esp + 8 + 12]  // dst_ptr
    353     mov        ecx, [esp + 8 + 16]  // dst_width
    354     lea        edi, [esi + esi * 2]  // src_stride * 3
    355     pcmpeqb    xmm4, xmm4  // constant 0x0101
    356     psrlw      xmm4, 15
    357     movdqa     xmm5, xmm4
    358     packuswb   xmm4, xmm4
    359     psllw      xmm5, 3  // constant 0x0008
    360 
    361   wloop:
    362     movdqu     xmm0, [eax]  // average rows
    363     movdqu     xmm1, [eax + 16]
    364     movdqu     xmm2, [eax + esi]
    365     movdqu     xmm3, [eax + esi + 16]
    366     pmaddubsw  xmm0, xmm4  // horizontal add
    367     pmaddubsw  xmm1, xmm4
    368     pmaddubsw  xmm2, xmm4
    369     pmaddubsw  xmm3, xmm4
    370     paddw      xmm0, xmm2  // vertical add rows 0, 1
    371     paddw      xmm1, xmm3
    372     movdqu     xmm2, [eax + esi * 2]
    373     movdqu     xmm3, [eax + esi * 2 + 16]
    374     pmaddubsw  xmm2, xmm4
    375     pmaddubsw  xmm3, xmm4
    376     paddw      xmm0, xmm2  // add row 2
    377     paddw      xmm1, xmm3
    378     movdqu     xmm2, [eax + edi]
    379     movdqu     xmm3, [eax + edi + 16]
    380     lea        eax, [eax + 32]
    381     pmaddubsw  xmm2, xmm4
    382     pmaddubsw  xmm3, xmm4
    383     paddw      xmm0, xmm2  // add row 3
    384     paddw      xmm1, xmm3
    385     phaddw     xmm0, xmm1
    386     paddw      xmm0, xmm5  // + 8 for round
    387     psrlw      xmm0, 4  // /16 for average of 4 * 4
    388     packuswb   xmm0, xmm0
    389     movq       qword ptr [edx], xmm0
    390     lea        edx, [edx + 8]
    391     sub        ecx, 8
    392     jg         wloop
    393 
    394     pop        edi
    395     pop        esi
    396     ret
    397   }
    398 }
    399 
    400 #ifdef HAS_SCALEROWDOWN4_AVX2
    401 // Point samples 64 pixels to 16 pixels.
    402 __declspec(naked) void ScaleRowDown4_AVX2(const uint8* src_ptr,
    403                                           ptrdiff_t src_stride,
    404                                           uint8* dst_ptr,
    405                                           int dst_width) {
    406   __asm {
    407     mov         eax, [esp + 4]  // src_ptr
    408     // src_stride ignored
    409     mov         edx, [esp + 12]  // dst_ptr
    410     mov         ecx, [esp + 16]  // dst_width
    411     vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0x00ff0000
    412     vpsrld      ymm5, ymm5, 24
    413     vpslld      ymm5, ymm5, 16
    414 
    415   wloop:
    416     vmovdqu     ymm0, [eax]
    417     vmovdqu     ymm1, [eax + 32]
    418     lea         eax,  [eax + 64]
    419     vpand       ymm0, ymm0, ymm5
    420     vpand       ymm1, ymm1, ymm5
    421     vpackuswb   ymm0, ymm0, ymm1
    422     vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
    423     vpsrlw      ymm0, ymm0, 8
    424     vpackuswb   ymm0, ymm0, ymm0
    425     vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
    426     vmovdqu     [edx], xmm0
    427     lea         edx, [edx + 16]
    428     sub         ecx, 16
    429     jg          wloop
    430 
    431     vzeroupper
    432     ret
    433   }
    434 }
    435 
    436 // Blends 64x4 rectangle to 16x1.
    437 __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
    438                                              ptrdiff_t src_stride,
    439                                              uint8* dst_ptr,
    440                                              int dst_width) {
    441   __asm {
    442     push        esi
    443     push        edi
    444     mov         eax, [esp + 8 + 4]  // src_ptr
    445     mov         esi, [esp + 8 + 8]  // src_stride
    446     mov         edx, [esp + 8 + 12]  // dst_ptr
    447     mov         ecx, [esp + 8 + 16]  // dst_width
    448     lea         edi, [esi + esi * 2]  // src_stride * 3
    449     vpcmpeqb    ymm4, ymm4, ymm4  // constant 0x0101
    450     vpsrlw      ymm4, ymm4, 15
    451     vpsllw      ymm5, ymm4, 3  // constant 0x0008
    452     vpackuswb   ymm4, ymm4, ymm4
    453 
    454   wloop:
    455     vmovdqu     ymm0, [eax]  // average rows
    456     vmovdqu     ymm1, [eax + 32]
    457     vmovdqu     ymm2, [eax + esi]
    458     vmovdqu     ymm3, [eax + esi + 32]
    459     vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
    460     vpmaddubsw  ymm1, ymm1, ymm4
    461     vpmaddubsw  ymm2, ymm2, ymm4
    462     vpmaddubsw  ymm3, ymm3, ymm4
    463     vpaddw      ymm0, ymm0, ymm2  // vertical add rows 0, 1
    464     vpaddw      ymm1, ymm1, ymm3
    465     vmovdqu     ymm2, [eax + esi * 2]
    466     vmovdqu     ymm3, [eax + esi * 2 + 32]
    467     vpmaddubsw  ymm2, ymm2, ymm4
    468     vpmaddubsw  ymm3, ymm3, ymm4
    469     vpaddw      ymm0, ymm0, ymm2  // add row 2
    470     vpaddw      ymm1, ymm1, ymm3
    471     vmovdqu     ymm2, [eax + edi]
    472     vmovdqu     ymm3, [eax + edi + 32]
    473     lea         eax,  [eax + 64]
    474     vpmaddubsw  ymm2, ymm2, ymm4
    475     vpmaddubsw  ymm3, ymm3, ymm4
    476     vpaddw      ymm0, ymm0, ymm2  // add row 3
    477     vpaddw      ymm1, ymm1, ymm3
    478     vphaddw     ymm0, ymm0, ymm1  // mutates
    479     vpermq      ymm0, ymm0, 0xd8  // unmutate vphaddw
    480     vpaddw      ymm0, ymm0, ymm5  // + 8 for round
    481     vpsrlw      ymm0, ymm0, 4  // /32 for average of 4 * 4
    482     vpackuswb   ymm0, ymm0, ymm0
    483     vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
    484     vmovdqu     [edx], xmm0
    485     lea         edx, [edx + 16]
    486     sub         ecx, 16
    487     jg          wloop
    488 
    489     pop        edi
    490     pop        esi
    491     vzeroupper
    492     ret
    493   }
    494 }
    495 #endif  // HAS_SCALEROWDOWN4_AVX2
    496 
    497 // Point samples 32 pixels to 24 pixels.
    498 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
    499 // Then shuffled to do the scaling.
    500 
    501 __declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr,
    502                                             ptrdiff_t src_stride,
    503                                             uint8* dst_ptr,
    504                                             int dst_width) {
    505   __asm {
    506     mov        eax, [esp + 4]   // src_ptr
    507     // src_stride ignored
    508     mov        edx, [esp + 12]  // dst_ptr
    509     mov        ecx, [esp + 16]  // dst_width
    510     movdqa     xmm3, xmmword ptr kShuf0
    511     movdqa     xmm4, xmmword ptr kShuf1
    512     movdqa     xmm5, xmmword ptr kShuf2
    513 
    514   wloop:
    515     movdqu     xmm0, [eax]
    516     movdqu     xmm1, [eax + 16]
    517     lea        eax,  [eax + 32]
    518     movdqa     xmm2, xmm1
    519     palignr    xmm1, xmm0, 8
    520     pshufb     xmm0, xmm3
    521     pshufb     xmm1, xmm4
    522     pshufb     xmm2, xmm5
    523     movq       qword ptr [edx], xmm0
    524     movq       qword ptr [edx + 8], xmm1
    525     movq       qword ptr [edx + 16], xmm2
    526     lea        edx, [edx + 24]
    527     sub        ecx, 24
    528     jg         wloop
    529 
    530     ret
    531   }
    532 }
    533 
    534 // Blends 32x2 rectangle to 24x1
    535 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
    536 // Then shuffled to do the scaling.
    537 
    538 // Register usage:
    539 // xmm0 src_row 0
    540 // xmm1 src_row 1
    541 // xmm2 shuf 0
    542 // xmm3 shuf 1
    543 // xmm4 shuf 2
    544 // xmm5 madd 0
    545 // xmm6 madd 1
    546 // xmm7 kRound34
    547 
    548 // Note that movdqa+palign may be better than movdqu.
    549 __declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
    550                                                   ptrdiff_t src_stride,
    551                                                   uint8* dst_ptr,
    552                                                   int dst_width) {
    553   __asm {
    554     push       esi
    555     mov        eax, [esp + 4 + 4]  // src_ptr
    556     mov        esi, [esp + 4 + 8]  // src_stride
    557     mov        edx, [esp + 4 + 12]  // dst_ptr
    558     mov        ecx, [esp + 4 + 16]  // dst_width
    559     movdqa     xmm2, xmmword ptr kShuf01
    560     movdqa     xmm3, xmmword ptr kShuf11
    561     movdqa     xmm4, xmmword ptr kShuf21
    562     movdqa     xmm5, xmmword ptr kMadd01
    563     movdqa     xmm6, xmmword ptr kMadd11
    564     movdqa     xmm7, xmmword ptr kRound34
    565 
    566   wloop:
    567     movdqu     xmm0, [eax]  // pixels 0..7
    568     movdqu     xmm1, [eax + esi]
    569     pavgb      xmm0, xmm1
    570     pshufb     xmm0, xmm2
    571     pmaddubsw  xmm0, xmm5
    572     paddsw     xmm0, xmm7
    573     psrlw      xmm0, 2
    574     packuswb   xmm0, xmm0
    575     movq       qword ptr [edx], xmm0
    576     movdqu     xmm0, [eax + 8]  // pixels 8..15
    577     movdqu     xmm1, [eax + esi + 8]
    578     pavgb      xmm0, xmm1
    579     pshufb     xmm0, xmm3
    580     pmaddubsw  xmm0, xmm6
    581     paddsw     xmm0, xmm7
    582     psrlw      xmm0, 2
    583     packuswb   xmm0, xmm0
    584     movq       qword ptr [edx + 8], xmm0
    585     movdqu     xmm0, [eax + 16]  // pixels 16..23
    586     movdqu     xmm1, [eax + esi + 16]
    587     lea        eax, [eax + 32]
    588     pavgb      xmm0, xmm1
    589     pshufb     xmm0, xmm4
    590     movdqa     xmm1, xmmword ptr kMadd21
    591     pmaddubsw  xmm0, xmm1
    592     paddsw     xmm0, xmm7
    593     psrlw      xmm0, 2
    594     packuswb   xmm0, xmm0
    595     movq       qword ptr [edx + 16], xmm0
    596     lea        edx, [edx + 24]
    597     sub        ecx, 24
    598     jg         wloop
    599 
    600     pop        esi
    601     ret
    602   }
    603 }
    604 
    605 // Note that movdqa+palign may be better than movdqu.
    606 __declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
    607                                                   ptrdiff_t src_stride,
    608                                                   uint8* dst_ptr,
    609                                                   int dst_width) {
    610   __asm {
    611     push       esi
    612     mov        eax, [esp + 4 + 4]  // src_ptr
    613     mov        esi, [esp + 4 + 8]  // src_stride
    614     mov        edx, [esp + 4 + 12]  // dst_ptr
    615     mov        ecx, [esp + 4 + 16]  // dst_width
    616     movdqa     xmm2, xmmword ptr kShuf01
    617     movdqa     xmm3, xmmword ptr kShuf11
    618     movdqa     xmm4, xmmword ptr kShuf21
    619     movdqa     xmm5, xmmword ptr kMadd01
    620     movdqa     xmm6, xmmword ptr kMadd11
    621     movdqa     xmm7, xmmword ptr kRound34
    622 
    623   wloop:
    624     movdqu     xmm0, [eax]  // pixels 0..7
    625     movdqu     xmm1, [eax + esi]
    626     pavgb      xmm1, xmm0
    627     pavgb      xmm0, xmm1
    628     pshufb     xmm0, xmm2
    629     pmaddubsw  xmm0, xmm5
    630     paddsw     xmm0, xmm7
    631     psrlw      xmm0, 2
    632     packuswb   xmm0, xmm0
    633     movq       qword ptr [edx], xmm0
    634     movdqu     xmm0, [eax + 8]  // pixels 8..15
    635     movdqu     xmm1, [eax + esi + 8]
    636     pavgb      xmm1, xmm0
    637     pavgb      xmm0, xmm1
    638     pshufb     xmm0, xmm3
    639     pmaddubsw  xmm0, xmm6
    640     paddsw     xmm0, xmm7
    641     psrlw      xmm0, 2
    642     packuswb   xmm0, xmm0
    643     movq       qword ptr [edx + 8], xmm0
    644     movdqu     xmm0, [eax + 16]  // pixels 16..23
    645     movdqu     xmm1, [eax + esi + 16]
    646     lea        eax, [eax + 32]
    647     pavgb      xmm1, xmm0
    648     pavgb      xmm0, xmm1
    649     pshufb     xmm0, xmm4
    650     movdqa     xmm1, xmmword ptr kMadd21
    651     pmaddubsw  xmm0, xmm1
    652     paddsw     xmm0, xmm7
    653     psrlw      xmm0, 2
    654     packuswb   xmm0, xmm0
    655     movq       qword ptr [edx + 16], xmm0
    656     lea        edx, [edx+24]
    657     sub        ecx, 24
    658     jg         wloop
    659 
    660     pop        esi
    661     ret
    662   }
    663 }
    664 
    665 // 3/8 point sampler
    666 
    667 // Scale 32 pixels to 12
    668 __declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr,
    669                                             ptrdiff_t src_stride,
    670                                             uint8* dst_ptr,
    671                                             int dst_width) {
    672   __asm {
    673     mov        eax, [esp + 4]  // src_ptr
    674     // src_stride ignored
    675     mov        edx, [esp + 12]  // dst_ptr
    676     mov        ecx, [esp + 16]  // dst_width
    677     movdqa     xmm4, xmmword ptr kShuf38a
    678     movdqa     xmm5, xmmword ptr kShuf38b
    679 
    680   xloop:
    681     movdqu     xmm0, [eax]  // 16 pixels -> 0,1,2,3,4,5
    682     movdqu     xmm1, [eax + 16]  // 16 pixels -> 6,7,8,9,10,11
    683     lea        eax, [eax + 32]
    684     pshufb     xmm0, xmm4
    685     pshufb     xmm1, xmm5
    686     paddusb    xmm0, xmm1
    687 
    688     movq       qword ptr [edx], xmm0       // write 12 pixels
    689     movhlps    xmm1, xmm0
    690     movd       [edx + 8], xmm1
    691     lea        edx, [edx + 12]
    692     sub        ecx, 12
    693     jg         xloop
    694 
    695     ret
    696   }
    697 }
    698 
    699 // Scale 16x3 pixels to 6x1 with interpolation
    700 __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
    701                                                   ptrdiff_t src_stride,
    702                                                   uint8* dst_ptr,
    703                                                   int dst_width) {
    704   __asm {
    705     push       esi
    706     mov        eax, [esp + 4 + 4]  // src_ptr
    707     mov        esi, [esp + 4 + 8]  // src_stride
    708     mov        edx, [esp + 4 + 12]  // dst_ptr
    709     mov        ecx, [esp + 4 + 16]  // dst_width
    710     movdqa     xmm2, xmmword ptr kShufAc
    711     movdqa     xmm3, xmmword ptr kShufAc3
    712     movdqa     xmm4, xmmword ptr kScaleAc33
    713     pxor       xmm5, xmm5
    714 
    715   xloop:
    716     movdqu     xmm0, [eax]  // sum up 3 rows into xmm0/1
    717     movdqu     xmm6, [eax + esi]
    718     movhlps    xmm1, xmm0
    719     movhlps    xmm7, xmm6
    720     punpcklbw  xmm0, xmm5
    721     punpcklbw  xmm1, xmm5
    722     punpcklbw  xmm6, xmm5
    723     punpcklbw  xmm7, xmm5
    724     paddusw    xmm0, xmm6
    725     paddusw    xmm1, xmm7
    726     movdqu     xmm6, [eax + esi * 2]
    727     lea        eax, [eax + 16]
    728     movhlps    xmm7, xmm6
    729     punpcklbw  xmm6, xmm5
    730     punpcklbw  xmm7, xmm5
    731     paddusw    xmm0, xmm6
    732     paddusw    xmm1, xmm7
    733 
    734     movdqa     xmm6, xmm0  // 8 pixels -> 0,1,2 of xmm6
    735     psrldq     xmm0, 2
    736     paddusw    xmm6, xmm0
    737     psrldq     xmm0, 2
    738     paddusw    xmm6, xmm0
    739     pshufb     xmm6, xmm2
    740 
    741     movdqa     xmm7, xmm1  // 8 pixels -> 3,4,5 of xmm6
    742     psrldq     xmm1, 2
    743     paddusw    xmm7, xmm1
    744     psrldq     xmm1, 2
    745     paddusw    xmm7, xmm1
    746     pshufb     xmm7, xmm3
    747     paddusw    xmm6, xmm7
    748 
    749     pmulhuw    xmm6, xmm4  // divide by 9,9,6, 9,9,6
    750     packuswb   xmm6, xmm6
    751 
    752     movd       [edx], xmm6  // write 6 pixels
    753     psrlq      xmm6, 16
    754     movd       [edx + 2], xmm6
    755     lea        edx, [edx + 6]
    756     sub        ecx, 6
    757     jg         xloop
    758 
    759     pop        esi
    760     ret
    761   }
    762 }
    763 
    764 // Scale 16x2 pixels to 6x1 with interpolation
    765 __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
    766                                                   ptrdiff_t src_stride,
    767                                                   uint8* dst_ptr,
    768                                                   int dst_width) {
    769   __asm {
    770     push       esi
    771     mov        eax, [esp + 4 + 4]  // src_ptr
    772     mov        esi, [esp + 4 + 8]  // src_stride
    773     mov        edx, [esp + 4 + 12]  // dst_ptr
    774     mov        ecx, [esp + 4 + 16]  // dst_width
    775     movdqa     xmm2, xmmword ptr kShufAb0
    776     movdqa     xmm3, xmmword ptr kShufAb1
    777     movdqa     xmm4, xmmword ptr kShufAb2
    778     movdqa     xmm5, xmmword ptr kScaleAb2
    779 
    780   xloop:
    781     movdqu     xmm0, [eax]  // average 2 rows into xmm0
    782     movdqu     xmm1, [eax + esi]
    783     lea        eax, [eax + 16]
    784     pavgb      xmm0, xmm1
    785 
    786     movdqa     xmm1, xmm0  // 16 pixels -> 0,1,2,3,4,5 of xmm1
    787     pshufb     xmm1, xmm2
    788     movdqa     xmm6, xmm0
    789     pshufb     xmm6, xmm3
    790     paddusw    xmm1, xmm6
    791     pshufb     xmm0, xmm4
    792     paddusw    xmm1, xmm0
    793 
    794     pmulhuw    xmm1, xmm5  // divide by 3,3,2, 3,3,2
    795     packuswb   xmm1, xmm1
    796 
    797     movd       [edx], xmm1  // write 6 pixels
    798     psrlq      xmm1, 16
    799     movd       [edx + 2], xmm1
    800     lea        edx, [edx + 6]
    801     sub        ecx, 6
    802     jg         xloop
    803 
    804     pop        esi
    805     ret
    806   }
    807 }
    808 
    809 // Reads 16 bytes and accumulates to 16 shorts at a time.
    810 __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
    811                                         uint16* dst_ptr,
    812                                         int src_width) {
    813   __asm {
    814     mov        eax, [esp + 4]  // src_ptr
    815     mov        edx, [esp + 8]  // dst_ptr
    816     mov        ecx, [esp + 12]  // src_width
    817     pxor       xmm5, xmm5
    818 
    819     // sum rows
    820   xloop:
    821     movdqu     xmm3, [eax]  // read 16 bytes
    822     lea        eax, [eax + 16]
    823     movdqu     xmm0, [edx]  // read 16 words from destination
    824     movdqu     xmm1, [edx + 16]
    825     movdqa     xmm2, xmm3
    826     punpcklbw  xmm2, xmm5
    827     punpckhbw  xmm3, xmm5
    828     paddusw    xmm0, xmm2  // sum 16 words
    829     paddusw    xmm1, xmm3
    830     movdqu     [edx], xmm0  // write 16 words to destination
    831     movdqu     [edx + 16], xmm1
    832     lea        edx, [edx + 32]
    833     sub        ecx, 16
    834     jg         xloop
    835     ret
    836   }
    837 }
    838 
    839 #ifdef HAS_SCALEADDROW_AVX2
    840 // Reads 32 bytes and accumulates to 32 shorts at a time.
    841 __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
    842                                         uint16* dst_ptr,
    843                                         int src_width) {
    844   __asm {
    845     mov         eax, [esp + 4]  // src_ptr
    846     mov         edx, [esp + 8]  // dst_ptr
    847     mov         ecx, [esp + 12]  // src_width
    848     vpxor       ymm5, ymm5, ymm5
    849 
    850     // sum rows
    851   xloop:
    852     vmovdqu     ymm3, [eax]  // read 32 bytes
    853     lea         eax, [eax + 32]
    854     vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
    855     vpunpcklbw  ymm2, ymm3, ymm5
    856     vpunpckhbw  ymm3, ymm3, ymm5
    857     vpaddusw    ymm0, ymm2, [edx]  // sum 16 words
    858     vpaddusw    ymm1, ymm3, [edx + 32]
    859     vmovdqu     [edx], ymm0  // write 32 words to destination
    860     vmovdqu     [edx + 32], ymm1
    861     lea         edx, [edx + 64]
    862     sub         ecx, 32
    863     jg          xloop
    864 
    865     vzeroupper
    866     ret
    867   }
    868 }
    869 #endif  // HAS_SCALEADDROW_AVX2
    870 
    871 // Constant for making pixels signed to avoid pmaddubsw
    872 // saturation.
    873 static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
    874                         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
    875 
    876 // Constant for making pixels unsigned and adding .5 for rounding.
    877 static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
    878                          0x4040, 0x4040, 0x4040, 0x4040};
    879 
    880 // Bilinear column filtering. SSSE3 version.
    881 __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
    882                                              const uint8* src_ptr,
    883                                              int dst_width,
    884                                              int x,
    885                                              int dx) {
    886   __asm {
    887     push       ebx
    888     push       esi
    889     push       edi
    890     mov        edi, [esp + 12 + 4]  // dst_ptr
    891     mov        esi, [esp + 12 + 8]  // src_ptr
    892     mov        ecx, [esp + 12 + 12]  // dst_width
    893     movd       xmm2, [esp + 12 + 16]  // x
    894     movd       xmm3, [esp + 12 + 20]  // dx
    895     mov        eax, 0x04040000  // shuffle to line up fractions with pixel.
    896     movd       xmm5, eax
    897     pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
    898     psrlw      xmm6, 9
    899     pcmpeqb    xmm7, xmm7  // generate 0x0001
    900     psrlw      xmm7, 15
    901     pextrw     eax, xmm2, 1  // get x0 integer. preroll
    902     sub        ecx, 2
    903     jl         xloop29
    904 
    905     movdqa     xmm0, xmm2  // x1 = x0 + dx
    906     paddd      xmm0, xmm3
    907     punpckldq  xmm2, xmm0  // x0 x1
    908     punpckldq  xmm3, xmm3  // dx dx
    909     paddd      xmm3, xmm3  // dx * 2, dx * 2
    910     pextrw     edx, xmm2, 3  // get x1 integer. preroll
    911 
    912     // 2 Pixel loop.
    913   xloop2:
    914     movdqa     xmm1, xmm2  // x0, x1 fractions.
    915     paddd      xmm2, xmm3  // x += dx
    916     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
    917     movd       xmm0, ebx
    918     psrlw      xmm1, 9  // 7 bit fractions.
    919     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
    920     movd       xmm4, ebx
    921     pshufb     xmm1, xmm5  // 0011
    922     punpcklwd  xmm0, xmm4
    923     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
    924     pxor       xmm1, xmm6  // 0..7f and 7f..0
    925     paddusb    xmm1, xmm7  // +1 so 0..7f and 80..1
    926     pmaddubsw  xmm1, xmm0  // 16 bit, 2 pixels.
    927     pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
    928     pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
    929     paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
    930     psrlw      xmm1, 7  // 8.7 fixed point to low 8 bits.
    931     packuswb   xmm1, xmm1  // 8 bits, 2 pixels.
    932     movd       ebx, xmm1
    933     mov        [edi], bx
    934     lea        edi, [edi + 2]
    935     sub        ecx, 2  // 2 pixels
    936     jge        xloop2
    937 
    938  xloop29:
    939     add        ecx, 2 - 1
    940     jl         xloop99
    941 
    942         // 1 pixel remainder
    943     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
    944     movd       xmm0, ebx
    945     psrlw      xmm2, 9  // 7 bit fractions.
    946     pshufb     xmm2, xmm5  // 0011
    947     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
    948     pxor       xmm2, xmm6  // 0..7f and 7f..0
    949     paddusb    xmm2, xmm7  // +1 so 0..7f and 80..1
    950     pmaddubsw  xmm2, xmm0  // 16 bit
    951     paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
    952     psrlw      xmm2, 7  // 8.7 fixed point to low 8 bits.
    953     packuswb   xmm2, xmm2  // 8 bits
    954     movd       ebx, xmm2
    955     mov        [edi], bl
    956 
    957  xloop99:
    958 
    959     pop        edi
    960     pop        esi
    961     pop        ebx
    962     ret
    963   }
    964 }
    965 
    966 // Reads 16 pixels, duplicates them and writes 32 pixels.
    967 __declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr,
    968                                          const uint8* src_ptr,
    969                                          int dst_width,
    970                                          int x,
    971                                          int dx) {
    972   __asm {
    973     mov        edx, [esp + 4]  // dst_ptr
    974     mov        eax, [esp + 8]  // src_ptr
    975     mov        ecx, [esp + 12]  // dst_width
    976 
    977   wloop:
    978     movdqu     xmm0, [eax]
    979     lea        eax,  [eax + 16]
    980     movdqa     xmm1, xmm0
    981     punpcklbw  xmm0, xmm0
    982     punpckhbw  xmm1, xmm1
    983     movdqu     [edx], xmm0
    984     movdqu     [edx + 16], xmm1
    985     lea        edx, [edx + 32]
    986     sub        ecx, 32
    987     jg         wloop
    988 
    989     ret
    990   }
    991 }
    992 
    993 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
    994 __declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
    995                                               ptrdiff_t src_stride,
    996                                               uint8* dst_argb,
    997                                               int dst_width) {
    998   __asm {
    999     mov        eax, [esp + 4]   // src_argb
   1000     // src_stride ignored
   1001     mov        edx, [esp + 12]  // dst_argb
   1002     mov        ecx, [esp + 16]  // dst_width
   1003 
   1004   wloop:
   1005     movdqu     xmm0, [eax]
   1006     movdqu     xmm1, [eax + 16]
   1007     lea        eax,  [eax + 32]
   1008     shufps     xmm0, xmm1, 0xdd
   1009     movdqu     [edx], xmm0
   1010     lea        edx, [edx + 16]
   1011     sub        ecx, 4
   1012     jg         wloop
   1013 
   1014     ret
   1015   }
   1016 }
   1017 
   1018 // Blends 8x1 rectangle to 4x1.
   1019 __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
   1020                                                     ptrdiff_t src_stride,
   1021                                                     uint8* dst_argb,
   1022                                                     int dst_width) {
   1023   __asm {
   1024     mov        eax, [esp + 4]  // src_argb
   1025     // src_stride ignored
   1026     mov        edx, [esp + 12]  // dst_argb
   1027     mov        ecx, [esp + 16]  // dst_width
   1028 
   1029   wloop:
   1030     movdqu     xmm0, [eax]
   1031     movdqu     xmm1, [eax + 16]
   1032     lea        eax,  [eax + 32]
   1033     movdqa     xmm2, xmm0
   1034     shufps     xmm0, xmm1, 0x88  // even pixels
   1035     shufps     xmm2, xmm1, 0xdd       // odd pixels
   1036     pavgb      xmm0, xmm2
   1037     movdqu     [edx], xmm0
   1038     lea        edx, [edx + 16]
   1039     sub        ecx, 4
   1040     jg         wloop
   1041 
   1042     ret
   1043   }
   1044 }
   1045 
   1046 // Blends 8x2 rectangle to 4x1.
   1047 __declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
   1048                                                  ptrdiff_t src_stride,
   1049                                                  uint8* dst_argb,
   1050                                                  int dst_width) {
   1051   __asm {
   1052     push       esi
   1053     mov        eax, [esp + 4 + 4]  // src_argb
   1054     mov        esi, [esp + 4 + 8]  // src_stride
   1055     mov        edx, [esp + 4 + 12]  // dst_argb
   1056     mov        ecx, [esp + 4 + 16]  // dst_width
   1057 
   1058   wloop:
   1059     movdqu     xmm0, [eax]
   1060     movdqu     xmm1, [eax + 16]
   1061     movdqu     xmm2, [eax + esi]
   1062     movdqu     xmm3, [eax + esi + 16]
   1063     lea        eax,  [eax + 32]
   1064     pavgb      xmm0, xmm2  // average rows
   1065     pavgb      xmm1, xmm3
   1066     movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
   1067     shufps     xmm0, xmm1, 0x88  // even pixels
   1068     shufps     xmm2, xmm1, 0xdd  // odd pixels
   1069     pavgb      xmm0, xmm2
   1070     movdqu     [edx], xmm0
   1071     lea        edx, [edx + 16]
   1072     sub        ecx, 4
   1073     jg         wloop
   1074 
   1075     pop        esi
   1076     ret
   1077   }
   1078 }
   1079 
   1080 // Reads 4 pixels at a time.
   1081 __declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
   1082                                                  ptrdiff_t src_stride,
   1083                                                  int src_stepx,
   1084                                                  uint8* dst_argb,
   1085                                                  int dst_width) {
   1086   __asm {
   1087     push       ebx
   1088     push       edi
   1089     mov        eax, [esp + 8 + 4]   // src_argb
   1090     // src_stride ignored
   1091     mov        ebx, [esp + 8 + 12]  // src_stepx
   1092     mov        edx, [esp + 8 + 16]  // dst_argb
   1093     mov        ecx, [esp + 8 + 20]  // dst_width
   1094     lea        ebx, [ebx * 4]
   1095     lea        edi, [ebx + ebx * 2]
   1096 
   1097   wloop:
   1098     movd       xmm0, [eax]
   1099     movd       xmm1, [eax + ebx]
   1100     punpckldq  xmm0, xmm1
   1101     movd       xmm2, [eax + ebx * 2]
   1102     movd       xmm3, [eax + edi]
   1103     lea        eax,  [eax + ebx * 4]
   1104     punpckldq  xmm2, xmm3
   1105     punpcklqdq xmm0, xmm2
   1106     movdqu     [edx], xmm0
   1107     lea        edx, [edx + 16]
   1108     sub        ecx, 4
   1109     jg         wloop
   1110 
   1111     pop        edi
   1112     pop        ebx
   1113     ret
   1114   }
   1115 }
   1116 
   1117 // Blends four 2x2 to 4x1.
   1118 __declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
   1119                                                     ptrdiff_t src_stride,
   1120                                                     int src_stepx,
   1121                                                     uint8* dst_argb,
   1122                                                     int dst_width) {
   1123   __asm {
   1124     push       ebx
   1125     push       esi
   1126     push       edi
   1127     mov        eax, [esp + 12 + 4]  // src_argb
   1128     mov        esi, [esp + 12 + 8]  // src_stride
   1129     mov        ebx, [esp + 12 + 12]  // src_stepx
   1130     mov        edx, [esp + 12 + 16]  // dst_argb
   1131     mov        ecx, [esp + 12 + 20]  // dst_width
   1132     lea        esi, [eax + esi]  // row1 pointer
   1133     lea        ebx, [ebx * 4]
   1134     lea        edi, [ebx + ebx * 2]
   1135 
   1136   wloop:
   1137     movq       xmm0, qword ptr [eax]  // row0 4 pairs
   1138     movhps     xmm0, qword ptr [eax + ebx]
   1139     movq       xmm1, qword ptr [eax + ebx * 2]
   1140     movhps     xmm1, qword ptr [eax + edi]
   1141     lea        eax,  [eax + ebx * 4]
   1142     movq       xmm2, qword ptr [esi]  // row1 4 pairs
   1143     movhps     xmm2, qword ptr [esi + ebx]
   1144     movq       xmm3, qword ptr [esi + ebx * 2]
   1145     movhps     xmm3, qword ptr [esi + edi]
   1146     lea        esi,  [esi + ebx * 4]
   1147     pavgb      xmm0, xmm2  // average rows
   1148     pavgb      xmm1, xmm3
   1149     movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
   1150     shufps     xmm0, xmm1, 0x88  // even pixels
   1151     shufps     xmm2, xmm1, 0xdd  // odd pixels
   1152     pavgb      xmm0, xmm2
   1153     movdqu     [edx], xmm0
   1154     lea        edx, [edx + 16]
   1155     sub        ecx, 4
   1156     jg         wloop
   1157 
   1158     pop        edi
   1159     pop        esi
   1160     pop        ebx
   1161     ret
   1162   }
   1163 }
   1164 
   1165 // Column scaling unfiltered. SSE2 version.
   1166 __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
   1167                                           const uint8* src_argb,
   1168                                           int dst_width,
   1169                                           int x,
   1170                                           int dx) {
   1171   __asm {
   1172     push       edi
   1173     push       esi
   1174     mov        edi, [esp + 8 + 4]  // dst_argb
   1175     mov        esi, [esp + 8 + 8]  // src_argb
   1176     mov        ecx, [esp + 8 + 12]  // dst_width
   1177     movd       xmm2, [esp + 8 + 16]  // x
   1178     movd       xmm3, [esp + 8 + 20]  // dx
   1179 
   1180     pshufd     xmm2, xmm2, 0  // x0 x0 x0 x0
   1181     pshufd     xmm0, xmm3, 0x11  // dx  0 dx  0
   1182     paddd      xmm2, xmm0
   1183     paddd      xmm3, xmm3  // 0, 0, 0,  dx * 2
   1184     pshufd     xmm0, xmm3, 0x05  // dx * 2, dx * 2, 0, 0
   1185     paddd      xmm2, xmm0  // x3 x2 x1 x0
   1186     paddd      xmm3, xmm3  // 0, 0, 0,  dx * 4
   1187     pshufd     xmm3, xmm3, 0  // dx * 4, dx * 4, dx * 4, dx * 4
   1188 
   1189     pextrw     eax, xmm2, 1  // get x0 integer.
   1190     pextrw     edx, xmm2, 3  // get x1 integer.
   1191 
   1192     cmp        ecx, 0
   1193     jle        xloop99
   1194     sub        ecx, 4
   1195     jl         xloop49
   1196 
   1197     // 4 Pixel loop.
   1198  xloop4:
   1199     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
   1200     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
   1201     pextrw     eax, xmm2, 5  // get x2 integer.
   1202     pextrw     edx, xmm2, 7  // get x3 integer.
   1203     paddd      xmm2, xmm3  // x += dx
   1204     punpckldq  xmm0, xmm1  // x0 x1
   1205 
   1206     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
   1207     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
   1208     pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
   1209     pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
   1210     punpckldq  xmm1, xmm4  // x2 x3
   1211     punpcklqdq xmm0, xmm1  // x0 x1 x2 x3
   1212     movdqu     [edi], xmm0
   1213     lea        edi, [edi + 16]
   1214     sub        ecx, 4  // 4 pixels
   1215     jge        xloop4
   1216 
   1217  xloop49:
   1218     test       ecx, 2
   1219     je         xloop29
   1220 
   1221     // 2 Pixels.
   1222     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
   1223     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
   1224     pextrw     eax, xmm2, 5  // get x2 integer.
   1225     punpckldq  xmm0, xmm1  // x0 x1
   1226 
   1227     movq       qword ptr [edi], xmm0
   1228     lea        edi, [edi + 8]
   1229 
   1230  xloop29:
   1231     test       ecx, 1
   1232     je         xloop99
   1233 
   1234     // 1 Pixels.
   1235     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
   1236     movd       dword ptr [edi], xmm0
   1237  xloop99:
   1238 
   1239     pop        esi
   1240     pop        edi
   1241     ret
   1242   }
   1243 }
   1244 
   1245 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
   1246 // TODO(fbarchard): Port to Neon
   1247 
   1248 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
   1249 static uvec8 kShuffleColARGB = {
   1250     0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
   1251     8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
   1252 };
   1253 
   1254 // Shuffle table for duplicating 2 fractions into 8 bytes each
   1255 static uvec8 kShuffleFractions = {
   1256     0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
   1257 };
   1258 
   1259 __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
   1260                                                  const uint8* src_argb,
   1261                                                  int dst_width,
   1262                                                  int x,
   1263                                                  int dx) {
   1264   __asm {
   1265     push       esi
   1266     push       edi
   1267     mov        edi, [esp + 8 + 4]  // dst_argb
   1268     mov        esi, [esp + 8 + 8]  // src_argb
   1269     mov        ecx, [esp + 8 + 12]  // dst_width
   1270     movd       xmm2, [esp + 8 + 16]  // x
   1271     movd       xmm3, [esp + 8 + 20]  // dx
   1272     movdqa     xmm4, xmmword ptr kShuffleColARGB
   1273     movdqa     xmm5, xmmword ptr kShuffleFractions
   1274     pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
   1275     psrlw      xmm6, 9
   1276     pextrw     eax, xmm2, 1  // get x0 integer. preroll
   1277     sub        ecx, 2
   1278     jl         xloop29
   1279 
   1280     movdqa     xmm0, xmm2  // x1 = x0 + dx
   1281     paddd      xmm0, xmm3
   1282     punpckldq  xmm2, xmm0  // x0 x1
   1283     punpckldq  xmm3, xmm3  // dx dx
   1284     paddd      xmm3, xmm3  // dx * 2, dx * 2
   1285     pextrw     edx, xmm2, 3  // get x1 integer. preroll
   1286 
   1287     // 2 Pixel loop.
   1288   xloop2:
   1289     movdqa     xmm1, xmm2  // x0, x1 fractions.
   1290     paddd      xmm2, xmm3  // x += dx
   1291     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
   1292     psrlw      xmm1, 9  // 7 bit fractions.
   1293     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
   1294     pshufb     xmm1, xmm5  // 0000000011111111
   1295     pshufb     xmm0, xmm4  // arrange pixels into pairs
   1296     pxor       xmm1, xmm6  // 0..7f and 7f..0
   1297     pmaddubsw  xmm0, xmm1  // argb_argb 16 bit, 2 pixels.
   1298     pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
   1299     pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
   1300     psrlw      xmm0, 7  // argb 8.7 fixed point to low 8 bits.
   1301     packuswb   xmm0, xmm0  // argb_argb 8 bits, 2 pixels.
   1302     movq       qword ptr [edi], xmm0
   1303     lea        edi, [edi + 8]
   1304     sub        ecx, 2  // 2 pixels
   1305     jge        xloop2
   1306 
   1307  xloop29:
   1308 
   1309     add        ecx, 2 - 1
   1310     jl         xloop99
   1311 
   1312         // 1 pixel remainder
   1313     psrlw      xmm2, 9  // 7 bit fractions.
   1314     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
   1315     pshufb     xmm2, xmm5  // 00000000
   1316     pshufb     xmm0, xmm4  // arrange pixels into pairs
   1317     pxor       xmm2, xmm6  // 0..7f and 7f..0
   1318     pmaddubsw  xmm0, xmm2  // argb 16 bit, 1 pixel.
   1319     psrlw      xmm0, 7
   1320     packuswb   xmm0, xmm0  // argb 8 bits, 1 pixel.
   1321     movd       [edi], xmm0
   1322 
   1323  xloop99:
   1324 
   1325     pop        edi
   1326     pop        esi
   1327     ret
   1328   }
   1329 }
   1330 
   1331 // Reads 4 pixels, duplicates them and writes 8 pixels.
   1332 __declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
   1333                                              const uint8* src_argb,
   1334                                              int dst_width,
   1335                                              int x,
   1336                                              int dx) {
   1337   __asm {
   1338     mov        edx, [esp + 4]  // dst_argb
   1339     mov        eax, [esp + 8]  // src_argb
   1340     mov        ecx, [esp + 12]  // dst_width
   1341 
   1342   wloop:
   1343     movdqu     xmm0, [eax]
   1344     lea        eax,  [eax + 16]
   1345     movdqa     xmm1, xmm0
   1346     punpckldq  xmm0, xmm0
   1347     punpckhdq  xmm1, xmm1
   1348     movdqu     [edx], xmm0
   1349     movdqu     [edx + 16], xmm1
   1350     lea        edx, [edx + 32]
   1351     sub        ecx, 8
   1352     jg         wloop
   1353 
   1354     ret
   1355   }
   1356 }
   1357 
   1358 // Divide num by div and return as 16.16 fixed point result.
   1359 __declspec(naked) int FixedDiv_X86(int num, int div) {
   1360   __asm {
   1361     mov        eax, [esp + 4]  // num
   1362     cdq  // extend num to 64 bits
   1363     shld       edx, eax, 16  // 32.16
   1364     shl        eax, 16
   1365     idiv       dword ptr [esp + 8]
   1366     ret
   1367   }
   1368 }
   1369 
   1370 // Divide num by div and return as 16.16 fixed point result.
   1371 __declspec(naked) int FixedDiv1_X86(int num, int div) {
   1372   __asm {
   1373     mov        eax, [esp + 4]  // num
   1374     mov        ecx, [esp + 8]  // denom
   1375     cdq  // extend num to 64 bits
   1376     shld       edx, eax, 16  // 32.16
   1377     shl        eax, 16
   1378     sub        eax, 0x00010001
   1379     sbb        edx, 0
   1380     sub        ecx, 1
   1381     idiv       ecx
   1382     ret
   1383   }
   1384 }
   1385 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
   1386 
   1387 #ifdef __cplusplus
   1388 }  // extern "C"
   1389 }  // namespace libyuv
   1390 #endif
   1391