Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 #include "libyuv/scale_row.h"
     13 
     14 #ifdef __cplusplus
     15 namespace libyuv {
     16 extern "C" {
     17 #endif
     18 
     19 // This module is for 32 bit Visual C x86 and clangcl
     20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
     21 
     22 // Offsets for source bytes 0 to 9
     23 static uvec8 kShuf0 =
     24   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
     25 
     26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
     27 static uvec8 kShuf1 =
     28   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
     29 
     30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
     31 static uvec8 kShuf2 =
     32   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
     33 
     34 // Offsets for source bytes 0 to 10
     35 static uvec8 kShuf01 =
     36   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
     37 
     38 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
     39 static uvec8 kShuf11 =
     40   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
     41 
     42 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
     43 static uvec8 kShuf21 =
     44   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
     45 
     46 // Coefficients for source bytes 0 to 10
     47 static uvec8 kMadd01 =
     48   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
     49 
     50 // Coefficients for source bytes 10 to 21
     51 static uvec8 kMadd11 =
     52   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
     53 
     54 // Coefficients for source bytes 21 to 31
     55 static uvec8 kMadd21 =
     56   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
     57 
     58 // Coefficients for source bytes 21 to 31
     59 static vec16 kRound34 =
     60   { 2, 2, 2, 2, 2, 2, 2, 2 };
     61 
     62 static uvec8 kShuf38a =
     63   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
     64 
     65 static uvec8 kShuf38b =
     66   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
     67 
     68 // Arrange words 0,3,6 into 0,1,2
     69 static uvec8 kShufAc =
     70   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
     71 
     72 // Arrange words 0,3,6 into 3,4,5
     73 static uvec8 kShufAc3 =
     74   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
     75 
     76 // Scaling values for boxes of 3x3 and 2x3
     77 static uvec16 kScaleAc33 =
     78   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
     79 
     80 // Arrange first value for pixels 0,1,2,3,4,5
     81 static uvec8 kShufAb0 =
     82   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
     83 
     84 // Arrange second value for pixels 0,1,2,3,4,5
     85 static uvec8 kShufAb1 =
     86   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
     87 
     88 // Arrange third value for pixels 0,1,2,3,4,5
     89 static uvec8 kShufAb2 =
     90   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
     91 
     92 // Scaling values for boxes of 3x2 and 2x2
     93 static uvec16 kScaleAb2 =
     94   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
     95 
     96 // Reads 32 pixels, throws half away and writes 16 pixels.
     97 __declspec(naked)
     98 void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
     99                          uint8* dst_ptr, int dst_width) {
    100   __asm {
    101     mov        eax, [esp + 4]        // src_ptr
    102                                      // src_stride ignored
    103     mov        edx, [esp + 12]       // dst_ptr
    104     mov        ecx, [esp + 16]       // dst_width
    105 
    106   wloop:
    107     movdqu     xmm0, [eax]
    108     movdqu     xmm1, [eax + 16]
    109     lea        eax,  [eax + 32]
    110     psrlw      xmm0, 8               // isolate odd pixels.
    111     psrlw      xmm1, 8
    112     packuswb   xmm0, xmm1
    113     movdqu     [edx], xmm0
    114     lea        edx, [edx + 16]
    115     sub        ecx, 16
    116     jg         wloop
    117 
    118     ret
    119   }
    120 }
    121 
    122 // Blends 32x1 rectangle to 16x1.
    123 __declspec(naked)
    124 void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    125                                uint8* dst_ptr, int dst_width) {
    126   __asm {
    127     mov        eax, [esp + 4]        // src_ptr
    128                                      // src_stride
    129     mov        edx, [esp + 12]       // dst_ptr
    130     mov        ecx, [esp + 16]       // dst_width
    131 
    132     pcmpeqb    xmm4, xmm4            // constant 0x0101
    133     psrlw      xmm4, 15
    134     packuswb   xmm4, xmm4
    135     pxor       xmm5, xmm5            // constant 0
    136 
    137   wloop:
    138     movdqu     xmm0, [eax]
    139     movdqu     xmm1, [eax + 16]
    140     lea        eax,  [eax + 32]
    141     pmaddubsw  xmm0, xmm4      // horizontal add
    142     pmaddubsw  xmm1, xmm4
    143     pavgw      xmm0, xmm5      // (x + 1) / 2
    144     pavgw      xmm1, xmm5
    145     packuswb   xmm0, xmm1
    146     movdqu     [edx], xmm0
    147     lea        edx, [edx + 16]
    148     sub        ecx, 16
    149     jg         wloop
    150 
    151     ret
    152   }
    153 }
    154 
    155 // Blends 32x2 rectangle to 16x1.
    156 __declspec(naked)
    157 void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    158                             uint8* dst_ptr, int dst_width) {
    159   __asm {
    160     push       esi
    161     mov        eax, [esp + 4 + 4]    // src_ptr
    162     mov        esi, [esp + 4 + 8]    // src_stride
    163     mov        edx, [esp + 4 + 12]   // dst_ptr
    164     mov        ecx, [esp + 4 + 16]   // dst_width
    165 
    166     pcmpeqb    xmm4, xmm4            // constant 0x0101
    167     psrlw      xmm4, 15
    168     packuswb   xmm4, xmm4
    169     pxor       xmm5, xmm5            // constant 0
    170 
    171   wloop:
    172     movdqu     xmm0, [eax]
    173     movdqu     xmm1, [eax + 16]
    174     movdqu     xmm2, [eax + esi]
    175     movdqu     xmm3, [eax + esi + 16]
    176     lea        eax,  [eax + 32]
    177     pmaddubsw  xmm0, xmm4      // horizontal add
    178     pmaddubsw  xmm1, xmm4
    179     pmaddubsw  xmm2, xmm4
    180     pmaddubsw  xmm3, xmm4
    181     paddw      xmm0, xmm2      // vertical add
    182     paddw      xmm1, xmm3
    183     psrlw      xmm0, 1
    184     psrlw      xmm1, 1
    185     pavgw      xmm0, xmm5      // (x + 1) / 2
    186     pavgw      xmm1, xmm5
    187     packuswb   xmm0, xmm1
    188     movdqu     [edx], xmm0
    189     lea        edx, [edx + 16]
    190     sub        ecx, 16
    191     jg         wloop
    192 
    193     pop        esi
    194     ret
    195   }
    196 }
    197 
    198 #ifdef HAS_SCALEROWDOWN2_AVX2
    199 // Reads 64 pixels, throws half away and writes 32 pixels.
    200 __declspec(naked)
    201 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
    202                         uint8* dst_ptr, int dst_width) {
    203   __asm {
    204     mov        eax, [esp + 4]        // src_ptr
    205                                      // src_stride ignored
    206     mov        edx, [esp + 12]       // dst_ptr
    207     mov        ecx, [esp + 16]       // dst_width
    208 
    209   wloop:
    210     vmovdqu     ymm0, [eax]
    211     vmovdqu     ymm1, [eax + 32]
    212     lea         eax,  [eax + 64]
    213     vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
    214     vpsrlw      ymm1, ymm1, 8
    215     vpackuswb   ymm0, ymm0, ymm1
    216     vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
    217     vmovdqu     [edx], ymm0
    218     lea         edx, [edx + 32]
    219     sub         ecx, 32
    220     jg          wloop
    221 
    222     vzeroupper
    223     ret
    224   }
    225 }
    226 
    227 // Blends 64x1 rectangle to 32x1.
    228 __declspec(naked)
    229 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
    230                               uint8* dst_ptr, int dst_width) {
    231   __asm {
    232     mov         eax, [esp + 4]        // src_ptr
    233                                       // src_stride
    234     mov         edx, [esp + 12]       // dst_ptr
    235     mov         ecx, [esp + 16]       // dst_width
    236 
    237     vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
    238     vpsrlw      ymm4, ymm4, 15
    239     vpackuswb   ymm4, ymm4, ymm4
    240     vpxor       ymm5, ymm5, ymm5      // constant 0
    241 
    242   wloop:
    243     vmovdqu     ymm0, [eax]
    244     vmovdqu     ymm1, [eax + 32]
    245     lea         eax,  [eax + 64]
    246     vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
    247     vpmaddubsw  ymm1, ymm1, ymm4
    248     vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
    249     vpavgw      ymm1, ymm1, ymm5
    250     vpackuswb   ymm0, ymm0, ymm1
    251     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    252     vmovdqu     [edx], ymm0
    253     lea         edx, [edx + 32]
    254     sub         ecx, 32
    255     jg          wloop
    256 
    257     vzeroupper
    258     ret
    259   }
    260 }
    261 
    262 // For rounding, average = (sum + 2) / 4
    263 // becomes average((sum >> 1), 0)
    264 // Blends 64x2 rectangle to 32x1.
    265 __declspec(naked)
    266 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
    267                            uint8* dst_ptr, int dst_width) {
    268   __asm {
    269     push        esi
    270     mov         eax, [esp + 4 + 4]    // src_ptr
    271     mov         esi, [esp + 4 + 8]    // src_stride
    272     mov         edx, [esp + 4 + 12]   // dst_ptr
    273     mov         ecx, [esp + 4 + 16]   // dst_width
    274 
    275     vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
    276     vpsrlw      ymm4, ymm4, 15
    277     vpackuswb   ymm4, ymm4, ymm4
    278     vpxor       ymm5, ymm5, ymm5      // constant 0
    279 
    280   wloop:
    281     vmovdqu     ymm0, [eax]
    282     vmovdqu     ymm1, [eax + 32]
    283     vmovdqu     ymm2, [eax + esi]
    284     vmovdqu     ymm3, [eax + esi + 32]
    285     lea         eax,  [eax + 64]
    286     vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
    287     vpmaddubsw  ymm1, ymm1, ymm4
    288     vpmaddubsw  ymm2, ymm2, ymm4
    289     vpmaddubsw  ymm3, ymm3, ymm4
    290     vpaddw      ymm0, ymm0, ymm2      // vertical add
    291     vpaddw      ymm1, ymm1, ymm3
    292     vpsrlw      ymm0, ymm0, 1         // (x + 2) / 4 = (x / 2 + 1) / 2
    293     vpsrlw      ymm1, ymm1, 1
    294     vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
    295     vpavgw      ymm1, ymm1, ymm5
    296     vpackuswb   ymm0, ymm0, ymm1
    297     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    298     vmovdqu     [edx], ymm0
    299     lea         edx, [edx + 32]
    300     sub         ecx, 32
    301     jg          wloop
    302 
    303     pop         esi
    304     vzeroupper
    305     ret
    306   }
    307 }
    308 #endif  // HAS_SCALEROWDOWN2_AVX2
    309 
    310 // Point samples 32 pixels to 8 pixels.
    311 __declspec(naked)
    312 void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    313                         uint8* dst_ptr, int dst_width) {
    314   __asm {
    315     mov        eax, [esp + 4]        // src_ptr
    316                                      // src_stride ignored
    317     mov        edx, [esp + 12]       // dst_ptr
    318     mov        ecx, [esp + 16]       // dst_width
    319     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
    320     psrld      xmm5, 24
    321     pslld      xmm5, 16
    322 
    323   wloop:
    324     movdqu     xmm0, [eax]
    325     movdqu     xmm1, [eax + 16]
    326     lea        eax,  [eax + 32]
    327     pand       xmm0, xmm5
    328     pand       xmm1, xmm5
    329     packuswb   xmm0, xmm1
    330     psrlw      xmm0, 8
    331     packuswb   xmm0, xmm0
    332     movq       qword ptr [edx], xmm0
    333     lea        edx, [edx + 8]
    334     sub        ecx, 8
    335     jg         wloop
    336 
    337     ret
    338   }
    339 }
    340 
    341 // Blends 32x4 rectangle to 8x1.
    342 __declspec(naked)
    343 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    344                            uint8* dst_ptr, int dst_width) {
    345   __asm {
    346     push       esi
    347     push       edi
    348     mov        eax, [esp + 8 + 4]    // src_ptr
    349     mov        esi, [esp + 8 + 8]    // src_stride
    350     mov        edx, [esp + 8 + 12]   // dst_ptr
    351     mov        ecx, [esp + 8 + 16]   // dst_width
    352     lea        edi, [esi + esi * 2]  // src_stride * 3
    353     pcmpeqb    xmm4, xmm4            // constant 0x0101
    354     psrlw      xmm4, 15
    355     movdqa     xmm5, xmm4
    356     packuswb   xmm4, xmm4
    357     psllw      xmm5, 3               // constant 0x0008
    358 
    359   wloop:
    360     movdqu     xmm0, [eax]           // average rows
    361     movdqu     xmm1, [eax + 16]
    362     movdqu     xmm2, [eax + esi]
    363     movdqu     xmm3, [eax + esi + 16]
    364     pmaddubsw  xmm0, xmm4      // horizontal add
    365     pmaddubsw  xmm1, xmm4
    366     pmaddubsw  xmm2, xmm4
    367     pmaddubsw  xmm3, xmm4
    368     paddw      xmm0, xmm2      // vertical add rows 0, 1
    369     paddw      xmm1, xmm3
    370     movdqu     xmm2, [eax + esi * 2]
    371     movdqu     xmm3, [eax + esi * 2 + 16]
    372     pmaddubsw  xmm2, xmm4
    373     pmaddubsw  xmm3, xmm4
    374     paddw      xmm0, xmm2      // add row 2
    375     paddw      xmm1, xmm3
    376     movdqu     xmm2, [eax + edi]
    377     movdqu     xmm3, [eax + edi + 16]
    378     lea        eax, [eax + 32]
    379     pmaddubsw  xmm2, xmm4
    380     pmaddubsw  xmm3, xmm4
    381     paddw      xmm0, xmm2      // add row 3
    382     paddw      xmm1, xmm3
    383     phaddw     xmm0, xmm1
    384     paddw      xmm0, xmm5      // + 8 for round
    385     psrlw      xmm0, 4         // /16 for average of 4 * 4
    386     packuswb   xmm0, xmm0
    387     movq       qword ptr [edx], xmm0
    388     lea        edx, [edx + 8]
    389     sub        ecx, 8
    390     jg         wloop
    391 
    392     pop        edi
    393     pop        esi
    394     ret
    395   }
    396 }
    397 
    398 #ifdef HAS_SCALEROWDOWN4_AVX2
    399 // Point samples 64 pixels to 16 pixels.
    400 __declspec(naked)
    401 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
    402                         uint8* dst_ptr, int dst_width) {
    403   __asm {
    404     mov         eax, [esp + 4]        // src_ptr
    405                                       // src_stride ignored
    406     mov         edx, [esp + 12]       // dst_ptr
    407     mov         ecx, [esp + 16]       // dst_width
    408     vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
    409     vpsrld      ymm5, ymm5, 24
    410     vpslld      ymm5, ymm5, 16
    411 
    412   wloop:
    413     vmovdqu     ymm0, [eax]
    414     vmovdqu     ymm1, [eax + 32]
    415     lea         eax,  [eax + 64]
    416     vpand       ymm0, ymm0, ymm5
    417     vpand       ymm1, ymm1, ymm5
    418     vpackuswb   ymm0, ymm0, ymm1
    419     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    420     vpsrlw      ymm0, ymm0, 8
    421     vpackuswb   ymm0, ymm0, ymm0
    422     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    423     vmovdqu     [edx], xmm0
    424     lea         edx, [edx + 16]
    425     sub         ecx, 16
    426     jg          wloop
    427 
    428     vzeroupper
    429     ret
    430   }
    431 }
    432 
    433 // Blends 64x4 rectangle to 16x1.
    434 __declspec(naked)
    435 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
    436                            uint8* dst_ptr, int dst_width) {
    437   __asm {
    438     push        esi
    439     push        edi
    440     mov         eax, [esp + 8 + 4]    // src_ptr
    441     mov         esi, [esp + 8 + 8]    // src_stride
    442     mov         edx, [esp + 8 + 12]   // dst_ptr
    443     mov         ecx, [esp + 8 + 16]   // dst_width
    444     lea         edi, [esi + esi * 2]  // src_stride * 3
    445     vpcmpeqb    ymm4, ymm4, ymm4            // constant 0x0101
    446     vpsrlw      ymm4, ymm4, 15
    447     vpsllw      ymm5, ymm4, 3               // constant 0x0008
    448     vpackuswb   ymm4, ymm4, ymm4
    449 
    450   wloop:
    451     vmovdqu     ymm0, [eax]           // average rows
    452     vmovdqu     ymm1, [eax + 32]
    453     vmovdqu     ymm2, [eax + esi]
    454     vmovdqu     ymm3, [eax + esi + 32]
    455     vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
    456     vpmaddubsw  ymm1, ymm1, ymm4
    457     vpmaddubsw  ymm2, ymm2, ymm4
    458     vpmaddubsw  ymm3, ymm3, ymm4
    459     vpaddw      ymm0, ymm0, ymm2      // vertical add rows 0, 1
    460     vpaddw      ymm1, ymm1, ymm3
    461     vmovdqu     ymm2, [eax + esi * 2]
    462     vmovdqu     ymm3, [eax + esi * 2 + 32]
    463     vpmaddubsw  ymm2, ymm2, ymm4
    464     vpmaddubsw  ymm3, ymm3, ymm4
    465     vpaddw      ymm0, ymm0, ymm2      // add row 2
    466     vpaddw      ymm1, ymm1, ymm3
    467     vmovdqu     ymm2, [eax + edi]
    468     vmovdqu     ymm3, [eax + edi + 32]
    469     lea         eax,  [eax + 64]
    470     vpmaddubsw  ymm2, ymm2, ymm4
    471     vpmaddubsw  ymm3, ymm3, ymm4
    472     vpaddw      ymm0, ymm0, ymm2      // add row 3
    473     vpaddw      ymm1, ymm1, ymm3
    474     vphaddw     ymm0, ymm0, ymm1      // mutates
    475     vpermq      ymm0, ymm0, 0xd8      // unmutate vphaddw
    476     vpaddw      ymm0, ymm0, ymm5      // + 8 for round
    477     vpsrlw      ymm0, ymm0, 4         // /32 for average of 4 * 4
    478     vpackuswb   ymm0, ymm0, ymm0
    479     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    480     vmovdqu     [edx], xmm0
    481     lea         edx, [edx + 16]
    482     sub         ecx, 16
    483     jg          wloop
    484 
    485     pop        edi
    486     pop        esi
    487     vzeroupper
    488     ret
    489   }
    490 }
    491 #endif  // HAS_SCALEROWDOWN4_AVX2
    492 
    493 // Point samples 32 pixels to 24 pixels.
    494 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
    495 // Then shuffled to do the scaling.
    496 
    497 __declspec(naked)
    498 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    499                           uint8* dst_ptr, int dst_width) {
    500   __asm {
    501     mov        eax, [esp + 4]        // src_ptr
    502                                      // src_stride ignored
    503     mov        edx, [esp + 12]       // dst_ptr
    504     mov        ecx, [esp + 16]       // dst_width
    505     movdqa     xmm3, xmmword ptr kShuf0
    506     movdqa     xmm4, xmmword ptr kShuf1
    507     movdqa     xmm5, xmmword ptr kShuf2
    508 
    509   wloop:
    510     movdqu     xmm0, [eax]
    511     movdqu     xmm1, [eax + 16]
    512     lea        eax,  [eax + 32]
    513     movdqa     xmm2, xmm1
    514     palignr    xmm1, xmm0, 8
    515     pshufb     xmm0, xmm3
    516     pshufb     xmm1, xmm4
    517     pshufb     xmm2, xmm5
    518     movq       qword ptr [edx], xmm0
    519     movq       qword ptr [edx + 8], xmm1
    520     movq       qword ptr [edx + 16], xmm2
    521     lea        edx, [edx + 24]
    522     sub        ecx, 24
    523     jg         wloop
    524 
    525     ret
    526   }
    527 }
    528 
    529 // Blends 32x2 rectangle to 24x1
    530 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
    531 // Then shuffled to do the scaling.
    532 
    533 // Register usage:
    534 // xmm0 src_row 0
    535 // xmm1 src_row 1
    536 // xmm2 shuf 0
    537 // xmm3 shuf 1
    538 // xmm4 shuf 2
    539 // xmm5 madd 0
    540 // xmm6 madd 1
    541 // xmm7 kRound34
    542 
    543 // Note that movdqa+palign may be better than movdqu.
    544 __declspec(naked)
    545 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
    546                                 ptrdiff_t src_stride,
    547                                 uint8* dst_ptr, int dst_width) {
    548   __asm {
    549     push       esi
    550     mov        eax, [esp + 4 + 4]    // src_ptr
    551     mov        esi, [esp + 4 + 8]    // src_stride
    552     mov        edx, [esp + 4 + 12]   // dst_ptr
    553     mov        ecx, [esp + 4 + 16]   // dst_width
    554     movdqa     xmm2, xmmword ptr kShuf01
    555     movdqa     xmm3, xmmword ptr kShuf11
    556     movdqa     xmm4, xmmword ptr kShuf21
    557     movdqa     xmm5, xmmword ptr kMadd01
    558     movdqa     xmm6, xmmword ptr kMadd11
    559     movdqa     xmm7, xmmword ptr kRound34
    560 
    561   wloop:
    562     movdqu     xmm0, [eax]           // pixels 0..7
    563     movdqu     xmm1, [eax + esi]
    564     pavgb      xmm0, xmm1
    565     pshufb     xmm0, xmm2
    566     pmaddubsw  xmm0, xmm5
    567     paddsw     xmm0, xmm7
    568     psrlw      xmm0, 2
    569     packuswb   xmm0, xmm0
    570     movq       qword ptr [edx], xmm0
    571     movdqu     xmm0, [eax + 8]       // pixels 8..15
    572     movdqu     xmm1, [eax + esi + 8]
    573     pavgb      xmm0, xmm1
    574     pshufb     xmm0, xmm3
    575     pmaddubsw  xmm0, xmm6
    576     paddsw     xmm0, xmm7
    577     psrlw      xmm0, 2
    578     packuswb   xmm0, xmm0
    579     movq       qword ptr [edx + 8], xmm0
    580     movdqu     xmm0, [eax + 16]      // pixels 16..23
    581     movdqu     xmm1, [eax + esi + 16]
    582     lea        eax, [eax + 32]
    583     pavgb      xmm0, xmm1
    584     pshufb     xmm0, xmm4
    585     movdqa     xmm1, xmmword ptr kMadd21
    586     pmaddubsw  xmm0, xmm1
    587     paddsw     xmm0, xmm7
    588     psrlw      xmm0, 2
    589     packuswb   xmm0, xmm0
    590     movq       qword ptr [edx + 16], xmm0
    591     lea        edx, [edx + 24]
    592     sub        ecx, 24
    593     jg         wloop
    594 
    595     pop        esi
    596     ret
    597   }
    598 }
    599 
    600 // Note that movdqa+palign may be better than movdqu.
    601 __declspec(naked)
    602 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
    603                                 ptrdiff_t src_stride,
    604                                 uint8* dst_ptr, int dst_width) {
    605   __asm {
    606     push       esi
    607     mov        eax, [esp + 4 + 4]    // src_ptr
    608     mov        esi, [esp + 4 + 8]    // src_stride
    609     mov        edx, [esp + 4 + 12]   // dst_ptr
    610     mov        ecx, [esp + 4 + 16]   // dst_width
    611     movdqa     xmm2, xmmword ptr kShuf01
    612     movdqa     xmm3, xmmword ptr kShuf11
    613     movdqa     xmm4, xmmword ptr kShuf21
    614     movdqa     xmm5, xmmword ptr kMadd01
    615     movdqa     xmm6, xmmword ptr kMadd11
    616     movdqa     xmm7, xmmword ptr kRound34
    617 
    618   wloop:
    619     movdqu     xmm0, [eax]           // pixels 0..7
    620     movdqu     xmm1, [eax + esi]
    621     pavgb      xmm1, xmm0
    622     pavgb      xmm0, xmm1
    623     pshufb     xmm0, xmm2
    624     pmaddubsw  xmm0, xmm5
    625     paddsw     xmm0, xmm7
    626     psrlw      xmm0, 2
    627     packuswb   xmm0, xmm0
    628     movq       qword ptr [edx], xmm0
    629     movdqu     xmm0, [eax + 8]       // pixels 8..15
    630     movdqu     xmm1, [eax + esi + 8]
    631     pavgb      xmm1, xmm0
    632     pavgb      xmm0, xmm1
    633     pshufb     xmm0, xmm3
    634     pmaddubsw  xmm0, xmm6
    635     paddsw     xmm0, xmm7
    636     psrlw      xmm0, 2
    637     packuswb   xmm0, xmm0
    638     movq       qword ptr [edx + 8], xmm0
    639     movdqu     xmm0, [eax + 16]      // pixels 16..23
    640     movdqu     xmm1, [eax + esi + 16]
    641     lea        eax, [eax + 32]
    642     pavgb      xmm1, xmm0
    643     pavgb      xmm0, xmm1
    644     pshufb     xmm0, xmm4
    645     movdqa     xmm1, xmmword ptr kMadd21
    646     pmaddubsw  xmm0, xmm1
    647     paddsw     xmm0, xmm7
    648     psrlw      xmm0, 2
    649     packuswb   xmm0, xmm0
    650     movq       qword ptr [edx + 16], xmm0
    651     lea        edx, [edx+24]
    652     sub        ecx, 24
    653     jg         wloop
    654 
    655     pop        esi
    656     ret
    657   }
    658 }
    659 
    660 // 3/8 point sampler
    661 
    662 // Scale 32 pixels to 12
    663 __declspec(naked)
    664 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    665                           uint8* dst_ptr, int dst_width) {
    666   __asm {
    667     mov        eax, [esp + 4]        // src_ptr
    668                                      // src_stride ignored
    669     mov        edx, [esp + 12]       // dst_ptr
    670     mov        ecx, [esp + 16]       // dst_width
    671     movdqa     xmm4, xmmword ptr kShuf38a
    672     movdqa     xmm5, xmmword ptr kShuf38b
    673 
    674   xloop:
    675     movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
    676     movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
    677     lea        eax, [eax + 32]
    678     pshufb     xmm0, xmm4
    679     pshufb     xmm1, xmm5
    680     paddusb    xmm0, xmm1
    681 
    682     movq       qword ptr [edx], xmm0  // write 12 pixels
    683     movhlps    xmm1, xmm0
    684     movd       [edx + 8], xmm1
    685     lea        edx, [edx + 12]
    686     sub        ecx, 12
    687     jg         xloop
    688 
    689     ret
    690   }
    691 }
    692 
    693 // Scale 16x3 pixels to 6x1 with interpolation
    694 __declspec(naked)
    695 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
    696                                 ptrdiff_t src_stride,
    697                                 uint8* dst_ptr, int dst_width) {
    698   __asm {
    699     push       esi
    700     mov        eax, [esp + 4 + 4]    // src_ptr
    701     mov        esi, [esp + 4 + 8]    // src_stride
    702     mov        edx, [esp + 4 + 12]   // dst_ptr
    703     mov        ecx, [esp + 4 + 16]   // dst_width
    704     movdqa     xmm2, xmmword ptr kShufAc
    705     movdqa     xmm3, xmmword ptr kShufAc3
    706     movdqa     xmm4, xmmword ptr kScaleAc33
    707     pxor       xmm5, xmm5
    708 
    709   xloop:
    710     movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
    711     movdqu     xmm6, [eax + esi]
    712     movhlps    xmm1, xmm0
    713     movhlps    xmm7, xmm6
    714     punpcklbw  xmm0, xmm5
    715     punpcklbw  xmm1, xmm5
    716     punpcklbw  xmm6, xmm5
    717     punpcklbw  xmm7, xmm5
    718     paddusw    xmm0, xmm6
    719     paddusw    xmm1, xmm7
    720     movdqu     xmm6, [eax + esi * 2]
    721     lea        eax, [eax + 16]
    722     movhlps    xmm7, xmm6
    723     punpcklbw  xmm6, xmm5
    724     punpcklbw  xmm7, xmm5
    725     paddusw    xmm0, xmm6
    726     paddusw    xmm1, xmm7
    727 
    728     movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
    729     psrldq     xmm0, 2
    730     paddusw    xmm6, xmm0
    731     psrldq     xmm0, 2
    732     paddusw    xmm6, xmm0
    733     pshufb     xmm6, xmm2
    734 
    735     movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
    736     psrldq     xmm1, 2
    737     paddusw    xmm7, xmm1
    738     psrldq     xmm1, 2
    739     paddusw    xmm7, xmm1
    740     pshufb     xmm7, xmm3
    741     paddusw    xmm6, xmm7
    742 
    743     pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
    744     packuswb   xmm6, xmm6
    745 
    746     movd       [edx], xmm6           // write 6 pixels
    747     psrlq      xmm6, 16
    748     movd       [edx + 2], xmm6
    749     lea        edx, [edx + 6]
    750     sub        ecx, 6
    751     jg         xloop
    752 
    753     pop        esi
    754     ret
    755   }
    756 }
    757 
    758 // Scale 16x2 pixels to 6x1 with interpolation
    759 __declspec(naked)
    760 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
    761                                 ptrdiff_t src_stride,
    762                                 uint8* dst_ptr, int dst_width) {
    763   __asm {
    764     push       esi
    765     mov        eax, [esp + 4 + 4]    // src_ptr
    766     mov        esi, [esp + 4 + 8]    // src_stride
    767     mov        edx, [esp + 4 + 12]   // dst_ptr
    768     mov        ecx, [esp + 4 + 16]   // dst_width
    769     movdqa     xmm2, xmmword ptr kShufAb0
    770     movdqa     xmm3, xmmword ptr kShufAb1
    771     movdqa     xmm4, xmmword ptr kShufAb2
    772     movdqa     xmm5, xmmword ptr kScaleAb2
    773 
    774   xloop:
    775     movdqu     xmm0, [eax]           // average 2 rows into xmm0
    776     movdqu     xmm1, [eax + esi]
    777     lea        eax, [eax + 16]
    778     pavgb      xmm0, xmm1
    779 
    780     movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
    781     pshufb     xmm1, xmm2
    782     movdqa     xmm6, xmm0
    783     pshufb     xmm6, xmm3
    784     paddusw    xmm1, xmm6
    785     pshufb     xmm0, xmm4
    786     paddusw    xmm1, xmm0
    787 
    788     pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
    789     packuswb   xmm1, xmm1
    790 
    791     movd       [edx], xmm1           // write 6 pixels
    792     psrlq      xmm1, 16
    793     movd       [edx + 2], xmm1
    794     lea        edx, [edx + 6]
    795     sub        ecx, 6
    796     jg         xloop
    797 
    798     pop        esi
    799     ret
    800   }
    801 }
    802 
    803 // Reads 16 bytes and accumulates to 16 shorts at a time.
    804 __declspec(naked)
    805 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
    806   __asm {
    807     mov        eax, [esp + 4]   // src_ptr
    808     mov        edx, [esp + 8]   // dst_ptr
    809     mov        ecx, [esp + 12]  // src_width
    810     pxor       xmm5, xmm5
    811 
    812   // sum rows
    813   xloop:
    814     movdqu     xmm3, [eax]       // read 16 bytes
    815     lea        eax, [eax + 16]
    816     movdqu     xmm0, [edx]       // read 16 words from destination
    817     movdqu     xmm1, [edx + 16]
    818     movdqa     xmm2, xmm3
    819     punpcklbw  xmm2, xmm5
    820     punpckhbw  xmm3, xmm5
    821     paddusw    xmm0, xmm2        // sum 16 words
    822     paddusw    xmm1, xmm3
    823     movdqu     [edx], xmm0       // write 16 words to destination
    824     movdqu     [edx + 16], xmm1
    825     lea        edx, [edx + 32]
    826     sub        ecx, 16
    827     jg         xloop
    828     ret
    829   }
    830 }
    831 
    832 #ifdef HAS_SCALEADDROW_AVX2
    833 // Reads 32 bytes and accumulates to 32 shorts at a time.
    834 __declspec(naked)
    835 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
    836   __asm {
    837     mov         eax, [esp + 4]   // src_ptr
    838     mov         edx, [esp + 8]   // dst_ptr
    839     mov         ecx, [esp + 12]  // src_width
    840     vpxor       ymm5, ymm5, ymm5
    841 
    842   // sum rows
    843   xloop:
    844     vmovdqu     ymm3, [eax]       // read 32 bytes
    845     lea         eax, [eax + 32]
    846     vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
    847     vpunpcklbw  ymm2, ymm3, ymm5
    848     vpunpckhbw  ymm3, ymm3, ymm5
    849     vpaddusw    ymm0, ymm2, [edx] // sum 16 words
    850     vpaddusw    ymm1, ymm3, [edx + 32]
    851     vmovdqu     [edx], ymm0       // write 32 words to destination
    852     vmovdqu     [edx + 32], ymm1
    853     lea         edx, [edx + 64]
    854     sub         ecx, 32
    855     jg          xloop
    856 
    857     vzeroupper
    858     ret
    859   }
    860 }
    861 #endif  // HAS_SCALEADDROW_AVX2
    862 
    863 // Constant for making pixels signed to avoid pmaddubsw
    864 // saturation.
    865 static uvec8 kFsub80 =
    866   { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
    867     0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
    868 
    869 // Constant for making pixels unsigned and adding .5 for rounding.
    870 static uvec16 kFadd40 =
    871   { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
    872 
    873 // Bilinear column filtering. SSSE3 version.
    874 __declspec(naked)
    875 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    876                            int dst_width, int x, int dx) {
    877   __asm {
    878     push       ebx
    879     push       esi
    880     push       edi
    881     mov        edi, [esp + 12 + 4]    // dst_ptr
    882     mov        esi, [esp + 12 + 8]    // src_ptr
    883     mov        ecx, [esp + 12 + 12]   // dst_width
    884     movd       xmm2, [esp + 12 + 16]  // x
    885     movd       xmm3, [esp + 12 + 20]  // dx
    886     mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
    887     movd       xmm5, eax
    888     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
    889     psrlw      xmm6, 9
    890     pcmpeqb    xmm7, xmm7           // generate 0x0001
    891     psrlw      xmm7, 15
    892     pextrw     eax, xmm2, 1         // get x0 integer. preroll
    893     sub        ecx, 2
    894     jl         xloop29
    895 
    896     movdqa     xmm0, xmm2           // x1 = x0 + dx
    897     paddd      xmm0, xmm3
    898     punpckldq  xmm2, xmm0           // x0 x1
    899     punpckldq  xmm3, xmm3           // dx dx
    900     paddd      xmm3, xmm3           // dx * 2, dx * 2
    901     pextrw     edx, xmm2, 3         // get x1 integer. preroll
    902 
    903     // 2 Pixel loop.
    904   xloop2:
    905     movdqa     xmm1, xmm2           // x0, x1 fractions.
    906     paddd      xmm2, xmm3           // x += dx
    907     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
    908     movd       xmm0, ebx
    909     psrlw      xmm1, 9              // 7 bit fractions.
    910     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
    911     movd       xmm4, ebx
    912     pshufb     xmm1, xmm5           // 0011
    913     punpcklwd  xmm0, xmm4
    914     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
    915     pxor       xmm1, xmm6           // 0..7f and 7f..0
    916     paddusb    xmm1, xmm7           // +1 so 0..7f and 80..1
    917     pmaddubsw  xmm1, xmm0           // 16 bit, 2 pixels.
    918     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
    919     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
    920     paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
    921     psrlw      xmm1, 7              // 8.7 fixed point to low 8 bits.
    922     packuswb   xmm1, xmm1           // 8 bits, 2 pixels.
    923     movd       ebx, xmm1
    924     mov        [edi], bx
    925     lea        edi, [edi + 2]
    926     sub        ecx, 2               // 2 pixels
    927     jge        xloop2
    928 
    929  xloop29:
    930     add        ecx, 2 - 1
    931     jl         xloop99
    932 
    933     // 1 pixel remainder
    934     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
    935     movd       xmm0, ebx
    936     psrlw      xmm2, 9              // 7 bit fractions.
    937     pshufb     xmm2, xmm5           // 0011
    938     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
    939     pxor       xmm2, xmm6           // 0..7f and 7f..0
    940     paddusb    xmm2, xmm7           // +1 so 0..7f and 80..1
    941     pmaddubsw  xmm2, xmm0           // 16 bit
    942     paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
    943     psrlw      xmm2, 7              // 8.7 fixed point to low 8 bits.
    944     packuswb   xmm2, xmm2           // 8 bits
    945     movd       ebx, xmm2
    946     mov        [edi], bl
    947 
    948  xloop99:
    949 
    950     pop        edi
    951     pop        esi
    952     pop        ebx
    953     ret
    954   }
    955 }
    956 
    957 // Reads 16 pixels, duplicates them and writes 32 pixels.
    958 __declspec(naked)
    959 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
    960                        int dst_width, int x, int dx) {
    961   __asm {
    962     mov        edx, [esp + 4]    // dst_ptr
    963     mov        eax, [esp + 8]    // src_ptr
    964     mov        ecx, [esp + 12]   // dst_width
    965 
    966   wloop:
    967     movdqu     xmm0, [eax]
    968     lea        eax,  [eax + 16]
    969     movdqa     xmm1, xmm0
    970     punpcklbw  xmm0, xmm0
    971     punpckhbw  xmm1, xmm1
    972     movdqu     [edx], xmm0
    973     movdqu     [edx + 16], xmm1
    974     lea        edx, [edx + 32]
    975     sub        ecx, 32
    976     jg         wloop
    977 
    978     ret
    979   }
    980 }
    981 
    982 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
    983 __declspec(naked)
    984 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
    985                             ptrdiff_t src_stride,
    986                             uint8* dst_argb, int dst_width) {
    987   __asm {
    988     mov        eax, [esp + 4]        // src_argb
    989                                      // src_stride ignored
    990     mov        edx, [esp + 12]       // dst_argb
    991     mov        ecx, [esp + 16]       // dst_width
    992 
    993   wloop:
    994     movdqu     xmm0, [eax]
    995     movdqu     xmm1, [eax + 16]
    996     lea        eax,  [eax + 32]
    997     shufps     xmm0, xmm1, 0xdd
    998     movdqu     [edx], xmm0
    999     lea        edx, [edx + 16]
   1000     sub        ecx, 4
   1001     jg         wloop
   1002 
   1003     ret
   1004   }
   1005 }
   1006 
   1007 // Blends 8x1 rectangle to 4x1.
   1008 __declspec(naked)
   1009 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
   1010                                   ptrdiff_t src_stride,
   1011                                   uint8* dst_argb, int dst_width) {
   1012   __asm {
   1013     mov        eax, [esp + 4]        // src_argb
   1014                                      // src_stride ignored
   1015     mov        edx, [esp + 12]       // dst_argb
   1016     mov        ecx, [esp + 16]       // dst_width
   1017 
   1018   wloop:
   1019     movdqu     xmm0, [eax]
   1020     movdqu     xmm1, [eax + 16]
   1021     lea        eax,  [eax + 32]
   1022     movdqa     xmm2, xmm0
   1023     shufps     xmm0, xmm1, 0x88      // even pixels
   1024     shufps     xmm2, xmm1, 0xdd      // odd pixels
   1025     pavgb      xmm0, xmm2
   1026     movdqu     [edx], xmm0
   1027     lea        edx, [edx + 16]
   1028     sub        ecx, 4
   1029     jg         wloop
   1030 
   1031     ret
   1032   }
   1033 }
   1034 
   1035 // Blends 8x2 rectangle to 4x1.
   1036 __declspec(naked)
   1037 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
   1038                                ptrdiff_t src_stride,
   1039                                uint8* dst_argb, int dst_width) {
   1040   __asm {
   1041     push       esi
   1042     mov        eax, [esp + 4 + 4]    // src_argb
   1043     mov        esi, [esp + 4 + 8]    // src_stride
   1044     mov        edx, [esp + 4 + 12]   // dst_argb
   1045     mov        ecx, [esp + 4 + 16]   // dst_width
   1046 
   1047   wloop:
   1048     movdqu     xmm0, [eax]
   1049     movdqu     xmm1, [eax + 16]
   1050     movdqu     xmm2, [eax + esi]
   1051     movdqu     xmm3, [eax + esi + 16]
   1052     lea        eax,  [eax + 32]
   1053     pavgb      xmm0, xmm2            // average rows
   1054     pavgb      xmm1, xmm3
   1055     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
   1056     shufps     xmm0, xmm1, 0x88      // even pixels
   1057     shufps     xmm2, xmm1, 0xdd      // odd pixels
   1058     pavgb      xmm0, xmm2
   1059     movdqu     [edx], xmm0
   1060     lea        edx, [edx + 16]
   1061     sub        ecx, 4
   1062     jg         wloop
   1063 
   1064     pop        esi
   1065     ret
   1066   }
   1067 }
   1068 
   1069 // Reads 4 pixels at a time.
   1070 __declspec(naked)
   1071 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
   1072                                int src_stepx,
   1073                                uint8* dst_argb, int dst_width) {
   1074   __asm {
   1075     push       ebx
   1076     push       edi
   1077     mov        eax, [esp + 8 + 4]    // src_argb
   1078                                      // src_stride ignored
   1079     mov        ebx, [esp + 8 + 12]   // src_stepx
   1080     mov        edx, [esp + 8 + 16]   // dst_argb
   1081     mov        ecx, [esp + 8 + 20]   // dst_width
   1082     lea        ebx, [ebx * 4]
   1083     lea        edi, [ebx + ebx * 2]
   1084 
   1085   wloop:
   1086     movd       xmm0, [eax]
   1087     movd       xmm1, [eax + ebx]
   1088     punpckldq  xmm0, xmm1
   1089     movd       xmm2, [eax + ebx * 2]
   1090     movd       xmm3, [eax + edi]
   1091     lea        eax,  [eax + ebx * 4]
   1092     punpckldq  xmm2, xmm3
   1093     punpcklqdq xmm0, xmm2
   1094     movdqu     [edx], xmm0
   1095     lea        edx, [edx + 16]
   1096     sub        ecx, 4
   1097     jg         wloop
   1098 
   1099     pop        edi
   1100     pop        ebx
   1101     ret
   1102   }
   1103 }
   1104 
   1105 // Blends four 2x2 to 4x1.
   1106 __declspec(naked)
   1107 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
   1108                                   ptrdiff_t src_stride,
   1109                                   int src_stepx,
   1110                                   uint8* dst_argb, int dst_width) {
   1111   __asm {
   1112     push       ebx
   1113     push       esi
   1114     push       edi
   1115     mov        eax, [esp + 12 + 4]    // src_argb
   1116     mov        esi, [esp + 12 + 8]    // src_stride
   1117     mov        ebx, [esp + 12 + 12]   // src_stepx
   1118     mov        edx, [esp + 12 + 16]   // dst_argb
   1119     mov        ecx, [esp + 12 + 20]   // dst_width
   1120     lea        esi, [eax + esi]       // row1 pointer
   1121     lea        ebx, [ebx * 4]
   1122     lea        edi, [ebx + ebx * 2]
   1123 
   1124   wloop:
   1125     movq       xmm0, qword ptr [eax]  // row0 4 pairs
   1126     movhps     xmm0, qword ptr [eax + ebx]
   1127     movq       xmm1, qword ptr [eax + ebx * 2]
   1128     movhps     xmm1, qword ptr [eax + edi]
   1129     lea        eax,  [eax + ebx * 4]
   1130     movq       xmm2, qword ptr [esi]  // row1 4 pairs
   1131     movhps     xmm2, qword ptr [esi + ebx]
   1132     movq       xmm3, qword ptr [esi + ebx * 2]
   1133     movhps     xmm3, qword ptr [esi + edi]
   1134     lea        esi,  [esi + ebx * 4]
   1135     pavgb      xmm0, xmm2            // average rows
   1136     pavgb      xmm1, xmm3
   1137     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
   1138     shufps     xmm0, xmm1, 0x88      // even pixels
   1139     shufps     xmm2, xmm1, 0xdd      // odd pixels
   1140     pavgb      xmm0, xmm2
   1141     movdqu     [edx], xmm0
   1142     lea        edx, [edx + 16]
   1143     sub        ecx, 4
   1144     jg         wloop
   1145 
   1146     pop        edi
   1147     pop        esi
   1148     pop        ebx
   1149     ret
   1150   }
   1151 }
   1152 
   1153 // Column scaling unfiltered. SSE2 version.
   1154 __declspec(naked)
   1155 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
   1156                         int dst_width, int x, int dx) {
   1157   __asm {
   1158     push       edi
   1159     push       esi
   1160     mov        edi, [esp + 8 + 4]    // dst_argb
   1161     mov        esi, [esp + 8 + 8]    // src_argb
   1162     mov        ecx, [esp + 8 + 12]   // dst_width
   1163     movd       xmm2, [esp + 8 + 16]  // x
   1164     movd       xmm3, [esp + 8 + 20]  // dx
   1165 
   1166     pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
   1167     pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
   1168     paddd      xmm2, xmm0
   1169     paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
   1170     pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
   1171     paddd      xmm2, xmm0            // x3 x2 x1 x0
   1172     paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
   1173     pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
   1174 
   1175     pextrw     eax, xmm2, 1          // get x0 integer.
   1176     pextrw     edx, xmm2, 3          // get x1 integer.
   1177 
   1178     cmp        ecx, 0
   1179     jle        xloop99
   1180     sub        ecx, 4
   1181     jl         xloop49
   1182 
   1183     // 4 Pixel loop.
   1184  xloop4:
   1185     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
   1186     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
   1187     pextrw     eax, xmm2, 5           // get x2 integer.
   1188     pextrw     edx, xmm2, 7           // get x3 integer.
   1189     paddd      xmm2, xmm3             // x += dx
   1190     punpckldq  xmm0, xmm1             // x0 x1
   1191 
   1192     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
   1193     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
   1194     pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
   1195     pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
   1196     punpckldq  xmm1, xmm4             // x2 x3
   1197     punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
   1198     movdqu     [edi], xmm0
   1199     lea        edi, [edi + 16]
   1200     sub        ecx, 4                 // 4 pixels
   1201     jge        xloop4
   1202 
   1203  xloop49:
   1204     test       ecx, 2
   1205     je         xloop29
   1206 
   1207     // 2 Pixels.
   1208     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
   1209     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
   1210     pextrw     eax, xmm2, 5           // get x2 integer.
   1211     punpckldq  xmm0, xmm1             // x0 x1
   1212 
   1213     movq       qword ptr [edi], xmm0
   1214     lea        edi, [edi + 8]
   1215 
   1216  xloop29:
   1217     test       ecx, 1
   1218     je         xloop99
   1219 
   1220     // 1 Pixels.
   1221     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
   1222     movd       dword ptr [edi], xmm0
   1223  xloop99:
   1224 
   1225     pop        esi
   1226     pop        edi
   1227     ret
   1228   }
   1229 }
   1230 
   1231 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
   1232 // TODO(fbarchard): Port to Neon
   1233 
   1234 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
   1235 static uvec8 kShuffleColARGB = {
   1236   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
   1237   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
   1238 };
   1239 
   1240 // Shuffle table for duplicating 2 fractions into 8 bytes each
   1241 static uvec8 kShuffleFractions = {
   1242   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
   1243 };
   1244 
   1245 __declspec(naked)
   1246 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
   1247                                int dst_width, int x, int dx) {
   1248   __asm {
   1249     push       esi
   1250     push       edi
   1251     mov        edi, [esp + 8 + 4]    // dst_argb
   1252     mov        esi, [esp + 8 + 8]    // src_argb
   1253     mov        ecx, [esp + 8 + 12]   // dst_width
   1254     movd       xmm2, [esp + 8 + 16]  // x
   1255     movd       xmm3, [esp + 8 + 20]  // dx
   1256     movdqa     xmm4, xmmword ptr kShuffleColARGB
   1257     movdqa     xmm5, xmmword ptr kShuffleFractions
   1258     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
   1259     psrlw      xmm6, 9
   1260     pextrw     eax, xmm2, 1         // get x0 integer. preroll
   1261     sub        ecx, 2
   1262     jl         xloop29
   1263 
   1264     movdqa     xmm0, xmm2           // x1 = x0 + dx
   1265     paddd      xmm0, xmm3
   1266     punpckldq  xmm2, xmm0           // x0 x1
   1267     punpckldq  xmm3, xmm3           // dx dx
   1268     paddd      xmm3, xmm3           // dx * 2, dx * 2
   1269     pextrw     edx, xmm2, 3         // get x1 integer. preroll
   1270 
   1271     // 2 Pixel loop.
   1272   xloop2:
   1273     movdqa     xmm1, xmm2           // x0, x1 fractions.
   1274     paddd      xmm2, xmm3           // x += dx
   1275     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
   1276     psrlw      xmm1, 9              // 7 bit fractions.
   1277     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
   1278     pshufb     xmm1, xmm5           // 0000000011111111
   1279     pshufb     xmm0, xmm4           // arrange pixels into pairs
   1280     pxor       xmm1, xmm6           // 0..7f and 7f..0
   1281     pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
   1282     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
   1283     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
   1284     psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
   1285     packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
   1286     movq       qword ptr [edi], xmm0
   1287     lea        edi, [edi + 8]
   1288     sub        ecx, 2               // 2 pixels
   1289     jge        xloop2
   1290 
   1291  xloop29:
   1292 
   1293     add        ecx, 2 - 1
   1294     jl         xloop99
   1295 
   1296     // 1 pixel remainder
   1297     psrlw      xmm2, 9              // 7 bit fractions.
   1298     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
   1299     pshufb     xmm2, xmm5           // 00000000
   1300     pshufb     xmm0, xmm4           // arrange pixels into pairs
   1301     pxor       xmm2, xmm6           // 0..7f and 7f..0
   1302     pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
   1303     psrlw      xmm0, 7
   1304     packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
   1305     movd       [edi], xmm0
   1306 
   1307  xloop99:
   1308 
   1309     pop        edi
   1310     pop        esi
   1311     ret
   1312   }
   1313 }
   1314 
   1315 // Reads 4 pixels, duplicates them and writes 8 pixels.
   1316 __declspec(naked)
   1317 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
   1318                            int dst_width, int x, int dx) {
   1319   __asm {
   1320     mov        edx, [esp + 4]    // dst_argb
   1321     mov        eax, [esp + 8]    // src_argb
   1322     mov        ecx, [esp + 12]   // dst_width
   1323 
   1324   wloop:
   1325     movdqu     xmm0, [eax]
   1326     lea        eax,  [eax + 16]
   1327     movdqa     xmm1, xmm0
   1328     punpckldq  xmm0, xmm0
   1329     punpckhdq  xmm1, xmm1
   1330     movdqu     [edx], xmm0
   1331     movdqu     [edx + 16], xmm1
   1332     lea        edx, [edx + 32]
   1333     sub        ecx, 8
   1334     jg         wloop
   1335 
   1336     ret
   1337   }
   1338 }
   1339 
   1340 // Divide num by div and return as 16.16 fixed point result.
   1341 __declspec(naked)
   1342 int FixedDiv_X86(int num, int div) {
   1343   __asm {
   1344     mov        eax, [esp + 4]    // num
   1345     cdq                          // extend num to 64 bits
   1346     shld       edx, eax, 16      // 32.16
   1347     shl        eax, 16
   1348     idiv       dword ptr [esp + 8]
   1349     ret
   1350   }
   1351 }
   1352 
   1353 // Divide num by div and return as 16.16 fixed point result.
   1354 __declspec(naked)
   1355 int FixedDiv1_X86(int num, int div) {
   1356   __asm {
   1357     mov        eax, [esp + 4]    // num
   1358     mov        ecx, [esp + 8]    // denom
   1359     cdq                          // extend num to 64 bits
   1360     shld       edx, eax, 16      // 32.16
   1361     shl        eax, 16
   1362     sub        eax, 0x00010001
   1363     sbb        edx, 0
   1364     sub        ecx, 1
   1365     idiv       ecx
   1366     ret
   1367   }
   1368 }
   1369 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
   1370 
   1371 #ifdef __cplusplus
   1372 }  // extern "C"
   1373 }  // namespace libyuv
   1374 #endif
   1375