Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 #include "libyuv/scale_row.h"
     13 
     14 #ifdef __cplusplus
     15 namespace libyuv {
     16 extern "C" {
     17 #endif
     18 
     19 // This module is for Visual C x86.
     20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
     21     defined(_MSC_VER) && !defined(__clang__)
     22 
     23 // Offsets for source bytes 0 to 9
     24 static uvec8 kShuf0 =
     25   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
     26 
     27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
     28 static uvec8 kShuf1 =
     29   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
     30 
     31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
     32 static uvec8 kShuf2 =
     33   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
     34 
     35 // Offsets for source bytes 0 to 10
     36 static uvec8 kShuf01 =
     37   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
     38 
     39 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
     40 static uvec8 kShuf11 =
     41   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
     42 
     43 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
     44 static uvec8 kShuf21 =
     45   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
     46 
     47 // Coefficients for source bytes 0 to 10
     48 static uvec8 kMadd01 =
     49   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
     50 
     51 // Coefficients for source bytes 10 to 21
     52 static uvec8 kMadd11 =
     53   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
     54 
     55 // Coefficients for source bytes 21 to 31
     56 static uvec8 kMadd21 =
     57   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
     58 
     59 // Coefficients for source bytes 21 to 31
     60 static vec16 kRound34 =
     61   { 2, 2, 2, 2, 2, 2, 2, 2 };
     62 
     63 static uvec8 kShuf38a =
     64   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
     65 
     66 static uvec8 kShuf38b =
     67   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
     68 
     69 // Arrange words 0,3,6 into 0,1,2
     70 static uvec8 kShufAc =
     71   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
     72 
     73 // Arrange words 0,3,6 into 3,4,5
     74 static uvec8 kShufAc3 =
     75   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
     76 
     77 // Scaling values for boxes of 3x3 and 2x3
     78 static uvec16 kScaleAc33 =
     79   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
     80 
     81 // Arrange first value for pixels 0,1,2,3,4,5
     82 static uvec8 kShufAb0 =
     83   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
     84 
     85 // Arrange second value for pixels 0,1,2,3,4,5
     86 static uvec8 kShufAb1 =
     87   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
     88 
     89 // Arrange third value for pixels 0,1,2,3,4,5
     90 static uvec8 kShufAb2 =
     91   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
     92 
     93 // Scaling values for boxes of 3x2 and 2x2
     94 static uvec16 kScaleAb2 =
     95   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
     96 
     97 // Reads 32 pixels, throws half away and writes 16 pixels.
     98 __declspec(naked)
     99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    100                         uint8* dst_ptr, int dst_width) {
    101   __asm {
    102     mov        eax, [esp + 4]        // src_ptr
    103                                      // src_stride ignored
    104     mov        edx, [esp + 12]       // dst_ptr
    105     mov        ecx, [esp + 16]       // dst_width
    106 
    107   wloop:
    108     movdqu     xmm0, [eax]
    109     movdqu     xmm1, [eax + 16]
    110     lea        eax,  [eax + 32]
    111     psrlw      xmm0, 8               // isolate odd pixels.
    112     psrlw      xmm1, 8
    113     packuswb   xmm0, xmm1
    114     movdqu     [edx], xmm0
    115     lea        edx, [edx + 16]
    116     sub        ecx, 16
    117     jg         wloop
    118 
    119     ret
    120   }
    121 }
    122 
    123 // Blends 32x1 rectangle to 16x1.
    124 __declspec(naked)
    125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    126                               uint8* dst_ptr, int dst_width) {
    127   __asm {
    128     mov        eax, [esp + 4]        // src_ptr
    129                                      // src_stride
    130     mov        edx, [esp + 12]       // dst_ptr
    131     mov        ecx, [esp + 16]       // dst_width
    132     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
    133     psrlw      xmm5, 8
    134 
    135   wloop:
    136     movdqu     xmm0, [eax]
    137     movdqu     xmm1, [eax + 16]
    138     lea        eax,  [eax + 32]
    139 
    140     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
    141     psrlw      xmm0, 8
    142     movdqa     xmm3, xmm1
    143     psrlw      xmm1, 8
    144     pand       xmm2, xmm5
    145     pand       xmm3, xmm5
    146     pavgw      xmm0, xmm2
    147     pavgw      xmm1, xmm3
    148     packuswb   xmm0, xmm1
    149 
    150     movdqu     [edx], xmm0
    151     lea        edx, [edx + 16]
    152     sub        ecx, 16
    153     jg         wloop
    154 
    155     ret
    156   }
    157 }
    158 
    159 // Blends 32x2 rectangle to 16x1.
    160 __declspec(naked)
    161 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    162                            uint8* dst_ptr, int dst_width) {
    163   __asm {
    164     push       esi
    165     mov        eax, [esp + 4 + 4]    // src_ptr
    166     mov        esi, [esp + 4 + 8]    // src_stride
    167     mov        edx, [esp + 4 + 12]   // dst_ptr
    168     mov        ecx, [esp + 4 + 16]   // dst_width
    169     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
    170     psrlw      xmm5, 8
    171 
    172   wloop:
    173     movdqu     xmm0, [eax]
    174     movdqu     xmm1, [eax + 16]
    175     movdqu     xmm2, [eax + esi]
    176     movdqu     xmm3, [eax + esi + 16]
    177     lea        eax,  [eax + 32]
    178     pavgb      xmm0, xmm2            // average rows
    179     pavgb      xmm1, xmm3
    180 
    181     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
    182     psrlw      xmm0, 8
    183     movdqa     xmm3, xmm1
    184     psrlw      xmm1, 8
    185     pand       xmm2, xmm5
    186     pand       xmm3, xmm5
    187     pavgw      xmm0, xmm2
    188     pavgw      xmm1, xmm3
    189     packuswb   xmm0, xmm1
    190 
    191     movdqu     [edx], xmm0
    192     lea        edx, [edx + 16]
    193     sub        ecx, 16
    194     jg         wloop
    195 
    196     pop        esi
    197     ret
    198   }
    199 }
    200 
    201 #ifdef HAS_SCALEROWDOWN2_AVX2
    202 // Reads 64 pixels, throws half away and writes 32 pixels.
    203 __declspec(naked)
    204 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
    205                         uint8* dst_ptr, int dst_width) {
    206   __asm {
    207     mov        eax, [esp + 4]        // src_ptr
    208                                      // src_stride ignored
    209     mov        edx, [esp + 12]       // dst_ptr
    210     mov        ecx, [esp + 16]       // dst_width
    211 
    212   wloop:
    213     vmovdqu     ymm0, [eax]
    214     vmovdqu     ymm1, [eax + 32]
    215     lea         eax,  [eax + 64]
    216     vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
    217     vpsrlw      ymm1, ymm1, 8
    218     vpackuswb   ymm0, ymm0, ymm1
    219     vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
    220     vmovdqu     [edx], ymm0
    221     lea         edx, [edx + 32]
    222     sub         ecx, 32
    223     jg          wloop
    224 
    225     vzeroupper
    226     ret
    227   }
    228 }
    229 
    230 // Blends 64x1 rectangle to 32x1.
    231 __declspec(naked)
    232 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
    233                               uint8* dst_ptr, int dst_width) {
    234   __asm {
    235     mov         eax, [esp + 4]        // src_ptr
    236                                       // src_stride
    237     mov         edx, [esp + 12]       // dst_ptr
    238     mov         ecx, [esp + 16]       // dst_width
    239 
    240     vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
    241     vpsrlw      ymm4, ymm4, 15
    242     vpackuswb   ymm4, ymm4, ymm4
    243     vpxor       ymm5, ymm5, ymm5      // constant 0
    244 
    245   wloop:
    246     vmovdqu     ymm0, [eax]
    247     vmovdqu     ymm1, [eax + 32]
    248     lea         eax,  [eax + 64]
    249 
    250     vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
    251     vpmaddubsw  ymm1, ymm1, ymm4
    252     vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
    253     vpavgw      ymm1, ymm1, ymm5
    254     vpackuswb   ymm0, ymm0, ymm1
    255     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    256 
    257     vmovdqu     [edx], ymm0
    258     lea         edx, [edx + 32]
    259     sub         ecx, 32
    260     jg          wloop
    261 
    262     vzeroupper
    263     ret
    264   }
    265 }
    266 
    267 // Blends 64x2 rectangle to 32x1.
    268 __declspec(naked)
    269 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
    270                            uint8* dst_ptr, int dst_width) {
    271   __asm {
    272     push        esi
    273     mov         eax, [esp + 4 + 4]    // src_ptr
    274     mov         esi, [esp + 4 + 8]    // src_stride
    275     mov         edx, [esp + 4 + 12]   // dst_ptr
    276     mov         ecx, [esp + 4 + 16]   // dst_width
    277 
    278     vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
    279     vpsrlw      ymm4, ymm4, 15
    280     vpackuswb   ymm4, ymm4, ymm4
    281     vpxor       ymm5, ymm5, ymm5      // constant 0
    282 
    283   wloop:
    284     vmovdqu     ymm0, [eax]           // average rows
    285     vmovdqu     ymm1, [eax + 32]
    286     vpavgb      ymm0, ymm0, [eax + esi]
    287     vpavgb      ymm1, ymm1, [eax + esi + 32]
    288     lea         eax,  [eax + 64]
    289 
    290     vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
    291     vpmaddubsw  ymm1, ymm1, ymm4
    292     vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
    293     vpavgw      ymm1, ymm1, ymm5
    294     vpackuswb   ymm0, ymm0, ymm1
    295     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    296 
    297     vmovdqu     [edx], ymm0
    298     lea         edx, [edx + 32]
    299     sub         ecx, 32
    300     jg          wloop
    301 
    302     pop         esi
    303     vzeroupper
    304     ret
    305   }
    306 }
    307 #endif  // HAS_SCALEROWDOWN2_AVX2
    308 
    309 // Point samples 32 pixels to 8 pixels.
    310 __declspec(naked)
    311 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    312                         uint8* dst_ptr, int dst_width) {
    313   __asm {
    314     mov        eax, [esp + 4]        // src_ptr
    315                                      // src_stride ignored
    316     mov        edx, [esp + 12]       // dst_ptr
    317     mov        ecx, [esp + 16]       // dst_width
    318     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
    319     psrld      xmm5, 24
    320     pslld      xmm5, 16
    321 
    322   wloop:
    323     movdqu     xmm0, [eax]
    324     movdqu     xmm1, [eax + 16]
    325     lea        eax,  [eax + 32]
    326     pand       xmm0, xmm5
    327     pand       xmm1, xmm5
    328     packuswb   xmm0, xmm1
    329     psrlw      xmm0, 8
    330     packuswb   xmm0, xmm0
    331     movq       qword ptr [edx], xmm0
    332     lea        edx, [edx + 8]
    333     sub        ecx, 8
    334     jg         wloop
    335 
    336     ret
    337   }
    338 }
    339 
    340 // Blends 32x4 rectangle to 8x1.
    341 __declspec(naked)
    342 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    343                            uint8* dst_ptr, int dst_width) {
    344   __asm {
    345     push       esi
    346     push       edi
    347     mov        eax, [esp + 8 + 4]    // src_ptr
    348     mov        esi, [esp + 8 + 8]    // src_stride
    349     mov        edx, [esp + 8 + 12]   // dst_ptr
    350     mov        ecx, [esp + 8 + 16]   // dst_width
    351     lea        edi, [esi + esi * 2]  // src_stride * 3
    352     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
    353     psrlw      xmm7, 8
    354 
    355   wloop:
    356     movdqu     xmm0, [eax]           // average rows
    357     movdqu     xmm1, [eax + 16]
    358     movdqu     xmm2, [eax + esi]
    359     movdqu     xmm3, [eax + esi + 16]
    360     pavgb      xmm0, xmm2
    361     pavgb      xmm1, xmm3
    362     movdqu     xmm2, [eax + esi * 2]
    363     movdqu     xmm3, [eax + esi * 2 + 16]
    364     movdqu     xmm4, [eax + edi]
    365     movdqu     xmm5, [eax + edi + 16]
    366     lea        eax, [eax + 32]
    367     pavgb      xmm2, xmm4
    368     pavgb      xmm3, xmm5
    369     pavgb      xmm0, xmm2
    370     pavgb      xmm1, xmm3
    371 
    372     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
    373     psrlw      xmm0, 8
    374     movdqa     xmm3, xmm1
    375     psrlw      xmm1, 8
    376     pand       xmm2, xmm7
    377     pand       xmm3, xmm7
    378     pavgw      xmm0, xmm2
    379     pavgw      xmm1, xmm3
    380     packuswb   xmm0, xmm1
    381 
    382     movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
    383     psrlw      xmm0, 8
    384     pand       xmm2, xmm7
    385     pavgw      xmm0, xmm2
    386     packuswb   xmm0, xmm0
    387 
    388     movq       qword ptr [edx], xmm0
    389     lea        edx, [edx + 8]
    390     sub        ecx, 8
    391     jg         wloop
    392 
    393     pop        edi
    394     pop        esi
    395     ret
    396   }
    397 }
    398 
    399 #ifdef HAS_SCALEROWDOWN4_AVX2
    400 // Point samples 64 pixels to 16 pixels.
    401 __declspec(naked)
    402 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
    403                         uint8* dst_ptr, int dst_width) {
    404   __asm {
    405     mov         eax, [esp + 4]        // src_ptr
    406                                       // src_stride ignored
    407     mov         edx, [esp + 12]       // dst_ptr
    408     mov         ecx, [esp + 16]       // dst_width
    409     vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
    410     vpsrld      ymm5, ymm5, 24
    411     vpslld      ymm5, ymm5, 16
    412 
    413   wloop:
    414     vmovdqu     ymm0, [eax]
    415     vmovdqu     ymm1, [eax + 32]
    416     lea         eax,  [eax + 64]
    417     vpand       ymm0, ymm0, ymm5
    418     vpand       ymm1, ymm1, ymm5
    419     vpackuswb   ymm0, ymm0, ymm1
    420     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    421     vpsrlw      ymm0, ymm0, 8
    422     vpackuswb   ymm0, ymm0, ymm0
    423     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    424     vmovdqu     [edx], xmm0
    425     lea         edx, [edx + 16]
    426     sub         ecx, 16
    427     jg          wloop
    428 
    429     vzeroupper
    430     ret
    431   }
    432 }
    433 
    434 // Blends 64x4 rectangle to 16x1.
    435 __declspec(naked)
    436 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
    437                            uint8* dst_ptr, int dst_width) {
    438   __asm {
    439     push        esi
    440     push        edi
    441     mov         eax, [esp + 8 + 4]    // src_ptr
    442     mov         esi, [esp + 8 + 8]    // src_stride
    443     mov         edx, [esp + 8 + 12]   // dst_ptr
    444     mov         ecx, [esp + 8 + 16]   // dst_width
    445     lea         edi, [esi + esi * 2]  // src_stride * 3
    446     vpcmpeqb    ymm7, ymm7, ymm7      // generate mask 0x00ff00ff
    447     vpsrlw      ymm7, ymm7, 8
    448 
    449   wloop:
    450     vmovdqu     ymm0, [eax]           // average rows
    451     vmovdqu     ymm1, [eax + 32]
    452     vpavgb      ymm0, ymm0, [eax + esi]
    453     vpavgb      ymm1, ymm1, [eax + esi + 32]
    454     vmovdqu     ymm2, [eax + esi * 2]
    455     vmovdqu     ymm3, [eax + esi * 2 + 32]
    456     vpavgb      ymm2, ymm2, [eax + edi]
    457     vpavgb      ymm3, ymm3, [eax + edi + 32]
    458     lea         eax, [eax + 64]
    459     vpavgb      ymm0, ymm0, ymm2
    460     vpavgb      ymm1, ymm1, ymm3
    461 
    462     vpand       ymm2, ymm0, ymm7      // average columns (64 to 32 pixels)
    463     vpand       ymm3, ymm1, ymm7
    464     vpsrlw      ymm0, ymm0, 8
    465     vpsrlw      ymm1, ymm1, 8
    466     vpavgw      ymm0, ymm0, ymm2
    467     vpavgw      ymm1, ymm1, ymm3
    468     vpackuswb   ymm0, ymm0, ymm1
    469     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    470 
    471     vpand       ymm2, ymm0, ymm7      // average columns (32 to 16 pixels)
    472     vpsrlw      ymm0, ymm0, 8
    473     vpavgw      ymm0, ymm0, ymm2
    474     vpackuswb   ymm0, ymm0, ymm0
    475     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    476 
    477     vmovdqu     [edx], xmm0
    478     lea         edx, [edx + 16]
    479     sub         ecx, 16
    480     jg          wloop
    481 
    482     pop        edi
    483     pop        esi
    484     vzeroupper
    485     ret
    486   }
    487 }
    488 #endif  // HAS_SCALEROWDOWN4_AVX2
    489 
    490 // Point samples 32 pixels to 24 pixels.
    491 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
    492 // Then shuffled to do the scaling.
    493 
    494 __declspec(naked)
    495 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    496                           uint8* dst_ptr, int dst_width) {
    497   __asm {
    498     mov        eax, [esp + 4]        // src_ptr
    499                                      // src_stride ignored
    500     mov        edx, [esp + 12]       // dst_ptr
    501     mov        ecx, [esp + 16]       // dst_width
    502     movdqa     xmm3, kShuf0
    503     movdqa     xmm4, kShuf1
    504     movdqa     xmm5, kShuf2
    505 
    506   wloop:
    507     movdqu     xmm0, [eax]
    508     movdqu     xmm1, [eax + 16]
    509     lea        eax,  [eax + 32]
    510     movdqa     xmm2, xmm1
    511     palignr    xmm1, xmm0, 8
    512     pshufb     xmm0, xmm3
    513     pshufb     xmm1, xmm4
    514     pshufb     xmm2, xmm5
    515     movq       qword ptr [edx], xmm0
    516     movq       qword ptr [edx + 8], xmm1
    517     movq       qword ptr [edx + 16], xmm2
    518     lea        edx, [edx + 24]
    519     sub        ecx, 24
    520     jg         wloop
    521 
    522     ret
    523   }
    524 }
    525 
    526 // Blends 32x2 rectangle to 24x1
    527 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
    528 // Then shuffled to do the scaling.
    529 
    530 // Register usage:
    531 // xmm0 src_row 0
    532 // xmm1 src_row 1
    533 // xmm2 shuf 0
    534 // xmm3 shuf 1
    535 // xmm4 shuf 2
    536 // xmm5 madd 0
    537 // xmm6 madd 1
    538 // xmm7 kRound34
    539 
    540 // Note that movdqa+palign may be better than movdqu.
    541 __declspec(naked)
    542 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
    543                                 ptrdiff_t src_stride,
    544                                 uint8* dst_ptr, int dst_width) {
    545   __asm {
    546     push       esi
    547     mov        eax, [esp + 4 + 4]    // src_ptr
    548     mov        esi, [esp + 4 + 8]    // src_stride
    549     mov        edx, [esp + 4 + 12]   // dst_ptr
    550     mov        ecx, [esp + 4 + 16]   // dst_width
    551     movdqa     xmm2, kShuf01
    552     movdqa     xmm3, kShuf11
    553     movdqa     xmm4, kShuf21
    554     movdqa     xmm5, kMadd01
    555     movdqa     xmm6, kMadd11
    556     movdqa     xmm7, kRound34
    557 
    558   wloop:
    559     movdqu     xmm0, [eax]           // pixels 0..7
    560     movdqu     xmm1, [eax + esi]
    561     pavgb      xmm0, xmm1
    562     pshufb     xmm0, xmm2
    563     pmaddubsw  xmm0, xmm5
    564     paddsw     xmm0, xmm7
    565     psrlw      xmm0, 2
    566     packuswb   xmm0, xmm0
    567     movq       qword ptr [edx], xmm0
    568     movdqu     xmm0, [eax + 8]       // pixels 8..15
    569     movdqu     xmm1, [eax + esi + 8]
    570     pavgb      xmm0, xmm1
    571     pshufb     xmm0, xmm3
    572     pmaddubsw  xmm0, xmm6
    573     paddsw     xmm0, xmm7
    574     psrlw      xmm0, 2
    575     packuswb   xmm0, xmm0
    576     movq       qword ptr [edx + 8], xmm0
    577     movdqu     xmm0, [eax + 16]      // pixels 16..23
    578     movdqu     xmm1, [eax + esi + 16]
    579     lea        eax, [eax + 32]
    580     pavgb      xmm0, xmm1
    581     pshufb     xmm0, xmm4
    582     movdqa     xmm1, kMadd21
    583     pmaddubsw  xmm0, xmm1
    584     paddsw     xmm0, xmm7
    585     psrlw      xmm0, 2
    586     packuswb   xmm0, xmm0
    587     movq       qword ptr [edx + 16], xmm0
    588     lea        edx, [edx + 24]
    589     sub        ecx, 24
    590     jg         wloop
    591 
    592     pop        esi
    593     ret
    594   }
    595 }
    596 
    597 // Note that movdqa+palign may be better than movdqu.
    598 __declspec(naked)
    599 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
    600                                 ptrdiff_t src_stride,
    601                                 uint8* dst_ptr, int dst_width) {
    602   __asm {
    603     push       esi
    604     mov        eax, [esp + 4 + 4]    // src_ptr
    605     mov        esi, [esp + 4 + 8]    // src_stride
    606     mov        edx, [esp + 4 + 12]   // dst_ptr
    607     mov        ecx, [esp + 4 + 16]   // dst_width
    608     movdqa     xmm2, kShuf01
    609     movdqa     xmm3, kShuf11
    610     movdqa     xmm4, kShuf21
    611     movdqa     xmm5, kMadd01
    612     movdqa     xmm6, kMadd11
    613     movdqa     xmm7, kRound34
    614 
    615   wloop:
    616     movdqu     xmm0, [eax]           // pixels 0..7
    617     movdqu     xmm1, [eax + esi]
    618     pavgb      xmm1, xmm0
    619     pavgb      xmm0, xmm1
    620     pshufb     xmm0, xmm2
    621     pmaddubsw  xmm0, xmm5
    622     paddsw     xmm0, xmm7
    623     psrlw      xmm0, 2
    624     packuswb   xmm0, xmm0
    625     movq       qword ptr [edx], xmm0
    626     movdqu     xmm0, [eax + 8]       // pixels 8..15
    627     movdqu     xmm1, [eax + esi + 8]
    628     pavgb      xmm1, xmm0
    629     pavgb      xmm0, xmm1
    630     pshufb     xmm0, xmm3
    631     pmaddubsw  xmm0, xmm6
    632     paddsw     xmm0, xmm7
    633     psrlw      xmm0, 2
    634     packuswb   xmm0, xmm0
    635     movq       qword ptr [edx + 8], xmm0
    636     movdqu     xmm0, [eax + 16]      // pixels 16..23
    637     movdqu     xmm1, [eax + esi + 16]
    638     lea        eax, [eax + 32]
    639     pavgb      xmm1, xmm0
    640     pavgb      xmm0, xmm1
    641     pshufb     xmm0, xmm4
    642     movdqa     xmm1, kMadd21
    643     pmaddubsw  xmm0, xmm1
    644     paddsw     xmm0, xmm7
    645     psrlw      xmm0, 2
    646     packuswb   xmm0, xmm0
    647     movq       qword ptr [edx + 16], xmm0
    648     lea        edx, [edx+24]
    649     sub        ecx, 24
    650     jg         wloop
    651 
    652     pop        esi
    653     ret
    654   }
    655 }
    656 
    657 // 3/8 point sampler
    658 
    659 // Scale 32 pixels to 12
    660 __declspec(naked)
    661 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    662                           uint8* dst_ptr, int dst_width) {
    663   __asm {
    664     mov        eax, [esp + 4]        // src_ptr
    665                                      // src_stride ignored
    666     mov        edx, [esp + 12]       // dst_ptr
    667     mov        ecx, [esp + 16]       // dst_width
    668     movdqa     xmm4, kShuf38a
    669     movdqa     xmm5, kShuf38b
    670 
    671   xloop:
    672     movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
    673     movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
    674     lea        eax, [eax + 32]
    675     pshufb     xmm0, xmm4
    676     pshufb     xmm1, xmm5
    677     paddusb    xmm0, xmm1
    678 
    679     movq       qword ptr [edx], xmm0  // write 12 pixels
    680     movhlps    xmm1, xmm0
    681     movd       [edx + 8], xmm1
    682     lea        edx, [edx + 12]
    683     sub        ecx, 12
    684     jg         xloop
    685 
    686     ret
    687   }
    688 }
    689 
    690 // Scale 16x3 pixels to 6x1 with interpolation
    691 __declspec(naked)
    692 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
    693                                 ptrdiff_t src_stride,
    694                                 uint8* dst_ptr, int dst_width) {
    695   __asm {
    696     push       esi
    697     mov        eax, [esp + 4 + 4]    // src_ptr
    698     mov        esi, [esp + 4 + 8]    // src_stride
    699     mov        edx, [esp + 4 + 12]   // dst_ptr
    700     mov        ecx, [esp + 4 + 16]   // dst_width
    701     movdqa     xmm2, kShufAc
    702     movdqa     xmm3, kShufAc3
    703     movdqa     xmm4, kScaleAc33
    704     pxor       xmm5, xmm5
    705 
    706   xloop:
    707     movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
    708     movdqu     xmm6, [eax + esi]
    709     movhlps    xmm1, xmm0
    710     movhlps    xmm7, xmm6
    711     punpcklbw  xmm0, xmm5
    712     punpcklbw  xmm1, xmm5
    713     punpcklbw  xmm6, xmm5
    714     punpcklbw  xmm7, xmm5
    715     paddusw    xmm0, xmm6
    716     paddusw    xmm1, xmm7
    717     movdqu     xmm6, [eax + esi * 2]
    718     lea        eax, [eax + 16]
    719     movhlps    xmm7, xmm6
    720     punpcklbw  xmm6, xmm5
    721     punpcklbw  xmm7, xmm5
    722     paddusw    xmm0, xmm6
    723     paddusw    xmm1, xmm7
    724 
    725     movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
    726     psrldq     xmm0, 2
    727     paddusw    xmm6, xmm0
    728     psrldq     xmm0, 2
    729     paddusw    xmm6, xmm0
    730     pshufb     xmm6, xmm2
    731 
    732     movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
    733     psrldq     xmm1, 2
    734     paddusw    xmm7, xmm1
    735     psrldq     xmm1, 2
    736     paddusw    xmm7, xmm1
    737     pshufb     xmm7, xmm3
    738     paddusw    xmm6, xmm7
    739 
    740     pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
    741     packuswb   xmm6, xmm6
    742 
    743     movd       [edx], xmm6           // write 6 pixels
    744     psrlq      xmm6, 16
    745     movd       [edx + 2], xmm6
    746     lea        edx, [edx + 6]
    747     sub        ecx, 6
    748     jg         xloop
    749 
    750     pop        esi
    751     ret
    752   }
    753 }
    754 
    755 // Scale 16x2 pixels to 6x1 with interpolation
    756 __declspec(naked)
    757 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
    758                                 ptrdiff_t src_stride,
    759                                 uint8* dst_ptr, int dst_width) {
    760   __asm {
    761     push       esi
    762     mov        eax, [esp + 4 + 4]    // src_ptr
    763     mov        esi, [esp + 4 + 8]    // src_stride
    764     mov        edx, [esp + 4 + 12]   // dst_ptr
    765     mov        ecx, [esp + 4 + 16]   // dst_width
    766     movdqa     xmm2, kShufAb0
    767     movdqa     xmm3, kShufAb1
    768     movdqa     xmm4, kShufAb2
    769     movdqa     xmm5, kScaleAb2
    770 
    771   xloop:
    772     movdqu     xmm0, [eax]           // average 2 rows into xmm0
    773     movdqu     xmm1, [eax + esi]
    774     lea        eax, [eax + 16]
    775     pavgb      xmm0, xmm1
    776 
    777     movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
    778     pshufb     xmm1, xmm2
    779     movdqa     xmm6, xmm0
    780     pshufb     xmm6, xmm3
    781     paddusw    xmm1, xmm6
    782     pshufb     xmm0, xmm4
    783     paddusw    xmm1, xmm0
    784 
    785     pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
    786     packuswb   xmm1, xmm1
    787 
    788     movd       [edx], xmm1           // write 6 pixels
    789     psrlq      xmm1, 16
    790     movd       [edx + 2], xmm1
    791     lea        edx, [edx + 6]
    792     sub        ecx, 6
    793     jg         xloop
    794 
    795     pop        esi
    796     ret
    797   }
    798 }
    799 
    800 // Reads 16 bytes and accumulates to 16 shorts at a time.
    801 __declspec(naked)
    802 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
    803   __asm {
    804     mov        eax, [esp + 4]   // src_ptr
    805     mov        edx, [esp + 8]   // dst_ptr
    806     mov        ecx, [esp + 12]  // src_width
    807     pxor       xmm5, xmm5
    808 
    809   // sum rows
    810   xloop:
    811     movdqu     xmm3, [eax]       // read 16 bytes
    812     lea        eax, [eax + 16]
    813     movdqu     xmm0, [edx]       // read 16 words from destination
    814     movdqu     xmm1, [edx + 16]
    815     movdqa     xmm2, xmm3
    816     punpcklbw  xmm2, xmm5
    817     punpckhbw  xmm3, xmm5
    818     paddusw    xmm0, xmm2        // sum 16 words
    819     paddusw    xmm1, xmm3
    820     movdqu     [edx], xmm0       // write 16 words to destination
    821     movdqu     [edx + 16], xmm1
    822     lea        edx, [edx + 32]
    823     sub        ecx, 16
    824     jg         xloop
    825     ret
    826   }
    827 }
    828 
    829 #ifdef HAS_SCALEADDROW_AVX2
    830 // Reads 32 bytes and accumulates to 32 shorts at a time.
    831 __declspec(naked)
    832 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
    833   __asm {
    834     mov         eax, [esp + 4]   // src_ptr
    835     mov         edx, [esp + 8]   // dst_ptr
    836     mov         ecx, [esp + 12]  // src_width
    837     vpxor       ymm5, ymm5, ymm5
    838 
    839   // sum rows
    840   xloop:
    841     vmovdqu     ymm3, [eax]       // read 32 bytes
    842     lea         eax, [eax + 32]
    843     vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
    844     vpunpcklbw  ymm2, ymm3, ymm5
    845     vpunpckhbw  ymm3, ymm3, ymm5
    846     vpaddusw    ymm0, ymm2, [edx] // sum 16 words
    847     vpaddusw    ymm1, ymm3, [edx + 32]
    848     vmovdqu     [edx], ymm0       // write 32 words to destination
    849     vmovdqu     [edx + 32], ymm1
    850     lea         edx, [edx + 64]
    851     sub         ecx, 32
    852     jg          xloop
    853 
    854     vzeroupper
    855     ret
    856   }
    857 }
    858 #endif  // HAS_SCALEADDROW_AVX2
    859 
    860 // Bilinear column filtering. SSSE3 version.
    861 __declspec(naked)
    862 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    863                            int dst_width, int x, int dx) {
    864   __asm {
    865     push       ebx
    866     push       esi
    867     push       edi
    868     mov        edi, [esp + 12 + 4]    // dst_ptr
    869     mov        esi, [esp + 12 + 8]    // src_ptr
    870     mov        ecx, [esp + 12 + 12]   // dst_width
    871     movd       xmm2, [esp + 12 + 16]  // x
    872     movd       xmm3, [esp + 12 + 20]  // dx
    873     mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
    874     movd       xmm5, eax
    875     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
    876     psrlw      xmm6, 9
    877     pextrw     eax, xmm2, 1         // get x0 integer. preroll
    878     sub        ecx, 2
    879     jl         xloop29
    880 
    881     movdqa     xmm0, xmm2           // x1 = x0 + dx
    882     paddd      xmm0, xmm3
    883     punpckldq  xmm2, xmm0           // x0 x1
    884     punpckldq  xmm3, xmm3           // dx dx
    885     paddd      xmm3, xmm3           // dx * 2, dx * 2
    886     pextrw     edx, xmm2, 3         // get x1 integer. preroll
    887 
    888     // 2 Pixel loop.
    889   xloop2:
    890     movdqa     xmm1, xmm2           // x0, x1 fractions.
    891     paddd      xmm2, xmm3           // x += dx
    892     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
    893     movd       xmm0, ebx
    894     psrlw      xmm1, 9              // 7 bit fractions.
    895     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
    896     movd       xmm4, ebx
    897     pshufb     xmm1, xmm5           // 0011
    898     punpcklwd  xmm0, xmm4
    899     pxor       xmm1, xmm6           // 0..7f and 7f..0
    900     pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
    901     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
    902     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
    903     psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
    904     packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
    905     movd       ebx, xmm0
    906     mov        [edi], bx
    907     lea        edi, [edi + 2]
    908     sub        ecx, 2               // 2 pixels
    909     jge        xloop2
    910 
    911  xloop29:
    912 
    913     add        ecx, 2 - 1
    914     jl         xloop99
    915 
    916     // 1 pixel remainder
    917     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
    918     movd       xmm0, ebx
    919     psrlw      xmm2, 9              // 7 bit fractions.
    920     pshufb     xmm2, xmm5           // 0011
    921     pxor       xmm2, xmm6           // 0..7f and 7f..0
    922     pmaddubsw  xmm0, xmm2           // 16 bit
    923     psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
    924     packuswb   xmm0, xmm0           // 8 bits
    925     movd       ebx, xmm0
    926     mov        [edi], bl
    927 
    928  xloop99:
    929 
    930     pop        edi
    931     pop        esi
    932     pop        ebx
    933     ret
    934   }
    935 }
    936 
    937 // Reads 16 pixels, duplicates them and writes 32 pixels.
    938 __declspec(naked)
    939 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
    940                        int dst_width, int x, int dx) {
    941   __asm {
    942     mov        edx, [esp + 4]    // dst_ptr
    943     mov        eax, [esp + 8]    // src_ptr
    944     mov        ecx, [esp + 12]   // dst_width
    945 
    946   wloop:
    947     movdqu     xmm0, [eax]
    948     lea        eax,  [eax + 16]
    949     movdqa     xmm1, xmm0
    950     punpcklbw  xmm0, xmm0
    951     punpckhbw  xmm1, xmm1
    952     movdqu     [edx], xmm0
    953     movdqu     [edx + 16], xmm1
    954     lea        edx, [edx + 32]
    955     sub        ecx, 32
    956     jg         wloop
    957 
    958     ret
    959   }
    960 }
    961 
    962 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
    963 __declspec(naked)
    964 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
    965                             ptrdiff_t src_stride,
    966                             uint8* dst_argb, int dst_width) {
    967   __asm {
    968     mov        eax, [esp + 4]        // src_argb
    969                                      // src_stride ignored
    970     mov        edx, [esp + 12]       // dst_argb
    971     mov        ecx, [esp + 16]       // dst_width
    972 
    973   wloop:
    974     movdqu     xmm0, [eax]
    975     movdqu     xmm1, [eax + 16]
    976     lea        eax,  [eax + 32]
    977     shufps     xmm0, xmm1, 0xdd
    978     movdqu     [edx], xmm0
    979     lea        edx, [edx + 16]
    980     sub        ecx, 4
    981     jg         wloop
    982 
    983     ret
    984   }
    985 }
    986 
    987 // Blends 8x1 rectangle to 4x1.
    988 __declspec(naked)
    989 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
    990                                   ptrdiff_t src_stride,
    991                                   uint8* dst_argb, int dst_width) {
    992   __asm {
    993     mov        eax, [esp + 4]        // src_argb
    994                                      // src_stride ignored
    995     mov        edx, [esp + 12]       // dst_argb
    996     mov        ecx, [esp + 16]       // dst_width
    997 
    998   wloop:
    999     movdqu     xmm0, [eax]
   1000     movdqu     xmm1, [eax + 16]
   1001     lea        eax,  [eax + 32]
   1002     movdqa     xmm2, xmm0
   1003     shufps     xmm0, xmm1, 0x88      // even pixels
   1004     shufps     xmm2, xmm1, 0xdd      // odd pixels
   1005     pavgb      xmm0, xmm2
   1006     movdqu     [edx], xmm0
   1007     lea        edx, [edx + 16]
   1008     sub        ecx, 4
   1009     jg         wloop
   1010 
   1011     ret
   1012   }
   1013 }
   1014 
   1015 // Blends 8x2 rectangle to 4x1.
   1016 __declspec(naked)
   1017 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
   1018                                ptrdiff_t src_stride,
   1019                                uint8* dst_argb, int dst_width) {
   1020   __asm {
   1021     push       esi
   1022     mov        eax, [esp + 4 + 4]    // src_argb
   1023     mov        esi, [esp + 4 + 8]    // src_stride
   1024     mov        edx, [esp + 4 + 12]   // dst_argb
   1025     mov        ecx, [esp + 4 + 16]   // dst_width
   1026 
   1027   wloop:
   1028     movdqu     xmm0, [eax]
   1029     movdqu     xmm1, [eax + 16]
   1030     movdqu     xmm2, [eax + esi]
   1031     movdqu     xmm3, [eax + esi + 16]
   1032     lea        eax,  [eax + 32]
   1033     pavgb      xmm0, xmm2            // average rows
   1034     pavgb      xmm1, xmm3
   1035     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
   1036     shufps     xmm0, xmm1, 0x88      // even pixels
   1037     shufps     xmm2, xmm1, 0xdd      // odd pixels
   1038     pavgb      xmm0, xmm2
   1039     movdqu     [edx], xmm0
   1040     lea        edx, [edx + 16]
   1041     sub        ecx, 4
   1042     jg         wloop
   1043 
   1044     pop        esi
   1045     ret
   1046   }
   1047 }
   1048 
   1049 // Reads 4 pixels at a time.
   1050 __declspec(naked)
   1051 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
   1052                                int src_stepx,
   1053                                uint8* dst_argb, int dst_width) {
   1054   __asm {
   1055     push       ebx
   1056     push       edi
   1057     mov        eax, [esp + 8 + 4]    // src_argb
   1058                                      // src_stride ignored
   1059     mov        ebx, [esp + 8 + 12]   // src_stepx
   1060     mov        edx, [esp + 8 + 16]   // dst_argb
   1061     mov        ecx, [esp + 8 + 20]   // dst_width
   1062     lea        ebx, [ebx * 4]
   1063     lea        edi, [ebx + ebx * 2]
   1064 
   1065   wloop:
   1066     movd       xmm0, [eax]
   1067     movd       xmm1, [eax + ebx]
   1068     punpckldq  xmm0, xmm1
   1069     movd       xmm2, [eax + ebx * 2]
   1070     movd       xmm3, [eax + edi]
   1071     lea        eax,  [eax + ebx * 4]
   1072     punpckldq  xmm2, xmm3
   1073     punpcklqdq xmm0, xmm2
   1074     movdqu     [edx], xmm0
   1075     lea        edx, [edx + 16]
   1076     sub        ecx, 4
   1077     jg         wloop
   1078 
   1079     pop        edi
   1080     pop        ebx
   1081     ret
   1082   }
   1083 }
   1084 
   1085 // Blends four 2x2 to 4x1.
   1086 __declspec(naked)
   1087 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
   1088                                   ptrdiff_t src_stride,
   1089                                   int src_stepx,
   1090                                   uint8* dst_argb, int dst_width) {
   1091   __asm {
   1092     push       ebx
   1093     push       esi
   1094     push       edi
   1095     mov        eax, [esp + 12 + 4]    // src_argb
   1096     mov        esi, [esp + 12 + 8]    // src_stride
   1097     mov        ebx, [esp + 12 + 12]   // src_stepx
   1098     mov        edx, [esp + 12 + 16]   // dst_argb
   1099     mov        ecx, [esp + 12 + 20]   // dst_width
   1100     lea        esi, [eax + esi]       // row1 pointer
   1101     lea        ebx, [ebx * 4]
   1102     lea        edi, [ebx + ebx * 2]
   1103 
   1104   wloop:
   1105     movq       xmm0, qword ptr [eax]  // row0 4 pairs
   1106     movhps     xmm0, qword ptr [eax + ebx]
   1107     movq       xmm1, qword ptr [eax + ebx * 2]
   1108     movhps     xmm1, qword ptr [eax + edi]
   1109     lea        eax,  [eax + ebx * 4]
   1110     movq       xmm2, qword ptr [esi]  // row1 4 pairs
   1111     movhps     xmm2, qword ptr [esi + ebx]
   1112     movq       xmm3, qword ptr [esi + ebx * 2]
   1113     movhps     xmm3, qword ptr [esi + edi]
   1114     lea        esi,  [esi + ebx * 4]
   1115     pavgb      xmm0, xmm2            // average rows
   1116     pavgb      xmm1, xmm3
   1117     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
   1118     shufps     xmm0, xmm1, 0x88      // even pixels
   1119     shufps     xmm2, xmm1, 0xdd      // odd pixels
   1120     pavgb      xmm0, xmm2
   1121     movdqu     [edx], xmm0
   1122     lea        edx, [edx + 16]
   1123     sub        ecx, 4
   1124     jg         wloop
   1125 
   1126     pop        edi
   1127     pop        esi
   1128     pop        ebx
   1129     ret
   1130   }
   1131 }
   1132 
   1133 // Column scaling unfiltered. SSE2 version.
   1134 __declspec(naked)
   1135 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
   1136                         int dst_width, int x, int dx) {
   1137   __asm {
   1138     push       edi
   1139     push       esi
   1140     mov        edi, [esp + 8 + 4]    // dst_argb
   1141     mov        esi, [esp + 8 + 8]    // src_argb
   1142     mov        ecx, [esp + 8 + 12]   // dst_width
   1143     movd       xmm2, [esp + 8 + 16]  // x
   1144     movd       xmm3, [esp + 8 + 20]  // dx
   1145 
   1146     pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
   1147     pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
   1148     paddd      xmm2, xmm0
   1149     paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
   1150     pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
   1151     paddd      xmm2, xmm0            // x3 x2 x1 x0
   1152     paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
   1153     pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
   1154 
   1155     pextrw     eax, xmm2, 1          // get x0 integer.
   1156     pextrw     edx, xmm2, 3          // get x1 integer.
   1157 
   1158     cmp        ecx, 0
   1159     jle        xloop99
   1160     sub        ecx, 4
   1161     jl         xloop49
   1162 
   1163     // 4 Pixel loop.
   1164  xloop4:
   1165     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
   1166     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
   1167     pextrw     eax, xmm2, 5           // get x2 integer.
   1168     pextrw     edx, xmm2, 7           // get x3 integer.
   1169     paddd      xmm2, xmm3             // x += dx
   1170     punpckldq  xmm0, xmm1             // x0 x1
   1171 
   1172     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
   1173     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
   1174     pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
   1175     pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
   1176     punpckldq  xmm1, xmm4             // x2 x3
   1177     punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
   1178     movdqu     [edi], xmm0
   1179     lea        edi, [edi + 16]
   1180     sub        ecx, 4                 // 4 pixels
   1181     jge        xloop4
   1182 
   1183  xloop49:
   1184     test       ecx, 2
   1185     je         xloop29
   1186 
   1187     // 2 Pixels.
   1188     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
   1189     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
   1190     pextrw     eax, xmm2, 5           // get x2 integer.
   1191     punpckldq  xmm0, xmm1             // x0 x1
   1192 
   1193     movq       qword ptr [edi], xmm0
   1194     lea        edi, [edi + 8]
   1195 
   1196  xloop29:
   1197     test       ecx, 1
   1198     je         xloop99
   1199 
   1200     // 1 Pixels.
   1201     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
   1202     movd       dword ptr [edi], xmm0
   1203  xloop99:
   1204 
   1205     pop        esi
   1206     pop        edi
   1207     ret
   1208   }
   1209 }
   1210 
   1211 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
   1212 // TODO(fbarchard): Port to Neon
   1213 
   1214 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
   1215 static uvec8 kShuffleColARGB = {
   1216   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
   1217   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
   1218 };
   1219 
   1220 // Shuffle table for duplicating 2 fractions into 8 bytes each
   1221 static uvec8 kShuffleFractions = {
   1222   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
   1223 };
   1224 
   1225 __declspec(naked)
   1226 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
   1227                                int dst_width, int x, int dx) {
   1228   __asm {
   1229     push       esi
   1230     push       edi
   1231     mov        edi, [esp + 8 + 4]    // dst_argb
   1232     mov        esi, [esp + 8 + 8]    // src_argb
   1233     mov        ecx, [esp + 8 + 12]   // dst_width
   1234     movd       xmm2, [esp + 8 + 16]  // x
   1235     movd       xmm3, [esp + 8 + 20]  // dx
   1236     movdqa     xmm4, kShuffleColARGB
   1237     movdqa     xmm5, kShuffleFractions
   1238     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
   1239     psrlw      xmm6, 9
   1240     pextrw     eax, xmm2, 1         // get x0 integer. preroll
   1241     sub        ecx, 2
   1242     jl         xloop29
   1243 
   1244     movdqa     xmm0, xmm2           // x1 = x0 + dx
   1245     paddd      xmm0, xmm3
   1246     punpckldq  xmm2, xmm0           // x0 x1
   1247     punpckldq  xmm3, xmm3           // dx dx
   1248     paddd      xmm3, xmm3           // dx * 2, dx * 2
   1249     pextrw     edx, xmm2, 3         // get x1 integer. preroll
   1250 
   1251     // 2 Pixel loop.
   1252   xloop2:
   1253     movdqa     xmm1, xmm2           // x0, x1 fractions.
   1254     paddd      xmm2, xmm3           // x += dx
   1255     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
   1256     psrlw      xmm1, 9              // 7 bit fractions.
   1257     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
   1258     pshufb     xmm1, xmm5           // 0000000011111111
   1259     pshufb     xmm0, xmm4           // arrange pixels into pairs
   1260     pxor       xmm1, xmm6           // 0..7f and 7f..0
   1261     pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
   1262     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
   1263     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
   1264     psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
   1265     packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
   1266     movq       qword ptr [edi], xmm0
   1267     lea        edi, [edi + 8]
   1268     sub        ecx, 2               // 2 pixels
   1269     jge        xloop2
   1270 
   1271  xloop29:
   1272 
   1273     add        ecx, 2 - 1
   1274     jl         xloop99
   1275 
   1276     // 1 pixel remainder
   1277     psrlw      xmm2, 9              // 7 bit fractions.
   1278     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
   1279     pshufb     xmm2, xmm5           // 00000000
   1280     pshufb     xmm0, xmm4           // arrange pixels into pairs
   1281     pxor       xmm2, xmm6           // 0..7f and 7f..0
   1282     pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
   1283     psrlw      xmm0, 7
   1284     packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
   1285     movd       [edi], xmm0
   1286 
   1287  xloop99:
   1288 
   1289     pop        edi
   1290     pop        esi
   1291     ret
   1292   }
   1293 }
   1294 
   1295 // Reads 4 pixels, duplicates them and writes 8 pixels.
   1296 __declspec(naked)
   1297 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
   1298                            int dst_width, int x, int dx) {
   1299   __asm {
   1300     mov        edx, [esp + 4]    // dst_argb
   1301     mov        eax, [esp + 8]    // src_argb
   1302     mov        ecx, [esp + 12]   // dst_width
   1303 
   1304   wloop:
   1305     movdqu     xmm0, [eax]
   1306     lea        eax,  [eax + 16]
   1307     movdqa     xmm1, xmm0
   1308     punpckldq  xmm0, xmm0
   1309     punpckhdq  xmm1, xmm1
   1310     movdqu     [edx], xmm0
   1311     movdqu     [edx + 16], xmm1
   1312     lea        edx, [edx + 32]
   1313     sub        ecx, 8
   1314     jg         wloop
   1315 
   1316     ret
   1317   }
   1318 }
   1319 
   1320 // Divide num by div and return as 16.16 fixed point result.
   1321 __declspec(naked)
   1322 int FixedDiv_X86(int num, int div) {
   1323   __asm {
   1324     mov        eax, [esp + 4]    // num
   1325     cdq                          // extend num to 64 bits
   1326     shld       edx, eax, 16      // 32.16
   1327     shl        eax, 16
   1328     idiv       dword ptr [esp + 8]
   1329     ret
   1330   }
   1331 }
   1332 
   1333 // Divide num by div and return as 16.16 fixed point result.
   1334 __declspec(naked)
   1335 int FixedDiv1_X86(int num, int div) {
   1336   __asm {
   1337     mov        eax, [esp + 4]    // num
   1338     mov        ecx, [esp + 8]    // denom
   1339     cdq                          // extend num to 64 bits
   1340     shld       edx, eax, 16      // 32.16
   1341     shl        eax, 16
   1342     sub        eax, 0x00010001
   1343     sbb        edx, 0
   1344     sub        ecx, 1
   1345     idiv       ecx
   1346     ret
   1347   }
   1348 }
   1349 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
   1350 
   1351 #ifdef __cplusplus
   1352 }  // extern "C"
   1353 }  // namespace libyuv
   1354 #endif
   1355