Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "third_party/libyuv/include/libyuv/scale.h"
     12 
     13 #include <assert.h>
     14 #include <string.h>
     15 
     16 #include "third_party/libyuv/include/libyuv/cpu_id.h"
     17 #include "third_party/libyuv/source/row.h"
     18 
     19 #ifdef __cplusplus
     20 namespace libyuv {
     21 extern "C" {
     22 #endif
     23 
     24 /*
     25  * Note: Defining YUV_DISABLE_ASM allows to use c version.
     26  */
     27 //#define YUV_DISABLE_ASM
     28 
     29 #if defined(_MSC_VER)
     30 #define ALIGN16(var) __declspec(align(16)) var
     31 #else
     32 #define ALIGN16(var) var __attribute__((aligned(16)))
     33 #endif
     34 
     35 // Note: A Neon reference manual
     36 // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
     37 // Note: Some SSE2 reference manuals
     38 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
     39 
     40 // Set the following flag to true to revert to only
     41 // using the reference implementation ScalePlaneBox(), and
     42 // NOT the optimized versions. Useful for debugging and
     43 // when comparing the quality of the resulting YUV planes
     44 // as produced by the optimized and non-optimized versions.
     45 
     46 static int use_reference_impl_ = 0;
     47 
     48 void SetUseReferenceImpl(int use) {
     49   use_reference_impl_ = use;
     50 }
     51 
     52 // ScaleRowDown2Int also used by planar functions
     53 
     54 /**
     55  * NEON downscalers with interpolation.
     56  *
     57  * Provided by Fritz Koenig
     58  *
     59  */
     60 
     61 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
     62 #define HAS_SCALEROWDOWN2_NEON
     63 void ScaleRowDown2_NEON(const uint8* src_ptr, int  src_stride,
     64                         uint8* dst, int dst_width) {
     65   asm volatile (
     66     "1:                                        \n"
     67     "vld2.u8    {q0,q1}, [%0]!                 \n"  // load even pixels into q0, odd into q1
     68     "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels
     69     "subs       %2, %2, #16                    \n"  // 16 processed per loop
     70     "bhi        1b                             \n"
     71     : "+r"(src_ptr),          // %0
     72       "+r"(dst),              // %1
     73       "+r"(dst_width)         // %2
     74     :
     75     : "q0", "q1"              // Clobber List
     76   );
     77 }
     78 
     79 void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
     80                            uint8* dst, int dst_width) {
     81   asm volatile (
     82     "add        %1, %0                         \n"  // change the stride to row 2 pointer
     83     "1:                                        \n"
     84     "vld1.u8    {q0,q1}, [%0]!                 \n"  // load row 1 and post increment
     85     "vld1.u8    {q2,q3}, [%1]!                 \n"  // load row 2 and post increment
     86     "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
     87     "vpaddl.u8  q1, q1                         \n"
     88     "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent, add row 1 to row 2
     89     "vpadal.u8  q1, q3                         \n"
     90     "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
     91     "vrshrn.u16 d1, q1, #2                     \n"
     92     "vst1.u8    {q0}, [%2]!                    \n"
     93     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     94     "bhi        1b                             \n"
     95     : "+r"(src_ptr),          // %0
     96       "+r"(src_stride),       // %1
     97       "+r"(dst),              // %2
     98       "+r"(dst_width)         // %3
     99     :
    100     : "q0", "q1", "q2", "q3"     // Clobber List
    101    );
    102 }
    103 
    104 #define HAS_SCALEROWDOWN4_NEON
    105 static void ScaleRowDown4_NEON(const uint8* src_ptr, int src_stride,
    106                                uint8* dst_ptr, int dst_width) {
    107   asm volatile (
    108     "1:                                        \n"
    109     "vld2.u8    {d0, d1}, [%0]!                \n"
    110     "vtrn.u8    d1, d0                         \n"
    111     "vshrn.u16  d0, q0, #8                     \n"
    112     "vst1.u32   {d0[1]}, [%1]!                 \n"
    113 
    114     "subs       %2, #4                         \n"
    115     "bhi        1b                             \n"
    116     : "+r"(src_ptr),          // %0
    117       "+r"(dst_ptr),          // %1
    118       "+r"(dst_width)         // %2
    119     :
    120     : "q0", "q1", "memory", "cc"
    121   );
    122 }
    123 
    124 static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
    125                                   uint8* dst_ptr, int dst_width) {
    126   asm volatile (
    127     "add        r4, %0, %3                     \n"
    128     "add        r5, r4, %3                     \n"
    129     "add        %3, r5, %3                     \n"
    130     "1:                                        \n"
    131     "vld1.u8    {q0}, [%0]!                    \n"   // load up 16x4 block of input data
    132     "vld1.u8    {q1}, [r4]!                    \n"
    133     "vld1.u8    {q2}, [r5]!                    \n"
    134     "vld1.u8    {q3}, [%3]!                    \n"
    135 
    136     "vpaddl.u8  q0, q0                         \n"
    137     "vpadal.u8  q0, q1                         \n"
    138     "vpadal.u8  q0, q2                         \n"
    139     "vpadal.u8  q0, q3                         \n"
    140 
    141     "vpaddl.u16 q0, q0                         \n"
    142 
    143     "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
    144 
    145     "vmovn.u16  d0, q0                         \n"
    146     "vst1.u32   {d0[0]}, [%1]!                 \n"
    147 
    148     "subs       %2, #4                         \n"
    149     "bhi        1b                             \n"
    150 
    151     : "+r"(src_ptr),          // %0
    152       "+r"(dst_ptr),          // %1
    153       "+r"(dst_width)         // %2
    154     : "r"(src_stride)         // %3
    155     : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
    156   );
    157 }
    158 
    159 #define HAS_SCALEROWDOWN34_NEON
    160 // Down scale from 4 to 3 pixels.  Use the neon multilane read/write
    161 //  to load up the every 4th pixel into a 4 different registers.
    162 // Point samples 32 pixels to 24 pixels.
    163 static void ScaleRowDown34_NEON(const uint8* src_ptr, int src_stride,
    164                                 uint8* dst_ptr, int dst_width) {
    165   asm volatile (
    166     "1:                                        \n"
    167     "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
    168     "vmov         d2, d3                       \n" // order needs to be d0, d1, d2
    169     "vst3.u8      {d0, d1, d2}, [%1]!          \n"
    170     "subs         %2, #24                      \n"
    171     "bhi          1b                           \n"
    172     : "+r"(src_ptr),          // %0
    173       "+r"(dst_ptr),          // %1
    174       "+r"(dst_width)         // %2
    175     :
    176     : "d0", "d1", "d2", "d3", "memory", "cc"
    177   );
    178 }
    179 
    180 static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
    181                                       uint8* dst_ptr, int dst_width) {
    182   asm volatile (
    183     "vmov.u8      d24, #3                      \n"
    184     "add          %3, %0                       \n"
    185     "1:                                        \n"
    186     "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
    187     "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
    188 
    189     // filter src line 0 with src line 1
    190     // expand chars to shorts to allow for room
    191     // when adding lines together
    192     "vmovl.u8     q8, d4                       \n"
    193     "vmovl.u8     q9, d5                       \n"
    194     "vmovl.u8     q10, d6                      \n"
    195     "vmovl.u8     q11, d7                      \n"
    196 
    197     // 3 * line_0 + line_1
    198     "vmlal.u8     q8, d0, d24                  \n"
    199     "vmlal.u8     q9, d1, d24                  \n"
    200     "vmlal.u8     q10, d2, d24                 \n"
    201     "vmlal.u8     q11, d3, d24                 \n"
    202 
    203     // (3 * line_0 + line_1) >> 2
    204     "vqrshrn.u16  d0, q8, #2                   \n"
    205     "vqrshrn.u16  d1, q9, #2                   \n"
    206     "vqrshrn.u16  d2, q10, #2                  \n"
    207     "vqrshrn.u16  d3, q11, #2                  \n"
    208 
    209     // a0 = (src[0] * 3 + s[1] * 1) >> 2
    210     "vmovl.u8     q8, d1                       \n"
    211     "vmlal.u8     q8, d0, d24                  \n"
    212     "vqrshrn.u16  d0, q8, #2                   \n"
    213 
    214     // a1 = (src[1] * 1 + s[2] * 1) >> 1
    215     "vrhadd.u8    d1, d1, d2                   \n"
    216 
    217     // a2 = (src[2] * 1 + s[3] * 3) >> 2
    218     "vmovl.u8     q8, d2                       \n"
    219     "vmlal.u8     q8, d3, d24                  \n"
    220     "vqrshrn.u16  d2, q8, #2                   \n"
    221 
    222     "vst3.u8      {d0, d1, d2}, [%1]!          \n"
    223 
    224     "subs         %2, #24                      \n"
    225     "bhi          1b                           \n"
    226     : "+r"(src_ptr),          // %0
    227       "+r"(dst_ptr),          // %1
    228       "+r"(dst_width),        // %2
    229       "+r"(src_stride)        // %3
    230     :
    231     : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
    232   );
    233 }
    234 
    235 static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
    236                                       uint8* dst_ptr, int dst_width) {
    237   asm volatile (
    238     "vmov.u8      d24, #3                      \n"
    239     "add          %3, %0                       \n"
    240     "1:                                        \n"
    241     "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
    242     "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
    243 
    244     // average src line 0 with src line 1
    245     "vrhadd.u8    q0, q0, q2                   \n"
    246     "vrhadd.u8    q1, q1, q3                   \n"
    247 
    248     // a0 = (src[0] * 3 + s[1] * 1) >> 2
    249     "vmovl.u8     q3, d1                       \n"
    250     "vmlal.u8     q3, d0, d24                  \n"
    251     "vqrshrn.u16  d0, q3, #2                   \n"
    252 
    253     // a1 = (src[1] * 1 + s[2] * 1) >> 1
    254     "vrhadd.u8    d1, d1, d2                   \n"
    255 
    256     // a2 = (src[2] * 1 + s[3] * 3) >> 2
    257     "vmovl.u8     q3, d2                       \n"
    258     "vmlal.u8     q3, d3, d24                  \n"
    259     "vqrshrn.u16  d2, q3, #2                   \n"
    260 
    261     "vst3.u8      {d0, d1, d2}, [%1]!          \n"
    262 
    263     "subs         %2, #24                      \n"
    264     "bhi          1b                           \n"
    265     : "+r"(src_ptr),          // %0
    266       "+r"(dst_ptr),          // %1
    267       "+r"(dst_width),        // %2
    268       "+r"(src_stride)        // %3
    269     :
    270     : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
    271   );
    272 }
    273 
    274 #define HAS_SCALEROWDOWN38_NEON
    275 const uint8 shuf38[16] __attribute__ ((aligned(16))) =
    276   { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
    277 const uint8 shuf38_2[16] __attribute__ ((aligned(16))) =
    278   { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
    279 const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) =
    280   { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
    281     65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
    282 const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) =
    283   { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
    284     65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
    285 
    286 // 32 -> 12
    287 static void ScaleRowDown38_NEON(const uint8* src_ptr, int src_stride,
    288                                 uint8* dst_ptr, int dst_width) {
    289   asm volatile (
    290     "vld1.u8      {q3}, [%3]                   \n"
    291     "1:                                        \n"
    292     "vld1.u8      {d0, d1, d2, d3}, [%0]!      \n"
    293     "vtbl.u8      d4, {d0, d1, d2, d3}, d6     \n"
    294     "vtbl.u8      d5, {d0, d1, d2, d3}, d7     \n"
    295     "vst1.u8      {d4}, [%1]!                  \n"
    296     "vst1.u32     {d5[0]}, [%1]!               \n"
    297     "subs         %2, #12                      \n"
    298     "bhi          1b                           \n"
    299     : "+r"(src_ptr),          // %0
    300       "+r"(dst_ptr),          // %1
    301       "+r"(dst_width)         // %2
    302     : "r"(shuf38)             // %3
    303     : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
    304   );
    305 }
    306 
    307 // 32x3 -> 12x1
    308 static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
    309                                       uint8* dst_ptr, int dst_width) {
    310   asm volatile (
    311     "vld1.u16     {q13}, [%4]                  \n"
    312     "vld1.u8      {q14}, [%5]                  \n"
    313     "vld1.u8      {q15}, [%6]                  \n"
    314     "add          r4, %0, %3, lsl #1           \n"
    315     "add          %3, %0                       \n"
    316     "1:                                        \n"
    317 
    318     // d0 = 00 40 01 41 02 42 03 43
    319     // d1 = 10 50 11 51 12 52 13 53
    320     // d2 = 20 60 21 61 22 62 23 63
    321     // d3 = 30 70 31 71 32 72 33 73
    322     "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
    323     "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
    324     "vld4.u8      {d16, d17, d18, d19}, [r4]!  \n"
    325 
    326     // Shuffle the input data around to get align the data
    327     //  so adjacent data can be added.  0,1 - 2,3 - 4,5 - 6,7
    328     // d0 = 00 10 01 11 02 12 03 13
    329     // d1 = 40 50 41 51 42 52 43 53
    330     "vtrn.u8      d0, d1                       \n"
    331     "vtrn.u8      d4, d5                       \n"
    332     "vtrn.u8      d16, d17                     \n"
    333 
    334     // d2 = 20 30 21 31 22 32 23 33
    335     // d3 = 60 70 61 71 62 72 63 73
    336     "vtrn.u8      d2, d3                       \n"
    337     "vtrn.u8      d6, d7                       \n"
    338     "vtrn.u8      d18, d19                     \n"
    339 
    340     // d0 = 00+10 01+11 02+12 03+13
    341     // d2 = 40+50 41+51 42+52 43+53
    342     "vpaddl.u8    q0, q0                       \n"
    343     "vpaddl.u8    q2, q2                       \n"
    344     "vpaddl.u8    q8, q8                       \n"
    345 
    346     // d3 = 60+70 61+71 62+72 63+73
    347     "vpaddl.u8    d3, d3                       \n"
    348     "vpaddl.u8    d7, d7                       \n"
    349     "vpaddl.u8    d19, d19                     \n"
    350 
    351     // combine source lines
    352     "vadd.u16     q0, q2                       \n"
    353     "vadd.u16     q0, q8                       \n"
    354     "vadd.u16     d4, d3, d7                   \n"
    355     "vadd.u16     d4, d19                      \n"
    356 
    357     // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
    358     //             + s[6 + st * 1] + s[7 + st * 1]
    359     //             + s[6 + st * 2] + s[7 + st * 2]) / 6
    360     "vqrdmulh.s16 q2, q13                      \n"
    361     "vmovn.u16    d4, q2                       \n"
    362 
    363     // Shuffle 2,3 reg around so that 2 can be added to the
    364     //  0,1 reg and 3 can be added to the 4,5 reg.  This
    365     //  requires expanding from u8 to u16 as the 0,1 and 4,5
    366     //  registers are already expanded.  Then do transposes
    367     //  to get aligned.
    368     // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
    369     "vmovl.u8     q1, d2                       \n"
    370     "vmovl.u8     q3, d6                       \n"
    371     "vmovl.u8     q9, d18                      \n"
    372 
    373     // combine source lines
    374     "vadd.u16     q1, q3                       \n"
    375     "vadd.u16     q1, q9                       \n"
    376 
    377     // d4 = xx 20 xx 30 xx 22 xx 32
    378     // d5 = xx 21 xx 31 xx 23 xx 33
    379     "vtrn.u32     d2, d3                       \n"
    380 
    381     // d4 = xx 20 xx 21 xx 22 xx 23
    382     // d5 = xx 30 xx 31 xx 32 xx 33
    383     "vtrn.u16     d2, d3                       \n"
    384 
    385     // 0+1+2, 3+4+5
    386     "vadd.u16     q0, q1                       \n"
    387 
    388     // Need to divide, but can't downshift as the the value
    389     //  isn't a power of 2.  So multiply by 65536 / n
    390     //  and take the upper 16 bits.
    391     "vqrdmulh.s16 q0, q15                      \n"
    392 
    393     // Align for table lookup, vtbl requires registers to
    394     //  be adjacent
    395     "vmov.u8      d2, d4                       \n"
    396 
    397     "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
    398     "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
    399 
    400     "vst1.u8      {d3}, [%1]!                  \n"
    401     "vst1.u32     {d4[0]}, [%1]!               \n"
    402     "subs         %2, #12                      \n"
    403     "bhi          1b                           \n"
    404     : "+r"(src_ptr),          // %0
    405       "+r"(dst_ptr),          // %1
    406       "+r"(dst_width),        // %2
    407       "+r"(src_stride)        // %3
    408     : "r"(mult38_div6),       // %4
    409       "r"(shuf38_2),          // %5
    410       "r"(mult38_div9)        // %6
    411     : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
    412       "q13", "q14", "q15", "memory", "cc"
    413   );
    414 }
    415 
    416 // 32x2 -> 12x1
    417 static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
    418                                       uint8* dst_ptr, int dst_width) {
    419   asm volatile (
    420     "vld1.u16     {q13}, [%4]                  \n"
    421     "vld1.u8      {q14}, [%5]                  \n"
    422     "add          %3, %0                       \n"
    423     "1:                                        \n"
    424 
    425     // d0 = 00 40 01 41 02 42 03 43
    426     // d1 = 10 50 11 51 12 52 13 53
    427     // d2 = 20 60 21 61 22 62 23 63
    428     // d3 = 30 70 31 71 32 72 33 73
    429     "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
    430     "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
    431 
    432     // Shuffle the input data around to get align the data
    433     //  so adjacent data can be added.  0,1 - 2,3 - 4,5 - 6,7
    434     // d0 = 00 10 01 11 02 12 03 13
    435     // d1 = 40 50 41 51 42 52 43 53
    436     "vtrn.u8      d0, d1                       \n"
    437     "vtrn.u8      d4, d5                       \n"
    438 
    439     // d2 = 20 30 21 31 22 32 23 33
    440     // d3 = 60 70 61 71 62 72 63 73
    441     "vtrn.u8      d2, d3                       \n"
    442     "vtrn.u8      d6, d7                       \n"
    443 
    444     // d0 = 00+10 01+11 02+12 03+13
    445     // d2 = 40+50 41+51 42+52 43+53
    446     "vpaddl.u8    q0, q0                       \n"
    447     "vpaddl.u8    q2, q2                       \n"
    448 
    449     // d3 = 60+70 61+71 62+72 63+73
    450     "vpaddl.u8    d3, d3                       \n"
    451     "vpaddl.u8    d7, d7                       \n"
    452 
    453     // combine source lines
    454     "vadd.u16     q0, q2                       \n"
    455     "vadd.u16     d4, d3, d7                   \n"
    456 
    457     // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
    458     "vqrshrn.u16  d4, q2, #2                   \n"
    459 
    460     // Shuffle 2,3 reg around so that 2 can be added to the
    461     //  0,1 reg and 3 can be added to the 4,5 reg.  This
    462     //  requires expanding from u8 to u16 as the 0,1 and 4,5
    463     //  registers are already expanded.  Then do transposes
    464     //  to get aligned.
    465     // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
    466     "vmovl.u8     q1, d2                       \n"
    467     "vmovl.u8     q3, d6                       \n"
    468 
    469     // combine source lines
    470     "vadd.u16     q1, q3                       \n"
    471 
    472     // d4 = xx 20 xx 30 xx 22 xx 32
    473     // d5 = xx 21 xx 31 xx 23 xx 33
    474     "vtrn.u32     d2, d3                       \n"
    475 
    476     // d4 = xx 20 xx 21 xx 22 xx 23
    477     // d5 = xx 30 xx 31 xx 32 xx 33
    478     "vtrn.u16     d2, d3                       \n"
    479 
    480     // 0+1+2, 3+4+5
    481     "vadd.u16     q0, q1                       \n"
    482 
    483     // Need to divide, but can't downshift as the the value
    484     //  isn't a power of 2.  So multiply by 65536 / n
    485     //  and take the upper 16 bits.
    486     "vqrdmulh.s16 q0, q13                      \n"
    487 
    488     // Align for table lookup, vtbl requires registers to
    489     //  be adjacent
    490     "vmov.u8      d2, d4                       \n"
    491 
    492     "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
    493     "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
    494 
    495     "vst1.u8      {d3}, [%1]!                  \n"
    496     "vst1.u32     {d4[0]}, [%1]!               \n"
    497     "subs         %2, #12                      \n"
    498     "bhi          1b                           \n"
    499     : "+r"(src_ptr),          // %0
    500       "+r"(dst_ptr),          // %1
    501       "+r"(dst_width),        // %2
    502       "+r"(src_stride)        // %3
    503     : "r"(mult38_div6),       // %4
    504       "r"(shuf38_2)           // %5
    505     : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
    506   );
    507 }
    508 
    509 /**
    510  * SSE2 downscalers with interpolation.
    511  *
    512  * Provided by Frank Barchard (fbarchard (at) google.com)
    513  *
    514  */
    515 
    516 // Constants for SSE2 code
    517 #elif (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && \
    518     !defined(YUV_DISABLE_ASM)
    519 #if defined(_MSC_VER)
    520 #define TALIGN16(t, var) __declspec(align(16)) t _ ## var
    521 #elif (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && defined(__i386__)
    522 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
    523 #else
    524 #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
    525 #endif
    526 
    527 #if (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && \
    528     defined(__i386__)
    529 #define DECLARE_FUNCTION(name)                                                 \
    530     ".text                                     \n"                             \
    531     ".globl _" #name "                         \n"                             \
    532 "_" #name ":                                   \n"
    533 #else
    534 #define DECLARE_FUNCTION(name)                                                 \
    535     ".text                                     \n"                             \
    536     ".global " #name "                         \n"                             \
    537 #name ":                                       \n"
    538 #endif
    539 
    540 
    541 // Offsets for source bytes 0 to 9
    542 //extern "C"
    543 TALIGN16(const uint8, shuf0[16]) =
    544   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
    545 
    546 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
    547 //extern "C"
    548 TALIGN16(const uint8, shuf1[16]) =
    549   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
    550 
    551 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
    552 //extern "C"
    553 TALIGN16(const uint8, shuf2[16]) =
    554   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
    555 
    556 // Offsets for source bytes 0 to 10
    557 //extern "C"
    558 TALIGN16(const uint8, shuf01[16]) =
    559   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
    560 
    561 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
    562 //extern "C"
    563 TALIGN16(const uint8, shuf11[16]) =
    564   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
    565 
    566 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
    567 //extern "C"
    568 TALIGN16(const uint8, shuf21[16]) =
    569   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
    570 
    571 // Coefficients for source bytes 0 to 10
    572 //extern "C"
    573 TALIGN16(const uint8, madd01[16]) =
    574   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
    575 
    576 // Coefficients for source bytes 10 to 21
    577 //extern "C"
    578 TALIGN16(const uint8, madd11[16]) =
    579   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
    580 
    581 // Coefficients for source bytes 21 to 31
    582 //extern "C"
    583 TALIGN16(const uint8, madd21[16]) =
    584   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
    585 
    586 // Coefficients for source bytes 21 to 31
    587 //extern "C"
    588 TALIGN16(const int16, round34[8]) =
    589   { 2, 2, 2, 2, 2, 2, 2, 2 };
    590 
    591 //extern "C"
    592 TALIGN16(const uint8, shuf38a[16]) =
    593   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
    594 
    595 //extern "C"
    596 TALIGN16(const uint8, shuf38b[16]) =
    597   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
    598 
    599 // Arrange words 0,3,6 into 0,1,2
    600 //extern "C"
    601 TALIGN16(const uint8, shufac0[16]) =
    602   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
    603 
    604 // Arrange words 0,3,6 into 3,4,5
    605 //extern "C"
    606 TALIGN16(const uint8, shufac3[16]) =
    607   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
    608 
    609 // Scaling values for boxes of 3x3 and 2x3
    610 //extern "C"
    611 TALIGN16(const uint16, scaleac3[8]) =
    612   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
    613 
    614 // Arrange first value for pixels 0,1,2,3,4,5
    615 //extern "C"
    616 TALIGN16(const uint8, shufab0[16]) =
    617   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
    618 
    619 // Arrange second value for pixels 0,1,2,3,4,5
    620 //extern "C"
    621 TALIGN16(const uint8, shufab1[16]) =
    622   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
    623 
    624 // Arrange third value for pixels 0,1,2,3,4,5
    625 //extern "C"
    626 TALIGN16(const uint8, shufab2[16]) =
    627   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
    628 
    629 // Scaling values for boxes of 3x2 and 2x2
    630 //extern "C"
    631 TALIGN16(const uint16, scaleab2[8]) =
    632   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
    633 #endif
    634 
    635 #if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) && defined(_MSC_VER)
    636 
    637 #define HAS_SCALEROWDOWN2_SSE2
    638 // Reads 32 pixels, throws half away and writes 16 pixels.
    639 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
    640 __declspec(naked)
    641 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
    642                                uint8* dst_ptr, int dst_width) {
    643   __asm {
    644     mov        eax, [esp + 4]        // src_ptr
    645                                      // src_stride ignored
    646     mov        edx, [esp + 12]       // dst_ptr
    647     mov        ecx, [esp + 16]       // dst_width
    648     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
    649     psrlw      xmm5, 8
    650 
    651   wloop:
    652     movdqa     xmm0, [eax]
    653     movdqa     xmm1, [eax + 16]
    654     lea        eax,  [eax + 32]
    655     pand       xmm0, xmm5
    656     pand       xmm1, xmm5
    657     packuswb   xmm0, xmm1
    658     movdqa     [edx], xmm0
    659     lea        edx, [edx + 16]
    660     sub        ecx, 16
    661     ja         wloop
    662 
    663     ret
    664   }
    665 }
    666 // Blends 32x2 rectangle to 16x1.
    667 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
    668 __declspec(naked)
    669 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
    670                            uint8* dst_ptr, int dst_width) {
    671   __asm {
    672     push       esi
    673     mov        eax, [esp + 4 + 4]    // src_ptr
    674     mov        esi, [esp + 4 + 8]    // src_stride
    675     mov        edx, [esp + 4 + 12]   // dst_ptr
    676     mov        ecx, [esp + 4 + 16]   // dst_width
    677     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
    678     psrlw      xmm5, 8
    679 
    680   wloop:
    681     movdqa     xmm0, [eax]
    682     movdqa     xmm1, [eax + 16]
    683     movdqa     xmm2, [eax + esi]
    684     movdqa     xmm3, [eax + esi + 16]
    685     lea        eax,  [eax + 32]
    686     pavgb      xmm0, xmm2            // average rows
    687     pavgb      xmm1, xmm3
    688 
    689     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
    690     psrlw      xmm0, 8
    691     movdqa     xmm3, xmm1
    692     psrlw      xmm1, 8
    693     pand       xmm2, xmm5
    694     pand       xmm3, xmm5
    695     pavgw      xmm0, xmm2
    696     pavgw      xmm1, xmm3
    697     packuswb   xmm0, xmm1
    698 
    699     movdqa     [edx], xmm0
    700     lea        edx, [edx + 16]
    701     sub        ecx, 16
    702     ja         wloop
    703 
    704     pop        esi
    705     ret
    706   }
    707 }
    708 
    709 #define HAS_SCALEROWDOWN4_SSE2
    710 // Point samples 32 pixels to 8 pixels.
    711 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
    712 __declspec(naked)
    713 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
    714                                uint8* dst_ptr, int dst_width) {
    715   __asm {
    716     pushad
    717     mov        esi, [esp + 32 + 4]   // src_ptr
    718                                      // src_stride ignored
    719     mov        edi, [esp + 32 + 12]  // dst_ptr
    720     mov        ecx, [esp + 32 + 16]  // dst_width
    721     pcmpeqb    xmm5, xmm5            // generate mask 0x000000ff
    722     psrld      xmm5, 24
    723 
    724   wloop:
    725     movdqa     xmm0, [esi]
    726     movdqa     xmm1, [esi + 16]
    727     lea        esi,  [esi + 32]
    728     pand       xmm0, xmm5
    729     pand       xmm1, xmm5
    730     packuswb   xmm0, xmm1
    731     packuswb   xmm0, xmm0
    732     movq       qword ptr [edi], xmm0
    733     lea        edi, [edi + 8]
    734     sub        ecx, 8
    735     ja         wloop
    736 
    737     popad
    738     ret
    739   }
    740 }
    741 
    742 // Blends 32x4 rectangle to 8x1.
    743 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
    744 __declspec(naked)
    745 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
    746                                   uint8* dst_ptr, int dst_width) {
    747   __asm {
    748     pushad
    749     mov        esi, [esp + 32 + 4]   // src_ptr
    750     mov        ebx, [esp + 32 + 8]   // src_stride
    751     mov        edi, [esp + 32 + 12]  // dst_ptr
    752     mov        ecx, [esp + 32 + 16]  // dst_width
    753     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
    754     psrlw      xmm7, 8
    755     lea        edx, [ebx + ebx * 2]  // src_stride * 3
    756 
    757   wloop:
    758     movdqa     xmm0, [esi]
    759     movdqa     xmm1, [esi + 16]
    760     movdqa     xmm2, [esi + ebx]
    761     movdqa     xmm3, [esi + ebx + 16]
    762     pavgb      xmm0, xmm2            // average rows
    763     pavgb      xmm1, xmm3
    764     movdqa     xmm2, [esi + ebx * 2]
    765     movdqa     xmm3, [esi + ebx * 2 + 16]
    766     movdqa     xmm4, [esi + edx]
    767     movdqa     xmm5, [esi + edx + 16]
    768     lea        esi, [esi + 32]
    769     pavgb      xmm2, xmm4
    770     pavgb      xmm3, xmm5
    771     pavgb      xmm0, xmm2
    772     pavgb      xmm1, xmm3
    773 
    774     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
    775     psrlw      xmm0, 8
    776     movdqa     xmm3, xmm1
    777     psrlw      xmm1, 8
    778     pand       xmm2, xmm7
    779     pand       xmm3, xmm7
    780     pavgw      xmm0, xmm2
    781     pavgw      xmm1, xmm3
    782     packuswb   xmm0, xmm1
    783 
    784     movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
    785     psrlw      xmm0, 8
    786     pand       xmm2, xmm7
    787     pavgw      xmm0, xmm2
    788     packuswb   xmm0, xmm0
    789 
    790     movq       qword ptr [edi], xmm0
    791     lea        edi, [edi + 8]
    792     sub        ecx, 8
    793     ja         wloop
    794 
    795     popad
    796     ret
    797   }
    798 }
    799 
    800 #define HAS_SCALEROWDOWN8_SSE2
    801 // Point samples 32 pixels to 4 pixels.
    802 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
    803 __declspec(naked)
    804 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
    805                                uint8* dst_ptr, int dst_width) {
    806   __asm {
    807     pushad
    808     mov        esi, [esp + 32 + 4]   // src_ptr
    809                                      // src_stride ignored
    810     mov        edi, [esp + 32 + 12]  // dst_ptr
    811     mov        ecx, [esp + 32 + 16]  // dst_width
    812     pcmpeqb    xmm5, xmm5            // generate mask isolating 1 src 8 bytes
    813     psrlq      xmm5, 56
    814 
    815   wloop:
    816     movdqa     xmm0, [esi]
    817     movdqa     xmm1, [esi + 16]
    818     lea        esi,  [esi + 32]
    819     pand       xmm0, xmm5
    820     pand       xmm1, xmm5
    821     packuswb   xmm0, xmm1  // 32->16
    822     packuswb   xmm0, xmm0  // 16->8
    823     packuswb   xmm0, xmm0  // 8->4
    824     movd       dword ptr [edi], xmm0
    825     lea        edi, [edi + 4]
    826     sub        ecx, 4
    827     ja         wloop
    828 
    829     popad
    830     ret
    831   }
    832 }
    833 
    834 // Blends 32x8 rectangle to 4x1.
    835 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
    836 __declspec(naked)
    837 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
    838                                   uint8* dst_ptr, int dst_width) {
    839   __asm {
    840     pushad
    841     mov        esi, [esp + 32 + 4]   // src_ptr
    842     mov        ebx, [esp + 32 + 8]   // src_stride
    843     mov        edi, [esp + 32 + 12]  // dst_ptr
    844     mov        ecx, [esp + 32 + 16]  // dst_width
    845     lea        edx, [ebx + ebx * 2]  // src_stride * 3
    846     pxor       xmm7, xmm7
    847 
    848   wloop:
    849     movdqa     xmm0, [esi]           // average 8 rows to 1
    850     movdqa     xmm1, [esi + 16]
    851     movdqa     xmm2, [esi + ebx]
    852     movdqa     xmm3, [esi + ebx + 16]
    853     pavgb      xmm0, xmm2
    854     pavgb      xmm1, xmm3
    855     movdqa     xmm2, [esi + ebx * 2]
    856     movdqa     xmm3, [esi + ebx * 2 + 16]
    857     movdqa     xmm4, [esi + edx]
    858     movdqa     xmm5, [esi + edx + 16]
    859     lea        ebp, [esi + ebx * 4]
    860     lea        esi, [esi + 32]
    861     pavgb      xmm2, xmm4
    862     pavgb      xmm3, xmm5
    863     pavgb      xmm0, xmm2
    864     pavgb      xmm1, xmm3
    865 
    866     movdqa     xmm2, [ebp]
    867     movdqa     xmm3, [ebp + 16]
    868     movdqa     xmm4, [ebp + ebx]
    869     movdqa     xmm5, [ebp + ebx + 16]
    870     pavgb      xmm2, xmm4
    871     pavgb      xmm3, xmm5
    872     movdqa     xmm4, [ebp + ebx * 2]
    873     movdqa     xmm5, [ebp + ebx * 2 + 16]
    874     movdqa     xmm6, [ebp + edx]
    875     pavgb      xmm4, xmm6
    876     movdqa     xmm6, [ebp + edx + 16]
    877     pavgb      xmm5, xmm6
    878     pavgb      xmm2, xmm4
    879     pavgb      xmm3, xmm5
    880     pavgb      xmm0, xmm2
    881     pavgb      xmm1, xmm3
    882 
    883     psadbw     xmm0, xmm7            // average 32 pixels to 4
    884     psadbw     xmm1, xmm7
    885     pshufd     xmm0, xmm0, 0xd8      // x1x0 -> xx01
    886     pshufd     xmm1, xmm1, 0x8d      // x3x2 -> 32xx
    887     por        xmm0, xmm1            //      -> 3201
    888     psrlw      xmm0, 3
    889     packuswb   xmm0, xmm0
    890     packuswb   xmm0, xmm0
    891     movd       dword ptr [edi], xmm0
    892 
    893     lea        edi, [edi + 4]
    894     sub        ecx, 4
    895     ja         wloop
    896 
    897     popad
    898     ret
    899   }
    900 }
    901 
    902 #define HAS_SCALEROWDOWN34_SSSE3
    903 // Point samples 32 pixels to 24 pixels.
    904 // Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
    905 // Then shuffled to do the scaling.
    906 
    907 // Note that movdqa+palign may be better than movdqu.
    908 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
    909 __declspec(naked)
    910 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
    911                                  uint8* dst_ptr, int dst_width) {
    912   __asm {
    913     pushad
    914     mov        esi, [esp + 32 + 4]   // src_ptr
    915                                      // src_stride ignored
    916     mov        edi, [esp + 32 + 12]  // dst_ptr
    917     mov        ecx, [esp + 32 + 16]  // dst_width
    918     movdqa     xmm3, _shuf0
    919     movdqa     xmm4, _shuf1
    920     movdqa     xmm5, _shuf2
    921 
    922   wloop:
    923     movdqa     xmm0, [esi]
    924     movdqa     xmm1, [esi + 16]
    925     lea        esi,  [esi + 32]
    926     movdqa     xmm2, xmm1
    927     palignr    xmm1, xmm0, 8
    928     pshufb     xmm0, xmm3
    929     pshufb     xmm1, xmm4
    930     pshufb     xmm2, xmm5
    931     movq       qword ptr [edi], xmm0
    932     movq       qword ptr [edi + 8], xmm1
    933     movq       qword ptr [edi + 16], xmm2
    934     lea        edi, [edi + 24]
    935     sub        ecx, 24
    936     ja         wloop
    937 
    938     popad
    939     ret
    940   }
    941 }
    942 
    943 // Blends 32x2 rectangle to 24x1
    944 // Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
    945 // Then shuffled to do the scaling.
    946 
    947 // Register usage:
    948 // xmm0 src_row 0
    949 // xmm1 src_row 1
    950 // xmm2 shuf 0
    951 // xmm3 shuf 1
    952 // xmm4 shuf 2
    953 // xmm5 madd 0
    954 // xmm6 madd 1
    955 // xmm7 round34
    956 
    957 // Note that movdqa+palign may be better than movdqu.
    958 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
    959 __declspec(naked)
    960 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
    961                                        uint8* dst_ptr, int dst_width) {
    962   __asm {
    963     pushad
    964     mov        esi, [esp + 32 + 4]   // src_ptr
    965     mov        ebx, [esp + 32 + 8]   // src_stride
    966     mov        edi, [esp + 32 + 12]  // dst_ptr
    967     mov        ecx, [esp + 32 + 16]  // dst_width
    968     movdqa     xmm2, _shuf01
    969     movdqa     xmm3, _shuf11
    970     movdqa     xmm4, _shuf21
    971     movdqa     xmm5, _madd01
    972     movdqa     xmm6, _madd11
    973     movdqa     xmm7, _round34
    974 
    975   wloop:
    976     movdqa     xmm0, [esi]           // pixels 0..7
    977     movdqa     xmm1, [esi+ebx]
    978     pavgb      xmm0, xmm1
    979     pshufb     xmm0, xmm2
    980     pmaddubsw  xmm0, xmm5
    981     paddsw     xmm0, xmm7
    982     psrlw      xmm0, 2
    983     packuswb   xmm0, xmm0
    984     movq       qword ptr [edi], xmm0
    985     movdqu     xmm0, [esi+8]         // pixels 8..15
    986     movdqu     xmm1, [esi+ebx+8]
    987     pavgb      xmm0, xmm1
    988     pshufb     xmm0, xmm3
    989     pmaddubsw  xmm0, xmm6
    990     paddsw     xmm0, xmm7
    991     psrlw      xmm0, 2
    992     packuswb   xmm0, xmm0
    993     movq       qword ptr [edi+8], xmm0
    994     movdqa     xmm0, [esi+16]        // pixels 16..23
    995     movdqa     xmm1, [esi+ebx+16]
    996     lea        esi, [esi+32]
    997     pavgb      xmm0, xmm1
    998     pshufb     xmm0, xmm4
    999     movdqa     xmm1, _madd21
   1000     pmaddubsw  xmm0, xmm1
   1001     paddsw     xmm0, xmm7
   1002     psrlw      xmm0, 2
   1003     packuswb   xmm0, xmm0
   1004     movq       qword ptr [edi+16], xmm0
   1005     lea        edi, [edi+24]
   1006     sub        ecx, 24
   1007     ja         wloop
   1008 
   1009     popad
   1010     ret
   1011   }
   1012 }
   1013 
   1014 // Note that movdqa+palign may be better than movdqu.
   1015 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
   1016 __declspec(naked)
   1017 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
   1018                                        uint8* dst_ptr, int dst_width) {
   1019   __asm {
   1020     pushad
   1021     mov        esi, [esp + 32 + 4]   // src_ptr
   1022     mov        ebx, [esp + 32 + 8]   // src_stride
   1023     mov        edi, [esp + 32 + 12]  // dst_ptr
   1024     mov        ecx, [esp + 32 + 16]  // dst_width
   1025     movdqa     xmm2, _shuf01
   1026     movdqa     xmm3, _shuf11
   1027     movdqa     xmm4, _shuf21
   1028     movdqa     xmm5, _madd01
   1029     movdqa     xmm6, _madd11
   1030     movdqa     xmm7, _round34
   1031 
   1032   wloop:
   1033     movdqa     xmm0, [esi]           // pixels 0..7
   1034     movdqa     xmm1, [esi+ebx]
   1035     pavgb      xmm1, xmm0
   1036     pavgb      xmm0, xmm1
   1037     pshufb     xmm0, xmm2
   1038     pmaddubsw  xmm0, xmm5
   1039     paddsw     xmm0, xmm7
   1040     psrlw      xmm0, 2
   1041     packuswb   xmm0, xmm0
   1042     movq       qword ptr [edi], xmm0
   1043     movdqu     xmm0, [esi+8]         // pixels 8..15
   1044     movdqu     xmm1, [esi+ebx+8]
   1045     pavgb      xmm1, xmm0
   1046     pavgb      xmm0, xmm1
   1047     pshufb     xmm0, xmm3
   1048     pmaddubsw  xmm0, xmm6
   1049     paddsw     xmm0, xmm7
   1050     psrlw      xmm0, 2
   1051     packuswb   xmm0, xmm0
   1052     movq       qword ptr [edi+8], xmm0
   1053     movdqa     xmm0, [esi+16]        // pixels 16..23
   1054     movdqa     xmm1, [esi+ebx+16]
   1055     lea        esi, [esi+32]
   1056     pavgb      xmm1, xmm0
   1057     pavgb      xmm0, xmm1
   1058     pshufb     xmm0, xmm4
   1059     movdqa     xmm1, _madd21
   1060     pmaddubsw  xmm0, xmm1
   1061     paddsw     xmm0, xmm7
   1062     psrlw      xmm0, 2
   1063     packuswb   xmm0, xmm0
   1064     movq       qword ptr [edi+16], xmm0
   1065     lea        edi, [edi+24]
   1066     sub        ecx, 24
   1067     ja         wloop
   1068 
   1069     popad
   1070     ret
   1071   }
   1072 }
   1073 
   1074 #define HAS_SCALEROWDOWN38_SSSE3
   1075 // 3/8 point sampler
   1076 
   1077 // Scale 32 pixels to 12
   1078 __declspec(naked)
   1079 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
   1080                                  uint8* dst_ptr, int dst_width) {
   1081   __asm {
   1082     pushad
   1083     mov        esi, [esp + 32 + 4]   // src_ptr
   1084     mov        edx, [esp + 32 + 8]   // src_stride
   1085     mov        edi, [esp + 32 + 12]  // dst_ptr
   1086     mov        ecx, [esp + 32 + 16]  // dst_width
   1087     movdqa     xmm4, _shuf38a
   1088     movdqa     xmm5, _shuf38b
   1089 
   1090   xloop:
   1091     movdqa     xmm0, [esi]           // 16 pixels -> 0,1,2,3,4,5
   1092     movdqa     xmm1, [esi + 16]      // 16 pixels -> 6,7,8,9,10,11
   1093     lea        esi, [esi + 32]
   1094     pshufb     xmm0, xmm4
   1095     pshufb     xmm1, xmm5
   1096     paddusb    xmm0, xmm1
   1097 
   1098     movq       qword ptr [edi], xmm0 // write 12 pixels
   1099     movhlps    xmm1, xmm0
   1100     movd       [edi + 8], xmm1
   1101     lea        edi, [edi + 12]
   1102     sub        ecx, 12
   1103     ja         xloop
   1104 
   1105     popad
   1106     ret
   1107   }
   1108 }
   1109 
   1110 // Scale 16x3 pixels to 6x1 with interpolation
   1111 __declspec(naked)
   1112 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
   1113                                        uint8* dst_ptr, int dst_width) {
   1114   __asm {
   1115     pushad
   1116     mov        esi, [esp + 32 + 4]   // src_ptr
   1117     mov        edx, [esp + 32 + 8]   // src_stride
   1118     mov        edi, [esp + 32 + 12]  // dst_ptr
   1119     mov        ecx, [esp + 32 + 16]  // dst_width
   1120     movdqa     xmm4, _shufac0
   1121     movdqa     xmm5, _shufac3
   1122     movdqa     xmm6, _scaleac3
   1123     pxor       xmm7, xmm7
   1124 
   1125   xloop:
   1126     movdqa     xmm0, [esi]           // sum up 3 rows into xmm0/1
   1127     movdqa     xmm2, [esi + edx]
   1128     movhlps    xmm1, xmm0
   1129     movhlps    xmm3, xmm2
   1130     punpcklbw  xmm0, xmm7
   1131     punpcklbw  xmm1, xmm7
   1132     punpcklbw  xmm2, xmm7
   1133     punpcklbw  xmm3, xmm7
   1134     paddusw    xmm0, xmm2
   1135     paddusw    xmm1, xmm3
   1136     movdqa     xmm2, [esi + edx * 2]
   1137     lea        esi, [esi + 16]
   1138     movhlps    xmm3, xmm2
   1139     punpcklbw  xmm2, xmm7
   1140     punpcklbw  xmm3, xmm7
   1141     paddusw    xmm0, xmm2
   1142     paddusw    xmm1, xmm3
   1143 
   1144     movdqa     xmm2, xmm0            // 8 pixels -> 0,1,2 of xmm2
   1145     psrldq     xmm0, 2
   1146     paddusw    xmm2, xmm0
   1147     psrldq     xmm0, 2
   1148     paddusw    xmm2, xmm0
   1149     pshufb     xmm2, xmm4
   1150 
   1151     movdqa     xmm3, xmm1            // 8 pixels -> 3,4,5 of xmm2
   1152     psrldq     xmm1, 2
   1153     paddusw    xmm3, xmm1
   1154     psrldq     xmm1, 2
   1155     paddusw    xmm3, xmm1
   1156     pshufb     xmm3, xmm5
   1157     paddusw    xmm2, xmm3
   1158 
   1159     pmulhuw    xmm2, xmm6            // divide by 9,9,6, 9,9,6
   1160     packuswb   xmm2, xmm2
   1161 
   1162     movd       [edi], xmm2           // write 6 pixels
   1163     pextrw     eax, xmm2, 2
   1164     mov        [edi + 4], ax
   1165     lea        edi, [edi + 6]
   1166     sub        ecx, 6
   1167     ja         xloop
   1168 
   1169     popad
   1170     ret
   1171   }
   1172 }
   1173 
   1174 // Scale 16x2 pixels to 6x1 with interpolation
   1175 __declspec(naked)
   1176 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
   1177                                        uint8* dst_ptr, int dst_width) {
   1178   __asm {
   1179     pushad
   1180     mov        esi, [esp + 32 + 4]   // src_ptr
   1181     mov        edx, [esp + 32 + 8]   // src_stride
   1182     mov        edi, [esp + 32 + 12]  // dst_ptr
   1183     mov        ecx, [esp + 32 + 16]  // dst_width
   1184     movdqa     xmm4, _shufab0
   1185     movdqa     xmm5, _shufab1
   1186     movdqa     xmm6, _shufab2
   1187     movdqa     xmm7, _scaleab2
   1188 
   1189   xloop:
   1190     movdqa     xmm2, [esi]           // average 2 rows into xmm2
   1191     pavgb      xmm2, [esi + edx]
   1192     lea        esi, [esi + 16]
   1193 
   1194     movdqa     xmm0, xmm2            // 16 pixels -> 0,1,2,3,4,5 of xmm0
   1195     pshufb     xmm0, xmm4
   1196     movdqa     xmm1, xmm2
   1197     pshufb     xmm1, xmm5
   1198     paddusw    xmm0, xmm1
   1199     pshufb     xmm2, xmm6
   1200     paddusw    xmm0, xmm2
   1201 
   1202     pmulhuw    xmm0, xmm7            // divide by 3,3,2, 3,3,2
   1203     packuswb   xmm0, xmm0
   1204 
   1205     movd       [edi], xmm0           // write 6 pixels
   1206     pextrw     eax, xmm0, 2
   1207     mov        [edi + 4], ax
   1208     lea        edi, [edi + 6]
   1209     sub        ecx, 6
   1210     ja         xloop
   1211 
   1212     popad
   1213     ret
   1214   }
   1215 }
   1216 
   1217 #define HAS_SCALEADDROWS_SSE2
   1218 
   1219 // Reads 8xN bytes and produces 16 shorts at a time.
   1220 __declspec(naked)
   1221 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
   1222                               uint16* dst_ptr, int src_width,
   1223                               int src_height) {
   1224   __asm {
   1225     pushad
   1226     mov        esi, [esp + 32 + 4]   // src_ptr
   1227     mov        edx, [esp + 32 + 8]   // src_stride
   1228     mov        edi, [esp + 32 + 12]  // dst_ptr
   1229     mov        ecx, [esp + 32 + 16]  // dst_width
   1230     mov        ebx, [esp + 32 + 20]  // height
   1231     pxor       xmm5, xmm5
   1232     dec        ebx
   1233 
   1234   xloop:
   1235     // first row
   1236     movdqa     xmm2, [esi]
   1237     lea        eax, [esi + edx]
   1238     movhlps    xmm3, xmm2
   1239     mov        ebp, ebx
   1240     punpcklbw  xmm2, xmm5
   1241     punpcklbw  xmm3, xmm5
   1242 
   1243     // sum remaining rows
   1244   yloop:
   1245     movdqa     xmm0, [eax]       // read 16 pixels
   1246     lea        eax, [eax + edx]  // advance to next row
   1247     movhlps    xmm1, xmm0
   1248     punpcklbw  xmm0, xmm5
   1249     punpcklbw  xmm1, xmm5
   1250     paddusw    xmm2, xmm0        // sum 16 words
   1251     paddusw    xmm3, xmm1
   1252     sub        ebp, 1
   1253     ja         yloop
   1254 
   1255     movdqa     [edi], xmm2
   1256     movdqa     [edi + 16], xmm3
   1257     lea        edi, [edi + 32]
   1258     lea        esi, [esi + 16]
   1259 
   1260     sub        ecx, 16
   1261     ja         xloop
   1262 
   1263     popad
   1264     ret
   1265   }
   1266 }
   1267 
   1268 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
   1269 #define HAS_SCALEFILTERROWS_SSE2
   1270 __declspec(naked)
   1271 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
   1272                                  int src_stride, int dst_width,
   1273                                  int source_y_fraction) {
   1274   __asm {
   1275     push       esi
   1276     push       edi
   1277     mov        edi, [esp + 8 + 4]   // dst_ptr
   1278     mov        esi, [esp + 8 + 8]   // src_ptr
   1279     mov        edx, [esp + 8 + 12]  // src_stride
   1280     mov        ecx, [esp + 8 + 16]  // dst_width
   1281     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   1282     cmp        eax, 0
   1283     je         xloop1
   1284     cmp        eax, 128
   1285     je         xloop2
   1286 
   1287     movd       xmm6, eax            // xmm6 = y fraction
   1288     punpcklwd  xmm6, xmm6
   1289     pshufd     xmm6, xmm6, 0
   1290     neg        eax                  // xmm5 = 256 - y fraction
   1291     add        eax, 256
   1292     movd       xmm5, eax
   1293     punpcklwd  xmm5, xmm5
   1294     pshufd     xmm5, xmm5, 0
   1295     pxor       xmm7, xmm7
   1296 
   1297   xloop:
   1298     movdqa     xmm0, [esi]
   1299     movdqa     xmm2, [esi + edx]
   1300     lea        esi, [esi + 16]
   1301     movdqa     xmm1, xmm0
   1302     movdqa     xmm3, xmm2
   1303     punpcklbw  xmm0, xmm7
   1304     punpcklbw  xmm2, xmm7
   1305     punpckhbw  xmm1, xmm7
   1306     punpckhbw  xmm3, xmm7
   1307     pmullw     xmm0, xmm5           // scale row 0
   1308     pmullw     xmm1, xmm5
   1309     pmullw     xmm2, xmm6           // scale row 1
   1310     pmullw     xmm3, xmm6
   1311     paddusw    xmm0, xmm2           // sum rows
   1312     paddusw    xmm1, xmm3
   1313     psrlw      xmm0, 8
   1314     psrlw      xmm1, 8
   1315     packuswb   xmm0, xmm1
   1316     movdqa     [edi], xmm0
   1317     lea        edi, [edi + 16]
   1318     sub        ecx, 16
   1319     ja         xloop
   1320 
   1321     mov        al, [edi - 1]
   1322     mov        [edi], al
   1323     pop        edi
   1324     pop        esi
   1325     ret
   1326 
   1327   xloop1:
   1328     movdqa     xmm0, [esi]
   1329     lea        esi, [esi + 16]
   1330     movdqa     [edi], xmm0
   1331     lea        edi, [edi + 16]
   1332     sub        ecx, 16
   1333     ja         xloop1
   1334 
   1335     mov        al, [edi - 1]
   1336     mov        [edi], al
   1337     pop        edi
   1338     pop        esi
   1339     ret
   1340 
   1341   xloop2:
   1342     movdqa     xmm0, [esi]
   1343     movdqa     xmm2, [esi + edx]
   1344     lea        esi, [esi + 16]
   1345     pavgb      xmm0, xmm2
   1346     movdqa     [edi], xmm0
   1347     lea        edi, [edi + 16]
   1348     sub        ecx, 16
   1349     ja         xloop2
   1350 
   1351     mov        al, [edi - 1]
   1352     mov        [edi], al
   1353     pop        edi
   1354     pop        esi
   1355     ret
   1356   }
   1357 }
   1358 
   1359 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
   1360 #define HAS_SCALEFILTERROWS_SSSE3
   1361 __declspec(naked)
   1362 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   1363                                   int src_stride, int dst_width,
   1364                                   int source_y_fraction) {
   1365   __asm {
   1366     push       esi
   1367     push       edi
   1368     mov        edi, [esp + 8 + 4]   // dst_ptr
   1369     mov        esi, [esp + 8 + 8]   // src_ptr
   1370     mov        edx, [esp + 8 + 12]  // src_stride
   1371     mov        ecx, [esp + 8 + 16]  // dst_width
   1372     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   1373     shr        eax, 1
   1374     cmp        eax, 0
   1375     je         xloop1
   1376     cmp        eax, 64
   1377     je         xloop2
   1378 
   1379     mov        ah,al
   1380     neg        al
   1381     add        al, 128
   1382     movd       xmm5, eax
   1383     punpcklwd  xmm5, xmm5
   1384     pshufd     xmm5, xmm5, 0
   1385 
   1386   xloop:
   1387     movdqa     xmm0, [esi]
   1388     movdqa     xmm2, [esi + edx]
   1389     lea        esi, [esi + 16]
   1390     movdqa     xmm1, xmm0
   1391     punpcklbw  xmm0, xmm2
   1392     punpckhbw  xmm1, xmm2
   1393     pmaddubsw  xmm0, xmm5
   1394     pmaddubsw  xmm1, xmm5
   1395     psrlw      xmm0, 7
   1396     psrlw      xmm1, 7
   1397     packuswb   xmm0, xmm1
   1398     movdqa     [edi], xmm0
   1399     lea        edi, [edi + 16]
   1400     sub        ecx, 16
   1401     ja         xloop
   1402 
   1403     mov        al, [edi - 1]
   1404     mov        [edi], al
   1405     pop        edi
   1406     pop        esi
   1407     ret
   1408 
   1409   xloop1:
   1410     movdqa     xmm0, [esi]
   1411     lea        esi, [esi + 16]
   1412     movdqa     [edi], xmm0
   1413     lea        edi, [edi + 16]
   1414     sub        ecx, 16
   1415     ja         xloop1
   1416 
   1417     mov        al, [edi - 1]
   1418     mov        [edi], al
   1419     pop        edi
   1420     pop        esi
   1421     ret
   1422 
   1423   xloop2:
   1424     movdqa     xmm0, [esi]
   1425     movdqa     xmm2, [esi + edx]
   1426     lea        esi, [esi + 16]
   1427     pavgb      xmm0, xmm2
   1428     movdqa     [edi], xmm0
   1429     lea        edi, [edi + 16]
   1430     sub        ecx, 16
   1431     ja         xloop2
   1432 
   1433     mov        al, [edi - 1]
   1434     mov        [edi], al
   1435     pop        edi
   1436     pop        esi
   1437     ret
   1438 
   1439   }
   1440 }
   1441 
   1442 // Note that movdqa+palign may be better than movdqu.
   1443 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
   1444 __declspec(naked)
   1445 static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   1446                                     int dst_width) {
   1447   __asm {
   1448     mov        edx, [esp + 4]    // dst_ptr
   1449     mov        eax, [esp + 8]    // src_ptr
   1450     mov        ecx, [esp + 12]   // dst_width
   1451     movdqa     xmm1, _round34
   1452     movdqa     xmm2, _shuf01
   1453     movdqa     xmm3, _shuf11
   1454     movdqa     xmm4, _shuf21
   1455     movdqa     xmm5, _madd01
   1456     movdqa     xmm6, _madd11
   1457     movdqa     xmm7, _madd21
   1458 
   1459   wloop:
   1460     movdqa     xmm0, [eax]           // pixels 0..7
   1461     pshufb     xmm0, xmm2
   1462     pmaddubsw  xmm0, xmm5
   1463     paddsw     xmm0, xmm1
   1464     psrlw      xmm0, 2
   1465     packuswb   xmm0, xmm0
   1466     movq       qword ptr [edx], xmm0
   1467     movdqu     xmm0, [eax+8]         // pixels 8..15
   1468     pshufb     xmm0, xmm3
   1469     pmaddubsw  xmm0, xmm6
   1470     paddsw     xmm0, xmm1
   1471     psrlw      xmm0, 2
   1472     packuswb   xmm0, xmm0
   1473     movq       qword ptr [edx+8], xmm0
   1474     movdqa     xmm0, [eax+16]        // pixels 16..23
   1475     lea        eax, [eax+32]
   1476     pshufb     xmm0, xmm4
   1477     pmaddubsw  xmm0, xmm7
   1478     paddsw     xmm0, xmm1
   1479     psrlw      xmm0, 2
   1480     packuswb   xmm0, xmm0
   1481     movq       qword ptr [edx+16], xmm0
   1482     lea        edx, [edx+24]
   1483     sub        ecx, 24
   1484     ja         wloop
   1485     ret
   1486   }
   1487 }
   1488 
   1489 #elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
   1490 
   1491 // GCC versions of row functions are verbatim conversions from Visual C.
   1492 // Generated using gcc disassembly on Visual C object file:
   1493 // objdump -D yuvscaler.obj >yuvscaler.txt
   1494 #define HAS_SCALEROWDOWN2_SSE2
   1495 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
   1496                                uint8* dst_ptr, int dst_width) {
   1497   asm volatile (
   1498   "pcmpeqb    %%xmm5,%%xmm5                    \n"
   1499   "psrlw      $0x8,%%xmm5                      \n"
   1500 "1:"
   1501   "movdqa     (%0),%%xmm0                      \n"
   1502   "movdqa     0x10(%0),%%xmm1                  \n"
   1503   "lea        0x20(%0),%0                      \n"
   1504   "pand       %%xmm5,%%xmm0                    \n"
   1505   "pand       %%xmm5,%%xmm1                    \n"
   1506   "packuswb   %%xmm1,%%xmm0                    \n"
   1507   "movdqa     %%xmm0,(%1)                      \n"
   1508   "lea        0x10(%1),%1                      \n"
   1509   "sub        $0x10,%2                         \n"
   1510   "ja         1b                               \n"
   1511   : "+r"(src_ptr),    // %0
   1512     "+r"(dst_ptr),    // %1
   1513     "+r"(dst_width)   // %2
   1514   :
   1515   : "memory", "cc"
   1516 );
   1517 }
   1518 
   1519 static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
   1520                                   uint8* dst_ptr, int dst_width) {
   1521   asm volatile (
   1522   "pcmpeqb    %%xmm5,%%xmm5                    \n"
   1523   "psrlw      $0x8,%%xmm5                      \n"
   1524 "1:"
   1525   "movdqa     (%0),%%xmm0                      \n"
   1526   "movdqa     0x10(%0),%%xmm1                  \n"
   1527   "movdqa     (%0,%3,1),%%xmm2                 \n"
   1528   "movdqa     0x10(%0,%3,1),%%xmm3             \n"
   1529   "lea        0x20(%0),%0                      \n"
   1530   "pavgb      %%xmm2,%%xmm0                    \n"
   1531   "pavgb      %%xmm3,%%xmm1                    \n"
   1532   "movdqa     %%xmm0,%%xmm2                    \n"
   1533   "psrlw      $0x8,%%xmm0                      \n"
   1534   "movdqa     %%xmm1,%%xmm3                    \n"
   1535   "psrlw      $0x8,%%xmm1                      \n"
   1536   "pand       %%xmm5,%%xmm2                    \n"
   1537   "pand       %%xmm5,%%xmm3                    \n"
   1538   "pavgw      %%xmm2,%%xmm0                    \n"
   1539   "pavgw      %%xmm3,%%xmm1                    \n"
   1540   "packuswb   %%xmm1,%%xmm0                    \n"
   1541   "movdqa     %%xmm0,(%1)                      \n"
   1542   "lea        0x10(%1),%1                      \n"
   1543   "sub        $0x10,%2                         \n"
   1544   "ja         1b                               \n"
   1545   : "+r"(src_ptr),    // %0
   1546     "+r"(dst_ptr),    // %1
   1547     "+r"(dst_width)   // %2
   1548   : "r"((intptr_t)(src_stride))   // %3
   1549   : "memory", "cc"
   1550 );
   1551 }
   1552 
   1553 #define HAS_SCALEROWDOWN4_SSE2
   1554 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
   1555                                uint8* dst_ptr, int dst_width) {
   1556   asm volatile (
   1557   "pcmpeqb    %%xmm5,%%xmm5                    \n"
   1558   "psrld      $0x18,%%xmm5                     \n"
   1559 "1:"
   1560   "movdqa     (%0),%%xmm0                      \n"
   1561   "movdqa     0x10(%0),%%xmm1                  \n"
   1562   "lea        0x20(%0),%0                      \n"
   1563   "pand       %%xmm5,%%xmm0                    \n"
   1564   "pand       %%xmm5,%%xmm1                    \n"
   1565   "packuswb   %%xmm1,%%xmm0                    \n"
   1566   "packuswb   %%xmm0,%%xmm0                    \n"
   1567   "movq       %%xmm0,(%1)                      \n"
   1568   "lea        0x8(%1),%1                       \n"
   1569   "sub        $0x8,%2                          \n"
   1570   "ja         1b                               \n"
   1571   : "+r"(src_ptr),    // %0
   1572     "+r"(dst_ptr),    // %1
   1573     "+r"(dst_width)   // %2
   1574   :
   1575   : "memory", "cc"
   1576 );
   1577 }
   1578 
   1579 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
   1580                                   uint8* dst_ptr, int dst_width) {
   1581   intptr_t temp = 0;
   1582   asm volatile (
   1583   "pcmpeqb    %%xmm7,%%xmm7                    \n"
   1584   "psrlw      $0x8,%%xmm7                      \n"
   1585   "lea        (%4,%4,2),%3                     \n"
   1586 "1:"
   1587   "movdqa     (%0),%%xmm0                      \n"
   1588   "movdqa     0x10(%0),%%xmm1                  \n"
   1589   "movdqa     (%0,%4,1),%%xmm2                 \n"
   1590   "movdqa     0x10(%0,%4,1),%%xmm3             \n"
   1591   "pavgb      %%xmm2,%%xmm0                    \n"
   1592   "pavgb      %%xmm3,%%xmm1                    \n"
   1593   "movdqa     (%0,%4,2),%%xmm2                 \n"
   1594   "movdqa     0x10(%0,%4,2),%%xmm3             \n"
   1595   "movdqa     (%0,%3,1),%%xmm4                 \n"
   1596   "movdqa     0x10(%0,%3,1),%%xmm5             \n"
   1597   "lea        0x20(%0),%0                      \n"
   1598   "pavgb      %%xmm4,%%xmm2                    \n"
   1599   "pavgb      %%xmm2,%%xmm0                    \n"
   1600   "pavgb      %%xmm5,%%xmm3                    \n"
   1601   "pavgb      %%xmm3,%%xmm1                    \n"
   1602   "movdqa     %%xmm0,%%xmm2                    \n"
   1603   "psrlw      $0x8,%%xmm0                      \n"
   1604   "movdqa     %%xmm1,%%xmm3                    \n"
   1605   "psrlw      $0x8,%%xmm1                      \n"
   1606   "pand       %%xmm7,%%xmm2                    \n"
   1607   "pand       %%xmm7,%%xmm3                    \n"
   1608   "pavgw      %%xmm2,%%xmm0                    \n"
   1609   "pavgw      %%xmm3,%%xmm1                    \n"
   1610   "packuswb   %%xmm1,%%xmm0                    \n"
   1611   "movdqa     %%xmm0,%%xmm2                    \n"
   1612   "psrlw      $0x8,%%xmm0                      \n"
   1613   "pand       %%xmm7,%%xmm2                    \n"
   1614   "pavgw      %%xmm2,%%xmm0                    \n"
   1615   "packuswb   %%xmm0,%%xmm0                    \n"
   1616   "movq       %%xmm0,(%1)                      \n"
   1617   "lea        0x8(%1),%1                       \n"
   1618   "sub        $0x8,%2                          \n"
   1619   "ja         1b                               \n"
   1620   : "+r"(src_ptr),     // %0
   1621     "+r"(dst_ptr),     // %1
   1622     "+r"(dst_width),   // %2
   1623     "+r"(temp)         // %3
   1624   : "r"((intptr_t)(src_stride))    // %4
   1625   : "memory", "cc"
   1626 #if defined(__x86_64__)
   1627     , "xmm6", "xmm7"
   1628 #endif
   1629 );
   1630 }
   1631 
   1632 #define HAS_SCALEROWDOWN8_SSE2
   1633 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
   1634                                uint8* dst_ptr, int dst_width) {
   1635   asm volatile (
   1636   "pcmpeqb    %%xmm5,%%xmm5                    \n"
   1637   "psrlq      $0x38,%%xmm5                     \n"
   1638 "1:"
   1639   "movdqa     (%0),%%xmm0                      \n"
   1640   "movdqa     0x10(%0),%%xmm1                  \n"
   1641   "lea        0x20(%0),%0                      \n"
   1642   "pand       %%xmm5,%%xmm0                    \n"
   1643   "pand       %%xmm5,%%xmm1                    \n"
   1644   "packuswb   %%xmm1,%%xmm0                    \n"
   1645   "packuswb   %%xmm0,%%xmm0                    \n"
   1646   "packuswb   %%xmm0,%%xmm0                    \n"
   1647   "movd       %%xmm0,(%1)                      \n"
   1648   "lea        0x4(%1),%1                       \n"
   1649   "sub        $0x4,%2                          \n"
   1650   "ja         1b                               \n"
   1651   : "+r"(src_ptr),    // %0
   1652     "+r"(dst_ptr),    // %1
   1653     "+r"(dst_width)   // %2
   1654   :
   1655   : "memory", "cc"
   1656 );
   1657 }
   1658 
   1659 #if defined(__i386__)
   1660 void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
   1661                                       uint8* dst_ptr, int dst_width);
   1662   asm(
   1663     DECLARE_FUNCTION(ScaleRowDown8Int_SSE2)
   1664     "pusha                                     \n"
   1665     "mov    0x24(%esp),%esi                    \n"
   1666     "mov    0x28(%esp),%ebx                    \n"
   1667     "mov    0x2c(%esp),%edi                    \n"
   1668     "mov    0x30(%esp),%ecx                    \n"
   1669     "lea    (%ebx,%ebx,2),%edx                 \n"
   1670     "pxor   %xmm7,%xmm7                        \n"
   1671 
   1672 "1:"
   1673     "movdqa (%esi),%xmm0                       \n"
   1674     "movdqa 0x10(%esi),%xmm1                   \n"
   1675     "movdqa (%esi,%ebx,1),%xmm2                \n"
   1676     "movdqa 0x10(%esi,%ebx,1),%xmm3            \n"
   1677     "pavgb  %xmm2,%xmm0                        \n"
   1678     "pavgb  %xmm3,%xmm1                        \n"
   1679     "movdqa (%esi,%ebx,2),%xmm2                \n"
   1680     "movdqa 0x10(%esi,%ebx,2),%xmm3            \n"
   1681     "movdqa (%esi,%edx,1),%xmm4                \n"
   1682     "movdqa 0x10(%esi,%edx,1),%xmm5            \n"
   1683     "lea    (%esi,%ebx,4),%ebp                 \n"
   1684     "lea    0x20(%esi),%esi                    \n"
   1685     "pavgb  %xmm4,%xmm2                        \n"
   1686     "pavgb  %xmm5,%xmm3                        \n"
   1687     "pavgb  %xmm2,%xmm0                        \n"
   1688     "pavgb  %xmm3,%xmm1                        \n"
   1689     "movdqa 0x0(%ebp),%xmm2                    \n"
   1690     "movdqa 0x10(%ebp),%xmm3                   \n"
   1691     "movdqa 0x0(%ebp,%ebx,1),%xmm4             \n"
   1692     "movdqa 0x10(%ebp,%ebx,1),%xmm5            \n"
   1693     "pavgb  %xmm4,%xmm2                        \n"
   1694     "pavgb  %xmm5,%xmm3                        \n"
   1695     "movdqa 0x0(%ebp,%ebx,2),%xmm4             \n"
   1696     "movdqa 0x10(%ebp,%ebx,2),%xmm5            \n"
   1697     "movdqa 0x0(%ebp,%edx,1),%xmm6             \n"
   1698     "pavgb  %xmm6,%xmm4                        \n"
   1699     "movdqa 0x10(%ebp,%edx,1),%xmm6            \n"
   1700     "pavgb  %xmm6,%xmm5                        \n"
   1701     "pavgb  %xmm4,%xmm2                        \n"
   1702     "pavgb  %xmm5,%xmm3                        \n"
   1703     "pavgb  %xmm2,%xmm0                        \n"
   1704     "pavgb  %xmm3,%xmm1                        \n"
   1705     "psadbw %xmm7,%xmm0                        \n"
   1706     "psadbw %xmm7,%xmm1                        \n"
   1707     "pshufd $0xd8,%xmm0,%xmm0                  \n"
   1708     "pshufd $0x8d,%xmm1,%xmm1                  \n"
   1709     "por    %xmm1,%xmm0                        \n"
   1710     "psrlw  $0x3,%xmm0                         \n"
   1711     "packuswb %xmm0,%xmm0                      \n"
   1712     "packuswb %xmm0,%xmm0                      \n"
   1713     "movd   %xmm0,(%edi)                       \n"
   1714     "lea    0x4(%edi),%edi                     \n"
   1715     "sub    $0x4,%ecx                          \n"
   1716     "ja     1b                                 \n"
   1717     "popa                                      \n"
   1718     "ret                                       \n"
   1719 );
   1720 
   1721 // fpic is used for magiccam plugin
   1722 #if !defined(__PIC__)
   1723 #define HAS_SCALEROWDOWN34_SSSE3
   1724 void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
   1725                                      uint8* dst_ptr, int dst_width);
   1726   asm(
   1727     DECLARE_FUNCTION(ScaleRowDown34_SSSE3)
   1728     "pusha                                     \n"
   1729     "mov    0x24(%esp),%esi                    \n"
   1730     "mov    0x2c(%esp),%edi                    \n"
   1731     "mov    0x30(%esp),%ecx                    \n"
   1732     "movdqa _shuf0,%xmm3                       \n"
   1733     "movdqa _shuf1,%xmm4                       \n"
   1734     "movdqa _shuf2,%xmm5                       \n"
   1735 
   1736 "1:"
   1737     "movdqa (%esi),%xmm0                       \n"
   1738     "movdqa 0x10(%esi),%xmm2                   \n"
   1739     "lea    0x20(%esi),%esi                    \n"
   1740     "movdqa %xmm2,%xmm1                        \n"
   1741     "palignr $0x8,%xmm0,%xmm1                  \n"
   1742     "pshufb %xmm3,%xmm0                        \n"
   1743     "pshufb %xmm4,%xmm1                        \n"
   1744     "pshufb %xmm5,%xmm2                        \n"
   1745     "movq   %xmm0,(%edi)                       \n"
   1746     "movq   %xmm1,0x8(%edi)                    \n"
   1747     "movq   %xmm2,0x10(%edi)                   \n"
   1748     "lea    0x18(%edi),%edi                    \n"
   1749     "sub    $0x18,%ecx                         \n"
   1750     "ja     1b                                 \n"
   1751     "popa                                      \n"
   1752     "ret                                       \n"
   1753 );
   1754 
   1755 void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
   1756                                            uint8* dst_ptr, int dst_width);
   1757   asm(
   1758     DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3)
   1759     "pusha                                     \n"
   1760     "mov    0x24(%esp),%esi                    \n"
   1761     "mov    0x28(%esp),%ebp                    \n"
   1762     "mov    0x2c(%esp),%edi                    \n"
   1763     "mov    0x30(%esp),%ecx                    \n"
   1764     "movdqa _shuf01,%xmm2                      \n"
   1765     "movdqa _shuf11,%xmm3                      \n"
   1766     "movdqa _shuf21,%xmm4                      \n"
   1767     "movdqa _madd01,%xmm5                      \n"
   1768     "movdqa _madd11,%xmm6                      \n"
   1769     "movdqa _round34,%xmm7                     \n"
   1770 
   1771 "1:"
   1772     "movdqa (%esi),%xmm0                       \n"
   1773     "movdqa (%esi,%ebp),%xmm1                  \n"
   1774     "pavgb  %xmm1,%xmm0                        \n"
   1775     "pshufb %xmm2,%xmm0                        \n"
   1776     "pmaddubsw %xmm5,%xmm0                     \n"
   1777     "paddsw %xmm7,%xmm0                        \n"
   1778     "psrlw  $0x2,%xmm0                         \n"
   1779     "packuswb %xmm0,%xmm0                      \n"
   1780     "movq   %xmm0,(%edi)                       \n"
   1781     "movdqu 0x8(%esi),%xmm0                    \n"
   1782     "movdqu 0x8(%esi,%ebp),%xmm1               \n"
   1783     "pavgb  %xmm1,%xmm0                        \n"
   1784     "pshufb %xmm3,%xmm0                        \n"
   1785     "pmaddubsw %xmm6,%xmm0                     \n"
   1786     "paddsw %xmm7,%xmm0                        \n"
   1787     "psrlw  $0x2,%xmm0                         \n"
   1788     "packuswb %xmm0,%xmm0                      \n"
   1789     "movq   %xmm0,0x8(%edi)                    \n"
   1790     "movdqa 0x10(%esi),%xmm0                   \n"
   1791     "movdqa 0x10(%esi,%ebp),%xmm1              \n"
   1792     "lea    0x20(%esi),%esi                    \n"
   1793     "pavgb  %xmm1,%xmm0                        \n"
   1794     "pshufb %xmm4,%xmm0                        \n"
   1795     "movdqa  _madd21,%xmm1                     \n"
   1796     "pmaddubsw %xmm1,%xmm0                     \n"
   1797     "paddsw %xmm7,%xmm0                        \n"
   1798     "psrlw  $0x2,%xmm0                         \n"
   1799     "packuswb %xmm0,%xmm0                      \n"
   1800     "movq   %xmm0,0x10(%edi)                   \n"
   1801     "lea    0x18(%edi),%edi                    \n"
   1802     "sub    $0x18,%ecx                         \n"
   1803     "ja     1b                                 \n"
   1804 
   1805     "popa                                      \n"
   1806     "ret                                       \n"
   1807 );
   1808 
   1809 void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
   1810                                            uint8* dst_ptr, int dst_width);
   1811   asm(
   1812     DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3)
   1813     "pusha                                     \n"
   1814     "mov    0x24(%esp),%esi                    \n"
   1815     "mov    0x28(%esp),%ebp                    \n"
   1816     "mov    0x2c(%esp),%edi                    \n"
   1817     "mov    0x30(%esp),%ecx                    \n"
   1818     "movdqa _shuf01,%xmm2                      \n"
   1819     "movdqa _shuf11,%xmm3                      \n"
   1820     "movdqa _shuf21,%xmm4                      \n"
   1821     "movdqa _madd01,%xmm5                      \n"
   1822     "movdqa _madd11,%xmm6                      \n"
   1823     "movdqa _round34,%xmm7                     \n"
   1824 
   1825 "1:"
   1826     "movdqa (%esi),%xmm0                       \n"
   1827     "movdqa (%esi,%ebp,1),%xmm1                \n"
   1828     "pavgb  %xmm0,%xmm1                        \n"
   1829     "pavgb  %xmm1,%xmm0                        \n"
   1830     "pshufb %xmm2,%xmm0                        \n"
   1831     "pmaddubsw %xmm5,%xmm0                     \n"
   1832     "paddsw %xmm7,%xmm0                        \n"
   1833     "psrlw  $0x2,%xmm0                         \n"
   1834     "packuswb %xmm0,%xmm0                      \n"
   1835     "movq   %xmm0,(%edi)                       \n"
   1836     "movdqu 0x8(%esi),%xmm0                    \n"
   1837     "movdqu 0x8(%esi,%ebp,1),%xmm1             \n"
   1838     "pavgb  %xmm0,%xmm1                        \n"
   1839     "pavgb  %xmm1,%xmm0                        \n"
   1840     "pshufb %xmm3,%xmm0                        \n"
   1841     "pmaddubsw %xmm6,%xmm0                     \n"
   1842     "paddsw %xmm7,%xmm0                        \n"
   1843     "psrlw  $0x2,%xmm0                         \n"
   1844     "packuswb %xmm0,%xmm0                      \n"
   1845     "movq   %xmm0,0x8(%edi)                    \n"
   1846     "movdqa 0x10(%esi),%xmm0                   \n"
   1847     "movdqa 0x10(%esi,%ebp,1),%xmm1            \n"
   1848     "lea    0x20(%esi),%esi                    \n"
   1849     "pavgb  %xmm0,%xmm1                        \n"
   1850     "pavgb  %xmm1,%xmm0                        \n"
   1851     "pshufb %xmm4,%xmm0                        \n"
   1852     "movdqa  _madd21,%xmm1                     \n"
   1853     "pmaddubsw %xmm1,%xmm0                     \n"
   1854     "paddsw %xmm7,%xmm0                        \n"
   1855     "psrlw  $0x2,%xmm0                         \n"
   1856     "packuswb %xmm0,%xmm0                      \n"
   1857     "movq   %xmm0,0x10(%edi)                   \n"
   1858     "lea    0x18(%edi),%edi                    \n"
   1859     "sub    $0x18,%ecx                         \n"
   1860     "ja     1b                                 \n"
   1861     "popa                                      \n"
   1862     "ret                                       \n"
   1863 );
   1864 
   1865 #define HAS_SCALEROWDOWN38_SSSE3
   1866 void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
   1867                                      uint8* dst_ptr, int dst_width);
   1868   asm(
   1869     DECLARE_FUNCTION(ScaleRowDown38_SSSE3)
   1870     "pusha                                     \n"
   1871     "mov    0x24(%esp),%esi                    \n"
   1872     "mov    0x28(%esp),%edx                    \n"
   1873     "mov    0x2c(%esp),%edi                    \n"
   1874     "mov    0x30(%esp),%ecx                    \n"
   1875     "movdqa _shuf38a ,%xmm4                    \n"
   1876     "movdqa _shuf38b ,%xmm5                    \n"
   1877 
   1878 "1:"
   1879     "movdqa (%esi),%xmm0                       \n"
   1880     "movdqa 0x10(%esi),%xmm1                   \n"
   1881     "lea    0x20(%esi),%esi                    \n"
   1882     "pshufb %xmm4,%xmm0                        \n"
   1883     "pshufb %xmm5,%xmm1                        \n"
   1884     "paddusb %xmm1,%xmm0                       \n"
   1885     "movq   %xmm0,(%edi)                       \n"
   1886     "movhlps %xmm0,%xmm1                       \n"
   1887     "movd   %xmm1,0x8(%edi)                    \n"
   1888     "lea    0xc(%edi),%edi                     \n"
   1889     "sub    $0xc,%ecx                          \n"
   1890     "ja     1b                                 \n"
   1891     "popa                                      \n"
   1892     "ret                                       \n"
   1893 );
   1894 
   1895 void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
   1896                                            uint8* dst_ptr, int dst_width);
   1897   asm(
   1898     DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3)
   1899     "pusha                                     \n"
   1900     "mov    0x24(%esp),%esi                    \n"
   1901     "mov    0x28(%esp),%edx                    \n"
   1902     "mov    0x2c(%esp),%edi                    \n"
   1903     "mov    0x30(%esp),%ecx                    \n"
   1904     "movdqa _shufac0,%xmm4                     \n"
   1905     "movdqa _shufac3,%xmm5                     \n"
   1906     "movdqa _scaleac3,%xmm6                    \n"
   1907     "pxor   %xmm7,%xmm7                        \n"
   1908 
   1909 "1:"
   1910     "movdqa (%esi),%xmm0                       \n"
   1911     "movdqa (%esi,%edx,1),%xmm2                \n"
   1912     "movhlps %xmm0,%xmm1                       \n"
   1913     "movhlps %xmm2,%xmm3                       \n"
   1914     "punpcklbw %xmm7,%xmm0                     \n"
   1915     "punpcklbw %xmm7,%xmm1                     \n"
   1916     "punpcklbw %xmm7,%xmm2                     \n"
   1917     "punpcklbw %xmm7,%xmm3                     \n"
   1918     "paddusw %xmm2,%xmm0                       \n"
   1919     "paddusw %xmm3,%xmm1                       \n"
   1920     "movdqa (%esi,%edx,2),%xmm2                \n"
   1921     "lea    0x10(%esi),%esi                    \n"
   1922     "movhlps %xmm2,%xmm3                       \n"
   1923     "punpcklbw %xmm7,%xmm2                     \n"
   1924     "punpcklbw %xmm7,%xmm3                     \n"
   1925     "paddusw %xmm2,%xmm0                       \n"
   1926     "paddusw %xmm3,%xmm1                       \n"
   1927     "movdqa %xmm0,%xmm2                        \n"
   1928     "psrldq $0x2,%xmm0                         \n"
   1929     "paddusw %xmm0,%xmm2                       \n"
   1930     "psrldq $0x2,%xmm0                         \n"
   1931     "paddusw %xmm0,%xmm2                       \n"
   1932     "pshufb %xmm4,%xmm2                        \n"
   1933     "movdqa %xmm1,%xmm3                        \n"
   1934     "psrldq $0x2,%xmm1                         \n"
   1935     "paddusw %xmm1,%xmm3                       \n"
   1936     "psrldq $0x2,%xmm1                         \n"
   1937     "paddusw %xmm1,%xmm3                       \n"
   1938     "pshufb %xmm5,%xmm3                        \n"
   1939     "paddusw %xmm3,%xmm2                       \n"
   1940     "pmulhuw %xmm6,%xmm2                       \n"
   1941     "packuswb %xmm2,%xmm2                      \n"
   1942     "movd   %xmm2,(%edi)                       \n"
   1943     "pextrw $0x2,%xmm2,%eax                    \n"
   1944     "mov    %ax,0x4(%edi)                      \n"
   1945     "lea    0x6(%edi),%edi                     \n"
   1946     "sub    $0x6,%ecx                          \n"
   1947     "ja     1b                                 \n"
   1948     "popa                                      \n"
   1949     "ret                                       \n"
   1950 );
   1951 
   1952 void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
   1953                                            uint8* dst_ptr, int dst_width);
   1954   asm(
   1955     DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3)
   1956     "pusha                                     \n"
   1957     "mov    0x24(%esp),%esi                    \n"
   1958     "mov    0x28(%esp),%edx                    \n"
   1959     "mov    0x2c(%esp),%edi                    \n"
   1960     "mov    0x30(%esp),%ecx                    \n"
   1961     "movdqa _shufab0,%xmm4                     \n"
   1962     "movdqa _shufab1,%xmm5                     \n"
   1963     "movdqa _shufab2,%xmm6                     \n"
   1964     "movdqa _scaleab2,%xmm7                    \n"
   1965 
   1966 "1:"
   1967     "movdqa (%esi),%xmm2                       \n"
   1968     "pavgb  (%esi,%edx,1),%xmm2                \n"
   1969     "lea    0x10(%esi),%esi                    \n"
   1970     "movdqa %xmm2,%xmm0                        \n"
   1971     "pshufb %xmm4,%xmm0                        \n"
   1972     "movdqa %xmm2,%xmm1                        \n"
   1973     "pshufb %xmm5,%xmm1                        \n"
   1974     "paddusw %xmm1,%xmm0                       \n"
   1975     "pshufb %xmm6,%xmm2                        \n"
   1976     "paddusw %xmm2,%xmm0                       \n"
   1977     "pmulhuw %xmm7,%xmm0                       \n"
   1978     "packuswb %xmm0,%xmm0                      \n"
   1979     "movd   %xmm0,(%edi)                       \n"
   1980     "pextrw $0x2,%xmm0,%eax                    \n"
   1981     "mov    %ax,0x4(%edi)                      \n"
   1982     "lea    0x6(%edi),%edi                     \n"
   1983     "sub    $0x6,%ecx                          \n"
   1984     "ja     1b                                 \n"
   1985     "popa                                      \n"
   1986     "ret                                       \n"
   1987 );
   1988 #endif // __PIC__
   1989 
   1990 #define HAS_SCALEADDROWS_SSE2
   1991 void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
   1992                                   uint16* dst_ptr, int src_width,
   1993                                   int src_height);
   1994   asm(
   1995     DECLARE_FUNCTION(ScaleAddRows_SSE2)
   1996     "pusha                                     \n"
   1997     "mov    0x24(%esp),%esi                    \n"
   1998     "mov    0x28(%esp),%edx                    \n"
   1999     "mov    0x2c(%esp),%edi                    \n"
   2000     "mov    0x30(%esp),%ecx                    \n"
   2001     "mov    0x34(%esp),%ebx                    \n"
   2002     "pxor   %xmm5,%xmm5                        \n"
   2003 
   2004 "1:"
   2005     "movdqa (%esi),%xmm2                       \n"
   2006     "lea    (%esi,%edx,1),%eax                 \n"
   2007     "movhlps %xmm2,%xmm3                       \n"
   2008     "lea    -0x1(%ebx),%ebp                    \n"
   2009     "punpcklbw %xmm5,%xmm2                     \n"
   2010     "punpcklbw %xmm5,%xmm3                     \n"
   2011 
   2012 "2:"
   2013     "movdqa (%eax),%xmm0                       \n"
   2014     "lea    (%eax,%edx,1),%eax                 \n"
   2015     "movhlps %xmm0,%xmm1                       \n"
   2016     "punpcklbw %xmm5,%xmm0                     \n"
   2017     "punpcklbw %xmm5,%xmm1                     \n"
   2018     "paddusw %xmm0,%xmm2                       \n"
   2019     "paddusw %xmm1,%xmm3                       \n"
   2020     "sub    $0x1,%ebp                          \n"
   2021     "ja     2b                                 \n"
   2022 
   2023     "movdqa %xmm2,(%edi)                       \n"
   2024     "movdqa %xmm3,0x10(%edi)                   \n"
   2025     "lea    0x20(%edi),%edi                    \n"
   2026     "lea    0x10(%esi),%esi                    \n"
   2027     "sub    $0x10,%ecx                         \n"
   2028     "ja     1b                                 \n"
   2029     "popa                                      \n"
   2030     "ret                                       \n"
   2031 );
   2032 
   2033 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
   2034 #define HAS_SCALEFILTERROWS_SSE2
   2035 void ScaleFilterRows_SSE2(uint8* dst_ptr,
   2036                                      const uint8* src_ptr, int src_stride,
   2037                                      int dst_width, int source_y_fraction);
   2038   asm(
   2039     DECLARE_FUNCTION(ScaleFilterRows_SSE2)
   2040     "push   %esi                               \n"
   2041     "push   %edi                               \n"
   2042     "mov    0xc(%esp),%edi                     \n"
   2043     "mov    0x10(%esp),%esi                    \n"
   2044     "mov    0x14(%esp),%edx                    \n"
   2045     "mov    0x18(%esp),%ecx                    \n"
   2046     "mov    0x1c(%esp),%eax                    \n"
   2047     "cmp    $0x0,%eax                          \n"
   2048     "je     2f                                 \n"
   2049     "cmp    $0x80,%eax                         \n"
   2050     "je     3f                                 \n"
   2051     "movd   %eax,%xmm6                         \n"
   2052     "punpcklwd %xmm6,%xmm6                     \n"
   2053     "pshufd $0x0,%xmm6,%xmm6                   \n"
   2054     "neg    %eax                               \n"
   2055     "add    $0x100,%eax                        \n"
   2056     "movd   %eax,%xmm5                         \n"
   2057     "punpcklwd %xmm5,%xmm5                     \n"
   2058     "pshufd $0x0,%xmm5,%xmm5                   \n"
   2059     "pxor   %xmm7,%xmm7                        \n"
   2060 
   2061 "1:"
   2062     "movdqa (%esi),%xmm0                       \n"
   2063     "movdqa (%esi,%edx,1),%xmm2                \n"
   2064     "lea    0x10(%esi),%esi                    \n"
   2065     "movdqa %xmm0,%xmm1                        \n"
   2066     "movdqa %xmm2,%xmm3                        \n"
   2067     "punpcklbw %xmm7,%xmm0                     \n"
   2068     "punpcklbw %xmm7,%xmm2                     \n"
   2069     "punpckhbw %xmm7,%xmm1                     \n"
   2070     "punpckhbw %xmm7,%xmm3                     \n"
   2071     "pmullw %xmm5,%xmm0                        \n"
   2072     "pmullw %xmm5,%xmm1                        \n"
   2073     "pmullw %xmm6,%xmm2                        \n"
   2074     "pmullw %xmm6,%xmm3                        \n"
   2075     "paddusw %xmm2,%xmm0                       \n"
   2076     "paddusw %xmm3,%xmm1                       \n"
   2077     "psrlw  $0x8,%xmm0                         \n"
   2078     "psrlw  $0x8,%xmm1                         \n"
   2079     "packuswb %xmm1,%xmm0                      \n"
   2080     "movdqa %xmm0,(%edi)                       \n"
   2081     "lea    0x10(%edi),%edi                    \n"
   2082     "sub    $0x10,%ecx                         \n"
   2083     "ja     1b                                 \n"
   2084     "mov    -0x1(%edi),%al                     \n"
   2085     "mov    %al,(%edi)                         \n"
   2086     "pop    %edi                               \n"
   2087     "pop    %esi                               \n"
   2088     "ret                                       \n"
   2089 
   2090 "2:"
   2091     "movdqa (%esi),%xmm0                       \n"
   2092     "lea    0x10(%esi),%esi                    \n"
   2093     "movdqa %xmm0,(%edi)                       \n"
   2094     "lea    0x10(%edi),%edi                    \n"
   2095     "sub    $0x10,%ecx                         \n"
   2096     "ja     2b                                 \n"
   2097 
   2098     "mov    -0x1(%edi),%al                     \n"
   2099     "mov    %al,(%edi)                         \n"
   2100     "pop    %edi                               \n"
   2101     "pop    %esi                               \n"
   2102     "ret                                       \n"
   2103 
   2104 "3:"
   2105     "movdqa (%esi),%xmm0                       \n"
   2106     "movdqa (%esi,%edx,1),%xmm2                \n"
   2107     "lea    0x10(%esi),%esi                    \n"
   2108     "pavgb  %xmm2,%xmm0                        \n"
   2109     "movdqa %xmm0,(%edi)                       \n"
   2110     "lea    0x10(%edi),%edi                    \n"
   2111     "sub    $0x10,%ecx                         \n"
   2112     "ja     3b                                 \n"
   2113 
   2114     "mov    -0x1(%edi),%al                     \n"
   2115     "mov    %al,(%edi)                         \n"
   2116     "pop    %edi                               \n"
   2117     "pop    %esi                               \n"
   2118     "ret                                       \n"
   2119 );
   2120 
   2121 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
   2122 #define HAS_SCALEFILTERROWS_SSSE3
   2123 void ScaleFilterRows_SSSE3(uint8* dst_ptr,
   2124                                       const uint8* src_ptr, int src_stride,
   2125                                       int dst_width, int source_y_fraction);
   2126   asm(
   2127     DECLARE_FUNCTION(ScaleFilterRows_SSSE3)
   2128     "push   %esi                               \n"
   2129     "push   %edi                               \n"
   2130     "mov    0xc(%esp),%edi                     \n"
   2131     "mov    0x10(%esp),%esi                    \n"
   2132     "mov    0x14(%esp),%edx                    \n"
   2133     "mov    0x18(%esp),%ecx                    \n"
   2134     "mov    0x1c(%esp),%eax                    \n"
   2135     "shr    %eax                               \n"
   2136     "cmp    $0x0,%eax                          \n"
   2137     "je     2f                                 \n"
   2138     "cmp    $0x40,%eax                         \n"
   2139     "je     3f                                 \n"
   2140     "mov    %al,%ah                            \n"
   2141     "neg    %al                                \n"
   2142     "add    $0x80,%al                          \n"
   2143     "movd   %eax,%xmm5                         \n"
   2144     "punpcklwd %xmm5,%xmm5                     \n"
   2145     "pshufd $0x0,%xmm5,%xmm5                   \n"
   2146 
   2147 "1:"
   2148     "movdqa (%esi),%xmm0                       \n"
   2149     "movdqa (%esi,%edx,1),%xmm2                \n"
   2150     "lea    0x10(%esi),%esi                    \n"
   2151     "movdqa %xmm0,%xmm1                        \n"
   2152     "punpcklbw %xmm2,%xmm0                     \n"
   2153     "punpckhbw %xmm2,%xmm1                     \n"
   2154     "pmaddubsw %xmm5,%xmm0                     \n"
   2155     "pmaddubsw %xmm5,%xmm1                     \n"
   2156     "psrlw  $0x7,%xmm0                         \n"
   2157     "psrlw  $0x7,%xmm1                         \n"
   2158     "packuswb %xmm1,%xmm0                      \n"
   2159     "movdqa %xmm0,(%edi)                       \n"
   2160     "lea    0x10(%edi),%edi                    \n"
   2161     "sub    $0x10,%ecx                         \n"
   2162     "ja     1b                                 \n"
   2163     "mov    -0x1(%edi),%al                     \n"
   2164     "mov    %al,(%edi)                         \n"
   2165     "pop    %edi                               \n"
   2166     "pop    %esi                               \n"
   2167     "ret                                       \n"
   2168 
   2169 "2:"
   2170     "movdqa (%esi),%xmm0                       \n"
   2171     "lea    0x10(%esi),%esi                    \n"
   2172     "movdqa %xmm0,(%edi)                       \n"
   2173     "lea    0x10(%edi),%edi                    \n"
   2174     "sub    $0x10,%ecx                         \n"
   2175     "ja     2b                                 \n"
   2176     "mov    -0x1(%edi),%al                     \n"
   2177     "mov    %al,(%edi)                         \n"
   2178     "pop    %edi                               \n"
   2179     "pop    %esi                               \n"
   2180     "ret                                       \n"
   2181 
   2182 "3:"
   2183     "movdqa (%esi),%xmm0                       \n"
   2184     "movdqa (%esi,%edx,1),%xmm2                \n"
   2185     "lea    0x10(%esi),%esi                    \n"
   2186     "pavgb  %xmm2,%xmm0                        \n"
   2187     "movdqa %xmm0,(%edi)                       \n"
   2188     "lea    0x10(%edi),%edi                    \n"
   2189     "sub    $0x10,%ecx                         \n"
   2190     "ja     3b                                 \n"
   2191     "mov    -0x1(%edi),%al                     \n"
   2192     "mov    %al,(%edi)                         \n"
   2193     "pop    %edi                               \n"
   2194     "pop    %esi                               \n"
   2195     "ret                                       \n"
   2196 );
   2197 
   2198 #elif defined(__x86_64__)
   2199 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
   2200                                   uint8* dst_ptr, int dst_width) {
   2201   asm volatile (
   2202   "lea        (%3,%3,2),%%r10                  \n"
   2203   "pxor       %%xmm7,%%xmm7                    \n"
   2204 "1:"
   2205   "movdqa     (%0),%%xmm0                      \n"
   2206   "movdqa     0x10(%0),%%xmm1                  \n"
   2207   "movdqa     (%0,%3,1),%%xmm2                 \n"
   2208   "movdqa     0x10(%0,%3,1),%%xmm3             \n"
   2209   "pavgb      %%xmm2,%%xmm0                    \n"
   2210   "pavgb      %%xmm3,%%xmm1                    \n"
   2211   "movdqa     (%0,%3,2),%%xmm2                 \n"
   2212   "movdqa     0x10(%0,%3,2),%%xmm3             \n"
   2213   "movdqa     (%0,%%r10,1),%%xmm4              \n"
   2214   "movdqa     0x10(%0,%%r10,1),%%xmm5          \n"
   2215   "lea        (%0,%3,4),%%r11                  \n"
   2216   "lea        0x20(%0),%0                      \n"
   2217   "pavgb      %%xmm4,%%xmm2                    \n"
   2218   "pavgb      %%xmm5,%%xmm3                    \n"
   2219   "pavgb      %%xmm2,%%xmm0                    \n"
   2220   "pavgb      %%xmm3,%%xmm1                    \n"
   2221   "movdqa     0x0(%%r11),%%xmm2                \n"
   2222   "movdqa     0x10(%%r11),%%xmm3               \n"
   2223   "movdqa     0x0(%%r11,%3,1),%%xmm4           \n"
   2224   "movdqa     0x10(%%r11,%3,1),%%xmm5          \n"
   2225   "pavgb      %%xmm4,%%xmm2                    \n"
   2226   "pavgb      %%xmm5,%%xmm3                    \n"
   2227   "movdqa     0x0(%%r11,%3,2),%%xmm4           \n"
   2228   "movdqa     0x10(%%r11,%3,2),%%xmm5          \n"
   2229   "movdqa     0x0(%%r11,%%r10,1),%%xmm6        \n"
   2230   "pavgb      %%xmm6,%%xmm4                    \n"
   2231   "movdqa     0x10(%%r11,%%r10,1),%%xmm6       \n"
   2232   "pavgb      %%xmm6,%%xmm5                    \n"
   2233   "pavgb      %%xmm4,%%xmm2                    \n"
   2234   "pavgb      %%xmm5,%%xmm3                    \n"
   2235   "pavgb      %%xmm2,%%xmm0                    \n"
   2236   "pavgb      %%xmm3,%%xmm1                    \n"
   2237   "psadbw     %%xmm7,%%xmm0                    \n"
   2238   "psadbw     %%xmm7,%%xmm1                    \n"
   2239   "pshufd     $0xd8,%%xmm0,%%xmm0              \n"
   2240   "pshufd     $0x8d,%%xmm1,%%xmm1              \n"
   2241   "por        %%xmm1,%%xmm0                    \n"
   2242   "psrlw      $0x3,%%xmm0                      \n"
   2243   "packuswb   %%xmm0,%%xmm0                    \n"
   2244   "packuswb   %%xmm0,%%xmm0                    \n"
   2245   "movd       %%xmm0,(%1)                      \n"
   2246   "lea        0x4(%1),%1                       \n"
   2247   "sub        $0x4,%2                          \n"
   2248   "ja         1b                               \n"
   2249   : "+r"(src_ptr),     // %0
   2250     "+r"(dst_ptr),     // %1
   2251     "+r"(dst_width)    // %2
   2252   : "r"((intptr_t)(src_stride))   // %3
   2253   : "memory", "cc", "r10", "r11", "xmm6", "xmm7"
   2254 );
   2255 }
   2256 
   2257 #define HAS_SCALEROWDOWN34_SSSE3
   2258 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
   2259                                  uint8* dst_ptr, int dst_width) {
   2260   asm volatile (
   2261   "movdqa     (%3),%%xmm3                      \n"
   2262   "movdqa     (%4),%%xmm4                      \n"
   2263   "movdqa     (%5),%%xmm5                      \n"
   2264 "1:"
   2265   "movdqa     (%0),%%xmm0                      \n"
   2266   "movdqa     0x10(%0),%%xmm2                  \n"
   2267   "lea        0x20(%0),%0                      \n"
   2268   "movdqa     %%xmm2,%%xmm1                    \n"
   2269   "palignr    $0x8,%%xmm0,%%xmm1               \n"
   2270   "pshufb     %%xmm3,%%xmm0                    \n"
   2271   "pshufb     %%xmm4,%%xmm1                    \n"
   2272   "pshufb     %%xmm5,%%xmm2                    \n"
   2273   "movq       %%xmm0,(%1)                      \n"
   2274   "movq       %%xmm1,0x8(%1)                   \n"
   2275   "movq       %%xmm2,0x10(%1)                  \n"
   2276   "lea        0x18(%1),%1                      \n"
   2277   "sub        $0x18,%2                         \n"
   2278   "ja         1b                               \n"
   2279   : "+r"(src_ptr),     // %0
   2280     "+r"(dst_ptr),     // %1
   2281     "+r"(dst_width)    // %2
   2282   : "r"(_shuf0),   // %3
   2283     "r"(_shuf1),   // %4
   2284     "r"(_shuf2)    // %5
   2285   : "memory", "cc"
   2286 );
   2287 }
   2288 
   2289 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
   2290                                        uint8* dst_ptr, int dst_width) {
   2291   asm volatile (
   2292   "movdqa     (%4),%%xmm2                      \n"  // _shuf01
   2293   "movdqa     (%5),%%xmm3                      \n"  // _shuf11
   2294   "movdqa     (%6),%%xmm4                      \n"  // _shuf21
   2295   "movdqa     (%7),%%xmm5                      \n"  // _madd01
   2296   "movdqa     (%8),%%xmm6                      \n"  // _madd11
   2297   "movdqa     (%9),%%xmm7                      \n"  // _round34
   2298   "movdqa     (%10),%%xmm8                     \n"  // _madd21
   2299 "1:"
   2300   "movdqa     (%0),%%xmm0                      \n"
   2301   "movdqa     (%0,%3),%%xmm1                   \n"
   2302   "pavgb      %%xmm1,%%xmm0                    \n"
   2303   "pshufb     %%xmm2,%%xmm0                    \n"
   2304   "pmaddubsw  %%xmm5,%%xmm0                    \n"
   2305   "paddsw     %%xmm7,%%xmm0                    \n"
   2306   "psrlw      $0x2,%%xmm0                      \n"
   2307   "packuswb   %%xmm0,%%xmm0                    \n"
   2308   "movq       %%xmm0,(%1)                      \n"
   2309   "movdqu     0x8(%0),%%xmm0                   \n"
   2310   "movdqu     0x8(%0,%3),%%xmm1                \n"
   2311   "pavgb      %%xmm1,%%xmm0                    \n"
   2312   "pshufb     %%xmm3,%%xmm0                    \n"
   2313   "pmaddubsw  %%xmm6,%%xmm0                    \n"
   2314   "paddsw     %%xmm7,%%xmm0                    \n"
   2315   "psrlw      $0x2,%%xmm0                      \n"
   2316   "packuswb   %%xmm0,%%xmm0                    \n"
   2317   "movq       %%xmm0,0x8(%1)                   \n"
   2318   "movdqa     0x10(%0),%%xmm0                  \n"
   2319   "movdqa     0x10(%0,%3),%%xmm1               \n"
   2320   "lea        0x20(%0),%0                      \n"
   2321   "pavgb      %%xmm1,%%xmm0                    \n"
   2322   "pshufb     %%xmm4,%%xmm0                    \n"
   2323   "pmaddubsw  %%xmm8,%%xmm0                    \n"
   2324   "paddsw     %%xmm7,%%xmm0                    \n"
   2325   "psrlw      $0x2,%%xmm0                      \n"
   2326   "packuswb   %%xmm0,%%xmm0                    \n"
   2327   "movq       %%xmm0,0x10(%1)                  \n"
   2328   "lea        0x18(%1),%1                      \n"
   2329   "sub        $0x18,%2                         \n"
   2330   "ja         1b                               \n"
   2331   : "+r"(src_ptr),     // %0
   2332     "+r"(dst_ptr),     // %1
   2333     "+r"(dst_width)    // %2
   2334   : "r"((intptr_t)(src_stride)),  // %3
   2335     "r"(_shuf01),   // %4
   2336     "r"(_shuf11),   // %5
   2337     "r"(_shuf21),   // %6
   2338     "r"(_madd01),   // %7
   2339     "r"(_madd11),   // %8
   2340     "r"(_round34),  // %9
   2341     "r"(_madd21)    // %10
   2342   : "memory", "cc", "xmm6", "xmm7", "xmm8"
   2343 );
   2344 }
   2345 
   2346 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
   2347                                        uint8* dst_ptr, int dst_width) {
   2348   asm volatile (
   2349   "movdqa     (%4),%%xmm2                      \n"  // _shuf01
   2350   "movdqa     (%5),%%xmm3                      \n"  // _shuf11
   2351   "movdqa     (%6),%%xmm4                      \n"  // _shuf21
   2352   "movdqa     (%7),%%xmm5                      \n"  // _madd01
   2353   "movdqa     (%8),%%xmm6                      \n"  // _madd11
   2354   "movdqa     (%9),%%xmm7                      \n"  // _round34
   2355   "movdqa     (%10),%%xmm8                     \n"  // _madd21
   2356 "1:"
   2357   "movdqa     (%0),%%xmm0                      \n"
   2358   "movdqa     (%0,%3,1),%%xmm1                 \n"
   2359   "pavgb      %%xmm0,%%xmm1                    \n"
   2360   "pavgb      %%xmm1,%%xmm0                    \n"
   2361   "pshufb     %%xmm2,%%xmm0                    \n"
   2362   "pmaddubsw  %%xmm5,%%xmm0                    \n"
   2363   "paddsw     %%xmm7,%%xmm0                    \n"
   2364   "psrlw      $0x2,%%xmm0                      \n"
   2365   "packuswb   %%xmm0,%%xmm0                    \n"
   2366   "movq       %%xmm0,(%1)                      \n"
   2367   "movdqu     0x8(%0),%%xmm0                   \n"
   2368   "movdqu     0x8(%0,%3,1),%%xmm1              \n"
   2369   "pavgb      %%xmm0,%%xmm1                    \n"
   2370   "pavgb      %%xmm1,%%xmm0                    \n"
   2371   "pshufb     %%xmm3,%%xmm0                    \n"
   2372   "pmaddubsw  %%xmm6,%%xmm0                    \n"
   2373   "paddsw     %%xmm7,%%xmm0                    \n"
   2374   "psrlw      $0x2,%%xmm0                      \n"
   2375   "packuswb   %%xmm0,%%xmm0                    \n"
   2376   "movq       %%xmm0,0x8(%1)                   \n"
   2377   "movdqa     0x10(%0),%%xmm0                  \n"
   2378   "movdqa     0x10(%0,%3,1),%%xmm1             \n"
   2379   "lea        0x20(%0),%0                      \n"
   2380   "pavgb      %%xmm0,%%xmm1                    \n"
   2381   "pavgb      %%xmm1,%%xmm0                    \n"
   2382   "pshufb     %%xmm4,%%xmm0                    \n"
   2383   "pmaddubsw  %%xmm8,%%xmm0                    \n"
   2384   "paddsw     %%xmm7,%%xmm0                    \n"
   2385   "psrlw      $0x2,%%xmm0                      \n"
   2386   "packuswb   %%xmm0,%%xmm0                    \n"
   2387   "movq       %%xmm0,0x10(%1)                  \n"
   2388   "lea        0x18(%1),%1                      \n"
   2389   "sub        $0x18,%2                         \n"
   2390   "ja         1b                               \n"
   2391   : "+r"(src_ptr),     // %0
   2392     "+r"(dst_ptr),     // %1
   2393     "+r"(dst_width)    // %2
   2394   : "r"((intptr_t)(src_stride)),  // %3
   2395     "r"(_shuf01),   // %4
   2396     "r"(_shuf11),   // %5
   2397     "r"(_shuf21),   // %6
   2398     "r"(_madd01),   // %7
   2399     "r"(_madd11),   // %8
   2400     "r"(_round34),  // %9
   2401     "r"(_madd21)    // %10
   2402   : "memory", "cc", "xmm6", "xmm7", "xmm8"
   2403 );
   2404 }
   2405 
   2406 #define HAS_SCALEROWDOWN38_SSSE3
   2407 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
   2408                                  uint8* dst_ptr, int dst_width) {
   2409   asm volatile (
   2410   "movdqa     (%3),%%xmm4                      \n"
   2411   "movdqa     (%4),%%xmm5                      \n"
   2412 "1:"
   2413   "movdqa     (%0),%%xmm0                      \n"
   2414   "movdqa     0x10(%0),%%xmm1                  \n"
   2415   "lea        0x20(%0),%0                      \n"
   2416   "pshufb     %%xmm4,%%xmm0                    \n"
   2417   "pshufb     %%xmm5,%%xmm1                    \n"
   2418   "paddusb    %%xmm1,%%xmm0                    \n"
   2419   "movq       %%xmm0,(%1)                      \n"
   2420   "movhlps    %%xmm0,%%xmm1                    \n"
   2421   "movd       %%xmm1,0x8(%1)                   \n"
   2422   "lea        0xc(%1),%1                       \n"
   2423   "sub        $0xc,%2                          \n"
   2424   "ja         1b                               \n"
   2425   : "+r"(src_ptr),     // %0
   2426     "+r"(dst_ptr),     // %1
   2427     "+r"(dst_width)    // %2
   2428   : "r"(_shuf38a),  // %3
   2429     "r"(_shuf38b)   // %4
   2430   : "memory", "cc"
   2431 );
   2432 }
   2433 
   2434 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
   2435                                        uint8* dst_ptr, int dst_width) {
   2436   asm volatile (
   2437   "movdqa     (%4),%%xmm4                      \n"
   2438   "movdqa     (%5),%%xmm5                      \n"
   2439   "movdqa     (%6),%%xmm6                      \n"
   2440   "pxor       %%xmm7,%%xmm7                    \n"
   2441 "1:"
   2442   "movdqa     (%0),%%xmm0                      \n"
   2443   "movdqa     (%0,%3,1),%%xmm2                 \n"
   2444   "movhlps    %%xmm0,%%xmm1                    \n"
   2445   "movhlps    %%xmm2,%%xmm3                    \n"
   2446   "punpcklbw  %%xmm7,%%xmm0                    \n"
   2447   "punpcklbw  %%xmm7,%%xmm1                    \n"
   2448   "punpcklbw  %%xmm7,%%xmm2                    \n"
   2449   "punpcklbw  %%xmm7,%%xmm3                    \n"
   2450   "paddusw    %%xmm2,%%xmm0                    \n"
   2451   "paddusw    %%xmm3,%%xmm1                    \n"
   2452   "movdqa     (%0,%3,2),%%xmm2                 \n"
   2453   "lea        0x10(%0),%0                      \n"
   2454   "movhlps    %%xmm2,%%xmm3                    \n"
   2455   "punpcklbw  %%xmm7,%%xmm2                    \n"
   2456   "punpcklbw  %%xmm7,%%xmm3                    \n"
   2457   "paddusw    %%xmm2,%%xmm0                    \n"
   2458   "paddusw    %%xmm3,%%xmm1                    \n"
   2459   "movdqa     %%xmm0,%%xmm2                    \n"
   2460   "psrldq     $0x2,%%xmm0                      \n"
   2461   "paddusw    %%xmm0,%%xmm2                    \n"
   2462   "psrldq     $0x2,%%xmm0                      \n"
   2463   "paddusw    %%xmm0,%%xmm2                    \n"
   2464   "pshufb     %%xmm4,%%xmm2                    \n"
   2465   "movdqa     %%xmm1,%%xmm3                    \n"
   2466   "psrldq     $0x2,%%xmm1                      \n"
   2467   "paddusw    %%xmm1,%%xmm3                    \n"
   2468   "psrldq     $0x2,%%xmm1                      \n"
   2469   "paddusw    %%xmm1,%%xmm3                    \n"
   2470   "pshufb     %%xmm5,%%xmm3                    \n"
   2471   "paddusw    %%xmm3,%%xmm2                    \n"
   2472   "pmulhuw    %%xmm6,%%xmm2                    \n"
   2473   "packuswb   %%xmm2,%%xmm2                    \n"
   2474   "movd       %%xmm2,(%1)                      \n"
   2475   "pextrw     $0x2,%%xmm2,%%eax                \n"
   2476   "mov        %%ax,0x4(%1)                     \n"
   2477   "lea        0x6(%1),%1                       \n"
   2478   "sub        $0x6,%2                          \n"
   2479   "ja         1b                               \n"
   2480   : "+r"(src_ptr),     // %0
   2481     "+r"(dst_ptr),     // %1
   2482     "+r"(dst_width)    // %2
   2483   : "r"((intptr_t)(src_stride)),  // %3
   2484     "r"(_shufac0),   // %4
   2485     "r"(_shufac3),   // %5
   2486     "r"(_scaleac3)   // %6
   2487   : "memory", "cc", "rax", "xmm6", "xmm7"
   2488 );
   2489 }
   2490 
   2491 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
   2492                                        uint8* dst_ptr, int dst_width) {
   2493   asm volatile (
   2494   "movdqa     (%4),%%xmm4                      \n"
   2495   "movdqa     (%5),%%xmm5                      \n"
   2496   "movdqa     (%6),%%xmm6                      \n"
   2497   "movdqa     (%7),%%xmm7                      \n"
   2498 "1:"
   2499   "movdqa     (%0),%%xmm2                      \n"
   2500   "pavgb      (%0,%3,1),%%xmm2                 \n"
   2501   "lea        0x10(%0),%0                      \n"
   2502   "movdqa     %%xmm2,%%xmm0                    \n"
   2503   "pshufb     %%xmm4,%%xmm0                    \n"
   2504   "movdqa     %%xmm2,%%xmm1                    \n"
   2505   "pshufb     %%xmm5,%%xmm1                    \n"
   2506   "paddusw    %%xmm1,%%xmm0                    \n"
   2507   "pshufb     %%xmm6,%%xmm2                    \n"
   2508   "paddusw    %%xmm2,%%xmm0                    \n"
   2509   "pmulhuw    %%xmm7,%%xmm0                    \n"
   2510   "packuswb   %%xmm0,%%xmm0                    \n"
   2511   "movd       %%xmm0,(%1)                      \n"
   2512   "pextrw     $0x2,%%xmm0,%%eax                \n"
   2513   "mov        %%ax,0x4(%1)                     \n"
   2514   "lea        0x6(%1),%1                       \n"
   2515   "sub        $0x6,%2                          \n"
   2516   "ja         1b                               \n"
   2517   : "+r"(src_ptr),     // %0
   2518     "+r"(dst_ptr),     // %1
   2519     "+r"(dst_width)    // %2
   2520   : "r"((intptr_t)(src_stride)),  // %3
   2521     "r"(_shufab0),   // %4
   2522     "r"(_shufab1),   // %5
   2523     "r"(_shufab2),   // %6
   2524     "r"(_scaleab2)   // %7
   2525   : "memory", "cc", "rax", "xmm6", "xmm7"
   2526 );
   2527 }
   2528 
   2529 #define HAS_SCALEADDROWS_SSE2
   2530 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
   2531                               uint16* dst_ptr, int src_width,
   2532                               int src_height) {
   2533   asm volatile (
   2534   "pxor       %%xmm5,%%xmm5                    \n"
   2535 "1:"
   2536   "movdqa     (%0),%%xmm2                      \n"
   2537   "lea        (%0,%4,1),%%r10                  \n"
   2538   "movhlps    %%xmm2,%%xmm3                    \n"
   2539   "lea        -0x1(%3),%%r11                   \n"
   2540   "punpcklbw  %%xmm5,%%xmm2                    \n"
   2541   "punpcklbw  %%xmm5,%%xmm3                    \n"
   2542 
   2543 "2:"
   2544   "movdqa     (%%r10),%%xmm0                   \n"
   2545   "lea        (%%r10,%4,1),%%r10               \n"
   2546   "movhlps    %%xmm0,%%xmm1                    \n"
   2547   "punpcklbw  %%xmm5,%%xmm0                    \n"
   2548   "punpcklbw  %%xmm5,%%xmm1                    \n"
   2549   "paddusw    %%xmm0,%%xmm2                    \n"
   2550   "paddusw    %%xmm1,%%xmm3                    \n"
   2551   "sub        $0x1,%%r11                       \n"
   2552   "ja         2b                               \n"
   2553 
   2554   "movdqa     %%xmm2,(%1)                      \n"
   2555   "movdqa     %%xmm3,0x10(%1)                  \n"
   2556   "lea        0x20(%1),%1                      \n"
   2557   "lea        0x10(%0),%0                      \n"
   2558   "sub        $0x10,%2                         \n"
   2559   "ja         1b                               \n"
   2560   : "+r"(src_ptr),     // %0
   2561     "+r"(dst_ptr),     // %1
   2562     "+r"(src_width),   // %2
   2563     "+r"(src_height)   // %3
   2564   : "r"((intptr_t)(src_stride))  // %4
   2565   : "memory", "cc", "r10", "r11"
   2566 );
   2567 }
   2568 
   2569 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
   2570 #define HAS_SCALEFILTERROWS_SSE2
   2571 static void ScaleFilterRows_SSE2(uint8* dst_ptr,
   2572                                  const uint8* src_ptr, int src_stride,
   2573                                  int dst_width, int source_y_fraction) {
   2574   if (source_y_fraction == 0) {
   2575     asm volatile (
   2576     "1:"
   2577       "movdqa     (%1),%%xmm0                  \n"
   2578       "lea        0x10(%1),%1                  \n"
   2579       "movdqa     %%xmm0,(%0)                  \n"
   2580       "lea        0x10(%0),%0                  \n"
   2581       "sub        $0x10,%2                     \n"
   2582       "ja         1b                           \n"
   2583       "mov        -0x1(%0),%%al                \n"
   2584       "mov        %%al,(%0)                    \n"
   2585       : "+r"(dst_ptr),     // %0
   2586         "+r"(src_ptr),     // %1
   2587         "+r"(dst_width)    // %2
   2588       :
   2589       : "memory", "cc", "rax"
   2590     );
   2591     return;
   2592   } else if (source_y_fraction == 128) {
   2593     asm volatile (
   2594     "1:"
   2595       "movdqa     (%1),%%xmm0                  \n"
   2596       "movdqa     (%1,%3,1),%%xmm2             \n"
   2597       "lea        0x10(%1),%1                  \n"
   2598       "pavgb      %%xmm2,%%xmm0                \n"
   2599       "movdqa     %%xmm0,(%0)                  \n"
   2600       "lea        0x10(%0),%0                  \n"
   2601       "sub        $0x10,%2                     \n"
   2602       "ja         1b                           \n"
   2603       "mov        -0x1(%0),%%al                \n"
   2604       "mov        %%al,(%0)                    \n"
   2605       : "+r"(dst_ptr),     // %0
   2606         "+r"(src_ptr),     // %1
   2607         "+r"(dst_width)    // %2
   2608       : "r"((intptr_t)(src_stride))  // %3
   2609       : "memory", "cc", "rax"
   2610     );
   2611     return;
   2612   } else {
   2613     asm volatile (
   2614       "mov        %3,%%eax                     \n"
   2615       "movd       %%eax,%%xmm6                 \n"
   2616       "punpcklwd  %%xmm6,%%xmm6                \n"
   2617       "pshufd     $0x0,%%xmm6,%%xmm6           \n"
   2618       "neg        %%eax                        \n"
   2619       "add        $0x100,%%eax                 \n"
   2620       "movd       %%eax,%%xmm5                 \n"
   2621       "punpcklwd  %%xmm5,%%xmm5                \n"
   2622       "pshufd     $0x0,%%xmm5,%%xmm5           \n"
   2623       "pxor       %%xmm7,%%xmm7                \n"
   2624     "1:"
   2625       "movdqa     (%1),%%xmm0                  \n"
   2626       "movdqa     (%1,%4,1),%%xmm2             \n"
   2627       "lea        0x10(%1),%1                  \n"
   2628       "movdqa     %%xmm0,%%xmm1                \n"
   2629       "movdqa     %%xmm2,%%xmm3                \n"
   2630       "punpcklbw  %%xmm7,%%xmm0                \n"
   2631       "punpcklbw  %%xmm7,%%xmm2                \n"
   2632       "punpckhbw  %%xmm7,%%xmm1                \n"
   2633       "punpckhbw  %%xmm7,%%xmm3                \n"
   2634       "pmullw     %%xmm5,%%xmm0                \n"
   2635       "pmullw     %%xmm5,%%xmm1                \n"
   2636       "pmullw     %%xmm6,%%xmm2                \n"
   2637       "pmullw     %%xmm6,%%xmm3                \n"
   2638       "paddusw    %%xmm2,%%xmm0                \n"
   2639       "paddusw    %%xmm3,%%xmm1                \n"
   2640       "psrlw      $0x8,%%xmm0                  \n"
   2641       "psrlw      $0x8,%%xmm1                  \n"
   2642       "packuswb   %%xmm1,%%xmm0                \n"
   2643       "movdqa     %%xmm0,(%0)                  \n"
   2644       "lea        0x10(%0),%0                  \n"
   2645       "sub        $0x10,%2                     \n"
   2646       "ja         1b                           \n"
   2647       "mov        -0x1(%0),%%al                \n"
   2648       "mov        %%al,(%0)                    \n"
   2649       : "+r"(dst_ptr),     // %0
   2650         "+r"(src_ptr),     // %1
   2651         "+r"(dst_width),   // %2
   2652         "+r"(source_y_fraction)  // %3
   2653       : "r"((intptr_t)(src_stride))  // %4
   2654       : "memory", "cc", "rax", "xmm6", "xmm7"
   2655     );
   2656   }
   2657   return;
   2658 }
   2659 
   2660 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
   2661 #define HAS_SCALEFILTERROWS_SSSE3
   2662 static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
   2663                                   const uint8* src_ptr, int src_stride,
   2664                                   int dst_width, int source_y_fraction) {
   2665   source_y_fraction >>= 1;
   2666   if (source_y_fraction == 0) {
   2667     asm volatile (
   2668    "1:"
   2669       "movdqa     (%1),%%xmm0                  \n"
   2670       "lea        0x10(%1),%1                  \n"
   2671       "movdqa     %%xmm0,(%0)                  \n"
   2672       "lea        0x10(%0),%0                  \n"
   2673       "sub        $0x10,%2                     \n"
   2674       "ja         1b                           \n"
   2675       "mov        -0x1(%0),%%al                \n"
   2676       "mov        %%al,(%0)                    \n"
   2677       : "+r"(dst_ptr),     // %0
   2678         "+r"(src_ptr),     // %1
   2679         "+r"(dst_width)    // %2
   2680       :
   2681       : "memory", "cc", "rax"
   2682     );
   2683     return;
   2684   } else if (source_y_fraction == 64) {
   2685     asm volatile (
   2686     "1:"
   2687       "movdqa     (%1),%%xmm0                  \n"
   2688       "movdqa     (%1,%3,1),%%xmm2             \n"
   2689       "lea        0x10(%1),%1                  \n"
   2690       "pavgb      %%xmm2,%%xmm0                \n"
   2691       "movdqa     %%xmm0,(%0)                  \n"
   2692       "lea        0x10(%0),%0                  \n"
   2693       "sub        $0x10,%2                     \n"
   2694       "ja         1b                           \n"
   2695       "mov        -0x1(%0),%%al                \n"
   2696       "mov        %%al,(%0)                    \n"
   2697       : "+r"(dst_ptr),     // %0
   2698         "+r"(src_ptr),     // %1
   2699         "+r"(dst_width)    // %2
   2700       : "r"((intptr_t)(src_stride))  // %3
   2701      : "memory", "cc", "rax"
   2702     );
   2703     return;
   2704   } else {
   2705     asm volatile (
   2706       "mov        %3,%%eax                     \n"
   2707       "mov        %%al,%%ah                    \n"
   2708       "neg        %%al                         \n"
   2709       "add        $0x80,%%al                   \n"
   2710       "movd       %%eax,%%xmm5                 \n"
   2711       "punpcklwd  %%xmm5,%%xmm5                \n"
   2712       "pshufd     $0x0,%%xmm5,%%xmm5           \n"
   2713     "1:"
   2714       "movdqa     (%1),%%xmm0                  \n"
   2715       "movdqa     (%1,%4,1),%%xmm2             \n"
   2716       "lea        0x10(%1),%1                  \n"
   2717       "movdqa     %%xmm0,%%xmm1                \n"
   2718       "punpcklbw  %%xmm2,%%xmm0                \n"
   2719       "punpckhbw  %%xmm2,%%xmm1                \n"
   2720       "pmaddubsw  %%xmm5,%%xmm0                \n"
   2721       "pmaddubsw  %%xmm5,%%xmm1                \n"
   2722       "psrlw      $0x7,%%xmm0                  \n"
   2723       "psrlw      $0x7,%%xmm1                  \n"
   2724       "packuswb   %%xmm1,%%xmm0                \n"
   2725       "movdqa     %%xmm0,(%0)                  \n"
   2726       "lea        0x10(%0),%0                  \n"
   2727       "sub        $0x10,%2                     \n"
   2728       "ja         1b                           \n"
   2729       "mov        -0x1(%0),%%al                \n"
   2730       "mov        %%al,(%0)                    \n"
   2731       : "+r"(dst_ptr),     // %0
   2732         "+r"(src_ptr),     // %1
   2733         "+r"(dst_width),   // %2
   2734         "+r"(source_y_fraction)  // %3
   2735       : "r"((intptr_t)(src_stride))  // %4
   2736       : "memory", "cc", "rax"
   2737     );
   2738   }
   2739   return;
   2740 }
   2741 #endif
   2742 #endif
   2743 
   2744 // CPU agnostic row functions
   2745 static void ScaleRowDown2_C(const uint8* src_ptr, int src_stride,
   2746                             uint8* dst, int dst_width) {
   2747   int x;
   2748   for (x = 0; x < dst_width; ++x) {
   2749     *dst++ = *src_ptr;
   2750     src_ptr += 2;
   2751   }
   2752 }
   2753 
   2754 static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
   2755                                uint8* dst, int dst_width) {
   2756   int x;
   2757   for (x = 0; x < dst_width; ++x) {
   2758     *dst++ = (src_ptr[0] + src_ptr[1] +
   2759               src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
   2760     src_ptr += 2;
   2761   }
   2762 }
   2763 
   2764 static void ScaleRowDown4_C(const uint8* src_ptr, int src_stride,
   2765                             uint8* dst, int dst_width) {
   2766   int x;
   2767   for (x = 0; x < dst_width; ++x) {
   2768     *dst++ = *src_ptr;
   2769     src_ptr += 4;
   2770   }
   2771 }
   2772 
   2773 static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
   2774                                uint8* dst, int dst_width) {
   2775   int x;
   2776   for (x = 0; x < dst_width; ++x) {
   2777     *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
   2778               src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
   2779               src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
   2780               src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
   2781               src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
   2782               src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
   2783               src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
   2784               8) >> 4;
   2785     src_ptr += 4;
   2786   }
   2787 }
   2788 
   2789 // 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
   2790 // Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
   2791 // The following 2 lines cause error on Windows.
   2792 //static const int kMaxOutputWidth = 640;
   2793 //static const int kMaxRow12 = 1280;         //kMaxOutputWidth * 2;
   2794 #define kMaxOutputWidth   640
   2795 #define kMaxRow12         1280
   2796 
   2797 static void ScaleRowDown8_C(const uint8* src_ptr, int src_stride,
   2798                             uint8* dst, int dst_width) {
   2799   int x;
   2800   for (x = 0; x < dst_width; ++x) {
   2801     *dst++ = *src_ptr;
   2802     src_ptr += 8;
   2803   }
   2804 }
   2805 
   2806 // Note calling code checks width is less than max and if not
   2807 // uses ScaleRowDown8_C instead.
   2808 static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
   2809                                uint8* dst, int dst_width) {
   2810   ALIGN16(uint8 src_row[kMaxRow12 * 2]);
   2811   assert(dst_width <= kMaxOutputWidth);
   2812   ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
   2813   ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
   2814                      src_row + kMaxOutputWidth,
   2815                      dst_width * 2);
   2816   ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
   2817 }
   2818 
   2819 static void ScaleRowDown34_C(const uint8* src_ptr, int src_stride,
   2820                              uint8* dst, int dst_width) {
   2821   uint8* dend;
   2822   assert((dst_width % 3 == 0) && (dst_width > 0));
   2823   dend = dst + dst_width;
   2824   do {
   2825     dst[0] = src_ptr[0];
   2826     dst[1] = src_ptr[1];
   2827     dst[2] = src_ptr[3];
   2828     dst += 3;
   2829     src_ptr += 4;
   2830   } while (dst < dend);
   2831 }
   2832 
   2833 // Filter rows 0 and 1 together, 3 : 1
   2834 static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
   2835                                    uint8* d, int dst_width) {
   2836   uint8* dend;
   2837   const uint8* s;
   2838   const uint8* t;
   2839   assert((dst_width % 3 == 0) && (dst_width > 0));
   2840   dend = d + dst_width;
   2841   s = src_ptr;
   2842   t = src_ptr + src_stride;
   2843   do {
   2844     uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
   2845     uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
   2846     uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
   2847     uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
   2848     uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
   2849     uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
   2850     d[0] = (a0 * 3 + b0 + 2) >> 2;
   2851     d[1] = (a1 * 3 + b1 + 2) >> 2;
   2852     d[2] = (a2 * 3 + b2 + 2) >> 2;
   2853     d += 3;
   2854     s += 4;
   2855     t += 4;
   2856   } while (d < dend);
   2857 }
   2858 
   2859 // Filter rows 1 and 2 together, 1 : 1
   2860 static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
   2861                                    uint8* d, int dst_width) {
   2862   uint8* dend;
   2863   const uint8* s;
   2864   const uint8* t;
   2865   assert((dst_width % 3 == 0) && (dst_width > 0));
   2866   dend = d + dst_width;
   2867   s = src_ptr;
   2868   t = src_ptr + src_stride;
   2869   do {
   2870     uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
   2871     uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
   2872     uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
   2873     uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
   2874     uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
   2875     uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
   2876     d[0] = (a0 + b0 + 1) >> 1;
   2877     d[1] = (a1 + b1 + 1) >> 1;
   2878     d[2] = (a2 + b2 + 1) >> 1;
   2879     d += 3;
   2880     s += 4;
   2881     t += 4;
   2882   } while (d < dend);
   2883 }
   2884 
   2885 #if defined(HAS_SCALEFILTERROWS_SSE2)
   2886 // Filter row to 3/4
   2887 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
   2888                                 int dst_width) {
   2889   uint8* dend;
   2890   const uint8* s;
   2891   assert((dst_width % 3 == 0) && (dst_width > 0));
   2892   dend = dst_ptr + dst_width;
   2893   s = src_ptr;
   2894   do {
   2895     dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
   2896     dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
   2897     dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
   2898     dst_ptr += 3;
   2899     s += 4;
   2900   } while (dst_ptr < dend);
   2901 }
   2902 #endif
   2903 
   2904 static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
   2905                               int dst_width, int dx) {
   2906   int x = 0;
   2907   int j;
   2908   for (j = 0; j < dst_width; ++j) {
   2909     int xi = x >> 16;
   2910     int xf1 = x & 0xffff;
   2911     int xf0 = 65536 - xf1;
   2912 
   2913     *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
   2914     x += dx;
   2915   }
   2916 }
   2917 
   2918 //Not work on Windows
   2919 //static const int kMaxInputWidth = 2560;
   2920 #define kMaxInputWidth    2560
   2921 #if defined(HAS_SCALEFILTERROWS_SSE2)
   2922 #define HAS_SCALEROWDOWN34_SSE2
   2923 // Filter rows 0 and 1 together, 3 : 1
   2924 static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,
   2925                                       uint8* dst_ptr, int dst_width) {
   2926   ALIGN16(uint8 row[kMaxInputWidth]);
   2927   assert((dst_width % 3 == 0) && (dst_width > 0));
   2928   ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
   2929   ScaleFilterCols34_C(dst_ptr, row, dst_width);
   2930 }
   2931 
   2932 // Filter rows 1 and 2 together, 1 : 1
   2933 static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,
   2934                                       uint8* dst_ptr, int dst_width) {
   2935   ALIGN16(uint8 row[kMaxInputWidth]);
   2936   assert((dst_width % 3 == 0) && (dst_width > 0));
   2937   ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
   2938   ScaleFilterCols34_C(dst_ptr, row, dst_width);
   2939 }
   2940 #endif
   2941 
   2942 static void ScaleRowDown38_C(const uint8* src_ptr, int src_stride,
   2943                              uint8* dst, int dst_width) {
   2944   int x;
   2945   assert(dst_width % 3 == 0);
   2946   for (x = 0; x < dst_width; x += 3) {
   2947     dst[0] = src_ptr[0];
   2948     dst[1] = src_ptr[3];
   2949     dst[2] = src_ptr[6];
   2950     dst += 3;
   2951     src_ptr += 8;
   2952   }
   2953 }
   2954 
   2955 // 8x3 -> 3x1
   2956 static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
   2957                                    uint8* dst_ptr, int dst_width) {
   2958   int i;
   2959   assert((dst_width % 3 == 0) && (dst_width > 0));
   2960   for (i = 0; i < dst_width; i+=3) {
   2961     dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
   2962         src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
   2963         src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
   2964         src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
   2965         (65536 / 9) >> 16;
   2966     dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
   2967         src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
   2968         src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
   2969         src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
   2970         (65536 / 9) >> 16;
   2971     dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
   2972         src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
   2973         src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
   2974         (65536 / 6) >> 16;
   2975     src_ptr += 8;
   2976     dst_ptr += 3;
   2977   }
   2978 }
   2979 
   2980 // 8x2 -> 3x1
   2981 static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
   2982                                    uint8* dst_ptr, int dst_width) {
   2983   int i;
   2984   assert((dst_width % 3 == 0) && (dst_width > 0));
   2985   for (i = 0; i < dst_width; i+=3) {
   2986     dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
   2987         src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
   2988         src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
   2989     dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
   2990         src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
   2991         src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
   2992     dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
   2993         src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
   2994         (65536 / 4) >> 16;
   2995     src_ptr += 8;
   2996     dst_ptr += 3;
   2997   }
   2998 }
   2999 
   3000 // C version 8x2 -> 8x1
   3001 static void ScaleFilterRows_C(uint8* dst_ptr,
   3002                               const uint8* src_ptr, int src_stride,
   3003                               int dst_width, int source_y_fraction) {
   3004   int y1_fraction;
   3005   int y0_fraction;
   3006   const uint8* src_ptr1;
   3007   uint8* end;
   3008   assert(dst_width > 0);
   3009   y1_fraction = source_y_fraction;
   3010   y0_fraction = 256 - y1_fraction;
   3011   src_ptr1 = src_ptr + src_stride;
   3012   end = dst_ptr + dst_width;
   3013   do {
   3014     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
   3015     dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
   3016     dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
   3017     dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
   3018     dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
   3019     dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
   3020     dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
   3021     dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
   3022     src_ptr += 8;
   3023     src_ptr1 += 8;
   3024     dst_ptr += 8;
   3025   } while (dst_ptr < end);
   3026   dst_ptr[0] = dst_ptr[-1];
   3027 }
   3028 
   3029 void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
   3030                     uint16* dst_ptr, int src_width, int src_height) {
   3031   int x,y;
   3032   assert(src_width > 0);
   3033   assert(src_height > 0);
   3034   for (x = 0; x < src_width; ++x) {
   3035     const uint8* s = src_ptr + x;
   3036     int sum = 0;
   3037     for (y = 0; y < src_height; ++y) {
   3038       sum += s[0];
   3039       s += src_stride;
   3040     }
   3041     dst_ptr[x] = sum;
   3042   }
   3043 }
   3044 
   3045 /**
   3046  * Scale plane, 1/2
   3047  *
   3048  * This is an optimized version for scaling down a plane to 1/2 of
   3049  * its original size.
   3050  *
   3051  */
   3052 static void ScalePlaneDown2(int src_width, int src_height,
   3053                             int dst_width, int dst_height,
   3054                             int src_stride, int dst_stride,
   3055                             const uint8* src_ptr, uint8* dst_ptr,
   3056                             FilterMode filtering) {
   3057   void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,
   3058                         uint8* dst_ptr, int dst_width);
   3059   assert(IS_ALIGNED(src_width, 2));
   3060   assert(IS_ALIGNED(src_height, 2));
   3061 
   3062 #if defined(HAS_SCALEROWDOWN2_NEON)
   3063   if (TestCpuFlag(kCpuHasNEON) &&
   3064       IS_ALIGNED(dst_width, 16)) {
   3065     ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
   3066   } else
   3067 #endif
   3068 #if defined(HAS_SCALEROWDOWN2_SSE2)
   3069   if (TestCpuFlag(kCpuHasSSE2) &&
   3070       IS_ALIGNED(dst_width, 16) &&
   3071       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
   3072       IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
   3073     ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
   3074   } else
   3075 #endif
   3076   {
   3077     ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
   3078   }
   3079 
   3080   {
   3081     int y;
   3082     for (y = 0; y < dst_height; ++y) {
   3083       ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
   3084       src_ptr += (src_stride << 1);
   3085       dst_ptr += dst_stride;
   3086     }
   3087   }
   3088 }
   3089 
   3090 /**
   3091  * Scale plane, 1/4
   3092  *
   3093  * This is an optimized version for scaling down a plane to 1/4 of
   3094  * its original size.
   3095  */
   3096 static void ScalePlaneDown4(int src_width, int src_height,
   3097                             int dst_width, int dst_height,
   3098                             int src_stride, int dst_stride,
   3099                             const uint8* src_ptr, uint8* dst_ptr,
   3100                             FilterMode filtering) {
   3101   void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
   3102                         uint8* dst_ptr, int dst_width);
   3103   assert(IS_ALIGNED(src_width, 4));
   3104   assert(IS_ALIGNED(src_height, 4));
   3105 
   3106 #if defined(HAS_SCALEROWDOWN4_NEON)
   3107   if (TestCpuFlag(kCpuHasNEON) &&
   3108       IS_ALIGNED(dst_width, 4)) {
   3109     ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
   3110   } else
   3111 #endif
   3112 #if defined(HAS_SCALEROWDOWN4_SSE2)
   3113   if (TestCpuFlag(kCpuHasSSE2) &&
   3114       IS_ALIGNED(dst_width, 8) &&
   3115       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
   3116       IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {
   3117     ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
   3118   } else
   3119 #endif
   3120   {
   3121     ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
   3122   }
   3123 
   3124   {
   3125     int y;
   3126     for (y = 0; y < dst_height; ++y) {
   3127       ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
   3128       src_ptr += (src_stride << 2);
   3129       dst_ptr += dst_stride;
   3130     }
   3131   }
   3132 }
   3133 
   3134 /**
   3135  * Scale plane, 1/8
   3136  *
   3137  * This is an optimized version for scaling down a plane to 1/8
   3138  * of its original size.
   3139  *
   3140  */
   3141 static void ScalePlaneDown8(int src_width, int src_height,
   3142                             int dst_width, int dst_height,
   3143                             int src_stride, int dst_stride,
   3144                             const uint8* src_ptr, uint8* dst_ptr,
   3145                             FilterMode filtering) {
   3146   void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,
   3147                         uint8* dst_ptr, int dst_width);
   3148   assert(IS_ALIGNED(src_width, 8));
   3149   assert(IS_ALIGNED(src_height, 8));
   3150 
   3151 #if defined(HAS_SCALEROWDOWN8_SSE2)
   3152   if (TestCpuFlag(kCpuHasSSE2) &&
   3153       IS_ALIGNED(dst_width, 4) &&
   3154       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
   3155       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
   3156     ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
   3157   } else
   3158 #endif
   3159   {
   3160     ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?
   3161         ScaleRowDown8Int_C : ScaleRowDown8_C;
   3162   }
   3163 
   3164   {
   3165     int y;
   3166     for (y = 0; y < dst_height; ++y) {
   3167       ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
   3168       src_ptr += (src_stride << 3);
   3169       dst_ptr += dst_stride;
   3170     }
   3171   }
   3172 }
   3173 
   3174 /**
   3175  * Scale plane down, 3/4
   3176  *
   3177  * Provided by Frank Barchard (fbarchard (at) google.com)
   3178  *
   3179  */
   3180 static void ScalePlaneDown34(int src_width, int src_height,
   3181                              int dst_width, int dst_height,
   3182                              int src_stride, int dst_stride,
   3183                              const uint8* src_ptr, uint8* dst_ptr,
   3184                              FilterMode filtering) {
   3185   void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,
   3186                            uint8* dst_ptr, int dst_width);
   3187   void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
   3188                            uint8* dst_ptr, int dst_width);
   3189   assert(dst_width % 3 == 0);
   3190 #if defined(HAS_SCALEROWDOWN34_NEON)
   3191   if (TestCpuFlag(kCpuHasNEON) &&
   3192       (dst_width % 24 == 0)) {
   3193     if (!filtering) {
   3194       ScaleRowDown34_0 = ScaleRowDown34_NEON;
   3195       ScaleRowDown34_1 = ScaleRowDown34_NEON;
   3196     } else {
   3197       ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
   3198       ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
   3199     }
   3200   } else
   3201 #endif
   3202 
   3203 #if defined(HAS_SCALEROWDOWN34_SSSE3)
   3204   if (TestCpuFlag(kCpuHasSSSE3) &&
   3205       (dst_width % 24 == 0) &&
   3206       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
   3207       IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {
   3208     if (!filtering) {
   3209       ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
   3210       ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
   3211     } else {
   3212       ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
   3213       ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
   3214     }
   3215   } else
   3216 #endif
   3217 #if defined(HAS_SCALEROWDOWN34_SSE2)
   3218   if (TestCpuFlag(kCpuHasSSE2) &&
   3219       (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&
   3220       IS_ALIGNED(dst_stride, 8) &&
   3221       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&
   3222       filtering) {
   3223     ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
   3224     ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
   3225   } else
   3226 #endif
   3227   {
   3228     if (!filtering) {
   3229       ScaleRowDown34_0 = ScaleRowDown34_C;
   3230       ScaleRowDown34_1 = ScaleRowDown34_C;
   3231     } else {
   3232       ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
   3233       ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
   3234     }
   3235   }
   3236   {
   3237   int src_row = 0;
   3238     int y;
   3239     for (y = 0; y < dst_height; ++y) {
   3240     switch (src_row) {
   3241       case 0:
   3242         ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
   3243         break;
   3244 
   3245       case 1:
   3246         ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
   3247         break;
   3248 
   3249       case 2:
   3250         ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
   3251                          dst_ptr, dst_width);
   3252         break;
   3253     }
   3254     ++src_row;
   3255     src_ptr += src_stride;
   3256     dst_ptr += dst_stride;
   3257     if (src_row >= 3) {
   3258       src_ptr += src_stride;
   3259       src_row = 0;
   3260     }
   3261   }
   3262 }
   3263 }
   3264 
   3265 /**
   3266  * Scale plane, 3/8
   3267  *
   3268  * This is an optimized version for scaling down a plane to 3/8
   3269  * of its original size.
   3270  *
   3271  * Reduces 16x3 to 6x1
   3272  */
   3273 static void ScalePlaneDown38(int src_width, int src_height,
   3274                              int dst_width, int dst_height,
   3275                              int src_stride, int dst_stride,
   3276                              const uint8* src_ptr, uint8* dst_ptr,
   3277                              FilterMode filtering) {
   3278   void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,
   3279                            uint8* dst_ptr, int dst_width);
   3280   void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
   3281                            uint8* dst_ptr, int dst_width);
   3282   assert(dst_width % 3 == 0);
   3283 #if defined(HAS_SCALEROWDOWN38_NEON)
   3284   if (TestCpuFlag(kCpuHasNEON) &&
   3285       (dst_width % 12 == 0)) {
   3286     if (!filtering) {
   3287       ScaleRowDown38_3 = ScaleRowDown38_NEON;
   3288       ScaleRowDown38_2 = ScaleRowDown38_NEON;
   3289     } else {
   3290       ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
   3291       ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
   3292     }
   3293   } else
   3294 #endif
   3295 
   3296 #if defined(HAS_SCALEROWDOWN38_SSSE3)
   3297   if (TestCpuFlag(kCpuHasSSSE3) &&
   3298       (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&
   3299       IS_ALIGNED(dst_stride, 8) &&
   3300       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
   3301     if (!filtering) {
   3302       ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
   3303       ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
   3304     } else {
   3305       ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
   3306       ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
   3307     }
   3308   } else
   3309 #endif
   3310   {
   3311     if (!filtering) {
   3312       ScaleRowDown38_3 = ScaleRowDown38_C;
   3313       ScaleRowDown38_2 = ScaleRowDown38_C;
   3314     } else {
   3315       ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
   3316       ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
   3317     }
   3318   }
   3319   {
   3320   int src_row = 0;
   3321     int y;
   3322     for (y = 0; y < dst_height; ++y) {
   3323     switch (src_row) {
   3324       case 0:
   3325       case 1:
   3326         ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
   3327         src_ptr += src_stride * 3;
   3328         ++src_row;
   3329         break;
   3330 
   3331       case 2:
   3332         ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
   3333         src_ptr += src_stride * 2;
   3334         src_row = 0;
   3335         break;
   3336     }
   3337     dst_ptr += dst_stride;
   3338   }
   3339 }
   3340 }
   3341 
   3342 __inline static uint32 SumBox(int iboxwidth, int iboxheight,
   3343                             int src_stride, const uint8* src_ptr) {
   3344   int x, y;
   3345   uint32 sum;
   3346   assert(iboxwidth > 0);
   3347   assert(iboxheight > 0);
   3348   sum = 0u;
   3349   for (y = 0; y < iboxheight; ++y) {
   3350     for (x = 0; x < iboxwidth; ++x) {
   3351       sum += src_ptr[x];
   3352     }
   3353     src_ptr += src_stride;
   3354   }
   3355   return sum;
   3356 }
   3357 
   3358 static void ScalePlaneBoxRow(int dst_width, int boxheight,
   3359                              int dx, int src_stride,
   3360                              const uint8* src_ptr, uint8* dst_ptr) {
   3361   int x = 0;
   3362   int i;
   3363   for (i = 0; i < dst_width; ++i) {
   3364     int ix = x >> 16;
   3365     int boxwidth;
   3366     x += dx;
   3367     boxwidth = (x >> 16) - ix;
   3368     *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
   3369         (boxwidth * boxheight);
   3370   }
   3371 }
   3372 
   3373 __inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
   3374   uint32 sum;
   3375   int x;
   3376   assert(iboxwidth > 0);
   3377   sum = 0u;
   3378   for (x = 0; x < iboxwidth; ++x) {
   3379     sum += src_ptr[x];
   3380   }
   3381   return sum;
   3382 }
   3383 
   3384 static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
   3385                             const uint16* src_ptr, uint8* dst_ptr) {
   3386   int scaletbl[2];
   3387   int minboxwidth = (dx >> 16);
   3388   scaletbl[0] = 65536 / (minboxwidth * boxheight);
   3389   scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
   3390   {
   3391   int *scaleptr = scaletbl - minboxwidth;
   3392   int x = 0;
   3393     int i;
   3394     for (i = 0; i < dst_width; ++i) {
   3395     int ix = x >> 16;
   3396       int boxwidth;
   3397     x += dx;
   3398       boxwidth = (x >> 16) - ix;
   3399     *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
   3400     }
   3401   }
   3402 }
   3403 
   3404 static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
   3405                             const uint16* src_ptr, uint8* dst_ptr) {
   3406   int boxwidth = (dx >> 16);
   3407   int scaleval = 65536 / (boxwidth * boxheight);
   3408   int x = 0;
   3409   int i;
   3410   for (i = 0; i < dst_width; ++i) {
   3411     *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
   3412     x += boxwidth;
   3413   }
   3414 }
   3415 
   3416 /**
   3417  * Scale plane down to any dimensions, with interpolation.
   3418  * (boxfilter).
   3419  *
   3420  * Same method as SimpleScale, which is fixed point, outputting
   3421  * one pixel of destination using fixed point (16.16) to step
   3422  * through source, sampling a box of pixel with simple
   3423  * averaging.
   3424  */
   3425 static void ScalePlaneBox(int src_width, int src_height,
   3426                           int dst_width, int dst_height,
   3427                           int src_stride, int dst_stride,
   3428                           const uint8* src_ptr, uint8* dst_ptr) {
   3429   int dx, dy;
   3430   assert(dst_width > 0);
   3431   assert(dst_height > 0);
   3432   dy = (src_height << 16) / dst_height;
   3433   dx = (src_width << 16) / dst_width;
   3434   if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
   3435       dst_height * 2 > src_height) {
   3436     uint8* dst = dst_ptr;
   3437     int dy = (src_height << 16) / dst_height;
   3438     int dx = (src_width << 16) / dst_width;
   3439     int y = 0;
   3440     int j;
   3441     for (j = 0; j < dst_height; ++j) {
   3442       int iy = y >> 16;
   3443       const uint8* const src = src_ptr + iy * src_stride;
   3444       int boxheight;
   3445       y += dy;
   3446       if (y > (src_height << 16)) {
   3447         y = (src_height << 16);
   3448       }
   3449       boxheight = (y >> 16) - iy;
   3450       ScalePlaneBoxRow(dst_width, boxheight,
   3451                        dx, src_stride,
   3452                        src, dst);
   3453 
   3454       dst += dst_stride;
   3455     }
   3456   } else {
   3457     ALIGN16(uint16 row[kMaxInputWidth]);
   3458     void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
   3459                          uint16* dst_ptr, int src_width, int src_height);
   3460     void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
   3461                          const uint16* src_ptr, uint8* dst_ptr);
   3462 #if defined(HAS_SCALEADDROWS_SSE2)
   3463     if (TestCpuFlag(kCpuHasSSE2) &&
   3464         IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
   3465         IS_ALIGNED(src_width, 16)) {
   3466       ScaleAddRows = ScaleAddRows_SSE2;
   3467     } else
   3468 #endif
   3469     {
   3470       ScaleAddRows = ScaleAddRows_C;
   3471     }
   3472     if (dx & 0xffff) {
   3473       ScaleAddCols = ScaleAddCols2_C;
   3474     } else {
   3475       ScaleAddCols = ScaleAddCols1_C;
   3476     }
   3477 
   3478     {
   3479     int y = 0;
   3480       int j;
   3481       for (j = 0; j < dst_height; ++j) {
   3482       int iy = y >> 16;
   3483       const uint8* const src = src_ptr + iy * src_stride;
   3484         int boxheight;
   3485       y += dy;
   3486       if (y > (src_height << 16)) {
   3487         y = (src_height << 16);
   3488       }
   3489         boxheight = (y >> 16) - iy;
   3490       ScaleAddRows(src, src_stride, row, src_width, boxheight);
   3491       ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
   3492       dst_ptr += dst_stride;
   3493       }
   3494     }
   3495   }
   3496 }
   3497 
   3498 /**
   3499  * Scale plane to/from any dimensions, with interpolation.
   3500  */
   3501 static void ScalePlaneBilinearSimple(int src_width, int src_height,
   3502                                      int dst_width, int dst_height,
   3503                                      int src_stride, int dst_stride,
   3504                                      const uint8* src_ptr, uint8* dst_ptr) {
   3505   int i, j;
   3506   uint8* dst = dst_ptr;
   3507   int dx = (src_width << 16) / dst_width;
   3508   int dy = (src_height << 16) / dst_height;
   3509   int maxx = ((src_width - 1) << 16) - 1;
   3510   int maxy = ((src_height - 1) << 16) - 1;
   3511   int y = (dst_height < src_height) ? 32768 :
   3512       (src_height << 16) / dst_height - 32768;
   3513   for (i = 0; i < dst_height; ++i) {
   3514     int cy = (y < 0) ? 0 : y;
   3515     int yi = cy >> 16;
   3516     int yf = cy & 0xffff;
   3517     const uint8* const src = src_ptr + yi * src_stride;
   3518     int x = (dst_width < src_width) ? 32768 :
   3519         (src_width << 16) / dst_width - 32768;
   3520     for (j = 0; j < dst_width; ++j) {
   3521       int cx = (x < 0) ? 0 : x;
   3522       int xi = cx >> 16;
   3523       int xf = cx & 0xffff;
   3524       int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
   3525       int r1 = (src[xi + src_stride] * (65536 - xf) +
   3526           src[xi + src_stride + 1] * xf) >> 16;
   3527       *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
   3528       x += dx;
   3529       if (x > maxx)
   3530         x = maxx;
   3531     }
   3532     dst += dst_stride - dst_width;
   3533     y += dy;
   3534     if (y > maxy)
   3535       y = maxy;
   3536   }
   3537 }
   3538 
   3539 /**
   3540  * Scale plane to/from any dimensions, with bilinear
   3541  * interpolation.
   3542  */
   3543 static void ScalePlaneBilinear(int src_width, int src_height,
   3544                                int dst_width, int dst_height,
   3545                                int src_stride, int dst_stride,
   3546                                const uint8* src_ptr, uint8* dst_ptr) {
   3547   int dy;
   3548   int dx;
   3549   assert(dst_width > 0);
   3550   assert(dst_height > 0);
   3551   dy = (src_height << 16) / dst_height;
   3552   dx = (src_width << 16) / dst_width;
   3553   if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
   3554     ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
   3555                              src_stride, dst_stride, src_ptr, dst_ptr);
   3556 
   3557   } else {
   3558     ALIGN16(uint8 row[kMaxInputWidth + 1]);
   3559     void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
   3560                             int src_stride,
   3561                             int dst_width, int source_y_fraction);
   3562     void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
   3563                             int dst_width, int dx);
   3564 #if defined(HAS_SCALEFILTERROWS_SSSE3)
   3565     if (TestCpuFlag(kCpuHasSSSE3) &&
   3566         IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
   3567         IS_ALIGNED(src_width, 16)) {
   3568       ScaleFilterRows = ScaleFilterRows_SSSE3;
   3569     } else
   3570 #endif
   3571 #if defined(HAS_SCALEFILTERROWS_SSE2)
   3572     if (TestCpuFlag(kCpuHasSSE2) &&
   3573         IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
   3574         IS_ALIGNED(src_width, 16)) {
   3575       ScaleFilterRows = ScaleFilterRows_SSE2;
   3576     } else
   3577 #endif
   3578     {
   3579       ScaleFilterRows = ScaleFilterRows_C;
   3580     }
   3581     ScaleFilterCols = ScaleFilterCols_C;
   3582 
   3583     {
   3584     int y = 0;
   3585     int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
   3586       int j;
   3587       for (j = 0; j < dst_height; ++j) {
   3588       int iy = y >> 16;
   3589       int fy = (y >> 8) & 255;
   3590       const uint8* const src = src_ptr + iy * src_stride;
   3591       ScaleFilterRows(row, src, src_stride, src_width, fy);
   3592       ScaleFilterCols(dst_ptr, row, dst_width, dx);
   3593       dst_ptr += dst_stride;
   3594       y += dy;
   3595       if (y > maxy) {
   3596         y = maxy;
   3597       }
   3598     }
   3599   }
   3600 }
   3601 }
   3602 
   3603 /**
   3604  * Scale plane to/from any dimensions, without interpolation.
   3605  * Fixed point math is used for performance: The upper 16 bits
   3606  * of x and dx is the integer part of the source position and
   3607  * the lower 16 bits are the fixed decimal part.
   3608  */
   3609 static void ScalePlaneSimple(int src_width, int src_height,
   3610                              int dst_width, int dst_height,
   3611                              int src_stride, int dst_stride,
   3612                              const uint8* src_ptr, uint8* dst_ptr) {
   3613   uint8* dst = dst_ptr;
   3614   int dx = (src_width << 16) / dst_width;
   3615   int y;
   3616   for (y = 0; y < dst_height; ++y) {
   3617     const uint8* const src = src_ptr + (y * src_height / dst_height) *
   3618         src_stride;
   3619     // TODO(fbarchard): Round X coordinate by setting x=0x8000.
   3620     int x = 0;
   3621     int i;
   3622     for (i = 0; i < dst_width; ++i) {
   3623       *dst++ = src[x >> 16];
   3624       x += dx;
   3625     }
   3626     dst += dst_stride - dst_width;
   3627   }
   3628 }
   3629 
   3630 /**
   3631  * Scale plane to/from any dimensions.
   3632  */
   3633 static void ScalePlaneAnySize(int src_width, int src_height,
   3634                               int dst_width, int dst_height,
   3635                               int src_stride, int dst_stride,
   3636                               const uint8* src_ptr, uint8* dst_ptr,
   3637                               FilterMode filtering) {
   3638   if (!filtering) {
   3639     ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
   3640                      src_stride, dst_stride, src_ptr, dst_ptr);
   3641   } else {
   3642     // fall back to non-optimized version
   3643     ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
   3644                        src_stride, dst_stride, src_ptr, dst_ptr);
   3645   }
   3646 }
   3647 
   3648 /**
   3649  * Scale plane down, any size
   3650  *
   3651  * This is an optimized version for scaling down a plane to any size.
   3652  * The current implementation is ~10 times faster compared to the
   3653  * reference implementation for e.g. XGA->LowResPAL
   3654  *
   3655  */
   3656 static void ScalePlaneDown(int src_width, int src_height,
   3657                            int dst_width, int dst_height,
   3658                            int src_stride, int dst_stride,
   3659                            const uint8* src_ptr, uint8* dst_ptr,
   3660                            FilterMode filtering) {
   3661   if (!filtering) {
   3662     ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
   3663                      src_stride, dst_stride, src_ptr, dst_ptr);
   3664   } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
   3665     // between 1/2x and 1x use bilinear
   3666     ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
   3667                        src_stride, dst_stride, src_ptr, dst_ptr);
   3668   } else {
   3669     ScalePlaneBox(src_width, src_height, dst_width, dst_height,
   3670                   src_stride, dst_stride, src_ptr, dst_ptr);
   3671   }
   3672 }
   3673 
   3674 /**
   3675  * Copy plane, no scaling
   3676  *
   3677  * This simply copies the given plane without scaling.
   3678  * The current implementation is ~115 times faster
   3679  * compared to the reference implementation.
   3680  *
   3681  */
   3682 static void CopyPlane(int src_width, int src_height,
   3683                       int dst_width, int dst_height,
   3684                       int src_stride, int dst_stride,
   3685                       const uint8* src_ptr, uint8* dst_ptr) {
   3686   if (src_stride == src_width && dst_stride == dst_width) {
   3687     // All contiguous, so can use REALLY fast path.
   3688     memcpy(dst_ptr, src_ptr, src_width * src_height);
   3689   } else {
   3690     // Not all contiguous; must copy scanlines individually
   3691     const uint8* src = src_ptr;
   3692     uint8* dst = dst_ptr;
   3693     int i;
   3694     for (i = 0; i < src_height; ++i) {
   3695       memcpy(dst, src, src_width);
   3696       dst += dst_stride;
   3697       src += src_stride;
   3698     }
   3699   }
   3700 }
   3701 
   3702 static void ScalePlane(const uint8* src, int src_stride,
   3703                        int src_width, int src_height,
   3704                        uint8* dst, int dst_stride,
   3705                        int dst_width, int dst_height,
   3706                        FilterMode filtering, int use_ref) {
   3707   // Use specialized scales to improve performance for common resolutions.
   3708   // For example, all the 1/2 scalings will use ScalePlaneDown2()
   3709   if (dst_width == src_width && dst_height == src_height) {
   3710     // Straight copy.
   3711     CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,
   3712               dst_stride, src, dst);
   3713   } else if (dst_width <= src_width && dst_height <= src_height) {
   3714     // Scale down.
   3715     if (use_ref) {
   3716       // For testing, allow the optimized versions to be disabled.
   3717       ScalePlaneDown(src_width, src_height, dst_width, dst_height,
   3718                      src_stride, dst_stride, src, dst, filtering);
   3719     } else if (4 * dst_width == 3 * src_width &&
   3720                4 * dst_height == 3 * src_height) {
   3721       // optimized, 3/4
   3722       ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
   3723                        src_stride, dst_stride, src, dst, filtering);
   3724     } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
   3725       // optimized, 1/2
   3726       ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
   3727                       src_stride, dst_stride, src, dst, filtering);
   3728     // 3/8 rounded up for odd sized chroma height.
   3729     } else if (8 * dst_width == 3 * src_width &&
   3730                dst_height == ((src_height * 3 + 7) / 8)) {
   3731       // optimized, 3/8
   3732       ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
   3733                        src_stride, dst_stride, src, dst, filtering);
   3734     } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
   3735       // optimized, 1/4
   3736       ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
   3737                       src_stride, dst_stride, src, dst, filtering);
   3738     } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
   3739       // optimized, 1/8
   3740       ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
   3741                       src_stride, dst_stride, src, dst, filtering);
   3742     } else {
   3743       // Arbitrary downsample
   3744       ScalePlaneDown(src_width, src_height, dst_width, dst_height,
   3745                      src_stride, dst_stride, src, dst, filtering);
   3746     }
   3747   } else {
   3748     // Arbitrary scale up and/or down.
   3749     ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
   3750                       src_stride, dst_stride, src, dst, filtering);
   3751   }
   3752 }
   3753 
   3754 /**
   3755  * Scale a plane.
   3756  *
   3757  * This function in turn calls a scaling function
   3758  * suitable for handling the desired resolutions.
   3759  *
   3760  */
   3761 
   3762 int I420Scale(const uint8* src_y, int src_stride_y,
   3763               const uint8* src_u, int src_stride_u,
   3764               const uint8* src_v, int src_stride_v,
   3765               int src_width, int src_height,
   3766               uint8* dst_y, int dst_stride_y,
   3767               uint8* dst_u, int dst_stride_u,
   3768               uint8* dst_v, int dst_stride_v,
   3769               int dst_width, int dst_height,
   3770               FilterMode filtering) {
   3771   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
   3772       !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
   3773     return -1;
   3774   }
   3775   // Negative height means invert the image.
   3776   if (src_height < 0) {
   3777     int halfheight;
   3778     src_height = -src_height;
   3779     halfheight = (src_height + 1) >> 1;
   3780     src_y = src_y + (src_height - 1) * src_stride_y;
   3781     src_u = src_u + (halfheight - 1) * src_stride_u;
   3782     src_v = src_v + (halfheight - 1) * src_stride_v;
   3783     src_stride_y = -src_stride_y;
   3784     src_stride_u = -src_stride_u;
   3785     src_stride_v = -src_stride_v;
   3786   }
   3787   {
   3788   int src_halfwidth = (src_width + 1) >> 1;
   3789   int src_halfheight = (src_height + 1) >> 1;
   3790   int dst_halfwidth = (dst_width + 1) >> 1;
   3791   int dst_halfheight = (dst_height + 1) >> 1;
   3792 
   3793     ScalePlane(src_y, src_stride_y, src_width, src_height,
   3794                dst_y, dst_stride_y, dst_width, dst_height,
   3795                filtering, use_reference_impl_);
   3796   ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
   3797              dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
   3798              filtering, use_reference_impl_);
   3799   ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
   3800              dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
   3801              filtering, use_reference_impl_);
   3802   }
   3803   return 0;
   3804 }
   3805 
   3806 // Deprecated api
   3807 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
   3808           int src_stride_y, int src_stride_u, int src_stride_v,
   3809           int src_width, int src_height,
   3810           uint8* dst_y, uint8* dst_u, uint8* dst_v,
   3811           int dst_stride_y, int dst_stride_u, int dst_stride_v,
   3812           int dst_width, int dst_height,
   3813           int interpolate) {
   3814   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
   3815       !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
   3816     return -1;
   3817   }
   3818   // Negative height means invert the image.
   3819   if (src_height < 0) {
   3820     int halfheight;
   3821     src_height = -src_height;
   3822     halfheight = (src_height + 1) >> 1;
   3823     src_y = src_y + (src_height - 1) * src_stride_y;
   3824     src_u = src_u + (halfheight - 1) * src_stride_u;
   3825     src_v = src_v + (halfheight - 1) * src_stride_v;
   3826     src_stride_y = -src_stride_y;
   3827     src_stride_u = -src_stride_u;
   3828     src_stride_v = -src_stride_v;
   3829   }
   3830   {
   3831   int src_halfwidth = (src_width + 1) >> 1;
   3832   int src_halfheight = (src_height + 1) >> 1;
   3833   int dst_halfwidth = (dst_width + 1) >> 1;
   3834   int dst_halfheight = (dst_height + 1) >> 1;
   3835   FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
   3836 
   3837   ScalePlane(src_y, src_stride_y, src_width, src_height,
   3838              dst_y, dst_stride_y, dst_width, dst_height,
   3839              filtering, use_reference_impl_);
   3840   ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
   3841              dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
   3842              filtering, use_reference_impl_);
   3843   ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
   3844              dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
   3845              filtering, use_reference_impl_);
   3846   }
   3847   return 0;
   3848 }
   3849 
   3850 // Deprecated api
   3851 int ScaleOffset(const uint8* src, int src_width, int src_height,
   3852                 uint8* dst, int dst_width, int dst_height, int dst_yoffset,
   3853           int interpolate) {
   3854   if (!src || src_width <= 0 || src_height <= 0 ||
   3855       !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
   3856       dst_yoffset >= dst_height) {
   3857     return -1;
   3858   }
   3859   dst_yoffset = dst_yoffset & ~1;  // chroma requires offset to multiple of 2.
   3860   {
   3861   int src_halfwidth = (src_width + 1) >> 1;
   3862   int src_halfheight = (src_height + 1) >> 1;
   3863   int dst_halfwidth = (dst_width + 1) >> 1;
   3864   int dst_halfheight = (dst_height + 1) >> 1;
   3865   int aheight = dst_height - dst_yoffset * 2;  // actual output height
   3866   const uint8* const src_y = src;
   3867   const uint8* const src_u = src + src_width * src_height;
   3868   const uint8* const src_v = src + src_width * src_height +
   3869                              src_halfwidth * src_halfheight;
   3870   uint8* dst_y = dst + dst_yoffset * dst_width;
   3871   uint8* dst_u = dst + dst_width * dst_height +
   3872                  (dst_yoffset >> 1) * dst_halfwidth;
   3873   uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
   3874                  (dst_yoffset >> 1) * dst_halfwidth;
   3875   return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
   3876                src_width, src_height, dst_y, dst_u, dst_v, dst_width,
   3877                dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
   3878   }
   3879 }
   3880 
   3881 #ifdef __cplusplus
   3882 }  // extern "C"
   3883 }  // namespace libyuv
   3884 #endif
   3885