Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/scale.h"
     12 
     13 #include <assert.h>
     14 #include <string.h>
     15 
     16 #include "libyuv/cpu_id.h"
     17 
     18 #if defined(_MSC_VER)
     19 #define ALIGN16(var) __declspec(align(16)) var
     20 #else
     21 #define ALIGN16(var) var __attribute__((aligned(16)))
     22 #endif
     23 
     24 // Note: A Neon reference manual
     25 // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
     26 // Note: Some SSE2 reference manuals
     27 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
     28 
     29 namespace libyuv {
     30 
     31 // Set the following flag to true to revert to only
     32 // using the reference implementation ScalePlaneBox(), and
     33 // NOT the optimized versions. Useful for debugging and
     34 // when comparing the quality of the resulting YUV planes
     35 // as produced by the optimized and non-optimized versions.
     36 
     37 static bool use_reference_impl_ = false;
     38 
     39 void SetUseReferenceImpl(bool use) {
     40   use_reference_impl_ = use;
     41 }
     42 
     43 /**
     44  * NEON downscalers with interpolation.
     45  *
     46  * Provided by Fritz Koenig
     47  *
     48  */
     49 
     50 #if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
     51 #define HAS_SCALEROWDOWN2_NEON
     52 void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
     53                         uint8* dst, int dst_width) {
     54   __asm__ volatile
     55   (
     56     "1:\n"
     57     "vld2.u8    {q0,q1}, [%0]!    \n"  // load even pixels into q0, odd into q1
     58     "vst1.u8    {q0}, [%1]!       \n"  // store even pixels
     59     "subs       %2, %2, #16       \n"  // 16 processed per loop
     60     "bhi        1b                \n"
     61     : "+r"(src_ptr),          // %0
     62       "+r"(dst),              // %1
     63       "+r"(dst_width)         // %2
     64     :
     65     : "q0", "q1"              // Clobber List
     66   );
     67 }
     68 
     69 void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
     70                            uint8* dst, int dst_width) {
     71   __asm__ volatile
     72   (
     73     "mov        r4, #2            \n"  // rounding constant
     74     "add        %1, %0            \n"  // change the stride to row 2 pointer
     75     "vdup.16    q4, r4            \n"
     76     "1:\n"
     77     "vld1.u8    {q0,q1}, [%0]!    \n"  // load row 1 and post increment
     78     "vld1.u8    {q2,q3}, [%1]!    \n"  // load row 2 and post increment
     79     "vpaddl.u8  q0, q0            \n"  // row 1 add adjacent
     80     "vpaddl.u8  q1, q1            \n"
     81     "vpadal.u8  q0, q2            \n"  // row 2 add adjacent, add row 1 to row 2
     82     "vpadal.u8  q1, q3            \n"
     83     "vadd.u16   q0, q4            \n"  // rounding
     84     "vadd.u16   q1, q4            \n"
     85     "vshrn.u16  d0, q0, #2        \n"  // downshift and pack
     86     "vshrn.u16  d1, q1, #2        \n"
     87     "vst1.u8    {q0}, [%2]!       \n"
     88     "subs       %3, %3, #16       \n"  // 16 processed per loop
     89     "bhi        1b                \n"
     90     : "+r"(src_ptr),          // %0
     91       "+r"(src_stride),       // %1
     92       "+r"(dst),              // %2
     93       "+r"(dst_width)         // %3
     94     :
     95     : "r4", "q0", "q1", "q2", "q3", "q4"              // Clobber List
     96    );
     97 }
     98 
     99 #define HAS_SCALEROWDOWN4_NEON
    100 // Expecting widths on arm devices to be smaller.  Went with 8x4 blocks
    101 //  to get most coverage.  Look to back and evaluate 16x4 blocks with
    102 //  handling of leftovers.
    103 static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
    104                                uint8* dst_ptr, int dst_width) {
    105   __asm__ volatile
    106   (
    107     "mov        r4, #4            \n"
    108     "1:                           \n"
    109     "vld1.u8    {d0[0]}, [%0],r4  \n"   // load up only 2 pixels of data to
    110     "vld1.u8    {d0[1]}, [%0],r4  \n"   //  represent the entire 8x4 block
    111 
    112     "vst1.u16   {d0[0]}, [%1]!    \n"
    113 
    114     "subs       %2, #2            \n"   // dst_width -= 2
    115     "bhi        1b                \n"
    116     : "+r"(src_ptr),          // %0
    117       "+r"(dst_ptr),          // %1
    118       "+r"(dst_width)         // %2
    119     :
    120     : "r4", "q0", "q1", "memory", "cc"
    121   );
    122 }
    123 
    124 static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
    125                                   uint8* dst_ptr, int dst_width) {
    126   __asm__ volatile
    127   (
    128     "1:                           \n"
    129     "mov        r4, %0            \n"
    130     "vld1.u8    {d0}, [r4],%3     \n"   // load up 8x4 block of input data
    131     "vld1.u8    {d1}, [r4],%3     \n"
    132     "vld1.u8    {d2}, [r4],%3     \n"
    133     "vld1.u8    {d3}, [r4]        \n"
    134 
    135     // data is loaded up int q0 and q1
    136     // q0 = a00 a01 a02 a03 b00 b01 b02 b03 a10 a11 a12 a13 b10 b11 b12 b13
    137     // q1 = a20 a21 a22 a23 b20 b21 b22 b23 a20 a21 a22 a23 b20 b21 b22 b23
    138     // q0 = a00+a01 a02+a03 b00+b01 b02+b03 a10+a11 a12+a13 b10+b11 b12+b13
    139     "vpaddl.u8  q0, q0            \n"
    140 
    141     // d0 = a00+a01+a20+a21 a02+a03+a22+a23 b00+b01+b20+b21 b02+b03+b22+b23
    142     // d1 = a10+a11+a20+a21 a12+a13+a22+a23 b10+b11+b20+b21 b12+b13+b22+b23
    143     "vpadal.u8  q0, q1            \n"
    144 
    145     // d0 = a00+a01+a20+a21+a02+a03+a22+a23 b00+b01+b20+b21+b02+b03+b22+b23
    146     // d1 = a10+a11+a20+a21+a12+a13+a22+a23 b10+b11+b20+b21+b12+b13+b22+b23
    147     "vpaddl.u16 q0, q0            \n"
    148 
    149 
    150     // d0 = a00+a01+a20+a21+a02+a03+a22+a23+a10+a11+a20+a21+a12+a13+a22+a23
    151     //      b00+b01+b20+b21+b02+b03+b22+b23+b10+b11+b20+b21+b12+b13+b22+b23
    152     "vadd.u32   d0, d1            \n"
    153 
    154     "vrshr.u32  d0, d0, #4        \n"   // divide by 16 w/rounding
    155 
    156     "vst1.u8    {d0[0]}, [%1]!    \n"
    157     "vst1.u8    {d0[4]}, [%1]!    \n"
    158 
    159     "add        %0, #8            \n"   // move src pointer to next 8 pixels
    160     "subs       %2, #2            \n"   // dst_width -= 2
    161     "bhi        1b                \n"
    162 
    163     : "+r"(src_ptr),          // %0
    164       "+r"(dst_ptr),          // %1
    165       "+r"(dst_width)         // %2
    166     : "r"(src_stride)         // %3
    167     : "r4", "q0", "q1", "memory", "cc"
    168   );
    169 }
    170 
    171 /**
    172  * SSE2 downscalers with interpolation.
    173  *
    174  * Provided by Frank Barchard (fbarchard (at) google.com)
    175  *
    176  */
    177 
    178 // Constants for SSE2 code
    179 #elif (defined(WIN32) || defined(__i386__) || defined(__x86_64__)) && \
    180     !defined(COVERAGE_ENABLED) && !TARGET_IPHONE_SIMULATOR
    181 #if defined(_MSC_VER)
    182 #define TALIGN16(t, var) __declspec(align(16)) t _ ## var
    183 #elif defined(OSX)
    184 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
    185 #else
    186 #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
    187 #endif
    188 
    189 // Offsets for source bytes 0 to 9
    190 extern "C" TALIGN16(const uint8, shuf0[16]) =
    191   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
    192 
    193 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
    194 extern "C" TALIGN16(const uint8, shuf1[16]) =
    195   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
    196 
    197 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
    198 extern "C" TALIGN16(const uint8, shuf2[16]) =
    199   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
    200 
    201 // Offsets for source bytes 0 to 10
    202 extern "C" TALIGN16(const uint8, shuf01[16]) =
    203   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
    204 
    205 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
    206 extern "C" TALIGN16(const uint8, shuf11[16]) =
    207   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
    208 
    209 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
    210 extern "C" TALIGN16(const uint8, shuf21[16]) =
    211   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
    212 
    213 // Coefficients for source bytes 0 to 10
    214 extern "C" TALIGN16(const uint8, madd01[16]) =
    215   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
    216 
    217 // Coefficients for source bytes 10 to 21
    218 extern "C" TALIGN16(const uint8, madd11[16]) =
    219   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
    220 
    221 // Coefficients for source bytes 21 to 31
    222 extern "C" TALIGN16(const uint8, madd21[16]) =
    223   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
    224 
    225 // Coefficients for source bytes 21 to 31
    226 extern "C" TALIGN16(const int16, round34[8]) =
    227   { 2, 2, 2, 2, 2, 2, 2, 2 };
    228 
    229 extern "C" TALIGN16(const uint8, shuf38a[16]) =
    230   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
    231 
    232 extern "C" TALIGN16(const uint8, shuf38b[16]) =
    233   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
    234 
    235 // Arrange words 0,3,6 into 0,1,2
    236 extern "C" TALIGN16(const uint8, shufac0[16]) =
    237   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
    238 
    239 // Arrange words 0,3,6 into 3,4,5
    240 extern "C" TALIGN16(const uint8, shufac3[16]) =
    241   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
    242 
    243 // Scaling values for boxes of 3x3 and 2x3
    244 extern "C" TALIGN16(const uint16, scaleac3[8]) =
    245   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
    246 
    247 // Arrange first value for pixels 0,1,2,3,4,5
    248 extern "C" TALIGN16(const uint8, shufab0[16]) =
    249   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
    250 
    251 // Arrange second value for pixels 0,1,2,3,4,5
    252 extern "C" TALIGN16(const uint8, shufab1[16]) =
    253   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
    254 
    255 // Arrange third value for pixels 0,1,2,3,4,5
    256 extern "C" TALIGN16(const uint8, shufab2[16]) =
    257   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
    258 
    259 // Scaling values for boxes of 3x2 and 2x2
    260 extern "C" TALIGN16(const uint16, scaleab2[8]) =
    261   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
    262 #endif
    263 
    264 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
    265 
    266 #define HAS_SCALEROWDOWN2_SSE2
    267 // Reads 32 pixels, throws half away and writes 16 pixels.
    268 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
    269 __declspec(naked)
    270 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
    271                                uint8* dst_ptr, int dst_width) {
    272   __asm {
    273     mov        eax, [esp + 4]        // src_ptr
    274                                      // src_stride ignored
    275     mov        edx, [esp + 12]       // dst_ptr
    276     mov        ecx, [esp + 16]       // dst_width
    277     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
    278     psrlw      xmm7, 8
    279 
    280   wloop:
    281     movdqa     xmm0, [eax]
    282     movdqa     xmm1, [eax + 16]
    283     lea        eax,  [eax + 32]
    284     pand       xmm0, xmm7
    285     pand       xmm1, xmm7
    286     packuswb   xmm0, xmm1
    287     movdqa     [edx], xmm0
    288     lea        edx, [edx + 16]
    289     sub        ecx, 16
    290     ja         wloop
    291 
    292     ret
    293   }
    294 }
    295 // Blends 32x2 rectangle to 16x1.
    296 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
    297 __declspec(naked)
    298 static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
    299                                   uint8* dst_ptr, int dst_width) {
    300   __asm {
    301     push       esi
    302     mov        eax, [esp + 4 + 4]    // src_ptr
    303     mov        esi, [esp + 4 + 8]    // src_stride
    304     mov        edx, [esp + 4 + 12]   // dst_ptr
    305     mov        ecx, [esp + 4 + 16]   // dst_width
    306     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
    307     psrlw      xmm7, 8
    308 
    309   wloop:
    310     movdqa     xmm0, [eax]
    311     movdqa     xmm1, [eax + 16]
    312     movdqa     xmm2, [eax + esi]
    313     movdqa     xmm3, [eax + esi + 16]
    314     lea        eax,  [eax + 32]
    315     pavgb      xmm0, xmm2            // average rows
    316     pavgb      xmm1, xmm3
    317 
    318     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
    319     psrlw      xmm0, 8
    320     movdqa     xmm3, xmm1
    321     psrlw      xmm1, 8
    322     pand       xmm2, xmm7
    323     pand       xmm3, xmm7
    324     pavgw      xmm0, xmm2
    325     pavgw      xmm1, xmm3
    326     packuswb   xmm0, xmm1
    327 
    328     movdqa     [edx], xmm0
    329     lea        edx, [edx + 16]
    330     sub        ecx, 16
    331     ja         wloop
    332 
    333     pop        esi
    334     ret
    335   }
    336 }
    337 
    338 #define HAS_SCALEROWDOWN4_SSE2
    339 // Point samples 32 pixels to 8 pixels.
    340 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
    341 __declspec(naked)
    342 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
    343                                uint8* dst_ptr, int dst_width) {
    344   __asm {
    345     pushad
    346     mov        esi, [esp + 32 + 4]   // src_ptr
    347                                      // src_stride ignored
    348     mov        edi, [esp + 32 + 12]  // dst_ptr
    349     mov        ecx, [esp + 32 + 16]  // dst_width
    350     pcmpeqb    xmm7, xmm7            // generate mask 0x000000ff
    351     psrld      xmm7, 24
    352 
    353   wloop:
    354     movdqa     xmm0, [esi]
    355     movdqa     xmm1, [esi + 16]
    356     lea        esi,  [esi + 32]
    357     pand       xmm0, xmm7
    358     pand       xmm1, xmm7
    359     packuswb   xmm0, xmm1
    360     packuswb   xmm0, xmm0
    361     movq       qword ptr [edi], xmm0
    362     lea        edi, [edi + 8]
    363     sub        ecx, 8
    364     ja         wloop
    365 
    366     popad
    367     ret
    368   }
    369 }
    370 
    371 // Blends 32x4 rectangle to 8x1.
    372 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
    373 __declspec(naked)
    374 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
    375                                   uint8* dst_ptr, int dst_width) {
    376   __asm {
    377     pushad
    378     mov        esi, [esp + 32 + 4]   // src_ptr
    379     mov        ebx, [esp + 32 + 8]   // src_stride
    380     mov        edi, [esp + 32 + 12]  // dst_ptr
    381     mov        ecx, [esp + 32 + 16]  // dst_width
    382     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
    383     psrlw      xmm7, 8
    384     lea        edx, [ebx + ebx * 2]  // src_stride * 3
    385 
    386   wloop:
    387     movdqa     xmm0, [esi]
    388     movdqa     xmm1, [esi + 16]
    389     movdqa     xmm2, [esi + ebx]
    390     movdqa     xmm3, [esi + ebx + 16]
    391     pavgb      xmm0, xmm2            // average rows
    392     pavgb      xmm1, xmm3
    393     movdqa     xmm2, [esi + ebx * 2]
    394     movdqa     xmm3, [esi + ebx * 2 + 16]
    395     movdqa     xmm4, [esi + edx]
    396     movdqa     xmm5, [esi + edx + 16]
    397     lea        esi, [esi + 32]
    398     pavgb      xmm2, xmm4
    399     pavgb      xmm3, xmm5
    400     pavgb      xmm0, xmm2
    401     pavgb      xmm1, xmm3
    402 
    403     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
    404     psrlw      xmm0, 8
    405     movdqa     xmm3, xmm1
    406     psrlw      xmm1, 8
    407     pand       xmm2, xmm7
    408     pand       xmm3, xmm7
    409     pavgw      xmm0, xmm2
    410     pavgw      xmm1, xmm3
    411     packuswb   xmm0, xmm1
    412 
    413     movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
    414     psrlw      xmm0, 8
    415     pand       xmm2, xmm7
    416     pavgw      xmm0, xmm2
    417     packuswb   xmm0, xmm0
    418 
    419     movq       qword ptr [edi], xmm0
    420     lea        edi, [edi + 8]
    421     sub        ecx, 8
    422     ja         wloop
    423 
    424     popad
    425     ret
    426   }
    427 }
    428 
    429 #define HAS_SCALEROWDOWN8_SSE2
    430 // Point samples 32 pixels to 4 pixels.
    431 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
    432 __declspec(naked)
    433 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
    434                                uint8* dst_ptr, int dst_width) {
    435   __asm {
    436     pushad
    437     mov        esi, [esp + 32 + 4]   // src_ptr
    438                                      // src_stride ignored
    439     mov        edi, [esp + 32 + 12]  // dst_ptr
    440     mov        ecx, [esp + 32 + 16]  // dst_width
    441     pcmpeqb    xmm7, xmm7            // generate mask isolating 1 src 8 bytes
    442     psrlq      xmm7, 56
    443 
    444   wloop:
    445     movdqa     xmm0, [esi]
    446     movdqa     xmm1, [esi + 16]
    447     lea        esi,  [esi + 32]
    448     pand       xmm0, xmm7
    449     pand       xmm1, xmm7
    450     packuswb   xmm0, xmm1  // 32->16
    451     packuswb   xmm0, xmm0  // 16->8
    452     packuswb   xmm0, xmm0  // 8->4
    453     movd       dword ptr [edi], xmm0
    454     lea        edi, [edi + 4]
    455     sub        ecx, 4
    456     ja         wloop
    457 
    458     popad
    459     ret
    460   }
    461 }
    462 
    463 // Blends 32x8 rectangle to 4x1.
    464 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
    465 __declspec(naked)
    466 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
    467                                   uint8* dst_ptr, int dst_width) {
    468   __asm {
    469     pushad
    470     mov        esi, [esp + 32 + 4]   // src_ptr
    471     mov        ebx, [esp + 32 + 8]   // src_stride
    472     mov        edi, [esp + 32 + 12]  // dst_ptr
    473     mov        ecx, [esp + 32 + 16]  // dst_width
    474     lea        edx, [ebx + ebx * 2]  // src_stride * 3
    475     pxor       xmm7, xmm7
    476 
    477   wloop:
    478     movdqa     xmm0, [esi]           // average 8 rows to 1
    479     movdqa     xmm1, [esi + 16]
    480     movdqa     xmm2, [esi + ebx]
    481     movdqa     xmm3, [esi + ebx + 16]
    482     pavgb      xmm0, xmm2
    483     pavgb      xmm1, xmm3
    484     movdqa     xmm2, [esi + ebx * 2]
    485     movdqa     xmm3, [esi + ebx * 2 + 16]
    486     movdqa     xmm4, [esi + edx]
    487     movdqa     xmm5, [esi + edx + 16]
    488     lea        ebp, [esi + ebx * 4]
    489     lea        esi, [esi + 32]
    490     pavgb      xmm2, xmm4
    491     pavgb      xmm3, xmm5
    492     pavgb      xmm0, xmm2
    493     pavgb      xmm1, xmm3
    494 
    495     movdqa     xmm2, [ebp]
    496     movdqa     xmm3, [ebp + 16]
    497     movdqa     xmm4, [ebp + ebx]
    498     movdqa     xmm5, [ebp + ebx + 16]
    499     pavgb      xmm2, xmm4
    500     pavgb      xmm3, xmm5
    501     movdqa     xmm4, [ebp + ebx * 2]
    502     movdqa     xmm5, [ebp + ebx * 2 + 16]
    503     movdqa     xmm6, [ebp + edx]
    504     pavgb      xmm4, xmm6
    505     movdqa     xmm6, [ebp + edx + 16]
    506     pavgb      xmm5, xmm6
    507     pavgb      xmm2, xmm4
    508     pavgb      xmm3, xmm5
    509     pavgb      xmm0, xmm2
    510     pavgb      xmm1, xmm3
    511 
    512     psadbw     xmm0, xmm7            // average 32 pixels to 4
    513     psadbw     xmm1, xmm7
    514     pshufd     xmm0, xmm0, 0xd8      // x1x0 -> xx01
    515     pshufd     xmm1, xmm1, 0x8d      // x3x2 -> 32xx
    516     por        xmm0, xmm1            //      -> 3201
    517     psrlw      xmm0, 3
    518     packuswb   xmm0, xmm0
    519     packuswb   xmm0, xmm0
    520     movd       dword ptr [edi], xmm0
    521 
    522     lea        edi, [edi + 4]
    523     sub        ecx, 4
    524     ja         wloop
    525 
    526     popad
    527     ret
    528   }
    529 }
    530 
    531 #define HAS_SCALEROWDOWN34_SSSE3
    532 // Point samples 32 pixels to 24 pixels.
    533 // Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
    534 // Then shuffled to do the scaling.
    535 
    536 // Note that movdqa+palign may be better than movdqu.
    537 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
    538 __declspec(naked)
    539 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
    540                                  uint8* dst_ptr, int dst_width) {
    541   __asm {
    542     pushad
    543     mov        esi, [esp + 32 + 4]   // src_ptr
    544                                      // src_stride ignored
    545     mov        edi, [esp + 32 + 12]  // dst_ptr
    546     mov        ecx, [esp + 32 + 16]  // dst_width
    547     movdqa     xmm3, _shuf0
    548     movdqa     xmm4, _shuf1
    549     movdqa     xmm5, _shuf2
    550 
    551   wloop:
    552     movdqa     xmm0, [esi]
    553     movdqa     xmm2, [esi + 16]
    554     lea        esi,  [esi + 32]
    555     movdqa     xmm1, xmm2
    556     palignr    xmm1, xmm0, 8
    557     pshufb     xmm0, xmm3
    558     pshufb     xmm1, xmm4
    559     pshufb     xmm2, xmm5
    560     movq       qword ptr [edi], xmm0
    561     movq       qword ptr [edi + 8], xmm1
    562     movq       qword ptr [edi + 16], xmm2
    563     lea        edi, [edi + 24]
    564     sub        ecx, 24
    565     ja         wloop
    566 
    567     popad
    568     ret
    569   }
    570 }
    571 
    572 // Blends 32x2 rectangle to 24x1
    573 // Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
    574 // Then shuffled to do the scaling.
    575 
    576 // Register usage:
    577 // xmm0 src_row 0
    578 // xmm1 src_row 1
    579 // xmm2 shuf 0
    580 // xmm3 shuf 1
    581 // xmm4 shuf 2
    582 // xmm5 madd 0
    583 // xmm6 madd 1
    584 // xmm7 round34
    585 
    586 // Note that movdqa+palign may be better than movdqu.
    587 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
    588 __declspec(naked)
    589 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
    590                                        uint8* dst_ptr, int dst_width) {
    591   __asm {
    592     pushad
    593     mov        esi, [esp + 32 + 4]   // src_ptr
    594     mov        ebx, [esp + 32 + 8]   // src_stride
    595     mov        edi, [esp + 32 + 12]  // dst_ptr
    596     mov        ecx, [esp + 32 + 16]  // dst_width
    597     movdqa     xmm2, _shuf01
    598     movdqa     xmm3, _shuf11
    599     movdqa     xmm4, _shuf21
    600     movdqa     xmm5, _madd01
    601     movdqa     xmm6, _madd11
    602     movdqa     xmm7, _round34
    603 
    604   wloop:
    605     movdqa     xmm0, [esi]           // pixels 0..7
    606     movdqa     xmm1, [esi+ebx]
    607     pavgb      xmm0, xmm1
    608     pshufb     xmm0, xmm2
    609     pmaddubsw  xmm0, xmm5
    610     paddsw     xmm0, xmm7
    611     psrlw      xmm0, 2
    612     packuswb   xmm0, xmm0
    613     movq       qword ptr [edi], xmm0
    614     movdqu     xmm0, [esi+8]         // pixels 8..15
    615     movdqu     xmm1, [esi+ebx+8]
    616     pavgb      xmm0, xmm1
    617     pshufb     xmm0, xmm3
    618     pmaddubsw  xmm0, xmm6
    619     paddsw     xmm0, xmm7
    620     psrlw      xmm0, 2
    621     packuswb   xmm0, xmm0
    622     movq       qword ptr [edi+8], xmm0
    623     movdqa     xmm0, [esi+16]        // pixels 16..23
    624     movdqa     xmm1, [esi+ebx+16]
    625     lea        esi, [esi+32]
    626     pavgb      xmm0, xmm1
    627     pshufb     xmm0, xmm4
    628     movdqa     xmm1, _madd21
    629     pmaddubsw  xmm0, xmm1
    630     paddsw     xmm0, xmm7
    631     psrlw      xmm0, 2
    632     packuswb   xmm0, xmm0
    633     movq       qword ptr [edi+16], xmm0
    634     lea        edi, [edi+24]
    635     sub        ecx, 24
    636     ja         wloop
    637 
    638     popad
    639     ret
    640   }
    641 }
    642 
    643 // Note that movdqa+palign may be better than movdqu.
    644 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
    645 __declspec(naked)
    646 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
    647                                        uint8* dst_ptr, int dst_width) {
    648   __asm {
    649     pushad
    650     mov        esi, [esp + 32 + 4]   // src_ptr
    651     mov        ebx, [esp + 32 + 8]   // src_stride
    652     mov        edi, [esp + 32 + 12]  // dst_ptr
    653     mov        ecx, [esp + 32 + 16]  // dst_width
    654     movdqa     xmm2, _shuf01
    655     movdqa     xmm3, _shuf11
    656     movdqa     xmm4, _shuf21
    657     movdqa     xmm5, _madd01
    658     movdqa     xmm6, _madd11
    659     movdqa     xmm7, _round34
    660 
    661   wloop:
    662     movdqa     xmm0, [esi]           // pixels 0..7
    663     movdqa     xmm1, [esi+ebx]
    664     pavgb      xmm1, xmm0
    665     pavgb      xmm0, xmm1
    666     pshufb     xmm0, xmm2
    667     pmaddubsw  xmm0, xmm5
    668     paddsw     xmm0, xmm7
    669     psrlw      xmm0, 2
    670     packuswb   xmm0, xmm0
    671     movq       qword ptr [edi], xmm0
    672     movdqu     xmm0, [esi+8]         // pixels 8..15
    673     movdqu     xmm1, [esi+ebx+8]
    674     pavgb      xmm1, xmm0
    675     pavgb      xmm0, xmm1
    676     pshufb     xmm0, xmm3
    677     pmaddubsw  xmm0, xmm6
    678     paddsw     xmm0, xmm7
    679     psrlw      xmm0, 2
    680     packuswb   xmm0, xmm0
    681     movq       qword ptr [edi+8], xmm0
    682     movdqa     xmm0, [esi+16]        // pixels 16..23
    683     movdqa     xmm1, [esi+ebx+16]
    684     lea        esi, [esi+32]
    685     pavgb      xmm1, xmm0
    686     pavgb      xmm0, xmm1
    687     pshufb     xmm0, xmm4
    688     movdqa     xmm1, _madd21
    689     pmaddubsw  xmm0, xmm1
    690     paddsw     xmm0, xmm7
    691     psrlw      xmm0, 2
    692     packuswb   xmm0, xmm0
    693     movq       qword ptr [edi+16], xmm0
    694     lea        edi, [edi+24]
    695     sub        ecx, 24
    696     ja         wloop
    697 
    698     popad
    699     ret
    700   }
    701 }
    702 
    703 #define HAS_SCALEROWDOWN38_SSSE3
    704 // 3/8 point sampler
    705 
    706 // Scale 32 pixels to 12
    707 __declspec(naked)
    708 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
    709                                  uint8* dst_ptr, int dst_width) {
    710   __asm {
    711     pushad
    712     mov        esi, [esp + 32 + 4]   // src_ptr
    713     mov        edx, [esp + 32 + 8]   // src_stride
    714     mov        edi, [esp + 32 + 12]  // dst_ptr
    715     mov        ecx, [esp + 32 + 16]  // dst_width
    716     movdqa     xmm5, _shuf38a
    717     movdqa     xmm6, _shuf38b
    718     pxor       xmm7, xmm7
    719 
    720   xloop:
    721     movdqa     xmm0, [esi]           // 16 pixels -> 0,1,2,3,4,5
    722     movdqa     xmm1, [esi + 16]      // 16 pixels -> 6,7,8,9,10,11
    723     lea        esi, [esi + 32]
    724     pshufb     xmm0, xmm5
    725     pshufb     xmm1, xmm6
    726     paddusb    xmm0, xmm1
    727 
    728     movq       qword ptr [edi], xmm0 // write 12 pixels
    729     movhlps    xmm1, xmm0
    730     movd       [edi + 8], xmm1
    731     lea        edi, [edi + 12]
    732     sub        ecx, 12
    733     ja         xloop
    734 
    735     popad
    736     ret
    737   }
    738 }
    739 
    740 // Scale 16x3 pixels to 6x1 with interpolation
    741 __declspec(naked)
    742 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
    743                                        uint8* dst_ptr, int dst_width) {
    744   __asm {
    745     pushad
    746     mov        esi, [esp + 32 + 4]   // src_ptr
    747     mov        edx, [esp + 32 + 8]   // src_stride
    748     mov        edi, [esp + 32 + 12]  // dst_ptr
    749     mov        ecx, [esp + 32 + 16]  // dst_width
    750     movdqa     xmm4, _shufac0
    751     movdqa     xmm5, _shufac3
    752     movdqa     xmm6, _scaleac3
    753     pxor       xmm7, xmm7
    754 
    755   xloop:
    756     movdqa     xmm0, [esi]           // sum up 3 rows into xmm0/1
    757     movdqa     xmm2, [esi + edx]
    758     movhlps    xmm1, xmm0
    759     movhlps    xmm3, xmm2
    760     punpcklbw  xmm0, xmm7
    761     punpcklbw  xmm1, xmm7
    762     punpcklbw  xmm2, xmm7
    763     punpcklbw  xmm3, xmm7
    764     paddusw    xmm0, xmm2
    765     paddusw    xmm1, xmm3
    766     movdqa     xmm2, [esi + edx * 2]
    767     lea        esi, [esi + 16]
    768     movhlps    xmm3, xmm2
    769     punpcklbw  xmm2, xmm7
    770     punpcklbw  xmm3, xmm7
    771     paddusw    xmm0, xmm2
    772     paddusw    xmm1, xmm3
    773 
    774     movdqa     xmm2, xmm0            // 8 pixels -> 0,1,2 of xmm2
    775     psrldq     xmm0, 2
    776     paddusw    xmm2, xmm0
    777     psrldq     xmm0, 2
    778     paddusw    xmm2, xmm0
    779     pshufb     xmm2, xmm4
    780 
    781     movdqa     xmm3, xmm1            // 8 pixels -> 3,4,5 of xmm2
    782     psrldq     xmm1, 2
    783     paddusw    xmm3, xmm1
    784     psrldq     xmm1, 2
    785     paddusw    xmm3, xmm1
    786     pshufb     xmm3, xmm5
    787     paddusw    xmm2, xmm3
    788 
    789     pmulhuw    xmm2, xmm6            // divide by 9,9,6, 9,9,6
    790     packuswb   xmm2, xmm2
    791 
    792     movd       [edi], xmm2           // write 6 pixels
    793     pextrw     eax, xmm2, 2
    794     mov        [edi + 4], ax
    795     lea        edi, [edi + 6]
    796     sub        ecx, 6
    797     ja         xloop
    798 
    799     popad
    800     ret
    801   }
    802 }
    803 
    804 // Scale 16x2 pixels to 6x1 with interpolation
    805 __declspec(naked)
    806 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
    807                                        uint8* dst_ptr, int dst_width) {
    808   __asm {
    809     pushad
    810     mov        esi, [esp + 32 + 4]   // src_ptr
    811     mov        edx, [esp + 32 + 8]   // src_stride
    812     mov        edi, [esp + 32 + 12]  // dst_ptr
    813     mov        ecx, [esp + 32 + 16]  // dst_width
    814     movdqa     xmm4, _shufab0
    815     movdqa     xmm5, _shufab1
    816     movdqa     xmm6, _shufab2
    817     movdqa     xmm7, _scaleab2
    818 
    819   xloop:
    820     movdqa     xmm2, [esi]           // average 2 rows into xmm2
    821     pavgb      xmm2, [esi + edx]
    822     lea        esi, [esi + 16]
    823 
    824     movdqa     xmm0, xmm2            // 16 pixels -> 0,1,2,3,4,5 of xmm0
    825     pshufb     xmm0, xmm4
    826     movdqa     xmm1, xmm2
    827     pshufb     xmm1, xmm5
    828     paddusw    xmm0, xmm1
    829     pshufb     xmm2, xmm6
    830     paddusw    xmm0, xmm2
    831 
    832     pmulhuw    xmm0, xmm7            // divide by 3,3,2, 3,3,2
    833     packuswb   xmm0, xmm0
    834 
    835     movd       [edi], xmm0           // write 6 pixels
    836     pextrw     eax, xmm0, 2
    837     mov        [edi + 4], ax
    838     lea        edi, [edi + 6]
    839     sub        ecx, 6
    840     ja         xloop
    841 
    842     popad
    843     ret
    844   }
    845 }
    846 
    847 #define HAS_SCALEADDROWS_SSE2
    848 
    849 // Reads 8xN bytes and produces 16 shorts at a time.
    850 __declspec(naked)
    851 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
    852                               uint16* dst_ptr, int src_width,
    853                               int src_height) {
    854   __asm {
    855     pushad
    856     mov        esi, [esp + 32 + 4]   // src_ptr
    857     mov        edx, [esp + 32 + 8]   // src_stride
    858     mov        edi, [esp + 32 + 12]  // dst_ptr
    859     mov        ecx, [esp + 32 + 16]  // dst_width
    860     mov        ebx, [esp + 32 + 20]  // height
    861     pxor       xmm7, xmm7
    862     dec        ebx
    863 
    864   xloop:
    865     // first row
    866     movdqa     xmm2, [esi]
    867     lea        eax, [esi + edx]
    868     movhlps    xmm3, xmm2
    869     mov        ebp, ebx
    870     punpcklbw  xmm2, xmm7
    871     punpcklbw  xmm3, xmm7
    872 
    873     // sum remaining rows
    874   yloop:
    875     movdqa     xmm0, [eax]       // read 16 pixels
    876     lea        eax, [eax + edx]  // advance to next row
    877     movhlps    xmm1, xmm0
    878     punpcklbw  xmm0, xmm7
    879     punpcklbw  xmm1, xmm7
    880     paddusw    xmm2, xmm0        // sum 16 words
    881     paddusw    xmm3, xmm1
    882     sub        ebp, 1
    883     ja         yloop
    884 
    885     movdqa     [edi], xmm2
    886     movdqa     [edi + 16], xmm3
    887     lea        edi, [edi + 32]
    888     lea        esi, [esi + 16]
    889 
    890     sub        ecx, 16
    891     ja         xloop
    892 
    893     popad
    894     ret
    895   }
    896 }
    897 
    898 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
    899 #define HAS_SCALEFILTERROWS_SSE2
    900 __declspec(naked)
    901 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
    902                                  int src_stride, int dst_width,
    903                                  int source_y_fraction) {
    904   __asm {
    905     push       esi
    906     push       edi
    907     mov        edi, [esp + 8 + 4]   // dst_ptr
    908     mov        esi, [esp + 8 + 8]   // src_ptr
    909     mov        edx, [esp + 8 + 12]  // src_stride
    910     mov        ecx, [esp + 8 + 16]  // dst_width
    911     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
    912     cmp        eax, 0
    913     je         xloop1
    914     cmp        eax, 128
    915     je         xloop2
    916 
    917     movd       xmm6, eax            // xmm6 = y fraction
    918     punpcklwd  xmm6, xmm6
    919     pshufd     xmm6, xmm6, 0
    920     neg        eax                  // xmm5 = 256 - y fraction
    921     add        eax, 256
    922     movd       xmm5, eax
    923     punpcklwd  xmm5, xmm5
    924     pshufd     xmm5, xmm5, 0
    925     pxor       xmm7, xmm7
    926 
    927   xloop:
    928     movdqa     xmm0, [esi]
    929     movdqa     xmm2, [esi + edx]
    930     lea        esi, [esi + 16]
    931     movdqa     xmm1, xmm0
    932     movdqa     xmm3, xmm2
    933     punpcklbw  xmm0, xmm7
    934     punpcklbw  xmm2, xmm7
    935     punpckhbw  xmm1, xmm7
    936     punpckhbw  xmm3, xmm7
    937     pmullw     xmm0, xmm5           // scale row 0
    938     pmullw     xmm1, xmm5
    939     pmullw     xmm2, xmm6           // scale row 1
    940     pmullw     xmm3, xmm6
    941     paddusw    xmm0, xmm2           // sum rows
    942     paddusw    xmm1, xmm3
    943     psrlw      xmm0, 8
    944     psrlw      xmm1, 8
    945     packuswb   xmm0, xmm1
    946     movdqa     [edi], xmm0
    947     lea        edi, [edi + 16]
    948     sub        ecx, 16
    949     ja         xloop
    950 
    951     mov        al, [edi - 1]
    952     mov        [edi], al
    953     pop        edi
    954     pop        esi
    955     ret
    956 
    957   xloop1:
    958     movdqa     xmm0, [esi]
    959     lea        esi, [esi + 16]
    960     movdqa     [edi], xmm0
    961     lea        edi, [edi + 16]
    962     sub        ecx, 16
    963     ja         xloop1
    964 
    965     mov        al, [edi - 1]
    966     mov        [edi], al
    967     pop        edi
    968     pop        esi
    969     ret
    970 
    971   xloop2:
    972     movdqa     xmm0, [esi]
    973     movdqa     xmm2, [esi + edx]
    974     lea        esi, [esi + 16]
    975     pavgb      xmm0, xmm2
    976     movdqa     [edi], xmm0
    977     lea        edi, [edi + 16]
    978     sub        ecx, 16
    979     ja         xloop2
    980 
    981     mov        al, [edi - 1]
    982     mov        [edi], al
    983     pop        edi
    984     pop        esi
    985     ret
    986   }
    987 }
    988 
    989 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
    990 #define HAS_SCALEFILTERROWS_SSSE3
    991 __declspec(naked)
    992 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    993                                   int src_stride, int dst_width,
    994                                   int source_y_fraction) {
    995   __asm {
    996     push       esi
    997     push       edi
    998     mov        edi, [esp + 8 + 4]   // dst_ptr
    999     mov        esi, [esp + 8 + 8]   // src_ptr
   1000     mov        edx, [esp + 8 + 12]  // src_stride
   1001     mov        ecx, [esp + 8 + 16]  // dst_width
   1002     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   1003     cmp        eax, 0
   1004     je         xloop1
   1005     cmp        eax, 128
   1006     je         xloop2
   1007 
   1008     shr        eax, 1
   1009     mov        ah,al
   1010     neg        al
   1011     add        al, 128
   1012     movd       xmm7, eax
   1013     punpcklwd  xmm7, xmm7
   1014     pshufd     xmm7, xmm7, 0
   1015 
   1016   xloop:
   1017     movdqa     xmm0, [esi]
   1018     movdqa     xmm2, [esi + edx]
   1019     lea        esi, [esi + 16]
   1020     movdqa     xmm1, xmm0
   1021     punpcklbw  xmm0, xmm2
   1022     punpckhbw  xmm1, xmm2
   1023     pmaddubsw  xmm0, xmm7
   1024     pmaddubsw  xmm1, xmm7
   1025     psrlw      xmm0, 7
   1026     psrlw      xmm1, 7
   1027     packuswb   xmm0, xmm1
   1028     movdqa     [edi], xmm0
   1029     lea        edi, [edi + 16]
   1030     sub        ecx, 16
   1031     ja         xloop
   1032 
   1033     mov        al, [edi - 1]
   1034     mov        [edi], al
   1035     pop        edi
   1036     pop        esi
   1037     ret
   1038 
   1039   xloop1:
   1040     movdqa     xmm0, [esi]
   1041     lea        esi, [esi + 16]
   1042     movdqa     [edi], xmm0
   1043     lea        edi, [edi + 16]
   1044     sub        ecx, 16
   1045     ja         xloop1
   1046 
   1047     mov        al, [edi - 1]
   1048     mov        [edi], al
   1049     pop        edi
   1050     pop        esi
   1051     ret
   1052 
   1053   xloop2:
   1054     movdqa     xmm0, [esi]
   1055     movdqa     xmm2, [esi + edx]
   1056     lea        esi, [esi + 16]
   1057     pavgb      xmm0, xmm2
   1058     movdqa     [edi], xmm0
   1059     lea        edi, [edi + 16]
   1060     sub        ecx, 16
   1061     ja         xloop2
   1062 
   1063     mov        al, [edi - 1]
   1064     mov        [edi], al
   1065     pop        edi
   1066     pop        esi
   1067     ret
   1068 
   1069   }
   1070 }
   1071 
   1072 // Note that movdqa+palign may be better than movdqu.
   1073 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
   1074 __declspec(naked)
   1075 static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   1076                                     int dst_width) {
   1077   __asm {
   1078     mov        edx, [esp + 4]    // dst_ptr
   1079     mov        eax, [esp + 8]    // src_ptr
   1080     mov        ecx, [esp + 12]   // dst_width
   1081     movdqa     xmm1, _round34
   1082     movdqa     xmm2, _shuf01
   1083     movdqa     xmm3, _shuf11
   1084     movdqa     xmm4, _shuf21
   1085     movdqa     xmm5, _madd01
   1086     movdqa     xmm6, _madd11
   1087     movdqa     xmm7, _madd21
   1088 
   1089   wloop:
   1090     movdqa     xmm0, [eax]           // pixels 0..7
   1091     pshufb     xmm0, xmm2
   1092     pmaddubsw  xmm0, xmm5
   1093     paddsw     xmm0, xmm1
   1094     psrlw      xmm0, 2
   1095     packuswb   xmm0, xmm0
   1096     movq       qword ptr [edx], xmm0
   1097     movdqu     xmm0, [eax+8]         // pixels 8..15
   1098     pshufb     xmm0, xmm3
   1099     pmaddubsw  xmm0, xmm6
   1100     paddsw     xmm0, xmm1
   1101     psrlw      xmm0, 2
   1102     packuswb   xmm0, xmm0
   1103     movq       qword ptr [edx+8], xmm0
   1104     movdqa     xmm0, [eax+16]        // pixels 16..23
   1105     lea        eax, [eax+32]
   1106     pshufb     xmm0, xmm4
   1107     pmaddubsw  xmm0, xmm7
   1108     paddsw     xmm0, xmm1
   1109     psrlw      xmm0, 2
   1110     packuswb   xmm0, xmm0
   1111     movq       qword ptr [edx+16], xmm0
   1112     lea        edx, [edx+24]
   1113     sub        ecx, 24
   1114     ja         wloop
   1115     ret
   1116   }
   1117 }
   1118 
   1119 #elif (defined(__x86_64__) || defined(__i386__)) && \
   1120     !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
   1121 
   1122 // GCC versions of row functions are verbatim conversions from Visual C.
   1123 // Generated using gcc disassembly on Visual C object file:
   1124 // objdump -D yuvscaler.obj >yuvscaler.txt
   1125 #define HAS_SCALEROWDOWN2_SSE2
   1126 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
   1127                                uint8* dst_ptr, int dst_width) {
   1128   asm volatile(
   1129   "pcmpeqb    %%xmm7,%%xmm7\n"
   1130   "psrlw      $0x8,%%xmm7\n"
   1131 "1:"
   1132   "movdqa     (%0),%%xmm0\n"
   1133   "movdqa     0x10(%0),%%xmm1\n"
   1134   "lea        0x20(%0),%0\n"
   1135   "pand       %%xmm7,%%xmm0\n"
   1136   "pand       %%xmm7,%%xmm1\n"
   1137   "packuswb   %%xmm1,%%xmm0\n"
   1138   "movdqa     %%xmm0,(%1)\n"
   1139   "lea        0x10(%1),%1\n"
   1140   "sub        $0x10,%2\n"
   1141   "ja         1b\n"
   1142   : "+r"(src_ptr),    // %0
   1143     "+r"(dst_ptr),    // %1
   1144     "+r"(dst_width)   // %2
   1145   :
   1146   : "memory"
   1147 );
   1148 }
   1149 
   1150 static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
   1151                                   uint8* dst_ptr, int dst_width) {
   1152   asm volatile(
   1153   "pcmpeqb    %%xmm7,%%xmm7\n"
   1154   "psrlw      $0x8,%%xmm7\n"
   1155 "1:"
   1156   "movdqa     (%0),%%xmm0\n"
   1157   "movdqa     0x10(%0),%%xmm1\n"
   1158   "movdqa     (%0,%3,1),%%xmm2\n"
   1159   "movdqa     0x10(%0,%3,1),%%xmm3\n"
   1160   "lea        0x20(%0),%0\n"
   1161   "pavgb      %%xmm2,%%xmm0\n"
   1162   "pavgb      %%xmm3,%%xmm1\n"
   1163   "movdqa     %%xmm0,%%xmm2\n"
   1164   "psrlw      $0x8,%%xmm0\n"
   1165   "movdqa     %%xmm1,%%xmm3\n"
   1166   "psrlw      $0x8,%%xmm1\n"
   1167   "pand       %%xmm7,%%xmm2\n"
   1168   "pand       %%xmm7,%%xmm3\n"
   1169   "pavgw      %%xmm2,%%xmm0\n"
   1170   "pavgw      %%xmm3,%%xmm1\n"
   1171   "packuswb   %%xmm1,%%xmm0\n"
   1172   "movdqa     %%xmm0,(%1)\n"
   1173   "lea        0x10(%1),%1\n"
   1174   "sub        $0x10,%2\n"
   1175   "ja         1b\n"
   1176   : "+r"(src_ptr),    // %0
   1177     "+r"(dst_ptr),    // %1
   1178     "+r"(dst_width)   // %2
   1179   : "r"(static_cast<intptr_t>(src_stride))   // %3
   1180   : "memory"
   1181 );
   1182 }
   1183 
   1184 #define HAS_SCALEROWDOWN4_SSE2
   1185 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
   1186                                uint8* dst_ptr, int dst_width) {
   1187   asm volatile(
   1188   "pcmpeqb    %%xmm7,%%xmm7\n"
   1189   "psrld      $0x18,%%xmm7\n"
   1190 "1:"
   1191   "movdqa     (%0),%%xmm0\n"
   1192   "movdqa     0x10(%0),%%xmm1\n"
   1193   "lea        0x20(%0),%0\n"
   1194   "pand       %%xmm7,%%xmm0\n"
   1195   "pand       %%xmm7,%%xmm1\n"
   1196   "packuswb   %%xmm1,%%xmm0\n"
   1197   "packuswb   %%xmm0,%%xmm0\n"
   1198   "movq       %%xmm0,(%1)\n"
   1199   "lea        0x8(%1),%1\n"
   1200   "sub        $0x8,%2\n"
   1201   "ja         1b\n"
   1202   : "+r"(src_ptr),    // %0
   1203     "+r"(dst_ptr),    // %1
   1204     "+r"(dst_width)   // %2
   1205   :
   1206   : "memory"
   1207 );
   1208 }
   1209 
   1210 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
   1211                                   uint8* dst_ptr, int dst_width) {
   1212   intptr_t temp = 0;
   1213   asm volatile(
   1214   "pcmpeqb    %%xmm7,%%xmm7\n"
   1215   "psrlw      $0x8,%%xmm7\n"
   1216   "lea        (%4,%4,2),%3\n"
   1217 "1:"
   1218   "movdqa     (%0),%%xmm0\n"
   1219   "movdqa     0x10(%0),%%xmm1\n"
   1220   "movdqa     (%0,%4,1),%%xmm2\n"
   1221   "movdqa     0x10(%0,%4,1),%%xmm3\n"
   1222   "pavgb      %%xmm2,%%xmm0\n"
   1223   "pavgb      %%xmm3,%%xmm1\n"
   1224   "movdqa     (%0,%4,2),%%xmm2\n"
   1225   "movdqa     0x10(%0,%4,2),%%xmm3\n"
   1226   "movdqa     (%0,%3,1),%%xmm4\n"
   1227   "movdqa     0x10(%0,%3,1),%%xmm5\n"
   1228   "lea        0x20(%0),%0\n"
   1229   "pavgb      %%xmm4,%%xmm2\n"
   1230   "pavgb      %%xmm2,%%xmm0\n"
   1231   "pavgb      %%xmm5,%%xmm3\n"
   1232   "pavgb      %%xmm3,%%xmm1\n"
   1233   "movdqa     %%xmm0,%%xmm2\n"
   1234   "psrlw      $0x8,%%xmm0\n"
   1235   "movdqa     %%xmm1,%%xmm3\n"
   1236   "psrlw      $0x8,%%xmm1\n"
   1237   "pand       %%xmm7,%%xmm2\n"
   1238   "pand       %%xmm7,%%xmm3\n"
   1239   "pavgw      %%xmm2,%%xmm0\n"
   1240   "pavgw      %%xmm3,%%xmm1\n"
   1241   "packuswb   %%xmm1,%%xmm0\n"
   1242   "movdqa     %%xmm0,%%xmm2\n"
   1243   "psrlw      $0x8,%%xmm0\n"
   1244   "pand       %%xmm7,%%xmm2\n"
   1245   "pavgw      %%xmm2,%%xmm0\n"
   1246   "packuswb   %%xmm0,%%xmm0\n"
   1247   "movq       %%xmm0,(%1)\n"
   1248   "lea        0x8(%1),%1\n"
   1249   "sub        $0x8,%2\n"
   1250   "ja         1b\n"
   1251   : "+r"(src_ptr),     // %0
   1252     "+r"(dst_ptr),     // %1
   1253     "+r"(dst_width),   // %2
   1254     "+r"(temp)         // %3
   1255   : "r"(static_cast<intptr_t>(src_stride))    // %4
   1256   : "memory"
   1257 );
   1258 }
   1259 
   1260 #define HAS_SCALEROWDOWN8_SSE2
   1261 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
   1262                                uint8* dst_ptr, int dst_width) {
   1263   asm volatile(
   1264   "pcmpeqb    %%xmm7,%%xmm7\n"
   1265   "psrlq      $0x38,%%xmm7\n"
   1266 "1:"
   1267   "movdqa     (%0),%%xmm0\n"
   1268   "movdqa     0x10(%0),%%xmm1\n"
   1269   "lea        0x20(%0),%0\n"
   1270   "pand       %%xmm7,%%xmm0\n"
   1271   "pand       %%xmm7,%%xmm1\n"
   1272   "packuswb   %%xmm1,%%xmm0\n"
   1273   "packuswb   %%xmm0,%%xmm0\n"
   1274   "packuswb   %%xmm0,%%xmm0\n"
   1275   "movd       %%xmm0,(%1)\n"
   1276   "lea        0x4(%1),%1\n"
   1277   "sub        $0x4,%2\n"
   1278   "ja         1b\n"
   1279   : "+r"(src_ptr),    // %0
   1280     "+r"(dst_ptr),    // %1
   1281     "+r"(dst_width)   // %2
   1282   :
   1283   : "memory"
   1284 );
   1285 }
   1286 
   1287 #if defined(__i386__)
   1288 extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
   1289                                       uint8* dst_ptr, int dst_width);
   1290   asm(
   1291     ".text\n"
   1292 #if defined(OSX)
   1293     ".globl _ScaleRowDown8Int_SSE2\n"
   1294 "_ScaleRowDown8Int_SSE2:\n"
   1295 #else
   1296     ".global ScaleRowDown8Int_SSE2\n"
   1297 "ScaleRowDown8Int_SSE2:\n"
   1298 #endif
   1299     "pusha\n"
   1300     "mov    0x24(%esp),%esi\n"
   1301     "mov    0x28(%esp),%ebx\n"
   1302     "mov    0x2c(%esp),%edi\n"
   1303     "mov    0x30(%esp),%ecx\n"
   1304     "lea    (%ebx,%ebx,2),%edx\n"
   1305     "pxor   %xmm7,%xmm7\n"
   1306 
   1307 "1:"
   1308     "movdqa (%esi),%xmm0\n"
   1309     "movdqa 0x10(%esi),%xmm1\n"
   1310     "movdqa (%esi,%ebx,1),%xmm2\n"
   1311     "movdqa 0x10(%esi,%ebx,1),%xmm3\n"
   1312     "pavgb  %xmm2,%xmm0\n"
   1313     "pavgb  %xmm3,%xmm1\n"
   1314     "movdqa (%esi,%ebx,2),%xmm2\n"
   1315     "movdqa 0x10(%esi,%ebx,2),%xmm3\n"
   1316     "movdqa (%esi,%edx,1),%xmm4\n"
   1317     "movdqa 0x10(%esi,%edx,1),%xmm5\n"
   1318     "lea    (%esi,%ebx,4),%ebp\n"
   1319     "lea    0x20(%esi),%esi\n"
   1320     "pavgb  %xmm4,%xmm2\n"
   1321     "pavgb  %xmm5,%xmm3\n"
   1322     "pavgb  %xmm2,%xmm0\n"
   1323     "pavgb  %xmm3,%xmm1\n"
   1324     "movdqa 0x0(%ebp),%xmm2\n"
   1325     "movdqa 0x10(%ebp),%xmm3\n"
   1326     "movdqa 0x0(%ebp,%ebx,1),%xmm4\n"
   1327     "movdqa 0x10(%ebp,%ebx,1),%xmm5\n"
   1328     "pavgb  %xmm4,%xmm2\n"
   1329     "pavgb  %xmm5,%xmm3\n"
   1330     "movdqa 0x0(%ebp,%ebx,2),%xmm4\n"
   1331     "movdqa 0x10(%ebp,%ebx,2),%xmm5\n"
   1332     "movdqa 0x0(%ebp,%edx,1),%xmm6\n"
   1333     "pavgb  %xmm6,%xmm4\n"
   1334     "movdqa 0x10(%ebp,%edx,1),%xmm6\n"
   1335     "pavgb  %xmm6,%xmm5\n"
   1336     "pavgb  %xmm4,%xmm2\n"
   1337     "pavgb  %xmm5,%xmm3\n"
   1338     "pavgb  %xmm2,%xmm0\n"
   1339     "pavgb  %xmm3,%xmm1\n"
   1340     "psadbw %xmm7,%xmm0\n"
   1341     "psadbw %xmm7,%xmm1\n"
   1342     "pshufd $0xd8,%xmm0,%xmm0\n"
   1343     "pshufd $0x8d,%xmm1,%xmm1\n"
   1344     "por    %xmm1,%xmm0\n"
   1345     "psrlw  $0x3,%xmm0\n"
   1346     "packuswb %xmm0,%xmm0\n"
   1347     "packuswb %xmm0,%xmm0\n"
   1348     "movd   %xmm0,(%edi)\n"
   1349     "lea    0x4(%edi),%edi\n"
   1350     "sub    $0x4,%ecx\n"
   1351     "ja     1b\n"
   1352     "popa\n"
   1353     "ret\n"
   1354 );
   1355 
   1356 // fpic is used for magiccam plugin
   1357 #if !defined(__PIC__)
   1358 #define HAS_SCALEROWDOWN34_SSSE3
   1359 extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
   1360                                      uint8* dst_ptr, int dst_width);
   1361   asm(
   1362     ".text\n"
   1363 #if defined(OSX)
   1364     ".globl _ScaleRowDown34_SSSE3\n"
   1365 "_ScaleRowDown34_SSSE3:\n"
   1366 #else
   1367     ".global ScaleRowDown34_SSSE3\n"
   1368 "ScaleRowDown34_SSSE3:\n"
   1369 #endif
   1370     "pusha\n"
   1371     "mov    0x24(%esp),%esi\n"
   1372     "mov    0x2c(%esp),%edi\n"
   1373     "mov    0x30(%esp),%ecx\n"
   1374     "movdqa _shuf0,%xmm3\n"
   1375     "movdqa _shuf1,%xmm4\n"
   1376     "movdqa _shuf2,%xmm5\n"
   1377 
   1378 "1:"
   1379     "movdqa (%esi),%xmm0\n"
   1380     "movdqa 0x10(%esi),%xmm2\n"
   1381     "lea    0x20(%esi),%esi\n"
   1382     "movdqa %xmm2,%xmm1\n"
   1383     "palignr $0x8,%xmm0,%xmm1\n"
   1384     "pshufb %xmm3,%xmm0\n"
   1385     "pshufb %xmm4,%xmm1\n"
   1386     "pshufb %xmm5,%xmm2\n"
   1387     "movq   %xmm0,(%edi)\n"
   1388     "movq   %xmm1,0x8(%edi)\n"
   1389     "movq   %xmm2,0x10(%edi)\n"
   1390     "lea    0x18(%edi),%edi\n"
   1391     "sub    $0x18,%ecx\n"
   1392     "ja     1b\n"
   1393     "popa\n"
   1394     "ret\n"
   1395 );
   1396 
   1397 extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
   1398                                            uint8* dst_ptr, int dst_width);
   1399   asm(
   1400     ".text\n"
   1401 #if defined(OSX)
   1402     ".globl _ScaleRowDown34_1_Int_SSSE3\n"
   1403 "_ScaleRowDown34_1_Int_SSSE3:\n"
   1404 #else
   1405     ".global ScaleRowDown34_1_Int_SSSE3\n"
   1406 "ScaleRowDown34_1_Int_SSSE3:\n"
   1407 #endif
   1408     "pusha\n"
   1409     "mov    0x24(%esp),%esi\n"
   1410     "mov    0x28(%esp),%ebp\n"
   1411     "mov    0x2c(%esp),%edi\n"
   1412     "mov    0x30(%esp),%ecx\n"
   1413     "movdqa _shuf01,%xmm2\n"
   1414     "movdqa _shuf11,%xmm3\n"
   1415     "movdqa _shuf21,%xmm4\n"
   1416     "movdqa _madd01,%xmm5\n"
   1417     "movdqa _madd11,%xmm6\n"
   1418     "movdqa _round34,%xmm7\n"
   1419 
   1420 "1:"
   1421     "movdqa (%esi),%xmm0\n"
   1422     "movdqa (%esi,%ebp),%xmm1\n"
   1423     "pavgb  %xmm1,%xmm0\n"
   1424     "pshufb %xmm2,%xmm0\n"
   1425     "pmaddubsw %xmm5,%xmm0\n"
   1426     "paddsw %xmm7,%xmm0\n"
   1427     "psrlw  $0x2,%xmm0\n"
   1428     "packuswb %xmm0,%xmm0\n"
   1429     "movq   %xmm0,(%edi)\n"
   1430     "movdqu 0x8(%esi),%xmm0\n"
   1431     "movdqu 0x8(%esi,%ebp),%xmm1\n"
   1432     "pavgb  %xmm1,%xmm0\n"
   1433     "pshufb %xmm3,%xmm0\n"
   1434     "pmaddubsw %xmm6,%xmm0\n"
   1435     "paddsw %xmm7,%xmm0\n"
   1436     "psrlw  $0x2,%xmm0\n"
   1437     "packuswb %xmm0,%xmm0\n"
   1438     "movq   %xmm0,0x8(%edi)\n"
   1439     "movdqa 0x10(%esi),%xmm0\n"
   1440     "movdqa 0x10(%esi,%ebp),%xmm1\n"
   1441     "lea    0x20(%esi),%esi\n"
   1442     "pavgb  %xmm1,%xmm0\n"
   1443     "pshufb %xmm4,%xmm0\n"
   1444     "movdqa  _madd21,%xmm1\n"
   1445     "pmaddubsw %xmm1,%xmm0\n"
   1446     "paddsw %xmm7,%xmm0\n"
   1447     "psrlw  $0x2,%xmm0\n"
   1448     "packuswb %xmm0,%xmm0\n"
   1449     "movq   %xmm0,0x10(%edi)\n"
   1450     "lea    0x18(%edi),%edi\n"
   1451     "sub    $0x18,%ecx\n"
   1452     "ja     1b\n"
   1453 
   1454     "popa\n"
   1455     "ret\n"
   1456 );
   1457 
   1458 extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
   1459                                            uint8* dst_ptr, int dst_width);
   1460   asm(
   1461     ".text\n"
   1462 #if defined(OSX)
   1463     ".globl _ScaleRowDown34_0_Int_SSSE3\n"
   1464 "_ScaleRowDown34_0_Int_SSSE3:\n"
   1465 #else
   1466     ".global ScaleRowDown34_0_Int_SSSE3\n"
   1467 "ScaleRowDown34_0_Int_SSSE3:\n"
   1468 #endif
   1469     "pusha\n"
   1470     "mov    0x24(%esp),%esi\n"
   1471     "mov    0x28(%esp),%ebp\n"
   1472     "mov    0x2c(%esp),%edi\n"
   1473     "mov    0x30(%esp),%ecx\n"
   1474     "movdqa _shuf01,%xmm2\n"
   1475     "movdqa _shuf11,%xmm3\n"
   1476     "movdqa _shuf21,%xmm4\n"
   1477     "movdqa _madd01,%xmm5\n"
   1478     "movdqa _madd11,%xmm6\n"
   1479     "movdqa _round34,%xmm7\n"
   1480 
   1481 "1:"
   1482     "movdqa (%esi),%xmm0\n"
   1483     "movdqa (%esi,%ebp,1),%xmm1\n"
   1484     "pavgb  %xmm0,%xmm1\n"
   1485     "pavgb  %xmm1,%xmm0\n"
   1486     "pshufb %xmm2,%xmm0\n"
   1487     "pmaddubsw %xmm5,%xmm0\n"
   1488     "paddsw %xmm7,%xmm0\n"
   1489     "psrlw  $0x2,%xmm0\n"
   1490     "packuswb %xmm0,%xmm0\n"
   1491     "movq   %xmm0,(%edi)\n"
   1492     "movdqu 0x8(%esi),%xmm0\n"
   1493     "movdqu 0x8(%esi,%ebp,1),%xmm1\n"
   1494     "pavgb  %xmm0,%xmm1\n"
   1495     "pavgb  %xmm1,%xmm0\n"
   1496     "pshufb %xmm3,%xmm0\n"
   1497     "pmaddubsw %xmm6,%xmm0\n"
   1498     "paddsw %xmm7,%xmm0\n"
   1499     "psrlw  $0x2,%xmm0\n"
   1500     "packuswb %xmm0,%xmm0\n"
   1501     "movq   %xmm0,0x8(%edi)\n"
   1502     "movdqa 0x10(%esi),%xmm0\n"
   1503     "movdqa 0x10(%esi,%ebp,1),%xmm1\n"
   1504     "lea    0x20(%esi),%esi\n"
   1505     "pavgb  %xmm0,%xmm1\n"
   1506     "pavgb  %xmm1,%xmm0\n"
   1507     "pshufb %xmm4,%xmm0\n"
   1508     "movdqa  _madd21,%xmm1\n"
   1509     "pmaddubsw %xmm1,%xmm0\n"
   1510     "paddsw %xmm7,%xmm0\n"
   1511     "psrlw  $0x2,%xmm0\n"
   1512     "packuswb %xmm0,%xmm0\n"
   1513     "movq   %xmm0,0x10(%edi)\n"
   1514     "lea    0x18(%edi),%edi\n"
   1515     "sub    $0x18,%ecx\n"
   1516     "ja     1b\n"
   1517     "popa\n"
   1518     "ret\n"
   1519 );
   1520 
   1521 #define HAS_SCALEROWDOWN38_SSSE3
   1522 extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
   1523                                      uint8* dst_ptr, int dst_width);
   1524   asm(
   1525     ".text\n"
   1526 #if defined(OSX)
   1527     ".globl _ScaleRowDown38_SSSE3\n"
   1528 "_ScaleRowDown38_SSSE3:\n"
   1529 #else
   1530     ".global ScaleRowDown38_SSSE3\n"
   1531 "ScaleRowDown38_SSSE3:\n"
   1532 #endif
   1533     "pusha\n"
   1534     "mov    0x24(%esp),%esi\n"
   1535     "mov    0x28(%esp),%edx\n"
   1536     "mov    0x2c(%esp),%edi\n"
   1537     "mov    0x30(%esp),%ecx\n"
   1538     "movdqa _shuf38a ,%xmm5\n"
   1539     "movdqa _shuf38b ,%xmm6\n"
   1540     "pxor   %xmm7,%xmm7\n"
   1541 
   1542 "1:"
   1543     "movdqa (%esi),%xmm0\n"
   1544     "movdqa 0x10(%esi),%xmm1\n"
   1545     "lea    0x20(%esi),%esi\n"
   1546     "pshufb %xmm5,%xmm0\n"
   1547     "pshufb %xmm6,%xmm1\n"
   1548     "paddusb %xmm1,%xmm0\n"
   1549     "movq   %xmm0,(%edi)\n"
   1550     "movhlps %xmm0,%xmm1\n"
   1551     "movd   %xmm1,0x8(%edi)\n"
   1552     "lea    0xc(%edi),%edi\n"
   1553     "sub    $0xc,%ecx\n"
   1554     "ja     1b\n"
   1555     "popa\n"
   1556     "ret\n"
   1557 );
   1558 
   1559 extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
   1560                                            uint8* dst_ptr, int dst_width);
   1561   asm(
   1562     ".text\n"
   1563 #if defined(OSX)
   1564     ".globl _ScaleRowDown38_3_Int_SSSE3\n"
   1565 "_ScaleRowDown38_3_Int_SSSE3:\n"
   1566 #else
   1567     ".global ScaleRowDown38_3_Int_SSSE3\n"
   1568 "ScaleRowDown38_3_Int_SSSE3:\n"
   1569 #endif
   1570     "pusha\n"
   1571     "mov    0x24(%esp),%esi\n"
   1572     "mov    0x28(%esp),%edx\n"
   1573     "mov    0x2c(%esp),%edi\n"
   1574     "mov    0x30(%esp),%ecx\n"
   1575     "movdqa _shufac0,%xmm4\n"
   1576     "movdqa _shufac3,%xmm5\n"
   1577     "movdqa _scaleac3,%xmm6\n"
   1578     "pxor   %xmm7,%xmm7\n"
   1579 
   1580 "1:"
   1581     "movdqa (%esi),%xmm0\n"
   1582     "movdqa (%esi,%edx,1),%xmm2\n"
   1583     "movhlps %xmm0,%xmm1\n"
   1584     "movhlps %xmm2,%xmm3\n"
   1585     "punpcklbw %xmm7,%xmm0\n"
   1586     "punpcklbw %xmm7,%xmm1\n"
   1587     "punpcklbw %xmm7,%xmm2\n"
   1588     "punpcklbw %xmm7,%xmm3\n"
   1589     "paddusw %xmm2,%xmm0\n"
   1590     "paddusw %xmm3,%xmm1\n"
   1591     "movdqa (%esi,%edx,2),%xmm2\n"
   1592     "lea    0x10(%esi),%esi\n"
   1593     "movhlps %xmm2,%xmm3\n"
   1594     "punpcklbw %xmm7,%xmm2\n"
   1595     "punpcklbw %xmm7,%xmm3\n"
   1596     "paddusw %xmm2,%xmm0\n"
   1597     "paddusw %xmm3,%xmm1\n"
   1598     "movdqa %xmm0,%xmm2\n"
   1599     "psrldq $0x2,%xmm0\n"
   1600     "paddusw %xmm0,%xmm2\n"
   1601     "psrldq $0x2,%xmm0\n"
   1602     "paddusw %xmm0,%xmm2\n"
   1603     "pshufb %xmm4,%xmm2\n"
   1604     "movdqa %xmm1,%xmm3\n"
   1605     "psrldq $0x2,%xmm1\n"
   1606     "paddusw %xmm1,%xmm3\n"
   1607     "psrldq $0x2,%xmm1\n"
   1608     "paddusw %xmm1,%xmm3\n"
   1609     "pshufb %xmm5,%xmm3\n"
   1610     "paddusw %xmm3,%xmm2\n"
   1611     "pmulhuw %xmm6,%xmm2\n"
   1612     "packuswb %xmm2,%xmm2\n"
   1613     "movd   %xmm2,(%edi)\n"
   1614     "pextrw $0x2,%xmm2,%eax\n"
   1615     "mov    %ax,0x4(%edi)\n"
   1616     "lea    0x6(%edi),%edi\n"
   1617     "sub    $0x6,%ecx\n"
   1618     "ja     1b\n"
   1619     "popa\n"
   1620     "ret\n"
   1621 );
   1622 
   1623 extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
   1624                                            uint8* dst_ptr, int dst_width);
   1625   asm(
   1626     ".text\n"
   1627 #if defined(OSX)
   1628     ".globl _ScaleRowDown38_2_Int_SSSE3\n"
   1629 "_ScaleRowDown38_2_Int_SSSE3:\n"
   1630 #else
   1631     ".global ScaleRowDown38_2_Int_SSSE3\n"
   1632 "ScaleRowDown38_2_Int_SSSE3:\n"
   1633 #endif
   1634     "pusha\n"
   1635     "mov    0x24(%esp),%esi\n"
   1636     "mov    0x28(%esp),%edx\n"
   1637     "mov    0x2c(%esp),%edi\n"
   1638     "mov    0x30(%esp),%ecx\n"
   1639     "movdqa _shufab0,%xmm4\n"
   1640     "movdqa _shufab1,%xmm5\n"
   1641     "movdqa _shufab2,%xmm6\n"
   1642     "movdqa _scaleab2,%xmm7\n"
   1643 
   1644 "1:"
   1645     "movdqa (%esi),%xmm2\n"
   1646     "pavgb  (%esi,%edx,1),%xmm2\n"
   1647     "lea    0x10(%esi),%esi\n"
   1648     "movdqa %xmm2,%xmm0\n"
   1649     "pshufb %xmm4,%xmm0\n"
   1650     "movdqa %xmm2,%xmm1\n"
   1651     "pshufb %xmm5,%xmm1\n"
   1652     "paddusw %xmm1,%xmm0\n"
   1653     "pshufb %xmm6,%xmm2\n"
   1654     "paddusw %xmm2,%xmm0\n"
   1655     "pmulhuw %xmm7,%xmm0\n"
   1656     "packuswb %xmm0,%xmm0\n"
   1657     "movd   %xmm0,(%edi)\n"
   1658     "pextrw $0x2,%xmm0,%eax\n"
   1659     "mov    %ax,0x4(%edi)\n"
   1660     "lea    0x6(%edi),%edi\n"
   1661     "sub    $0x6,%ecx\n"
   1662     "ja     1b\n"
   1663     "popa\n"
   1664     "ret\n"
   1665 );
   1666 #endif // __PIC__
   1667 
   1668 #define HAS_SCALEADDROWS_SSE2
   1669 extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
   1670                                   uint16* dst_ptr, int src_width,
   1671                                   int src_height);
   1672   asm(
   1673     ".text\n"
   1674 #if defined(OSX)
   1675     ".globl _ScaleAddRows_SSE2\n"
   1676 "_ScaleAddRows_SSE2:\n"
   1677 #else
   1678     ".global ScaleAddRows_SSE2\n"
   1679 "ScaleAddRows_SSE2:\n"
   1680 #endif
   1681     "pusha\n"
   1682     "mov    0x24(%esp),%esi\n"
   1683     "mov    0x28(%esp),%edx\n"
   1684     "mov    0x2c(%esp),%edi\n"
   1685     "mov    0x30(%esp),%ecx\n"
   1686     "mov    0x34(%esp),%ebx\n"
   1687     "pxor   %xmm7,%xmm7\n"
   1688 
   1689 "1:"
   1690     "movdqa (%esi),%xmm2\n"
   1691     "lea    (%esi,%edx,1),%eax\n"
   1692     "movhlps %xmm2,%xmm3\n"
   1693     "lea    -0x1(%ebx),%ebp\n"
   1694     "punpcklbw %xmm7,%xmm2\n"
   1695     "punpcklbw %xmm7,%xmm3\n"
   1696 
   1697 "2:"
   1698     "movdqa (%eax),%xmm0\n"
   1699     "lea    (%eax,%edx,1),%eax\n"
   1700     "movhlps %xmm0,%xmm1\n"
   1701     "punpcklbw %xmm7,%xmm0\n"
   1702     "punpcklbw %xmm7,%xmm1\n"
   1703     "paddusw %xmm0,%xmm2\n"
   1704     "paddusw %xmm1,%xmm3\n"
   1705     "sub    $0x1,%ebp\n"
   1706     "ja     2b\n"
   1707 
   1708     "movdqa %xmm2,(%edi)\n"
   1709     "movdqa %xmm3,0x10(%edi)\n"
   1710     "lea    0x20(%edi),%edi\n"
   1711     "lea    0x10(%esi),%esi\n"
   1712     "sub    $0x10,%ecx\n"
   1713     "ja     1b\n"
   1714     "popa\n"
   1715     "ret\n"
   1716 );
   1717 
   1718 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
   1719 #define HAS_SCALEFILTERROWS_SSE2
   1720 extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
   1721                                      const uint8* src_ptr, int src_stride,
   1722                                      int dst_width, int source_y_fraction);
   1723   asm(
   1724     ".text\n"
   1725 #if defined(OSX)
   1726     ".globl _ScaleFilterRows_SSE2\n"
   1727 "_ScaleFilterRows_SSE2:\n"
   1728 #else
   1729     ".global ScaleFilterRows_SSE2\n"
   1730 "ScaleFilterRows_SSE2:\n"
   1731 #endif
   1732     "push   %esi\n"
   1733     "push   %edi\n"
   1734     "mov    0xc(%esp),%edi\n"
   1735     "mov    0x10(%esp),%esi\n"
   1736     "mov    0x14(%esp),%edx\n"
   1737     "mov    0x18(%esp),%ecx\n"
   1738     "mov    0x1c(%esp),%eax\n"
   1739     "cmp    $0x0,%eax\n"
   1740     "je     2f\n"
   1741     "cmp    $0x80,%eax\n"
   1742     "je     3f\n"
   1743     "movd   %eax,%xmm6\n"
   1744     "punpcklwd %xmm6,%xmm6\n"
   1745     "pshufd $0x0,%xmm6,%xmm6\n"
   1746     "neg    %eax\n"
   1747     "add    $0x100,%eax\n"
   1748     "movd   %eax,%xmm5\n"
   1749     "punpcklwd %xmm5,%xmm5\n"
   1750     "pshufd $0x0,%xmm5,%xmm5\n"
   1751     "pxor   %xmm7,%xmm7\n"
   1752 
   1753 "1:"
   1754     "movdqa (%esi),%xmm0\n"
   1755     "movdqa (%esi,%edx,1),%xmm2\n"
   1756     "lea    0x10(%esi),%esi\n"
   1757     "movdqa %xmm0,%xmm1\n"
   1758     "movdqa %xmm2,%xmm3\n"
   1759     "punpcklbw %xmm7,%xmm0\n"
   1760     "punpcklbw %xmm7,%xmm2\n"
   1761     "punpckhbw %xmm7,%xmm1\n"
   1762     "punpckhbw %xmm7,%xmm3\n"
   1763     "pmullw %xmm5,%xmm0\n"
   1764     "pmullw %xmm5,%xmm1\n"
   1765     "pmullw %xmm6,%xmm2\n"
   1766     "pmullw %xmm6,%xmm3\n"
   1767     "paddusw %xmm2,%xmm0\n"
   1768     "paddusw %xmm3,%xmm1\n"
   1769     "psrlw  $0x8,%xmm0\n"
   1770     "psrlw  $0x8,%xmm1\n"
   1771     "packuswb %xmm1,%xmm0\n"
   1772     "movdqa %xmm0,(%edi)\n"
   1773     "lea    0x10(%edi),%edi\n"
   1774     "sub    $0x10,%ecx\n"
   1775     "ja     1b\n"
   1776     "mov    -0x1(%edi),%al\n"
   1777     "mov    %al,(%edi)\n"
   1778     "pop    %edi\n"
   1779     "pop    %esi\n"
   1780     "ret\n"
   1781 
   1782 "2:"
   1783     "movdqa (%esi),%xmm0\n"
   1784     "lea    0x10(%esi),%esi\n"
   1785     "movdqa %xmm0,(%edi)\n"
   1786     "lea    0x10(%edi),%edi\n"
   1787     "sub    $0x10,%ecx\n"
   1788     "ja     2b\n"
   1789 
   1790     "mov    -0x1(%edi),%al\n"
   1791     "mov    %al,(%edi)\n"
   1792     "pop    %edi\n"
   1793     "pop    %esi\n"
   1794     "ret\n"
   1795 
   1796 "3:"
   1797     "movdqa (%esi),%xmm0\n"
   1798     "movdqa (%esi,%edx,1),%xmm2\n"
   1799     "lea    0x10(%esi),%esi\n"
   1800     "pavgb  %xmm2,%xmm0\n"
   1801     "movdqa %xmm0,(%edi)\n"
   1802     "lea    0x10(%edi),%edi\n"
   1803     "sub    $0x10,%ecx\n"
   1804     "ja     3b\n"
   1805 
   1806     "mov    -0x1(%edi),%al\n"
   1807     "mov    %al,(%edi)\n"
   1808     "pop    %edi\n"
   1809     "pop    %esi\n"
   1810     "ret\n"
   1811 );
   1812 
   1813 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
   1814 #define HAS_SCALEFILTERROWS_SSSE3
   1815 extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
   1816                                       const uint8* src_ptr, int src_stride,
   1817                                       int dst_width, int source_y_fraction);
   1818   asm(
   1819     ".text\n"
   1820 #if defined(OSX)
   1821     ".globl _ScaleFilterRows_SSSE3\n"
   1822 "_ScaleFilterRows_SSSE3:\n"
   1823 #else
   1824     ".global ScaleFilterRows_SSSE3\n"
   1825 "ScaleFilterRows_SSSE3:\n"
   1826 #endif
   1827     "push   %esi\n"
   1828     "push   %edi\n"
   1829     "mov    0xc(%esp),%edi\n"
   1830     "mov    0x10(%esp),%esi\n"
   1831     "mov    0x14(%esp),%edx\n"
   1832     "mov    0x18(%esp),%ecx\n"
   1833     "mov    0x1c(%esp),%eax\n"
   1834     "cmp    $0x0,%eax\n"
   1835     "je     2f\n"
   1836     "cmp    $0x80,%eax\n"
   1837     "je     3f\n"
   1838     "shr    %eax\n"
   1839     "mov    %al,%ah\n"
   1840     "neg    %al\n"
   1841     "add    $0x80,%al\n"
   1842     "movd   %eax,%xmm7\n"
   1843     "punpcklwd %xmm7,%xmm7\n"
   1844     "pshufd $0x0,%xmm7,%xmm7\n"
   1845 
   1846 "1:"
   1847     "movdqa (%esi),%xmm0\n"
   1848     "movdqa (%esi,%edx,1),%xmm2\n"
   1849     "lea    0x10(%esi),%esi\n"
   1850     "movdqa %xmm0,%xmm1\n"
   1851     "punpcklbw %xmm2,%xmm0\n"
   1852     "punpckhbw %xmm2,%xmm1\n"
   1853     "pmaddubsw %xmm7,%xmm0\n"
   1854     "pmaddubsw %xmm7,%xmm1\n"
   1855     "psrlw  $0x7,%xmm0\n"
   1856     "psrlw  $0x7,%xmm1\n"
   1857     "packuswb %xmm1,%xmm0\n"
   1858     "movdqa %xmm0,(%edi)\n"
   1859     "lea    0x10(%edi),%edi\n"
   1860     "sub    $0x10,%ecx\n"
   1861     "ja     1b\n"
   1862     "mov    -0x1(%edi),%al\n"
   1863     "mov    %al,(%edi)\n"
   1864     "pop    %edi\n"
   1865     "pop    %esi\n"
   1866     "ret\n"
   1867 
   1868 "2:"
   1869     "movdqa (%esi),%xmm0\n"
   1870     "lea    0x10(%esi),%esi\n"
   1871     "movdqa %xmm0,(%edi)\n"
   1872     "lea    0x10(%edi),%edi\n"
   1873     "sub    $0x10,%ecx\n"
   1874     "ja     2b\n"
   1875     "mov    -0x1(%edi),%al\n"
   1876     "mov    %al,(%edi)\n"
   1877     "pop    %edi\n"
   1878     "pop    %esi\n"
   1879     "ret\n"
   1880 
   1881 "3:"
   1882     "movdqa (%esi),%xmm0\n"
   1883     "movdqa (%esi,%edx,1),%xmm2\n"
   1884     "lea    0x10(%esi),%esi\n"
   1885     "pavgb  %xmm2,%xmm0\n"
   1886     "movdqa %xmm0,(%edi)\n"
   1887     "lea    0x10(%edi),%edi\n"
   1888     "sub    $0x10,%ecx\n"
   1889     "ja     3b\n"
   1890     "mov    -0x1(%edi),%al\n"
   1891     "mov    %al,(%edi)\n"
   1892     "pop    %edi\n"
   1893     "pop    %esi\n"
   1894     "ret\n"
   1895 );
   1896 
   1897 #elif defined(__x86_64__)
   1898 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
   1899                                   uint8* dst_ptr, int dst_width) {
   1900   asm volatile(
   1901   "lea        (%3,%3,2),%%r10\n"
   1902   "pxor       %%xmm7,%%xmm7\n"
   1903 "1:"
   1904   "movdqa     (%0),%%xmm0\n"
   1905   "movdqa     0x10(%0),%%xmm1\n"
   1906   "movdqa     (%0,%3,1),%%xmm2\n"
   1907   "movdqa     0x10(%0,%3,1),%%xmm3\n"
   1908   "pavgb      %%xmm2,%%xmm0\n"
   1909   "pavgb      %%xmm3,%%xmm1\n"
   1910   "movdqa     (%0,%3,2),%%xmm2\n"
   1911   "movdqa     0x10(%0,%3,2),%%xmm3\n"
   1912   "movdqa     (%0,%%r10,1),%%xmm4\n"
   1913   "movdqa     0x10(%0,%%r10,1),%%xmm5\n"
   1914   "lea        (%0,%3,4),%%r11\n"
   1915   "lea        0x20(%0),%0\n"
   1916   "pavgb      %%xmm4,%%xmm2\n"
   1917   "pavgb      %%xmm5,%%xmm3\n"
   1918   "pavgb      %%xmm2,%%xmm0\n"
   1919   "pavgb      %%xmm3,%%xmm1\n"
   1920   "movdqa     0x0(%%r11),%%xmm2\n"
   1921   "movdqa     0x10(%%r11),%%xmm3\n"
   1922   "movdqa     0x0(%%r11,%3,1),%%xmm4\n"
   1923   "movdqa     0x10(%%r11,%3,1),%%xmm5\n"
   1924   "pavgb      %%xmm4,%%xmm2\n"
   1925   "pavgb      %%xmm5,%%xmm3\n"
   1926   "movdqa     0x0(%%r11,%3,2),%%xmm4\n"
   1927   "movdqa     0x10(%%r11,%3,2),%%xmm5\n"
   1928   "movdqa     0x0(%%r11,%%r10,1),%%xmm6\n"
   1929   "pavgb      %%xmm6,%%xmm4\n"
   1930   "movdqa     0x10(%%r11,%%r10,1),%%xmm6\n"
   1931   "pavgb      %%xmm6,%%xmm5\n"
   1932   "pavgb      %%xmm4,%%xmm2\n"
   1933   "pavgb      %%xmm5,%%xmm3\n"
   1934   "pavgb      %%xmm2,%%xmm0\n"
   1935   "pavgb      %%xmm3,%%xmm1\n"
   1936   "psadbw     %%xmm7,%%xmm0\n"
   1937   "psadbw     %%xmm7,%%xmm1\n"
   1938   "pshufd     $0xd8,%%xmm0,%%xmm0\n"
   1939   "pshufd     $0x8d,%%xmm1,%%xmm1\n"
   1940   "por        %%xmm1,%%xmm0\n"
   1941   "psrlw      $0x3,%%xmm0\n"
   1942   "packuswb   %%xmm0,%%xmm0\n"
   1943   "packuswb   %%xmm0,%%xmm0\n"
   1944   "movd       %%xmm0,(%1)\n"
   1945   "lea        0x4(%1),%1\n"
   1946   "sub        $0x4,%2\n"
   1947   "ja         1b\n"
   1948   : "+r"(src_ptr),     // %0
   1949     "+r"(dst_ptr),     // %1
   1950     "+r"(dst_width)    // %2
   1951   : "r"(static_cast<intptr_t>(src_stride))   // %3
   1952   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3",
   1953     "xmm4", "xmm5", "xmm6", "xmm7"
   1954 );
   1955 }
   1956 
   1957 #define HAS_SCALEROWDOWN34_SSSE3
   1958 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
   1959                                  uint8* dst_ptr, int dst_width) {
   1960   asm volatile(
   1961   "movdqa     (%3),%%xmm3\n"
   1962   "movdqa     (%4),%%xmm4\n"
   1963   "movdqa     (%5),%%xmm5\n"
   1964 "1:"
   1965   "movdqa     (%0),%%xmm0\n"
   1966   "movdqa     0x10(%0),%%xmm2\n"
   1967   "lea        0x20(%0),%0\n"
   1968   "movdqa     %%xmm2,%%xmm1\n"
   1969   "palignr    $0x8,%%xmm0,%%xmm1\n"
   1970   "pshufb     %%xmm3,%%xmm0\n"
   1971   "pshufb     %%xmm4,%%xmm1\n"
   1972   "pshufb     %%xmm5,%%xmm2\n"
   1973   "movq       %%xmm0,(%1)\n"
   1974   "movq       %%xmm1,0x8(%1)\n"
   1975   "movq       %%xmm2,0x10(%1)\n"
   1976   "lea        0x18(%1),%1\n"
   1977   "sub        $0x18,%2\n"
   1978   "ja         1b\n"
   1979   : "+r"(src_ptr),     // %0
   1980     "+r"(dst_ptr),     // %1
   1981     "+r"(dst_width)    // %2
   1982   : "r"(_shuf0),   // %3
   1983     "r"(_shuf1),   // %4
   1984     "r"(_shuf2)    // %5
   1985   : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1986 );
   1987 }
   1988 
   1989 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
   1990                                        uint8* dst_ptr, int dst_width) {
   1991   asm volatile(
   1992   "movdqa     (%4),%%xmm2\n"  // _shuf01
   1993   "movdqa     (%5),%%xmm3\n"  // _shuf11
   1994   "movdqa     (%6),%%xmm4\n"  // _shuf21
   1995   "movdqa     (%7),%%xmm5\n"  // _madd01
   1996   "movdqa     (%8),%%xmm6\n"  // _madd11
   1997   "movdqa     (%9),%%xmm7\n"  // _round34
   1998   "movdqa     (%10),%%xmm8\n"  // _madd21
   1999 "1:"
   2000   "movdqa     (%0),%%xmm0\n"
   2001   "movdqa     (%0,%3),%%xmm1\n"
   2002   "pavgb      %%xmm1,%%xmm0\n"
   2003   "pshufb     %%xmm2,%%xmm0\n"
   2004   "pmaddubsw  %%xmm5,%%xmm0\n"
   2005   "paddsw     %%xmm7,%%xmm0\n"
   2006   "psrlw      $0x2,%%xmm0\n"
   2007   "packuswb   %%xmm0,%%xmm0\n"
   2008   "movq       %%xmm0,(%1)\n"
   2009   "movdqu     0x8(%0),%%xmm0\n"
   2010   "movdqu     0x8(%0,%3),%%xmm1\n"
   2011   "pavgb      %%xmm1,%%xmm0\n"
   2012   "pshufb     %%xmm3,%%xmm0\n"
   2013   "pmaddubsw  %%xmm6,%%xmm0\n"
   2014   "paddsw     %%xmm7,%%xmm0\n"
   2015   "psrlw      $0x2,%%xmm0\n"
   2016   "packuswb   %%xmm0,%%xmm0\n"
   2017   "movq       %%xmm0,0x8(%1)\n"
   2018   "movdqa     0x10(%0),%%xmm0\n"
   2019   "movdqa     0x10(%0,%3),%%xmm1\n"
   2020   "lea        0x20(%0),%0\n"
   2021   "pavgb      %%xmm1,%%xmm0\n"
   2022   "pshufb     %%xmm4,%%xmm0\n"
   2023   "pmaddubsw  %%xmm8,%%xmm0\n"
   2024   "paddsw     %%xmm7,%%xmm0\n"
   2025   "psrlw      $0x2,%%xmm0\n"
   2026   "packuswb   %%xmm0,%%xmm0\n"
   2027   "movq       %%xmm0,0x10(%1)\n"
   2028   "lea        0x18(%1),%1\n"
   2029   "sub        $0x18,%2\n"
   2030   "ja         1b\n"
   2031   : "+r"(src_ptr),     // %0
   2032     "+r"(dst_ptr),     // %1
   2033     "+r"(dst_width)    // %2
   2034   : "r"(static_cast<intptr_t>(src_stride)),  // %3
   2035     "r"(_shuf01),   // %4
   2036     "r"(_shuf11),   // %5
   2037     "r"(_shuf21),   // %6
   2038     "r"(_madd01),   // %7
   2039     "r"(_madd11),   // %8
   2040     "r"(_round34),  // %9
   2041     "r"(_madd21)    // %10
   2042   : "memory", "xmm0", "xmm1", "xmm2", "xmm3",
   2043     "xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
   2044 );
   2045 }
   2046 
   2047 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
   2048                                        uint8* dst_ptr, int dst_width) {
   2049   asm volatile(
   2050   "movdqa     (%4),%%xmm2\n"  // _shuf01
   2051   "movdqa     (%5),%%xmm3\n"  // _shuf11
   2052   "movdqa     (%6),%%xmm4\n"  // _shuf21
   2053   "movdqa     (%7),%%xmm5\n"  // _madd01
   2054   "movdqa     (%8),%%xmm6\n"  // _madd11
   2055   "movdqa     (%9),%%xmm7\n"  // _round34
   2056   "movdqa     (%10),%%xmm8\n"  // _madd21
   2057 "1:"
   2058   "movdqa     (%0),%%xmm0\n"
   2059   "movdqa     (%0,%3,1),%%xmm1\n"
   2060   "pavgb      %%xmm0,%%xmm1\n"
   2061   "pavgb      %%xmm1,%%xmm0\n"
   2062   "pshufb     %%xmm2,%%xmm0\n"
   2063   "pmaddubsw  %%xmm5,%%xmm0\n"
   2064   "paddsw     %%xmm7,%%xmm0\n"
   2065   "psrlw      $0x2,%%xmm0\n"
   2066   "packuswb   %%xmm0,%%xmm0\n"
   2067   "movq       %%xmm0,(%1)\n"
   2068   "movdqu     0x8(%0),%%xmm0\n"
   2069   "movdqu     0x8(%0,%3,1),%%xmm1\n"
   2070   "pavgb      %%xmm0,%%xmm1\n"
   2071   "pavgb      %%xmm1,%%xmm0\n"
   2072   "pshufb     %%xmm3,%%xmm0\n"
   2073   "pmaddubsw  %%xmm6,%%xmm0\n"
   2074   "paddsw     %%xmm7,%%xmm0\n"
   2075   "psrlw      $0x2,%%xmm0\n"
   2076   "packuswb   %%xmm0,%%xmm0\n"
   2077   "movq       %%xmm0,0x8(%1)\n"
   2078   "movdqa     0x10(%0),%%xmm0\n"
   2079   "movdqa     0x10(%0,%3,1),%%xmm1\n"
   2080   "lea        0x20(%0),%0\n"
   2081   "pavgb      %%xmm0,%%xmm1\n"
   2082   "pavgb      %%xmm1,%%xmm0\n"
   2083   "pshufb     %%xmm4,%%xmm0\n"
   2084   "pmaddubsw  %%xmm8,%%xmm0\n"
   2085   "paddsw     %%xmm7,%%xmm0\n"
   2086   "psrlw      $0x2,%%xmm0\n"
   2087   "packuswb   %%xmm0,%%xmm0\n"
   2088   "movq       %%xmm0,0x10(%1)\n"
   2089   "lea        0x18(%1),%1\n"
   2090   "sub        $0x18,%2\n"
   2091   "ja         1b\n"
   2092   : "+r"(src_ptr),     // %0
   2093     "+r"(dst_ptr),     // %1
   2094     "+r"(dst_width)    // %2
   2095   : "r"(static_cast<intptr_t>(src_stride)),  // %3
   2096     "r"(_shuf01),   // %4
   2097     "r"(_shuf11),   // %5
   2098     "r"(_shuf21),   // %6
   2099     "r"(_madd01),   // %7
   2100     "r"(_madd11),   // %8
   2101     "r"(_round34),  // %9
   2102     "r"(_madd21)    // %10
   2103   : "memory", "xmm0", "xmm1", "xmm2", "xmm3",
   2104     "xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
   2105 );
   2106 }
   2107 
   2108 #define HAS_SCALEROWDOWN38_SSSE3
   2109 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
   2110                                  uint8* dst_ptr, int dst_width) {
   2111   asm volatile(
   2112   "movdqa     (%3),%%xmm5\n"
   2113   "movdqa     (%4),%%xmm6\n"
   2114   "pxor       %%xmm7,%%xmm7\n"
   2115 "1:"
   2116   "movdqa     (%0),%%xmm0\n"
   2117   "movdqa     0x10(%0),%%xmm1\n"
   2118   "lea        0x20(%0),%0\n"
   2119   "pshufb     %%xmm5,%%xmm0\n"
   2120   "pshufb     %%xmm6,%%xmm1\n"
   2121   "paddusb    %%xmm1,%%xmm0\n"
   2122   "movq       %%xmm0,(%1)\n"
   2123   "movhlps    %%xmm0,%%xmm1\n"
   2124   "movd       %%xmm1,0x8(%1)\n"
   2125   "lea        0xc(%1),%1\n"
   2126   "sub        $0xc,%2\n"
   2127   "ja         1b\n"
   2128   : "+r"(src_ptr),     // %0
   2129     "+r"(dst_ptr),     // %1
   2130     "+r"(dst_width)    // %2
   2131   : "r"(_shuf38a),  // %3
   2132     "r"(_shuf38b)   // %4
   2133   : "memory", "xmm0", "xmm1", "xmm5", "xmm6", "xmm7"
   2134 );
   2135 }
   2136 
   2137 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
   2138                                        uint8* dst_ptr, int dst_width) {
   2139   asm volatile(
   2140   "movdqa     (%4),%%xmm4\n"
   2141   "movdqa     (%5),%%xmm5\n"
   2142   "movdqa     (%6),%%xmm6\n"
   2143   "pxor       %%xmm7,%%xmm7\n"
   2144 "1:"
   2145   "movdqa     (%0),%%xmm0\n"
   2146   "movdqa     (%0,%3,1),%%xmm2\n"
   2147   "movhlps    %%xmm0,%%xmm1\n"
   2148   "movhlps    %%xmm2,%%xmm3\n"
   2149   "punpcklbw  %%xmm7,%%xmm0\n"
   2150   "punpcklbw  %%xmm7,%%xmm1\n"
   2151   "punpcklbw  %%xmm7,%%xmm2\n"
   2152   "punpcklbw  %%xmm7,%%xmm3\n"
   2153   "paddusw    %%xmm2,%%xmm0\n"
   2154   "paddusw    %%xmm3,%%xmm1\n"
   2155   "movdqa     (%0,%3,2),%%xmm2\n"
   2156   "lea        0x10(%0),%0\n"
   2157   "movhlps    %%xmm2,%%xmm3\n"
   2158   "punpcklbw  %%xmm7,%%xmm2\n"
   2159   "punpcklbw  %%xmm7,%%xmm3\n"
   2160   "paddusw    %%xmm2,%%xmm0\n"
   2161   "paddusw    %%xmm3,%%xmm1\n"
   2162   "movdqa     %%xmm0,%%xmm2\n"
   2163   "psrldq     $0x2,%%xmm0\n"
   2164   "paddusw    %%xmm0,%%xmm2\n"
   2165   "psrldq     $0x2,%%xmm0\n"
   2166   "paddusw    %%xmm0,%%xmm2\n"
   2167   "pshufb     %%xmm4,%%xmm2\n"
   2168   "movdqa     %%xmm1,%%xmm3\n"
   2169   "psrldq     $0x2,%%xmm1\n"
   2170   "paddusw    %%xmm1,%%xmm3\n"
   2171   "psrldq     $0x2,%%xmm1\n"
   2172   "paddusw    %%xmm1,%%xmm3\n"
   2173   "pshufb     %%xmm5,%%xmm3\n"
   2174   "paddusw    %%xmm3,%%xmm2\n"
   2175   "pmulhuw    %%xmm6,%%xmm2\n"
   2176   "packuswb   %%xmm2,%%xmm2\n"
   2177   "movd       %%xmm2,(%1)\n"
   2178   "pextrw     $0x2,%%xmm2,%%eax\n"
   2179   "mov        %%ax,0x4(%1)\n"
   2180   "lea        0x6(%1),%1\n"
   2181   "sub        $0x6,%2\n"
   2182   "ja         1b\n"
   2183   : "+r"(src_ptr),     // %0
   2184     "+r"(dst_ptr),     // %1
   2185     "+r"(dst_width)    // %2
   2186   : "r"(static_cast<intptr_t>(src_stride)),  // %3
   2187     "r"(_shufac0),   // %4
   2188     "r"(_shufac3),   // %5
   2189     "r"(_scaleac3)   // %6
   2190   : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3",
   2191     "xmm4", "xmm5", "xmm6", "xmm7"
   2192 );
   2193 }
   2194 
   2195 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
   2196                                        uint8* dst_ptr, int dst_width) {
   2197   asm volatile(
   2198   "movdqa     (%4),%%xmm4\n"
   2199   "movdqa     (%5),%%xmm5\n"
   2200   "movdqa     (%6),%%xmm6\n"
   2201   "movdqa     (%7),%%xmm7\n"
   2202 "1:"
   2203   "movdqa     (%0),%%xmm2\n"
   2204   "pavgb      (%0,%3,1),%%xmm2\n"
   2205   "lea        0x10(%0),%0\n"
   2206   "movdqa     %%xmm2,%%xmm0\n"
   2207   "pshufb     %%xmm4,%%xmm0\n"
   2208   "movdqa     %%xmm2,%%xmm1\n"
   2209   "pshufb     %%xmm5,%%xmm1\n"
   2210   "paddusw    %%xmm1,%%xmm0\n"
   2211   "pshufb     %%xmm6,%%xmm2\n"
   2212   "paddusw    %%xmm2,%%xmm0\n"
   2213   "pmulhuw    %%xmm7,%%xmm0\n"
   2214   "packuswb   %%xmm0,%%xmm0\n"
   2215   "movd       %%xmm0,(%1)\n"
   2216   "pextrw     $0x2,%%xmm0,%%eax\n"
   2217   "mov        %%ax,0x4(%1)\n"
   2218   "lea        0x6(%1),%1\n"
   2219   "sub        $0x6,%2\n"
   2220   "ja         1b\n"
   2221   : "+r"(src_ptr),     // %0
   2222     "+r"(dst_ptr),     // %1
   2223     "+r"(dst_width)    // %2
   2224   : "r"(static_cast<intptr_t>(src_stride)),  // %3
   2225     "r"(_shufab0),   // %4
   2226     "r"(_shufab1),   // %5
   2227     "r"(_shufab2),   // %6
   2228     "r"(_scaleab2)   // %7
   2229   : "memory", "rax", "xmm0", "xmm1", "xmm2",
   2230     "xmm4", "xmm5", "xmm6", "xmm7"
   2231 );
   2232 }
   2233 
   2234 #define HAS_SCALEADDROWS_SSE2
   2235 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
   2236                               uint16* dst_ptr, int src_width,
   2237                               int src_height) {
   2238   asm volatile(
   2239   "pxor       %%xmm7,%%xmm7\n"
   2240 "1:"
   2241   "movdqa     (%0),%%xmm2\n"
   2242   "lea        (%0,%4,1),%%r10\n"
   2243   "movhlps    %%xmm2,%%xmm3\n"
   2244   "lea        -0x1(%3),%%r11\n"
   2245   "punpcklbw  %%xmm7,%%xmm2\n"
   2246   "punpcklbw  %%xmm7,%%xmm3\n"
   2247 
   2248 "2:"
   2249   "movdqa     (%%r10),%%xmm0\n"
   2250   "lea        (%%r10,%4,1),%%r10\n"
   2251   "movhlps    %%xmm0,%%xmm1\n"
   2252   "punpcklbw  %%xmm7,%%xmm0\n"
   2253   "punpcklbw  %%xmm7,%%xmm1\n"
   2254   "paddusw    %%xmm0,%%xmm2\n"
   2255   "paddusw    %%xmm1,%%xmm3\n"
   2256   "sub        $0x1,%%r11\n"
   2257   "ja         2b\n"
   2258 
   2259   "movdqa     %%xmm2,(%1)\n"
   2260   "movdqa     %%xmm3,0x10(%1)\n"
   2261   "lea        0x20(%1),%1\n"
   2262   "lea        0x10(%0),%0\n"
   2263   "sub        $0x10,%2\n"
   2264   "ja         1b\n"
   2265   : "+r"(src_ptr),     // %0
   2266     "+r"(dst_ptr),     // %1
   2267     "+r"(src_width),   // %2
   2268     "+r"(src_height)   // %3
   2269   : "r"(static_cast<intptr_t>(src_stride))  // %4
   2270   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
   2271 );
   2272 }
   2273 
   2274 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
   2275 #define HAS_SCALEFILTERROWS_SSE2
   2276 static void ScaleFilterRows_SSE2(uint8* dst_ptr,
   2277                                  const uint8* src_ptr, int src_stride,
   2278                                  int dst_width, int source_y_fraction) {
   2279   if (source_y_fraction == 0) {
   2280     asm volatile(
   2281     "1:"
   2282       "movdqa     (%1),%%xmm0\n"
   2283       "lea        0x10(%1),%1\n"
   2284       "movdqa     %%xmm0,(%0)\n"
   2285       "lea        0x10(%0),%0\n"
   2286       "sub        $0x10,%2\n"
   2287       "ja         1b\n"
   2288       "mov        -0x1(%0),%%al\n"
   2289       "mov        %%al,(%0)\n"
   2290       : "+r"(dst_ptr),     // %0
   2291         "+r"(src_ptr),     // %1
   2292         "+r"(dst_width)    // %2
   2293       :
   2294       : "memory", "rax", "xmm0"
   2295     );
   2296     return;
   2297   } else if (source_y_fraction == 128) {
   2298     asm volatile(
   2299     "1:"
   2300       "movdqa     (%1),%%xmm0\n"
   2301       "movdqa     (%1,%3,1),%%xmm2\n"
   2302       "lea        0x10(%1),%1\n"
   2303       "pavgb      %%xmm2,%%xmm0\n"
   2304       "movdqa     %%xmm0,(%0)\n"
   2305       "lea        0x10(%0),%0\n"
   2306       "sub        $0x10,%2\n"
   2307       "ja         1b\n"
   2308       "mov        -0x1(%0),%%al\n"
   2309       "mov        %%al,(%0)\n"
   2310       : "+r"(dst_ptr),     // %0
   2311         "+r"(src_ptr),     // %1
   2312         "+r"(dst_width)    // %2
   2313       : "r"(static_cast<intptr_t>(src_stride))  // %3
   2314       : "memory", "rax", "xmm0", "xmm2"
   2315     );
   2316     return;
   2317   } else {
   2318     asm volatile(
   2319       "mov        %3,%%eax\n"
   2320       "movd       %%eax,%%xmm6\n"
   2321       "punpcklwd  %%xmm6,%%xmm6\n"
   2322       "pshufd     $0x0,%%xmm6,%%xmm6\n"
   2323       "neg        %%eax\n"
   2324       "add        $0x100,%%eax\n"
   2325       "movd       %%eax,%%xmm5\n"
   2326       "punpcklwd  %%xmm5,%%xmm5\n"
   2327       "pshufd     $0x0,%%xmm5,%%xmm5\n"
   2328       "pxor       %%xmm7,%%xmm7\n"
   2329     "1:"
   2330       "movdqa     (%1),%%xmm0\n"
   2331       "movdqa     (%1,%4,1),%%xmm2\n"
   2332       "lea        0x10(%1),%1\n"
   2333       "movdqa     %%xmm0,%%xmm1\n"
   2334       "movdqa     %%xmm2,%%xmm3\n"
   2335       "punpcklbw  %%xmm7,%%xmm0\n"
   2336       "punpcklbw  %%xmm7,%%xmm2\n"
   2337       "punpckhbw  %%xmm7,%%xmm1\n"
   2338       "punpckhbw  %%xmm7,%%xmm3\n"
   2339       "pmullw     %%xmm5,%%xmm0\n"
   2340       "pmullw     %%xmm5,%%xmm1\n"
   2341       "pmullw     %%xmm6,%%xmm2\n"
   2342       "pmullw     %%xmm6,%%xmm3\n"
   2343       "paddusw    %%xmm2,%%xmm0\n"
   2344       "paddusw    %%xmm3,%%xmm1\n"
   2345       "psrlw      $0x8,%%xmm0\n"
   2346       "psrlw      $0x8,%%xmm1\n"
   2347       "packuswb   %%xmm1,%%xmm0\n"
   2348       "movdqa     %%xmm0,(%0)\n"
   2349       "lea        0x10(%0),%0\n"
   2350       "sub        $0x10,%2\n"
   2351       "ja         1b\n"
   2352       "mov        -0x1(%0),%%al\n"
   2353       "mov        %%al,(%0)\n"
   2354       : "+r"(dst_ptr),     // %0
   2355         "+r"(src_ptr),     // %1
   2356         "+r"(dst_width),   // %2
   2357         "+r"(source_y_fraction)  // %3
   2358       : "r"(static_cast<intptr_t>(src_stride))  // %4
   2359       : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3",
   2360         "xmm5", "xmm6", "xmm7"
   2361     );
   2362   }
   2363   return;
   2364 }
   2365 
   2366 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
   2367 #define HAS_SCALEFILTERROWS_SSSE3
   2368 static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
   2369                                   const uint8* src_ptr, int src_stride,
   2370                                   int dst_width, int source_y_fraction) {
   2371   if (source_y_fraction == 0) {
   2372     asm volatile(
   2373    "1:"
   2374       "movdqa     (%1),%%xmm0\n"
   2375       "lea        0x10(%1),%1\n"
   2376       "movdqa     %%xmm0,(%0)\n"
   2377       "lea        0x10(%0),%0\n"
   2378       "sub        $0x10,%2\n"
   2379       "ja         1b\n"
   2380       "mov        -0x1(%0),%%al\n"
   2381       "mov        %%al,(%0)\n"
   2382       : "+r"(dst_ptr),     // %0
   2383         "+r"(src_ptr),     // %1
   2384         "+r"(dst_width)    // %2
   2385       :
   2386       : "memory", "rax", "xmm0"
   2387     );
   2388     return;
   2389   } else if (source_y_fraction == 128) {
   2390     asm volatile(
   2391     "1:"
   2392       "movdqa     (%1),%%xmm0\n"
   2393       "movdqa     (%1,%3,1),%%xmm2\n"
   2394       "lea        0x10(%1),%1\n"
   2395       "pavgb      %%xmm2,%%xmm0\n"
   2396       "movdqa     %%xmm0,(%0)\n"
   2397       "lea        0x10(%0),%0\n"
   2398       "sub        $0x10,%2\n"
   2399       "ja         1b\n"
   2400       "mov        -0x1(%0),%%al\n"
   2401       "mov        %%al,(%0)\n"
   2402       : "+r"(dst_ptr),     // %0
   2403         "+r"(src_ptr),     // %1
   2404         "+r"(dst_width)    // %2
   2405       : "r"(static_cast<intptr_t>(src_stride))  // %3
   2406      : "memory", "rax", "xmm0", "xmm2"
   2407     );
   2408     return;
   2409   } else {
   2410     asm volatile(
   2411       "mov        %3,%%eax\n"
   2412       "shr        %%eax\n"
   2413       "mov        %%al,%%ah\n"
   2414       "neg        %%al\n"
   2415       "add        $0x80,%%al\n"
   2416       "movd       %%eax,%%xmm7\n"
   2417       "punpcklwd  %%xmm7,%%xmm7\n"
   2418       "pshufd     $0x0,%%xmm7,%%xmm7\n"
   2419     "1:"
   2420       "movdqa     (%1),%%xmm0\n"
   2421       "movdqa     (%1,%4,1),%%xmm2\n"
   2422       "lea        0x10(%1),%1\n"
   2423       "movdqa     %%xmm0,%%xmm1\n"
   2424       "punpcklbw  %%xmm2,%%xmm0\n"
   2425       "punpckhbw  %%xmm2,%%xmm1\n"
   2426       "pmaddubsw  %%xmm7,%%xmm0\n"
   2427       "pmaddubsw  %%xmm7,%%xmm1\n"
   2428       "psrlw      $0x7,%%xmm0\n"
   2429       "psrlw      $0x7,%%xmm1\n"
   2430       "packuswb   %%xmm1,%%xmm0\n"
   2431       "movdqa     %%xmm0,(%0)\n"
   2432       "lea        0x10(%0),%0\n"
   2433       "sub        $0x10,%2\n"
   2434       "ja         1b\n"
   2435       "mov        -0x1(%0),%%al\n"
   2436       "mov        %%al,(%0)\n"
   2437       : "+r"(dst_ptr),     // %0
   2438         "+r"(src_ptr),     // %1
   2439         "+r"(dst_width),   // %2
   2440         "+r"(source_y_fraction)  // %3
   2441       : "r"(static_cast<intptr_t>(src_stride))  // %4
   2442       : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm7"
   2443     );
   2444   }
   2445   return;
   2446 }
   2447 #endif
   2448 #endif
   2449 
   2450 // CPU agnostic row functions
   2451 static void ScaleRowDown2_C(const uint8* src_ptr, int,
   2452                             uint8* dst, int dst_width) {
   2453   for (int x = 0; x < dst_width; ++x) {
   2454     *dst++ = *src_ptr;
   2455     src_ptr += 2;
   2456   }
   2457 }
   2458 
   2459 static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
   2460                                uint8* dst, int dst_width) {
   2461   for (int x = 0; x < dst_width; ++x) {
   2462     *dst++ = (src_ptr[0] + src_ptr[1] +
   2463               src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
   2464     src_ptr += 2;
   2465   }
   2466 }
   2467 
   2468 static void ScaleRowDown4_C(const uint8* src_ptr, int,
   2469                             uint8* dst, int dst_width) {
   2470   for (int x = 0; x < dst_width; ++x) {
   2471     *dst++ = *src_ptr;
   2472     src_ptr += 4;
   2473   }
   2474 }
   2475 
   2476 static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
   2477                                uint8* dst, int dst_width) {
   2478   for (int x = 0; x < dst_width; ++x) {
   2479     *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
   2480               src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
   2481               src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
   2482               src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
   2483               src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
   2484               src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
   2485               src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
   2486               8) >> 4;
   2487     src_ptr += 4;
   2488   }
   2489 }
   2490 
   2491 // 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
   2492 // Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
   2493 static const int kMaxOutputWidth = 640;
   2494 static const int kMaxRow12 = kMaxOutputWidth * 2;
   2495 
   2496 static void ScaleRowDown8_C(const uint8* src_ptr, int,
   2497                             uint8* dst, int dst_width) {
   2498   for (int x = 0; x < dst_width; ++x) {
   2499     *dst++ = *src_ptr;
   2500     src_ptr += 8;
   2501   }
   2502 }
   2503 
   2504 // Note calling code checks width is less than max and if not
   2505 // uses ScaleRowDown8_C instead.
   2506 static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
   2507                                uint8* dst, int dst_width) {
   2508   ALIGN16(uint8 src_row[kMaxRow12 * 2]);
   2509   assert(dst_width <= kMaxOutputWidth);
   2510   ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
   2511   ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
   2512                      src_row + kMaxOutputWidth,
   2513                      dst_width * 2);
   2514   ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
   2515 }
   2516 
   2517 static void ScaleRowDown34_C(const uint8* src_ptr, int,
   2518                              uint8* dst, int dst_width) {
   2519   assert((dst_width % 3 == 0) && (dst_width > 0));
   2520   uint8* dend = dst + dst_width;
   2521   do {
   2522     dst[0] = src_ptr[0];
   2523     dst[1] = src_ptr[1];
   2524     dst[2] = src_ptr[3];
   2525     dst += 3;
   2526     src_ptr += 4;
   2527   } while (dst < dend);
   2528 }
   2529 
   2530 // Filter rows 0 and 1 together, 3 : 1
   2531 static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
   2532                                    uint8* d, int dst_width) {
   2533   assert((dst_width % 3 == 0) && (dst_width > 0));
   2534   uint8* dend = d + dst_width;
   2535   const uint8* s = src_ptr;
   2536   const uint8* t = src_ptr + src_stride;
   2537   do {
   2538     uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
   2539     uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
   2540     uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
   2541     uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
   2542     uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
   2543     uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
   2544     d[0] = (a0 * 3 + b0 + 2) >> 2;
   2545     d[1] = (a1 * 3 + b1 + 2) >> 2;
   2546     d[2] = (a2 * 3 + b2 + 2) >> 2;
   2547     d += 3;
   2548     s += 4;
   2549     t += 4;
   2550   } while (d < dend);
   2551 }
   2552 
   2553 // Filter rows 1 and 2 together, 1 : 1
   2554 static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
   2555                                    uint8* d, int dst_width) {
   2556   assert((dst_width % 3 == 0) && (dst_width > 0));
   2557   uint8* dend = d + dst_width;
   2558   const uint8* s = src_ptr;
   2559   const uint8* t = src_ptr + src_stride;
   2560   do {
   2561     uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
   2562     uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
   2563     uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
   2564     uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
   2565     uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
   2566     uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
   2567     d[0] = (a0 + b0 + 1) >> 1;
   2568     d[1] = (a1 + b1 + 1) >> 1;
   2569     d[2] = (a2 + b2 + 1) >> 1;
   2570     d += 3;
   2571     s += 4;
   2572     t += 4;
   2573   } while (d < dend);
   2574 }
   2575 
   2576 #if defined(HAS_SCALEFILTERROWS_SSE2)
   2577 // Filter row to 3/4
   2578 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
   2579                                 int dst_width) {
   2580   assert((dst_width % 3 == 0) && (dst_width > 0));
   2581   uint8* dend = dst_ptr + dst_width;
   2582   const uint8* s = src_ptr;
   2583   do {
   2584     dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
   2585     dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
   2586     dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
   2587     dst_ptr += 3;
   2588     s += 4;
   2589   } while (dst_ptr < dend);
   2590 }
   2591 #endif
   2592 
   2593 static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
   2594                               int dst_width, int dx) {
   2595   int x = 0;
   2596   for (int j = 0; j < dst_width; ++j) {
   2597     int xi = x >> 16;
   2598     int xf1 = x & 0xffff;
   2599     int xf0 = 65536 - xf1;
   2600 
   2601     *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
   2602     x += dx;
   2603   }
   2604 }
   2605 
   2606 static const int kMaxInputWidth = 2560;
   2607 #if defined(HAS_SCALEFILTERROWS_SSE2)
   2608 #define HAS_SCALEROWDOWN34_SSE2
   2609 // Filter rows 0 and 1 together, 3 : 1
   2610 static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,
   2611                                       uint8* dst_ptr, int dst_width) {
   2612   assert((dst_width % 3 == 0) && (dst_width > 0));
   2613   ALIGN16(uint8 row[kMaxInputWidth]);
   2614   ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3,
   2615                        256 / 4);
   2616   ScaleFilterCols34_C(dst_ptr, row, dst_width);
   2617 }
   2618 
   2619 // Filter rows 1 and 2 together, 1 : 1
   2620 static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,
   2621                                       uint8* dst_ptr, int dst_width) {
   2622   assert((dst_width % 3 == 0) && (dst_width > 0));
   2623   ALIGN16(uint8 row[kMaxInputWidth]);
   2624   ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
   2625   ScaleFilterCols34_C(dst_ptr, row, dst_width);
   2626 }
   2627 #endif
   2628 
   2629 static void ScaleRowDown38_C(const uint8* src_ptr, int,
   2630                              uint8* dst, int dst_width) {
   2631   assert(dst_width % 3 == 0);
   2632   for (int x = 0; x < dst_width; x += 3) {
   2633     dst[0] = src_ptr[0];
   2634     dst[1] = src_ptr[3];
   2635     dst[2] = src_ptr[6];
   2636     dst += 3;
   2637     src_ptr += 8;
   2638   }
   2639 }
   2640 
   2641 // 8x3 -> 3x1
   2642 static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
   2643                                    uint8* dst_ptr, int dst_width) {
   2644   assert((dst_width % 3 == 0) && (dst_width > 0));
   2645   for (int i = 0; i < dst_width; i+=3) {
   2646     dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
   2647         src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
   2648         src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
   2649         src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
   2650         (65536 / 9) >> 16;
   2651     dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
   2652         src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
   2653         src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
   2654         src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
   2655         (65536 / 9) >> 16;
   2656     dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
   2657         src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
   2658         src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
   2659         (65536 / 6) >> 16;
   2660     src_ptr += 8;
   2661     dst_ptr += 3;
   2662   }
   2663 }
   2664 
   2665 // 8x2 -> 3x1
   2666 static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
   2667                                    uint8* dst_ptr, int dst_width) {
   2668   assert((dst_width % 3 == 0) && (dst_width > 0));
   2669   for (int i = 0; i < dst_width; i+=3) {
   2670     dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
   2671         src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
   2672         src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
   2673     dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
   2674         src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
   2675         src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
   2676     dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
   2677         src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
   2678         (65536 / 4) >> 16;
   2679     src_ptr += 8;
   2680     dst_ptr += 3;
   2681   }
   2682 }
   2683 
   2684 // C version 8x2 -> 8x1
   2685 static void ScaleFilterRows_C(uint8* dst_ptr,
   2686                               const uint8* src_ptr, int src_stride,
   2687                               int dst_width, int source_y_fraction) {
   2688   assert(dst_width > 0);
   2689   int y1_fraction = source_y_fraction;
   2690   int y0_fraction = 256 - y1_fraction;
   2691   const uint8* src_ptr1 = src_ptr + src_stride;
   2692   uint8* end = dst_ptr + dst_width;
   2693   do {
   2694     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
   2695     dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
   2696     dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
   2697     dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
   2698     dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
   2699     dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
   2700     dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
   2701     dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
   2702     src_ptr += 8;
   2703     src_ptr1 += 8;
   2704     dst_ptr += 8;
   2705   } while (dst_ptr < end);
   2706   dst_ptr[0] = dst_ptr[-1];
   2707 }
   2708 
   2709 void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
   2710                     uint16* dst_ptr, int src_width, int src_height) {
   2711   assert(src_width > 0);
   2712   assert(src_height > 0);
   2713   for (int x = 0; x < src_width; ++x) {
   2714     const uint8* s = src_ptr + x;
   2715     int sum = 0;
   2716     for (int y = 0; y < src_height; ++y) {
   2717       sum += s[0];
   2718       s += src_stride;
   2719     }
   2720     dst_ptr[x] = sum;
   2721   }
   2722 }
   2723 
   2724 /**
   2725  * Scale plane, 1/2
   2726  *
   2727  * This is an optimized version for scaling down a plane to 1/2 of
   2728  * its original size.
   2729  *
   2730  */
   2731 static void ScalePlaneDown2(int src_width, int src_height,
   2732                             int dst_width, int dst_height,
   2733                             int src_stride, int dst_stride,
   2734                             const uint8* src_ptr, uint8* dst_ptr,
   2735                             FilterMode filtering) {
   2736   assert(src_width % 2 == 0);
   2737   assert(src_height % 2 == 0);
   2738   void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,
   2739                         uint8* dst_ptr, int dst_width);
   2740 
   2741 #if defined(HAS_SCALEROWDOWN2_NEON)
   2742   if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
   2743       (dst_width % 16 == 0) && (src_stride % 16 == 0) &&
   2744       (dst_stride % 16 == 0) &&
   2745       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
   2746     ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
   2747   } else
   2748 #endif
   2749 #if defined(HAS_SCALEROWDOWN2_SSE2)
   2750   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
   2751       (dst_width % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
   2752       IS_ALIGNED(dst_ptr, 16)) {
   2753     ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
   2754   } else
   2755 #endif
   2756   {
   2757     ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
   2758   }
   2759 
   2760   for (int y = 0; y < dst_height; ++y) {
   2761     ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
   2762     src_ptr += (src_stride << 1);
   2763     dst_ptr += dst_stride;
   2764   }
   2765 }
   2766 
   2767 /**
   2768  * Scale plane, 1/4
   2769  *
   2770  * This is an optimized version for scaling down a plane to 1/4 of
   2771  * its original size.
   2772  */
   2773 static void ScalePlaneDown4(int src_width, int src_height,
   2774                             int dst_width, int dst_height,
   2775                             int src_stride, int dst_stride,
   2776                             const uint8* src_ptr, uint8* dst_ptr,
   2777                             FilterMode filtering) {
   2778   assert(src_width % 4 == 0);
   2779   assert(src_height % 4 == 0);
   2780   void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
   2781                         uint8* dst_ptr, int dst_width);
   2782 
   2783 #if defined(HAS_SCALEROWDOWN4_NEON)
   2784   if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
   2785       (dst_width % 2 == 0) && (src_stride % 8 == 0) &&
   2786       IS_ALIGNED(src_ptr, 8)) {
   2787     ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
   2788   } else
   2789 #endif
   2790 #if defined(HAS_SCALEROWDOWN4_SSE2)
   2791   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
   2792       (dst_width % 8 == 0) && (src_stride % 16 == 0) &&
   2793       (dst_stride % 8 == 0) &&
   2794       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
   2795     ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
   2796   } else
   2797 #endif
   2798   {
   2799     ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
   2800   }
   2801 
   2802   for (int y = 0; y < dst_height; ++y) {
   2803     ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
   2804     src_ptr += (src_stride << 2);
   2805     dst_ptr += dst_stride;
   2806   }
   2807 }
   2808 
   2809 /**
   2810  * Scale plane, 1/8
   2811  *
   2812  * This is an optimized version for scaling down a plane to 1/8
   2813  * of its original size.
   2814  *
   2815  */
   2816 static void ScalePlaneDown8(int src_width, int src_height,
   2817                             int dst_width, int dst_height,
   2818                             int src_stride, int dst_stride,
   2819                             const uint8* src_ptr, uint8* dst_ptr,
   2820                             FilterMode filtering) {
   2821   assert(src_width % 8 == 0);
   2822   assert(src_height % 8 == 0);
   2823   void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,
   2824                         uint8* dst_ptr, int dst_width);
   2825 #if defined(HAS_SCALEROWDOWN8_SSE2)
   2826   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
   2827       (dst_width % 16 == 0) && dst_width <= kMaxOutputWidth &&
   2828       (src_stride % 16 == 0) && (dst_stride % 16 == 0) &&
   2829       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
   2830     ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
   2831   } else
   2832 #endif
   2833   {
   2834     ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?
   2835         ScaleRowDown8Int_C : ScaleRowDown8_C;
   2836   }
   2837   for (int y = 0; y < dst_height; ++y) {
   2838     ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
   2839     src_ptr += (src_stride << 3);
   2840     dst_ptr += dst_stride;
   2841   }
   2842 }
   2843 
   2844 /**
   2845  * Scale plane down, 3/4
   2846  *
   2847  * Provided by Frank Barchard (fbarchard (at) google.com)
   2848  *
   2849  */
   2850 static void ScalePlaneDown34(int src_width, int src_height,
   2851                              int dst_width, int dst_height,
   2852                              int src_stride, int dst_stride,
   2853                              const uint8* src_ptr, uint8* dst_ptr,
   2854                              FilterMode filtering) {
   2855   assert(dst_width % 3 == 0);
   2856   void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,
   2857                            uint8* dst_ptr, int dst_width);
   2858   void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
   2859                            uint8* dst_ptr, int dst_width);
   2860 #if defined(HAS_SCALEROWDOWN34_SSSE3)
   2861   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
   2862       (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
   2863       (dst_stride % 8 == 0) &&
   2864       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
   2865     if (!filtering) {
   2866       ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
   2867       ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
   2868     } else {
   2869       ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
   2870       ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
   2871     }
   2872   } else
   2873 #endif
   2874 #if defined(HAS_SCALEROWDOWN34_SSE2)
   2875   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
   2876       (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
   2877       (dst_stride % 8 == 0) &&
   2878       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&
   2879       filtering) {
   2880     ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
   2881     ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
   2882   } else
   2883 #endif
   2884   {
   2885     if (!filtering) {
   2886       ScaleRowDown34_0 = ScaleRowDown34_C;
   2887       ScaleRowDown34_1 = ScaleRowDown34_C;
   2888     } else {
   2889       ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
   2890       ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
   2891     }
   2892   }
   2893   int src_row = 0;
   2894   for (int y = 0; y < dst_height; ++y) {
   2895     switch (src_row) {
   2896       case 0:
   2897         ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
   2898         break;
   2899 
   2900       case 1:
   2901         ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
   2902         break;
   2903 
   2904       case 2:
   2905         ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
   2906                          dst_ptr, dst_width);
   2907         break;
   2908     }
   2909     ++src_row;
   2910     src_ptr += src_stride;
   2911     dst_ptr += dst_stride;
   2912     if (src_row >= 3) {
   2913       src_ptr += src_stride;
   2914       src_row = 0;
   2915     }
   2916   }
   2917 }
   2918 
   2919 /**
   2920  * Scale plane, 3/8
   2921  *
   2922  * This is an optimized version for scaling down a plane to 3/8
   2923  * of its original size.
   2924  *
   2925  * Reduces 16x3 to 6x1
   2926  */
   2927 static void ScalePlaneDown38(int src_width, int src_height,
   2928                              int dst_width, int dst_height,
   2929                              int src_stride, int dst_stride,
   2930                              const uint8* src_ptr, uint8* dst_ptr,
   2931                              FilterMode filtering) {
   2932   assert(dst_width % 3 == 0);
   2933   void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,
   2934                            uint8* dst_ptr, int dst_width);
   2935   void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
   2936                            uint8* dst_ptr, int dst_width);
   2937 #if defined(HAS_SCALEROWDOWN38_SSSE3)
   2938   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
   2939       (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
   2940       (dst_stride % 8 == 0) &&
   2941       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
   2942     if (!filtering) {
   2943       ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
   2944       ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
   2945     } else {
   2946       ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
   2947       ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
   2948     }
   2949   } else
   2950 #endif
   2951   {
   2952     if (!filtering) {
   2953       ScaleRowDown38_3 = ScaleRowDown38_C;
   2954       ScaleRowDown38_2 = ScaleRowDown38_C;
   2955     } else {
   2956       ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
   2957       ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
   2958     }
   2959   }
   2960   int src_row = 0;
   2961   for (int y = 0; y < dst_height; ++y) {
   2962     switch (src_row) {
   2963       case 0:
   2964       case 1:
   2965         ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
   2966         src_ptr += src_stride * 3;
   2967         ++src_row;
   2968         break;
   2969 
   2970       case 2:
   2971         ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
   2972         src_ptr += src_stride * 2;
   2973         src_row = 0;
   2974         break;
   2975     }
   2976     dst_ptr += dst_stride;
   2977   }
   2978 }
   2979 
   2980 inline static uint32 SumBox(int iboxwidth, int iboxheight,
   2981                             int src_stride, const uint8* src_ptr) {
   2982   assert(iboxwidth > 0);
   2983   assert(iboxheight > 0);
   2984   uint32 sum = 0u;
   2985   for (int y = 0; y < iboxheight; ++y) {
   2986     for (int x = 0; x < iboxwidth; ++x) {
   2987       sum += src_ptr[x];
   2988     }
   2989     src_ptr += src_stride;
   2990   }
   2991   return sum;
   2992 }
   2993 
   2994 static void ScalePlaneBoxRow(int dst_width, int boxheight,
   2995                              int dx, int src_stride,
   2996                              const uint8* src_ptr, uint8* dst_ptr) {
   2997   int x = 0;
   2998   for (int i = 0; i < dst_width; ++i) {
   2999     int ix = x >> 16;
   3000     x += dx;
   3001     int boxwidth = (x >> 16) - ix;
   3002     *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
   3003         (boxwidth * boxheight);
   3004   }
   3005 }
   3006 
   3007 inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
   3008   assert(iboxwidth > 0);
   3009   uint32 sum = 0u;
   3010   for (int x = 0; x < iboxwidth; ++x) {
   3011     sum += src_ptr[x];
   3012   }
   3013   return sum;
   3014 }
   3015 
   3016 static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
   3017                             const uint16* src_ptr, uint8* dst_ptr) {
   3018   int scaletbl[2];
   3019   int minboxwidth = (dx >> 16);
   3020   scaletbl[0] = 65536 / (minboxwidth * boxheight);
   3021   scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
   3022   int *scaleptr = scaletbl - minboxwidth;
   3023   int x = 0;
   3024   for (int i = 0; i < dst_width; ++i) {
   3025     int ix = x >> 16;
   3026     x += dx;
   3027     int boxwidth = (x >> 16) - ix;
   3028     *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
   3029   }
   3030 }
   3031 
   3032 static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
   3033                             const uint16* src_ptr, uint8* dst_ptr) {
   3034   int boxwidth = (dx >> 16);
   3035   int scaleval = 65536 / (boxwidth * boxheight);
   3036   int x = 0;
   3037   for (int i = 0; i < dst_width; ++i) {
   3038     *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
   3039     x += boxwidth;
   3040   }
   3041 }
   3042 
   3043 /**
   3044  * Scale plane down to any dimensions, with interpolation.
   3045  * (boxfilter).
   3046  *
   3047  * Same method as SimpleScale, which is fixed point, outputting
   3048  * one pixel of destination using fixed point (16.16) to step
   3049  * through source, sampling a box of pixel with simple
   3050  * averaging.
   3051  */
   3052 static void ScalePlaneBox(int src_width, int src_height,
   3053                           int dst_width, int dst_height,
   3054                           int src_stride, int dst_stride,
   3055                           const uint8* src_ptr, uint8* dst_ptr) {
   3056   assert(dst_width > 0);
   3057   assert(dst_height > 0);
   3058   int dy = (src_height << 16) / dst_height;
   3059   int dx = (src_width << 16) / dst_width;
   3060   if ((src_width % 16 != 0) || (src_width > kMaxInputWidth) ||
   3061       dst_height * 2 > src_height) {
   3062     uint8* dst = dst_ptr;
   3063     int dy = (src_height << 16) / dst_height;
   3064     int dx = (src_width << 16) / dst_width;
   3065     int y = 0;
   3066     for (int j = 0; j < dst_height; ++j) {
   3067       int iy = y >> 16;
   3068       const uint8* const src = src_ptr + iy * src_stride;
   3069       y += dy;
   3070       if (y > (src_height << 16)) {
   3071         y = (src_height << 16);
   3072       }
   3073       int boxheight = (y >> 16) - iy;
   3074       ScalePlaneBoxRow(dst_width, boxheight,
   3075                        dx, src_stride,
   3076                        src, dst);
   3077 
   3078       dst += dst_stride;
   3079     }
   3080   } else {
   3081     ALIGN16(uint16 row[kMaxInputWidth]);
   3082     void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
   3083                          uint16* dst_ptr, int src_width, int src_height);
   3084     void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
   3085                          const uint16* src_ptr, uint8* dst_ptr);
   3086 #if defined(HAS_SCALEADDROWS_SSE2)
   3087     if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
   3088         (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
   3089         (src_width % 16) == 0) {
   3090       ScaleAddRows = ScaleAddRows_SSE2;
   3091     } else
   3092 #endif
   3093     {
   3094       ScaleAddRows = ScaleAddRows_C;
   3095     }
   3096     if (dx & 0xffff) {
   3097       ScaleAddCols = ScaleAddCols2_C;
   3098     } else {
   3099       ScaleAddCols = ScaleAddCols1_C;
   3100     }
   3101 
   3102     int y = 0;
   3103     for (int j = 0; j < dst_height; ++j) {
   3104       int iy = y >> 16;
   3105       const uint8* const src = src_ptr + iy * src_stride;
   3106       y += dy;
   3107       if (y > (src_height << 16)) {
   3108         y = (src_height << 16);
   3109       }
   3110       int boxheight = (y >> 16) - iy;
   3111       ScaleAddRows(src, src_stride, row, src_width, boxheight);
   3112       ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
   3113       dst_ptr += dst_stride;
   3114     }
   3115   }
   3116 }
   3117 
   3118 /**
   3119  * Scale plane to/from any dimensions, with interpolation.
   3120  */
   3121 static void ScalePlaneBilinearSimple(int src_width, int src_height,
   3122                                      int dst_width, int dst_height,
   3123                                      int src_stride, int dst_stride,
   3124                                      const uint8* src_ptr, uint8* dst_ptr) {
   3125   uint8* dst = dst_ptr;
   3126   int dx = (src_width << 16) / dst_width;
   3127   int dy = (src_height << 16) / dst_height;
   3128   int maxx = ((src_width - 1) << 16) - 1;
   3129   int maxy = ((src_height - 1) << 16) - 1;
   3130   int y = (dst_height < src_height) ? 32768 :
   3131       (src_height << 16) / dst_height - 32768;
   3132   for (int i = 0; i < dst_height; ++i) {
   3133     int cy = (y < 0) ? 0 : y;
   3134     int yi = cy >> 16;
   3135     int yf = cy & 0xffff;
   3136     const uint8* const src = src_ptr + yi * src_stride;
   3137     int x = (dst_width < src_width) ? 32768 :
   3138         (src_width << 16) / dst_width - 32768;
   3139     for (int j = 0; j < dst_width; ++j) {
   3140       int cx = (x < 0) ? 0 : x;
   3141       int xi = cx >> 16;
   3142       int xf = cx & 0xffff;
   3143       int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
   3144       int r1 = (src[xi + src_stride] * (65536 - xf) +
   3145           src[xi + src_stride + 1] * xf) >> 16;
   3146       *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
   3147       x += dx;
   3148       if (x > maxx)
   3149         x = maxx;
   3150     }
   3151     dst += dst_stride - dst_width;
   3152     y += dy;
   3153     if (y > maxy)
   3154       y = maxy;
   3155   }
   3156 }
   3157 
   3158 /**
   3159  * Scale plane to/from any dimensions, with bilinear
   3160  * interpolation.
   3161  */
   3162 static void ScalePlaneBilinear(int src_width, int src_height,
   3163                                int dst_width, int dst_height,
   3164                                int src_stride, int dst_stride,
   3165                                const uint8* src_ptr, uint8* dst_ptr) {
   3166   assert(dst_width > 0);
   3167   assert(dst_height > 0);
   3168   int dy = (src_height << 16) / dst_height;
   3169   int dx = (src_width << 16) / dst_width;
   3170   if ((src_width % 8 != 0) || (src_width > kMaxInputWidth)) {
   3171     ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
   3172                              src_stride, dst_stride, src_ptr, dst_ptr);
   3173 
   3174   } else {
   3175     ALIGN16(uint8 row[kMaxInputWidth + 1]);
   3176     void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
   3177                             int src_stride,
   3178                             int dst_width, int source_y_fraction);
   3179     void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
   3180                             int dst_width, int dx);
   3181 #if defined(HAS_SCALEFILTERROWS_SSSE3)
   3182     if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
   3183         (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
   3184         (src_width % 16) == 0) {
   3185       ScaleFilterRows = ScaleFilterRows_SSSE3;
   3186     } else
   3187 #endif
   3188 #if defined(HAS_SCALEFILTERROWS_SSE2)
   3189     if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
   3190         (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
   3191         (src_width % 16) == 0) {
   3192       ScaleFilterRows = ScaleFilterRows_SSE2;
   3193     } else
   3194 #endif
   3195     {
   3196       ScaleFilterRows = ScaleFilterRows_C;
   3197     }
   3198     ScaleFilterCols = ScaleFilterCols_C;
   3199 
   3200     int y = 0;
   3201     int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
   3202     for (int j = 0; j < dst_height; ++j) {
   3203       int iy = y >> 16;
   3204       int fy = (y >> 8) & 255;
   3205       const uint8* const src = src_ptr + iy * src_stride;
   3206       ScaleFilterRows(row, src, src_stride, src_width, fy);
   3207       ScaleFilterCols(dst_ptr, row, dst_width, dx);
   3208       dst_ptr += dst_stride;
   3209       y += dy;
   3210       if (y > maxy) {
   3211         y = maxy;
   3212       }
   3213     }
   3214   }
   3215 }
   3216 
   3217 /**
   3218  * Scale plane to/from any dimensions, without interpolation.
   3219  * Fixed point math is used for performance: The upper 16 bits
   3220  * of x and dx is the integer part of the source position and
   3221  * the lower 16 bits are the fixed decimal part.
   3222  */
   3223 static void ScalePlaneSimple(int src_width, int src_height,
   3224                              int dst_width, int dst_height,
   3225                              int src_stride, int dst_stride,
   3226                              const uint8* src_ptr, uint8* dst_ptr) {
   3227   uint8* dst = dst_ptr;
   3228   int dx = (src_width << 16) / dst_width;
   3229   for (int y = 0; y < dst_height; ++y) {
   3230     const uint8* const src = src_ptr + (y * src_height / dst_height) *
   3231         src_stride;
   3232     // TODO(fbarchard): Round X coordinate by setting x=0x8000.
   3233     int x = 0;
   3234     for (int i = 0; i < dst_width; ++i) {
   3235       *dst++ = src[x >> 16];
   3236       x += dx;
   3237     }
   3238     dst += dst_stride - dst_width;
   3239   }
   3240 }
   3241 
   3242 /**
   3243  * Scale plane to/from any dimensions.
   3244  */
   3245 static void ScalePlaneAnySize(int src_width, int src_height,
   3246                               int dst_width, int dst_height,
   3247                               int src_stride, int dst_stride,
   3248                               const uint8* src_ptr, uint8* dst_ptr,
   3249                               FilterMode filtering) {
   3250   if (!filtering) {
   3251     ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
   3252                      src_stride, dst_stride, src_ptr, dst_ptr);
   3253   } else {
   3254     // fall back to non-optimized version
   3255     ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
   3256                        src_stride, dst_stride, src_ptr, dst_ptr);
   3257   }
   3258 }
   3259 
   3260 /**
   3261  * Scale plane down, any size
   3262  *
   3263  * This is an optimized version for scaling down a plane to any size.
   3264  * The current implementation is ~10 times faster compared to the
   3265  * reference implementation for e.g. XGA->LowResPAL
   3266  *
   3267  */
   3268 static void ScalePlaneDown(int src_width, int src_height,
   3269                            int dst_width, int dst_height,
   3270                            int src_stride, int dst_stride,
   3271                            const uint8* src_ptr, uint8* dst_ptr,
   3272                            FilterMode filtering) {
   3273   if (!filtering) {
   3274     ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
   3275                      src_stride, dst_stride, src_ptr, dst_ptr);
   3276   } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
   3277     // between 1/2x and 1x use bilinear
   3278     ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
   3279                        src_stride, dst_stride, src_ptr, dst_ptr);
   3280   } else {
   3281     ScalePlaneBox(src_width, src_height, dst_width, dst_height,
   3282                   src_stride, dst_stride, src_ptr, dst_ptr);
   3283   }
   3284 }
   3285 
   3286 /**
   3287  * Copy plane, no scaling
   3288  *
   3289  * This simply copies the given plane without scaling.
   3290  * The current implementation is ~115 times faster
   3291  * compared to the reference implementation.
   3292  *
   3293  */
   3294 static void CopyPlane(int src_width, int src_height,
   3295                       int dst_width, int dst_height,
   3296                       int src_stride, int dst_stride,
   3297                       const uint8* src_ptr, uint8* dst_ptr) {
   3298   if (src_stride == src_width && dst_stride == dst_width) {
   3299     // All contiguous, so can use REALLY fast path.
   3300     memcpy(dst_ptr, src_ptr, src_width * src_height);
   3301   } else {
   3302     // Not all contiguous; must copy scanlines individually
   3303     const uint8* src = src_ptr;
   3304     uint8* dst = dst_ptr;
   3305     for (int i = 0; i < src_height; ++i) {
   3306       memcpy(dst, src, src_width);
   3307       dst += dst_stride;
   3308       src += src_stride;
   3309     }
   3310   }
   3311 }
   3312 
   3313 static void ScalePlane(const uint8* src, int src_stride,
   3314                        int src_width, int src_height,
   3315                        uint8* dst, int dst_stride,
   3316                        int dst_width, int dst_height,
   3317                        FilterMode filtering, bool use_ref) {
   3318   // Use specialized scales to improve performance for common resolutions.
   3319   // For example, all the 1/2 scalings will use ScalePlaneDown2()
   3320   if (dst_width == src_width && dst_height == src_height) {
   3321     // Straight copy.
   3322     CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,
   3323               dst_stride, src, dst);
   3324   } else if (dst_width <= src_width && dst_height <= src_height) {
   3325     // Scale down.
   3326     if (use_ref) {
   3327       // For testing, allow the optimized versions to be disabled.
   3328       ScalePlaneDown(src_width, src_height, dst_width, dst_height,
   3329                      src_stride, dst_stride, src, dst, filtering);
   3330     } else if (4 * dst_width == 3 * src_width &&
   3331                4 * dst_height == 3 * src_height) {
   3332       // optimized, 3/4
   3333       ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
   3334                        src_stride, dst_stride, src, dst, filtering);
   3335     } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
   3336       // optimized, 1/2
   3337       ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
   3338                       src_stride, dst_stride, src, dst, filtering);
   3339     // 3/8 rounded up for odd sized chroma height.
   3340     } else if (8 * dst_width == 3 * src_width &&
   3341                dst_height == ((src_height * 3 + 7) / 8)) {
   3342       // optimized, 3/8
   3343       ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
   3344                        src_stride, dst_stride, src, dst, filtering);
   3345     } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
   3346       // optimized, 1/4
   3347       ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
   3348                       src_stride, dst_stride, src, dst, filtering);
   3349     } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
   3350       // optimized, 1/8
   3351       ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
   3352                       src_stride, dst_stride, src, dst, filtering);
   3353     } else {
   3354       // Arbitrary downsample
   3355       ScalePlaneDown(src_width, src_height, dst_width, dst_height,
   3356                      src_stride, dst_stride, src, dst, filtering);
   3357     }
   3358   } else {
   3359     // Arbitrary scale up and/or down.
   3360     ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
   3361                       src_stride, dst_stride, src, dst, filtering);
   3362   }
   3363 }
   3364 
   3365 /**
   3366  * Scale a plane.
   3367  *
   3368  * This function in turn calls a scaling function
   3369  * suitable for handling the desired resolutions.
   3370  *
   3371  */
   3372 
   3373 int I420Scale(const uint8* src_y, int src_stride_y,
   3374               const uint8* src_u, int src_stride_u,
   3375               const uint8* src_v, int src_stride_v,
   3376               int src_width, int src_height,
   3377               uint8* dst_y, int dst_stride_y,
   3378               uint8* dst_u, int dst_stride_u,
   3379               uint8* dst_v, int dst_stride_v,
   3380               int dst_width, int dst_height,
   3381               FilterMode filtering) {
   3382   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
   3383       !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
   3384     return -1;
   3385   }
   3386   // Negative height means invert the image.
   3387   if (src_height < 0) {
   3388     src_height = -src_height;
   3389     int halfheight = (src_height + 1) >> 1;
   3390     src_y = src_y + (src_height - 1) * src_stride_y;
   3391     src_u = src_u + (halfheight - 1) * src_stride_u;
   3392     src_v = src_v + (halfheight - 1) * src_stride_v;
   3393     src_stride_y = -src_stride_y;
   3394     src_stride_u = -src_stride_u;
   3395     src_stride_v = -src_stride_v;
   3396   }
   3397   int halfsrc_width = (src_width + 1) >> 1;
   3398   int halfsrc_height = (src_height + 1) >> 1;
   3399   int halfdst_width = (dst_width + 1) >> 1;
   3400   int halfoheight = (dst_height + 1) >> 1;
   3401 
   3402   ScalePlane(src_y, src_stride_y, src_width, src_height,
   3403              dst_y, dst_stride_y, dst_width, dst_height,
   3404              filtering, use_reference_impl_);
   3405   ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
   3406              dst_u, dst_stride_u, halfdst_width, halfoheight,
   3407              filtering, use_reference_impl_);
   3408   ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
   3409              dst_v, dst_stride_v, halfdst_width, halfoheight,
   3410              filtering, use_reference_impl_);
   3411   return 0;
   3412 }
   3413 
   3414 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
   3415           int src_stride_y, int src_stride_u, int src_stride_v,
   3416           int src_width, int src_height,
   3417           uint8* dst_y, uint8* dst_u, uint8* dst_v,
   3418           int dst_stride_y, int dst_stride_u, int dst_stride_v,
   3419           int dst_width, int dst_height,
   3420           bool interpolate) {
   3421   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
   3422       !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
   3423     return -1;
   3424   }
   3425   // Negative height means invert the image.
   3426   if (src_height < 0) {
   3427     src_height = -src_height;
   3428     int halfheight = (src_height + 1) >> 1;
   3429     src_y = src_y + (src_height - 1) * src_stride_y;
   3430     src_u = src_u + (halfheight - 1) * src_stride_u;
   3431     src_v = src_v + (halfheight - 1) * src_stride_v;
   3432     src_stride_y = -src_stride_y;
   3433     src_stride_u = -src_stride_u;
   3434     src_stride_v = -src_stride_v;
   3435   }
   3436   int halfsrc_width = (src_width + 1) >> 1;
   3437   int halfsrc_height = (src_height + 1) >> 1;
   3438   int halfdst_width = (dst_width + 1) >> 1;
   3439   int halfoheight = (dst_height + 1) >> 1;
   3440   FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
   3441 
   3442   ScalePlane(src_y, src_stride_y, src_width, src_height,
   3443              dst_y, dst_stride_y, dst_width, dst_height,
   3444              filtering, use_reference_impl_);
   3445   ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
   3446              dst_u, dst_stride_u, halfdst_width, halfoheight,
   3447              filtering, use_reference_impl_);
   3448   ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
   3449              dst_v, dst_stride_v, halfdst_width, halfoheight,
   3450              filtering, use_reference_impl_);
   3451   return 0;
   3452 }
   3453 
   3454 int Scale(const uint8* src, int src_width, int src_height,
   3455           uint8* dst, int dst_width, int dst_height, int ooffset,
   3456           bool interpolate) {
   3457   if (!src || src_width <= 0 || src_height <= 0 ||
   3458       !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 ||
   3459       ooffset >= dst_height) {
   3460     return -1;
   3461   }
   3462   ooffset = ooffset & ~1;  // chroma requires offset to multiple of 2.
   3463   int halfsrc_width = (src_width + 1) >> 1;
   3464   int halfsrc_height = (src_height + 1) >> 1;
   3465   int halfdst_width = (dst_width + 1) >> 1;
   3466   int halfoheight = (dst_height + 1) >> 1;
   3467   int aheight = dst_height - ooffset * 2;  // actual output height
   3468   const uint8* const iyptr = src;
   3469   uint8* oyptr = dst + ooffset * dst_width;
   3470   const uint8* const iuptr = src + src_width * src_height;
   3471   uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfdst_width;
   3472   const uint8* const ivptr = src + src_width * src_height +
   3473                              halfsrc_width * halfsrc_height;
   3474   uint8* ovptr = dst + dst_width * dst_height + halfdst_width * halfoheight +
   3475                  (ooffset >> 1) * halfdst_width;
   3476   return Scale(iyptr, iuptr, ivptr, src_width, halfsrc_width, halfsrc_width,
   3477                src_width, src_height, oyptr, ouptr, ovptr, dst_width,
   3478                halfdst_width, halfdst_width, dst_width, aheight, interpolate);
   3479 }
   3480 
   3481 }  // namespace libyuv
   3482