Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/scale.h"
     12 
     13 #include <assert.h>
     14 #include <string.h>
     15 #include <stdlib.h>  // For getenv()
     16 
     17 #include "libyuv/cpu_id.h"
     18 #include "libyuv/planar_functions.h"  // For CopyPlane
     19 #include "libyuv/row.h"
     20 
     21 #ifdef __cplusplus
     22 namespace libyuv {
     23 extern "C" {
     24 #endif
     25 
     26 // Bilinear SSE2 is disabled.
     27 #define SSE2_DISABLED 1
     28 
     29 // Note: Some SSE2 reference manuals
     30 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
     31 
     32 // Set the following flag to true to revert to only
     33 // using the reference implementation ScalePlaneBox(), and
     34 // NOT the optimized versions. Useful for debugging and
     35 // when comparing the quality of the resulting YUV planes
     36 // as produced by the optimized and non-optimized versions.
     37 static bool use_reference_impl_ = false;
     38 
     39 LIBYUV_API
     40 void SetUseReferenceImpl(bool use) {
     41   use_reference_impl_ = use;
     42 }
     43 
     44 // ScaleRowDown2Int also used by planar functions
     45 
     46 /**
     47  * NEON downscalers with interpolation.
     48  *
     49  * Provided by Fritz Koenig
     50  *
     51  */
     52 
     53 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
     54 #define HAS_SCALEROWDOWN2_NEON
     55 // Note - not static due to reuse in convert for 444 to 420.
     56 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
     57                         uint8* dst, int dst_width);
     58 
     59 void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
     60                            uint8* dst, int dst_width);
     61 
     62 #define HAS_SCALEROWDOWN4_NEON
     63 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
     64                         uint8* dst_ptr, int dst_width);
     65 void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
     66                            uint8* dst_ptr, int dst_width);
     67 
     68 #define HAS_SCALEROWDOWN34_NEON
     69 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
     70 //  to load up the every 4th pixel into a 4 different registers.
     71 // Point samples 32 pixels to 24 pixels.
     72 void ScaleRowDown34_NEON(const uint8* src_ptr,
     73                          ptrdiff_t /* src_stride */,
     74                          uint8* dst_ptr, int dst_width);
     75 void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
     76                                ptrdiff_t src_stride,
     77                                uint8* dst_ptr, int dst_width);
     78 void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
     79                                ptrdiff_t src_stride,
     80                                uint8* dst_ptr, int dst_width);
     81 
     82 #define HAS_SCALEROWDOWN38_NEON
     83 // 32 -> 12
     84 void ScaleRowDown38_NEON(const uint8* src_ptr,
     85                          ptrdiff_t /* src_stride */,
     86                          uint8* dst_ptr, int dst_width);
     87 // 32x3 -> 12x1
     88 void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
     89                                ptrdiff_t src_stride,
     90                                uint8* dst_ptr, int dst_width);
     91 // 32x2 -> 12x1
     92 void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
     93                                ptrdiff_t src_stride,
     94                                uint8* dst_ptr, int dst_width);
     95 // 16x2 -> 16x1
     96 #define HAS_SCALEFILTERROWS_NEON
     97 void ScaleFilterRows_NEON(uint8* dst_ptr,
     98                           const uint8* src_ptr, ptrdiff_t src_stride,
     99                           int dst_width, int source_y_fraction);
    100 
    101 /**
    102  * SSE2 downscalers with interpolation.
    103  *
    104  * Provided by Frank Barchard (fbarchard (at) google.com)
    105  *
    106  */
    107 
    108 
    109 // Constants for SSSE3 code
    110 #elif !defined(YUV_DISABLE_ASM) && \
    111     (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
    112 
    113 // GCC 4.2 on OSX has link error when passing static or const to inline.
    114 // TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
    115 #ifdef __APPLE__
    116 #define CONST
    117 #else
    118 #define CONST static const
    119 #endif
    120 
    121 // Offsets for source bytes 0 to 9
    122 CONST uvec8 kShuf0 =
    123   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
    124 
    125 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
    126 CONST uvec8 kShuf1 =
    127   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
    128 
    129 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
    130 CONST uvec8 kShuf2 =
    131   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
    132 
    133 // Offsets for source bytes 0 to 10
    134 CONST uvec8 kShuf01 =
    135   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
    136 
    137 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
    138 CONST uvec8 kShuf11 =
    139   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
    140 
    141 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
    142 CONST uvec8 kShuf21 =
    143   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
    144 
    145 // Coefficients for source bytes 0 to 10
    146 CONST uvec8 kMadd01 =
    147   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
    148 
    149 // Coefficients for source bytes 10 to 21
    150 CONST uvec8 kMadd11 =
    151   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
    152 
    153 // Coefficients for source bytes 21 to 31
    154 CONST uvec8 kMadd21 =
    155   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
    156 
    157 // Coefficients for source bytes 21 to 31
    158 CONST vec16 kRound34 =
    159   { 2, 2, 2, 2, 2, 2, 2, 2 };
    160 
    161 CONST uvec8 kShuf38a =
    162   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
    163 
    164 CONST uvec8 kShuf38b =
    165   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
    166 
    167 // Arrange words 0,3,6 into 0,1,2
    168 CONST uvec8 kShufAc =
    169   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
    170 
    171 // Arrange words 0,3,6 into 3,4,5
    172 CONST uvec8 kShufAc3 =
    173   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
    174 
    175 // Scaling values for boxes of 3x3 and 2x3
    176 CONST uvec16 kScaleAc33 =
    177   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
    178 
    179 // Arrange first value for pixels 0,1,2,3,4,5
    180 CONST uvec8 kShufAb0 =
    181   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
    182 
    183 // Arrange second value for pixels 0,1,2,3,4,5
    184 CONST uvec8 kShufAb1 =
    185   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
    186 
    187 // Arrange third value for pixels 0,1,2,3,4,5
    188 CONST uvec8 kShufAb2 =
    189   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
    190 
    191 // Scaling values for boxes of 3x2 and 2x2
    192 CONST uvec16 kScaleAb2 =
    193   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
    194 #endif
    195 
    196 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
    197 
    198 #define HAS_SCALEROWDOWN2_SSE2
    199 // Reads 32 pixels, throws half away and writes 16 pixels.
    200 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
    201 __declspec(naked) __declspec(align(16))
    202 static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    203                                uint8* dst_ptr, int dst_width) {
    204   __asm {
    205     mov        eax, [esp + 4]        // src_ptr
    206                                      // src_stride ignored
    207     mov        edx, [esp + 12]       // dst_ptr
    208     mov        ecx, [esp + 16]       // dst_width
    209     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
    210     psrlw      xmm5, 8
    211 
    212     align      16
    213   wloop:
    214     movdqa     xmm0, [eax]
    215     movdqa     xmm1, [eax + 16]
    216     lea        eax,  [eax + 32]
    217     pand       xmm0, xmm5
    218     pand       xmm1, xmm5
    219     packuswb   xmm0, xmm1
    220     sub        ecx, 16
    221     movdqa     [edx], xmm0
    222     lea        edx, [edx + 16]
    223     jg         wloop
    224 
    225     ret
    226   }
    227 }
    228 // Blends 32x2 rectangle to 16x1.
    229 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
    230 __declspec(naked) __declspec(align(16))
    231 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    232                            uint8* dst_ptr, int dst_width) {
    233   __asm {
    234     push       esi
    235     mov        eax, [esp + 4 + 4]    // src_ptr
    236     mov        esi, [esp + 4 + 8]    // src_stride
    237     mov        edx, [esp + 4 + 12]   // dst_ptr
    238     mov        ecx, [esp + 4 + 16]   // dst_width
    239     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
    240     psrlw      xmm5, 8
    241 
    242     align      16
    243   wloop:
    244     movdqa     xmm0, [eax]
    245     movdqa     xmm1, [eax + 16]
    246     movdqa     xmm2, [eax + esi]
    247     movdqa     xmm3, [eax + esi + 16]
    248     lea        eax,  [eax + 32]
    249     pavgb      xmm0, xmm2            // average rows
    250     pavgb      xmm1, xmm3
    251 
    252     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
    253     psrlw      xmm0, 8
    254     movdqa     xmm3, xmm1
    255     psrlw      xmm1, 8
    256     pand       xmm2, xmm5
    257     pand       xmm3, xmm5
    258     pavgw      xmm0, xmm2
    259     pavgw      xmm1, xmm3
    260     packuswb   xmm0, xmm1
    261 
    262     sub        ecx, 16
    263     movdqa     [edx], xmm0
    264     lea        edx, [edx + 16]
    265     jg         wloop
    266 
    267     pop        esi
    268     ret
    269   }
    270 }
    271 
    272 // Reads 32 pixels, throws half away and writes 16 pixels.
    273 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
    274 __declspec(naked) __declspec(align(16))
    275 static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
    276                                          ptrdiff_t src_stride,
    277                                          uint8* dst_ptr, int dst_width) {
    278   __asm {
    279     mov        eax, [esp + 4]        // src_ptr
    280                                      // src_stride ignored
    281     mov        edx, [esp + 12]       // dst_ptr
    282     mov        ecx, [esp + 16]       // dst_width
    283     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
    284     psrlw      xmm5, 8
    285 
    286     align      16
    287   wloop:
    288     movdqu     xmm0, [eax]
    289     movdqu     xmm1, [eax + 16]
    290     lea        eax,  [eax + 32]
    291     pand       xmm0, xmm5
    292     pand       xmm1, xmm5
    293     packuswb   xmm0, xmm1
    294     sub        ecx, 16
    295     movdqu     [edx], xmm0
    296     lea        edx, [edx + 16]
    297     jg         wloop
    298 
    299     ret
    300   }
    301 }
    302 // Blends 32x2 rectangle to 16x1.
    303 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
    304 __declspec(naked) __declspec(align(16))
    305 static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
    306                                             ptrdiff_t src_stride,
    307                                             uint8* dst_ptr, int dst_width) {
    308   __asm {
    309     push       esi
    310     mov        eax, [esp + 4 + 4]    // src_ptr
    311     mov        esi, [esp + 4 + 8]    // src_stride
    312     mov        edx, [esp + 4 + 12]   // dst_ptr
    313     mov        ecx, [esp + 4 + 16]   // dst_width
    314     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
    315     psrlw      xmm5, 8
    316 
    317     align      16
    318   wloop:
    319     movdqu     xmm0, [eax]
    320     movdqu     xmm1, [eax + 16]
    321     movdqu     xmm2, [eax + esi]
    322     movdqu     xmm3, [eax + esi + 16]
    323     lea        eax,  [eax + 32]
    324     pavgb      xmm0, xmm2            // average rows
    325     pavgb      xmm1, xmm3
    326 
    327     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
    328     psrlw      xmm0, 8
    329     movdqa     xmm3, xmm1
    330     psrlw      xmm1, 8
    331     pand       xmm2, xmm5
    332     pand       xmm3, xmm5
    333     pavgw      xmm0, xmm2
    334     pavgw      xmm1, xmm3
    335     packuswb   xmm0, xmm1
    336 
    337     sub        ecx, 16
    338     movdqu     [edx], xmm0
    339     lea        edx, [edx + 16]
    340     jg         wloop
    341 
    342     pop        esi
    343     ret
    344   }
    345 }
    346 
    347 #define HAS_SCALEROWDOWN4_SSE2
    348 // Point samples 32 pixels to 8 pixels.
    349 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
    350 __declspec(naked) __declspec(align(16))
    351 static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    352                                uint8* dst_ptr, int dst_width) {
    353   __asm {
    354     mov        eax, [esp + 4]        // src_ptr
    355                                      // src_stride ignored
    356     mov        edx, [esp + 12]       // dst_ptr
    357     mov        ecx, [esp + 16]       // dst_width
    358     pcmpeqb    xmm5, xmm5            // generate mask 0x000000ff
    359     psrld      xmm5, 24
    360 
    361     align      16
    362   wloop:
    363     movdqa     xmm0, [eax]
    364     movdqa     xmm1, [eax + 16]
    365     lea        eax,  [eax + 32]
    366     pand       xmm0, xmm5
    367     pand       xmm1, xmm5
    368     packuswb   xmm0, xmm1
    369     packuswb   xmm0, xmm0
    370     sub        ecx, 8
    371     movq       qword ptr [edx], xmm0
    372     lea        edx, [edx + 8]
    373     jg         wloop
    374 
    375     ret
    376   }
    377 }
    378 
    379 // Blends 32x4 rectangle to 8x1.
    380 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
    381 __declspec(naked) __declspec(align(16))
    382 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    383                                   uint8* dst_ptr, int dst_width) {
    384   __asm {
    385     push       esi
    386     push       edi
    387     mov        eax, [esp + 8 + 4]    // src_ptr
    388     mov        esi, [esp + 8 + 8]    // src_stride
    389     mov        edx, [esp + 8 + 12]   // dst_ptr
    390     mov        ecx, [esp + 8 + 16]   // dst_width
    391     lea        edi, [esi + esi * 2]  // src_stride * 3
    392     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
    393     psrlw      xmm7, 8
    394 
    395     align      16
    396   wloop:
    397     movdqa     xmm0, [eax]
    398     movdqa     xmm1, [eax + 16]
    399     movdqa     xmm2, [eax + esi]
    400     movdqa     xmm3, [eax + esi + 16]
    401     pavgb      xmm0, xmm2            // average rows
    402     pavgb      xmm1, xmm3
    403     movdqa     xmm2, [eax + esi * 2]
    404     movdqa     xmm3, [eax + esi * 2 + 16]
    405     movdqa     xmm4, [eax + edi]
    406     movdqa     xmm5, [eax + edi + 16]
    407     lea        eax, [eax + 32]
    408     pavgb      xmm2, xmm4
    409     pavgb      xmm3, xmm5
    410     pavgb      xmm0, xmm2
    411     pavgb      xmm1, xmm3
    412 
    413     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
    414     psrlw      xmm0, 8
    415     movdqa     xmm3, xmm1
    416     psrlw      xmm1, 8
    417     pand       xmm2, xmm7
    418     pand       xmm3, xmm7
    419     pavgw      xmm0, xmm2
    420     pavgw      xmm1, xmm3
    421     packuswb   xmm0, xmm1
    422 
    423     movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
    424     psrlw      xmm0, 8
    425     pand       xmm2, xmm7
    426     pavgw      xmm0, xmm2
    427     packuswb   xmm0, xmm0
    428 
    429     sub        ecx, 8
    430     movq       qword ptr [edx], xmm0
    431     lea        edx, [edx + 8]
    432     jg         wloop
    433 
    434     pop        edi
    435     pop        esi
    436     ret
    437   }
    438 }
    439 
    440 #define HAS_SCALEROWDOWN8_SSE2
    441 // Point samples 32 pixels to 4 pixels.
    442 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
    443 __declspec(naked) __declspec(align(16))
    444 static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    445                                uint8* dst_ptr, int dst_width) {
    446   __asm {
    447     mov        eax, [esp + 4]        // src_ptr
    448                                      // src_stride ignored
    449     mov        edx, [esp + 12]       // dst_ptr
    450     mov        ecx, [esp + 16]       // dst_width
    451     pcmpeqb    xmm5, xmm5            // generate mask isolating 1 src 8 bytes
    452     psrlq      xmm5, 56
    453 
    454     align      16
    455   wloop:
    456     movdqa     xmm0, [eax]
    457     movdqa     xmm1, [eax + 16]
    458     lea        eax,  [eax + 32]
    459     pand       xmm0, xmm5
    460     pand       xmm1, xmm5
    461     packuswb   xmm0, xmm1  // 32->16
    462     packuswb   xmm0, xmm0  // 16->8
    463     packuswb   xmm0, xmm0  // 8->4
    464     sub        ecx, 4
    465     movd       dword ptr [edx], xmm0
    466     lea        edx, [edx + 4]
    467     jg         wloop
    468 
    469     ret
    470   }
    471 }
    472 
    473 // Blends 32x8 rectangle to 4x1.
    474 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
    475 __declspec(naked) __declspec(align(16))
    476 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    477                                   uint8* dst_ptr, int dst_width) {
    478   __asm {
    479     push       esi
    480     push       edi
    481     push       ebp
    482     mov        eax, [esp + 12 + 4]   // src_ptr
    483     mov        esi, [esp + 12 + 8]   // src_stride
    484     mov        edx, [esp + 12 + 12]  // dst_ptr
    485     mov        ecx, [esp + 12 + 16]  // dst_width
    486     lea        edi, [esi + esi * 2]  // src_stride * 3
    487     pxor       xmm7, xmm7
    488 
    489     align      16
    490   wloop:
    491     movdqa     xmm0, [eax]           // average 8 rows to 1
    492     movdqa     xmm1, [eax + 16]
    493     movdqa     xmm2, [eax + esi]
    494     movdqa     xmm3, [eax + esi + 16]
    495     pavgb      xmm0, xmm2
    496     pavgb      xmm1, xmm3
    497     movdqa     xmm2, [eax + esi * 2]
    498     movdqa     xmm3, [eax + esi * 2 + 16]
    499     movdqa     xmm4, [eax + edi]
    500     movdqa     xmm5, [eax + edi + 16]
    501     lea        ebp, [eax + esi * 4]
    502     lea        eax, [eax + 32]
    503     pavgb      xmm2, xmm4
    504     pavgb      xmm3, xmm5
    505     pavgb      xmm0, xmm2
    506     pavgb      xmm1, xmm3
    507 
    508     movdqa     xmm2, [ebp]
    509     movdqa     xmm3, [ebp + 16]
    510     movdqa     xmm4, [ebp + esi]
    511     movdqa     xmm5, [ebp + esi + 16]
    512     pavgb      xmm2, xmm4
    513     pavgb      xmm3, xmm5
    514     movdqa     xmm4, [ebp + esi * 2]
    515     movdqa     xmm5, [ebp + esi * 2 + 16]
    516     movdqa     xmm6, [ebp + edi]
    517     pavgb      xmm4, xmm6
    518     movdqa     xmm6, [ebp + edi + 16]
    519     pavgb      xmm5, xmm6
    520     pavgb      xmm2, xmm4
    521     pavgb      xmm3, xmm5
    522     pavgb      xmm0, xmm2
    523     pavgb      xmm1, xmm3
    524 
    525     psadbw     xmm0, xmm7            // average 32 pixels to 4
    526     psadbw     xmm1, xmm7
    527     pshufd     xmm0, xmm0, 0xd8      // x1x0 -> xx01
    528     pshufd     xmm1, xmm1, 0x8d      // x3x2 -> 32xx
    529     por        xmm0, xmm1            //      -> 3201
    530     psrlw      xmm0, 3
    531     packuswb   xmm0, xmm0
    532     packuswb   xmm0, xmm0
    533 
    534     sub        ecx, 4
    535     movd       dword ptr [edx], xmm0
    536     lea        edx, [edx + 4]
    537     jg         wloop
    538 
    539     pop        ebp
    540     pop        edi
    541     pop        esi
    542     ret
    543   }
    544 }
    545 
    546 #define HAS_SCALEROWDOWN34_SSSE3
    547 // Point samples 32 pixels to 24 pixels.
    548 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
    549 // Then shuffled to do the scaling.
    550 
    551 // Note that movdqa+palign may be better than movdqu.
    552 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
    553 __declspec(naked) __declspec(align(16))
    554 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    555                                  uint8* dst_ptr, int dst_width) {
    556   __asm {
    557     mov        eax, [esp + 4]        // src_ptr
    558                                      // src_stride ignored
    559     mov        edx, [esp + 12]       // dst_ptr
    560     mov        ecx, [esp + 16]       // dst_width
    561     movdqa     xmm3, kShuf0
    562     movdqa     xmm4, kShuf1
    563     movdqa     xmm5, kShuf2
    564 
    565     align      16
    566   wloop:
    567     movdqa     xmm0, [eax]
    568     movdqa     xmm1, [eax + 16]
    569     lea        eax,  [eax + 32]
    570     movdqa     xmm2, xmm1
    571     palignr    xmm1, xmm0, 8
    572     pshufb     xmm0, xmm3
    573     pshufb     xmm1, xmm4
    574     pshufb     xmm2, xmm5
    575     movq       qword ptr [edx], xmm0
    576     movq       qword ptr [edx + 8], xmm1
    577     movq       qword ptr [edx + 16], xmm2
    578     lea        edx, [edx + 24]
    579     sub        ecx, 24
    580     jg         wloop
    581 
    582     ret
    583   }
    584 }
    585 
    586 // Blends 32x2 rectangle to 24x1
    587 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
    588 // Then shuffled to do the scaling.
    589 
    590 // Register usage:
    591 // xmm0 src_row 0
    592 // xmm1 src_row 1
    593 // xmm2 shuf 0
    594 // xmm3 shuf 1
    595 // xmm4 shuf 2
    596 // xmm5 madd 0
    597 // xmm6 madd 1
    598 // xmm7 kRound34
    599 
    600 // Note that movdqa+palign may be better than movdqu.
    601 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
    602 __declspec(naked) __declspec(align(16))
    603 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
    604                                        ptrdiff_t src_stride,
    605                                        uint8* dst_ptr, int dst_width) {
    606   __asm {
    607     push       esi
    608     mov        eax, [esp + 4 + 4]    // src_ptr
    609     mov        esi, [esp + 4 + 8]    // src_stride
    610     mov        edx, [esp + 4 + 12]   // dst_ptr
    611     mov        ecx, [esp + 4 + 16]   // dst_width
    612     movdqa     xmm2, kShuf01
    613     movdqa     xmm3, kShuf11
    614     movdqa     xmm4, kShuf21
    615     movdqa     xmm5, kMadd01
    616     movdqa     xmm6, kMadd11
    617     movdqa     xmm7, kRound34
    618 
    619     align      16
    620   wloop:
    621     movdqa     xmm0, [eax]           // pixels 0..7
    622     movdqa     xmm1, [eax + esi]
    623     pavgb      xmm0, xmm1
    624     pshufb     xmm0, xmm2
    625     pmaddubsw  xmm0, xmm5
    626     paddsw     xmm0, xmm7
    627     psrlw      xmm0, 2
    628     packuswb   xmm0, xmm0
    629     movq       qword ptr [edx], xmm0
    630     movdqu     xmm0, [eax + 8]       // pixels 8..15
    631     movdqu     xmm1, [eax + esi + 8]
    632     pavgb      xmm0, xmm1
    633     pshufb     xmm0, xmm3
    634     pmaddubsw  xmm0, xmm6
    635     paddsw     xmm0, xmm7
    636     psrlw      xmm0, 2
    637     packuswb   xmm0, xmm0
    638     movq       qword ptr [edx + 8], xmm0
    639     movdqa     xmm0, [eax + 16]      // pixels 16..23
    640     movdqa     xmm1, [eax + esi + 16]
    641     lea        eax, [eax + 32]
    642     pavgb      xmm0, xmm1
    643     pshufb     xmm0, xmm4
    644     movdqa     xmm1, kMadd21
    645     pmaddubsw  xmm0, xmm1
    646     paddsw     xmm0, xmm7
    647     psrlw      xmm0, 2
    648     packuswb   xmm0, xmm0
    649     sub        ecx, 24
    650     movq       qword ptr [edx + 16], xmm0
    651     lea        edx, [edx + 24]
    652     jg         wloop
    653 
    654     pop        esi
    655     ret
    656   }
    657 }
    658 
    659 // Note that movdqa+palign may be better than movdqu.
    660 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
    661 __declspec(naked) __declspec(align(16))
    662 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
    663                                        ptrdiff_t src_stride,
    664                                        uint8* dst_ptr, int dst_width) {
    665   __asm {
    666     push       esi
    667     mov        eax, [esp + 4 + 4]    // src_ptr
    668     mov        esi, [esp + 4 + 8]    // src_stride
    669     mov        edx, [esp + 4 + 12]   // dst_ptr
    670     mov        ecx, [esp + 4 + 16]   // dst_width
    671     movdqa     xmm2, kShuf01
    672     movdqa     xmm3, kShuf11
    673     movdqa     xmm4, kShuf21
    674     movdqa     xmm5, kMadd01
    675     movdqa     xmm6, kMadd11
    676     movdqa     xmm7, kRound34
    677 
    678     align      16
    679   wloop:
    680     movdqa     xmm0, [eax]           // pixels 0..7
    681     movdqa     xmm1, [eax + esi]
    682     pavgb      xmm1, xmm0
    683     pavgb      xmm0, xmm1
    684     pshufb     xmm0, xmm2
    685     pmaddubsw  xmm0, xmm5
    686     paddsw     xmm0, xmm7
    687     psrlw      xmm0, 2
    688     packuswb   xmm0, xmm0
    689     movq       qword ptr [edx], xmm0
    690     movdqu     xmm0, [eax + 8]       // pixels 8..15
    691     movdqu     xmm1, [eax + esi + 8]
    692     pavgb      xmm1, xmm0
    693     pavgb      xmm0, xmm1
    694     pshufb     xmm0, xmm3
    695     pmaddubsw  xmm0, xmm6
    696     paddsw     xmm0, xmm7
    697     psrlw      xmm0, 2
    698     packuswb   xmm0, xmm0
    699     movq       qword ptr [edx + 8], xmm0
    700     movdqa     xmm0, [eax + 16]      // pixels 16..23
    701     movdqa     xmm1, [eax + esi + 16]
    702     lea        eax, [eax + 32]
    703     pavgb      xmm1, xmm0
    704     pavgb      xmm0, xmm1
    705     pshufb     xmm0, xmm4
    706     movdqa     xmm1, kMadd21
    707     pmaddubsw  xmm0, xmm1
    708     paddsw     xmm0, xmm7
    709     psrlw      xmm0, 2
    710     packuswb   xmm0, xmm0
    711     sub        ecx, 24
    712     movq       qword ptr [edx + 16], xmm0
    713     lea        edx, [edx+24]
    714     jg         wloop
    715 
    716     pop        esi
    717     ret
    718   }
    719 }
    720 
    721 #define HAS_SCALEROWDOWN38_SSSE3
    722 // 3/8 point sampler
    723 
    724 // Scale 32 pixels to 12
    725 __declspec(naked) __declspec(align(16))
    726 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    727                                  uint8* dst_ptr, int dst_width) {
    728   __asm {
    729     mov        eax, [esp + 4]        // src_ptr
    730                                      // src_stride ignored
    731     mov        edx, [esp + 12]       // dst_ptr
    732     mov        ecx, [esp + 16]       // dst_width
    733     movdqa     xmm4, kShuf38a
    734     movdqa     xmm5, kShuf38b
    735 
    736     align      16
    737   xloop:
    738     movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
    739     movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
    740     lea        eax, [eax + 32]
    741     pshufb     xmm0, xmm4
    742     pshufb     xmm1, xmm5
    743     paddusb    xmm0, xmm1
    744 
    745     sub        ecx, 12
    746     movq       qword ptr [edx], xmm0 // write 12 pixels
    747     movhlps    xmm1, xmm0
    748     movd       [edx + 8], xmm1
    749     lea        edx, [edx + 12]
    750     jg         xloop
    751 
    752     ret
    753   }
    754 }
    755 
    756 // Scale 16x3 pixels to 6x1 with interpolation
    757 __declspec(naked) __declspec(align(16))
    758 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
    759                                        ptrdiff_t src_stride,
    760                                        uint8* dst_ptr, int dst_width) {
    761   __asm {
    762     push       esi
    763     mov        eax, [esp + 4 + 4]    // src_ptr
    764     mov        esi, [esp + 4 + 8]    // src_stride
    765     mov        edx, [esp + 4 + 12]   // dst_ptr
    766     mov        ecx, [esp + 4 + 16]   // dst_width
    767     movdqa     xmm2, kShufAc
    768     movdqa     xmm3, kShufAc3
    769     movdqa     xmm4, kScaleAc33
    770     pxor       xmm5, xmm5
    771 
    772     align      16
    773   xloop:
    774     movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
    775     movdqa     xmm6, [eax + esi]
    776     movhlps    xmm1, xmm0
    777     movhlps    xmm7, xmm6
    778     punpcklbw  xmm0, xmm5
    779     punpcklbw  xmm1, xmm5
    780     punpcklbw  xmm6, xmm5
    781     punpcklbw  xmm7, xmm5
    782     paddusw    xmm0, xmm6
    783     paddusw    xmm1, xmm7
    784     movdqa     xmm6, [eax + esi * 2]
    785     lea        eax, [eax + 16]
    786     movhlps    xmm7, xmm6
    787     punpcklbw  xmm6, xmm5
    788     punpcklbw  xmm7, xmm5
    789     paddusw    xmm0, xmm6
    790     paddusw    xmm1, xmm7
    791 
    792     movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
    793     psrldq     xmm0, 2
    794     paddusw    xmm6, xmm0
    795     psrldq     xmm0, 2
    796     paddusw    xmm6, xmm0
    797     pshufb     xmm6, xmm2
    798 
    799     movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
    800     psrldq     xmm1, 2
    801     paddusw    xmm7, xmm1
    802     psrldq     xmm1, 2
    803     paddusw    xmm7, xmm1
    804     pshufb     xmm7, xmm3
    805     paddusw    xmm6, xmm7
    806 
    807     pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
    808     packuswb   xmm6, xmm6
    809 
    810     sub        ecx, 6
    811     movd       [edx], xmm6           // write 6 pixels
    812     psrlq      xmm6, 16
    813     movd       [edx + 2], xmm6
    814     lea        edx, [edx + 6]
    815     jg         xloop
    816 
    817     pop        esi
    818     ret
    819   }
    820 }
    821 
    822 // Scale 16x2 pixels to 6x1 with interpolation
    823 __declspec(naked) __declspec(align(16))
    824 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
    825                                        ptrdiff_t src_stride,
    826                                        uint8* dst_ptr, int dst_width) {
    827   __asm {
    828     push       esi
    829     mov        eax, [esp + 4 + 4]    // src_ptr
    830     mov        esi, [esp + 4 + 8]    // src_stride
    831     mov        edx, [esp + 4 + 12]   // dst_ptr
    832     mov        ecx, [esp + 4 + 16]   // dst_width
    833     movdqa     xmm2, kShufAb0
    834     movdqa     xmm3, kShufAb1
    835     movdqa     xmm4, kShufAb2
    836     movdqa     xmm5, kScaleAb2
    837 
    838     align      16
    839   xloop:
    840     movdqa     xmm0, [eax]           // average 2 rows into xmm0
    841     pavgb      xmm0, [eax + esi]
    842     lea        eax, [eax + 16]
    843 
    844     movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
    845     pshufb     xmm1, xmm2
    846     movdqa     xmm6, xmm0
    847     pshufb     xmm6, xmm3
    848     paddusw    xmm1, xmm6
    849     pshufb     xmm0, xmm4
    850     paddusw    xmm1, xmm0
    851 
    852     pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
    853     packuswb   xmm1, xmm1
    854 
    855     sub        ecx, 6
    856     movd       [edx], xmm1           // write 6 pixels
    857     psrlq      xmm1, 16
    858     movd       [edx + 2], xmm1
    859     lea        edx, [edx + 6]
    860     jg         xloop
    861 
    862     pop        esi
    863     ret
    864   }
    865 }
    866 
    867 #define HAS_SCALEADDROWS_SSE2
    868 
    869 // Reads 16xN bytes and produces 16 shorts at a time.
    870 __declspec(naked) __declspec(align(16))
    871 static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    872                               uint16* dst_ptr, int src_width,
    873                               int src_height) {
    874   __asm {
    875     push       esi
    876     push       edi
    877     push       ebx
    878     push       ebp
    879     mov        esi, [esp + 16 + 4]   // src_ptr
    880     mov        edx, [esp + 16 + 8]   // src_stride
    881     mov        edi, [esp + 16 + 12]  // dst_ptr
    882     mov        ecx, [esp + 16 + 16]  // dst_width
    883     mov        ebx, [esp + 16 + 20]  // height
    884     pxor       xmm4, xmm4
    885     dec        ebx
    886 
    887     align      16
    888   xloop:
    889     // first row
    890     movdqa     xmm0, [esi]
    891     lea        eax, [esi + edx]
    892     movdqa     xmm1, xmm0
    893     punpcklbw  xmm0, xmm4
    894     punpckhbw  xmm1, xmm4
    895     lea        esi, [esi + 16]
    896     mov        ebp, ebx
    897     test       ebp, ebp
    898     je         ydone
    899 
    900     // sum remaining rows
    901     align      16
    902   yloop:
    903     movdqa     xmm2, [eax]       // read 16 pixels
    904     lea        eax, [eax + edx]  // advance to next row
    905     movdqa     xmm3, xmm2
    906     punpcklbw  xmm2, xmm4
    907     punpckhbw  xmm3, xmm4
    908     paddusw    xmm0, xmm2        // sum 16 words
    909     paddusw    xmm1, xmm3
    910     sub        ebp, 1
    911     jg         yloop
    912   ydone:
    913     movdqa     [edi], xmm0
    914     movdqa     [edi + 16], xmm1
    915     lea        edi, [edi + 32]
    916 
    917     sub        ecx, 16
    918     jg         xloop
    919 
    920     pop        ebp
    921     pop        ebx
    922     pop        edi
    923     pop        esi
    924     ret
    925   }
    926 }
    927 
    928 #ifndef SSE2_DISABLED
    929 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
    930 // Normal formula for bilinear interpolation is:
    931 //   source_y_fraction * row1 + (1 - source_y_fraction) row0
    932 // SSE2 version using the a single multiply of difference:
    933 //   source_y_fraction * (row1 - row0) + row0
    934 #define HAS_SCALEFILTERROWS_SSE2_DISABLED
    935 __declspec(naked) __declspec(align(16))
    936 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
    937                                  ptrdiff_t src_stride, int dst_width,
    938                                  int source_y_fraction) {
    939   __asm {
    940     push       esi
    941     push       edi
    942     mov        edi, [esp + 8 + 4]   // dst_ptr
    943     mov        esi, [esp + 8 + 8]   // src_ptr
    944     mov        edx, [esp + 8 + 12]  // src_stride
    945     mov        ecx, [esp + 8 + 16]  // dst_width
    946     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
    947     sub        edi, esi
    948     cmp        eax, 0
    949     je         xloop1
    950     cmp        eax, 128
    951     je         xloop2
    952 
    953     movd       xmm5, eax            // xmm5 = y fraction
    954     punpcklbw  xmm5, xmm5
    955     punpcklwd  xmm5, xmm5
    956     pshufd     xmm5, xmm5, 0
    957     pxor       xmm4, xmm4
    958 
    959     align      16
    960   xloop:
    961     movdqa     xmm0, [esi]  // row0
    962     movdqa     xmm2, [esi + edx]  // row1
    963     movdqa     xmm1, xmm0
    964     movdqa     xmm3, xmm2
    965     punpcklbw  xmm2, xmm4
    966     punpckhbw  xmm3, xmm4
    967     punpcklbw  xmm0, xmm4
    968     punpckhbw  xmm1, xmm4
    969     psubw      xmm2, xmm0  // row1 - row0
    970     psubw      xmm3, xmm1
    971     pmulhw     xmm2, xmm5  // scale diff
    972     pmulhw     xmm3, xmm5
    973     paddw      xmm0, xmm2  // sum rows
    974     paddw      xmm1, xmm3
    975     packuswb   xmm0, xmm1
    976     sub        ecx, 16
    977     movdqa     [esi + edi], xmm0
    978     lea        esi, [esi + 16]
    979     jg         xloop
    980 
    981     punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
    982     pshufhw    xmm0, xmm0, 0xff
    983     punpckhqdq xmm0, xmm0
    984     movdqa     [esi + edi], xmm0
    985     pop        edi
    986     pop        esi
    987     ret
    988 
    989     align      16
    990   xloop1:
    991     movdqa     xmm0, [esi]
    992     sub        ecx, 16
    993     movdqa     [esi + edi], xmm0
    994     lea        esi, [esi + 16]
    995     jg         xloop1
    996 
    997     punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
    998     pshufhw    xmm0, xmm0, 0xff
    999     punpckhqdq xmm0, xmm0
   1000     movdqa     [esi + edi], xmm0
   1001     pop        edi
   1002     pop        esi
   1003     ret
   1004 
   1005     align      16
   1006   xloop2:
   1007     movdqa     xmm0, [esi]
   1008     pavgb      xmm0, [esi + edx]
   1009     sub        ecx, 16
   1010     movdqa     [esi + edi], xmm0
   1011     lea        esi, [esi + 16]
   1012     jg         xloop2
   1013 
   1014     punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
   1015     pshufhw    xmm0, xmm0, 0xff
   1016     punpckhqdq xmm0, xmm0
   1017     movdqa     [esi + edi], xmm0
   1018     pop        edi
   1019     pop        esi
   1020     ret
   1021   }
   1022 }
   1023 #endif  // SSE2_DISABLED
   1024 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
   1025 #define HAS_SCALEFILTERROWS_SSSE3
   1026 __declspec(naked) __declspec(align(16))
   1027 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   1028                                   ptrdiff_t src_stride, int dst_width,
   1029                                   int source_y_fraction) {
   1030   __asm {
   1031     push       esi
   1032     push       edi
   1033     mov        edi, [esp + 8 + 4]   // dst_ptr
   1034     mov        esi, [esp + 8 + 8]   // src_ptr
   1035     mov        edx, [esp + 8 + 12]  // src_stride
   1036     mov        ecx, [esp + 8 + 16]  // dst_width
   1037     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
   1038     sub        edi, esi
   1039     shr        eax, 1
   1040     cmp        eax, 0
   1041     je         xloop1
   1042     cmp        eax, 64
   1043     je         xloop2
   1044     movd       xmm0, eax  // high fraction 0..127
   1045     neg        eax
   1046     add        eax, 128
   1047     movd       xmm5, eax  // low fraction 128..1
   1048     punpcklbw  xmm5, xmm0
   1049     punpcklwd  xmm5, xmm5
   1050     pshufd     xmm5, xmm5, 0
   1051 
   1052     align      16
   1053   xloop:
   1054     movdqa     xmm0, [esi]
   1055     movdqa     xmm2, [esi + edx]
   1056     movdqa     xmm1, xmm0
   1057     punpcklbw  xmm0, xmm2
   1058     punpckhbw  xmm1, xmm2
   1059     pmaddubsw  xmm0, xmm5
   1060     pmaddubsw  xmm1, xmm5
   1061     psrlw      xmm0, 7
   1062     psrlw      xmm1, 7
   1063     packuswb   xmm0, xmm1
   1064     sub        ecx, 16
   1065     movdqa     [esi + edi], xmm0
   1066     lea        esi, [esi + 16]
   1067     jg         xloop
   1068 
   1069     punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
   1070     pshufhw    xmm0, xmm0, 0xff
   1071     punpckhqdq xmm0, xmm0
   1072     movdqa     [esi + edi], xmm0
   1073 
   1074     pop        edi
   1075     pop        esi
   1076     ret
   1077 
   1078     align      16
   1079   xloop1:
   1080     movdqa     xmm0, [esi]
   1081     sub        ecx, 16
   1082     movdqa     [esi + edi], xmm0
   1083     lea        esi, [esi + 16]
   1084     jg         xloop1
   1085 
   1086     punpckhbw  xmm0, xmm0
   1087     pshufhw    xmm0, xmm0, 0xff
   1088     punpckhqdq xmm0, xmm0
   1089     movdqa     [esi + edi], xmm0
   1090     pop        edi
   1091     pop        esi
   1092     ret
   1093 
   1094     align      16
   1095   xloop2:
   1096     movdqa     xmm0, [esi]
   1097     pavgb      xmm0, [esi + edx]
   1098     sub        ecx, 16
   1099     movdqa     [esi + edi], xmm0
   1100     lea        esi, [esi + 16]
   1101     jg         xloop2
   1102 
   1103     punpckhbw  xmm0, xmm0
   1104     pshufhw    xmm0, xmm0, 0xff
   1105     punpckhqdq xmm0, xmm0
   1106     movdqa     [esi + edi], xmm0
   1107     pop        edi
   1108     pop        esi
   1109     ret
   1110   }
   1111 }
   1112 
   1113 #elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
   1114 
   1115 // GCC versions of row functions are verbatim conversions from Visual C.
   1116 // Generated using gcc disassembly on Visual C object file:
   1117 // objdump -D yuvscaler.obj >yuvscaler.txt
   1118 #define HAS_SCALEROWDOWN2_SSE2
   1119 static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   1120                                uint8* dst_ptr, int dst_width) {
   1121   asm volatile (
   1122     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1123     "psrlw     $0x8,%%xmm5                     \n"
   1124     ".p2align  4                               \n"
   1125   "1:                                          \n"
   1126     "movdqa    (%0),%%xmm0                     \n"
   1127     "movdqa    0x10(%0),%%xmm1                 \n"
   1128     "lea       0x20(%0),%0                     \n"
   1129     "pand      %%xmm5,%%xmm0                   \n"
   1130     "pand      %%xmm5,%%xmm1                   \n"
   1131     "packuswb  %%xmm1,%%xmm0                   \n"
   1132     "movdqa    %%xmm0,(%1)                     \n"
   1133     "lea       0x10(%1),%1                     \n"
   1134     "sub       $0x10,%2                        \n"
   1135     "jg        1b                              \n"
   1136   : "+r"(src_ptr),    // %0
   1137     "+r"(dst_ptr),    // %1
   1138     "+r"(dst_width)   // %2
   1139   :
   1140   : "memory", "cc"
   1141 #if defined(__SSE2__)
   1142     , "xmm0", "xmm1", "xmm5"
   1143 #endif
   1144   );
   1145 }
   1146 
   1147 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   1148                            uint8* dst_ptr, int dst_width) {
   1149   asm volatile (
   1150     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1151     "psrlw     $0x8,%%xmm5                     \n"
   1152     ".p2align  4                               \n"
   1153   "1:                                          \n"
   1154     "movdqa    (%0),%%xmm0                     \n"
   1155     "movdqa    0x10(%0),%%xmm1                 \n"
   1156     "movdqa    (%0,%3,1),%%xmm2                \n"
   1157     "movdqa    0x10(%0,%3,1),%%xmm3            \n"
   1158     "lea       0x20(%0),%0                     \n"
   1159     "pavgb     %%xmm2,%%xmm0                   \n"
   1160     "pavgb     %%xmm3,%%xmm1                   \n"
   1161     "movdqa    %%xmm0,%%xmm2                   \n"
   1162     "psrlw     $0x8,%%xmm0                     \n"
   1163     "movdqa    %%xmm1,%%xmm3                   \n"
   1164     "psrlw     $0x8,%%xmm1                     \n"
   1165     "pand      %%xmm5,%%xmm2                   \n"
   1166     "pand      %%xmm5,%%xmm3                   \n"
   1167     "pavgw     %%xmm2,%%xmm0                   \n"
   1168     "pavgw     %%xmm3,%%xmm1                   \n"
   1169     "packuswb  %%xmm1,%%xmm0                   \n"
   1170     "movdqa    %%xmm0,(%1)                     \n"
   1171     "lea       0x10(%1),%1                     \n"
   1172     "sub       $0x10,%2                        \n"
   1173     "jg        1b                              \n"
   1174   : "+r"(src_ptr),    // %0
   1175     "+r"(dst_ptr),    // %1
   1176     "+r"(dst_width)   // %2
   1177   : "r"(static_cast<intptr_t>(src_stride))   // %3
   1178   : "memory", "cc"
   1179 #if defined(__SSE2__)
   1180     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   1181 #endif
   1182   );
   1183 }
   1184 static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
   1185                                          ptrdiff_t src_stride,
   1186                                          uint8* dst_ptr, int dst_width) {
   1187   asm volatile (
   1188     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1189     "psrlw     $0x8,%%xmm5                     \n"
   1190     ".p2align  4                               \n"
   1191   "1:                                          \n"
   1192     "movdqu    (%0),%%xmm0                     \n"
   1193     "movdqu    0x10(%0),%%xmm1                 \n"
   1194     "lea       0x20(%0),%0                     \n"
   1195     "pand      %%xmm5,%%xmm0                   \n"
   1196     "pand      %%xmm5,%%xmm1                   \n"
   1197     "packuswb  %%xmm1,%%xmm0                   \n"
   1198     "movdqu    %%xmm0,(%1)                     \n"
   1199     "lea       0x10(%1),%1                     \n"
   1200     "sub       $0x10,%2                        \n"
   1201     "jg        1b                              \n"
   1202   : "+r"(src_ptr),    // %0
   1203     "+r"(dst_ptr),    // %1
   1204     "+r"(dst_width)   // %2
   1205   :
   1206   : "memory", "cc"
   1207 #if defined(__SSE2__)
   1208     , "xmm0", "xmm1", "xmm5"
   1209 #endif
   1210   );
   1211 }
   1212 
   1213 static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
   1214                                             ptrdiff_t src_stride,
   1215                                             uint8* dst_ptr, int dst_width) {
   1216   asm volatile (
   1217     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1218     "psrlw     $0x8,%%xmm5                     \n"
   1219     ".p2align  4                               \n"
   1220   "1:                                          \n"
   1221     "movdqu    (%0),%%xmm0                     \n"
   1222     "movdqu    0x10(%0),%%xmm1                 \n"
   1223     "movdqu    (%0,%3,1),%%xmm2                \n"
   1224     "movdqu    0x10(%0,%3,1),%%xmm3            \n"
   1225     "lea       0x20(%0),%0                     \n"
   1226     "pavgb     %%xmm2,%%xmm0                   \n"
   1227     "pavgb     %%xmm3,%%xmm1                   \n"
   1228     "movdqa    %%xmm0,%%xmm2                   \n"
   1229     "psrlw     $0x8,%%xmm0                     \n"
   1230     "movdqa    %%xmm1,%%xmm3                   \n"
   1231     "psrlw     $0x8,%%xmm1                     \n"
   1232     "pand      %%xmm5,%%xmm2                   \n"
   1233     "pand      %%xmm5,%%xmm3                   \n"
   1234     "pavgw     %%xmm2,%%xmm0                   \n"
   1235     "pavgw     %%xmm3,%%xmm1                   \n"
   1236     "packuswb  %%xmm1,%%xmm0                   \n"
   1237     "movdqu    %%xmm0,(%1)                     \n"
   1238     "lea       0x10(%1),%1                     \n"
   1239     "sub       $0x10,%2                        \n"
   1240     "jg        1b                              \n"
   1241   : "+r"(src_ptr),    // %0
   1242     "+r"(dst_ptr),    // %1
   1243     "+r"(dst_width)   // %2
   1244   : "r"(static_cast<intptr_t>(src_stride))   // %3
   1245   : "memory", "cc"
   1246 #if defined(__SSE2__)
   1247     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   1248 #endif
   1249   );
   1250 }
   1251 
   1252 #define HAS_SCALEROWDOWN4_SSE2
   1253 static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   1254                                uint8* dst_ptr, int dst_width) {
   1255   asm volatile (
   1256     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1257     "psrld     $0x18,%%xmm5                    \n"
   1258     ".p2align  4                               \n"
   1259   "1:                                          \n"
   1260     "movdqa    (%0),%%xmm0                     \n"
   1261     "movdqa    0x10(%0),%%xmm1                 \n"
   1262     "lea       0x20(%0),%0                     \n"
   1263     "pand      %%xmm5,%%xmm0                   \n"
   1264     "pand      %%xmm5,%%xmm1                   \n"
   1265     "packuswb  %%xmm1,%%xmm0                   \n"
   1266     "packuswb  %%xmm0,%%xmm0                   \n"
   1267     "movq      %%xmm0,(%1)                     \n"
   1268     "lea       0x8(%1),%1                      \n"
   1269     "sub       $0x8,%2                         \n"
   1270     "jg        1b                              \n"
   1271   : "+r"(src_ptr),    // %0
   1272     "+r"(dst_ptr),    // %1
   1273     "+r"(dst_width)   // %2
   1274   :
   1275   : "memory", "cc"
   1276 #if defined(__SSE2__)
   1277     , "xmm0", "xmm1", "xmm5"
   1278 #endif
   1279   );
   1280 }
   1281 
   1282 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   1283                                   uint8* dst_ptr, int dst_width) {
   1284   intptr_t stridex3 = 0;
   1285   asm volatile (
   1286     "pcmpeqb   %%xmm7,%%xmm7                   \n"
   1287     "psrlw     $0x8,%%xmm7                     \n"
   1288     "lea       (%4,%4,2),%3                    \n"
   1289     ".p2align  4                               \n"
   1290   "1:                                          \n"
   1291     "movdqa    (%0),%%xmm0                     \n"
   1292     "movdqa    0x10(%0),%%xmm1                 \n"
   1293     "movdqa    (%0,%4,1),%%xmm2                \n"
   1294     "movdqa    0x10(%0,%4,1),%%xmm3            \n"
   1295     "pavgb     %%xmm2,%%xmm0                   \n"
   1296     "pavgb     %%xmm3,%%xmm1                   \n"
   1297     "movdqa    (%0,%4,2),%%xmm2                \n"
   1298     "movdqa    0x10(%0,%4,2),%%xmm3            \n"
   1299     "movdqa    (%0,%3,1),%%xmm4                \n"
   1300     "movdqa    0x10(%0,%3,1),%%xmm5            \n"
   1301     "lea       0x20(%0),%0                     \n"
   1302     "pavgb     %%xmm4,%%xmm2                   \n"
   1303     "pavgb     %%xmm2,%%xmm0                   \n"
   1304     "pavgb     %%xmm5,%%xmm3                   \n"
   1305     "pavgb     %%xmm3,%%xmm1                   \n"
   1306     "movdqa    %%xmm0,%%xmm2                   \n"
   1307     "psrlw     $0x8,%%xmm0                     \n"
   1308     "movdqa    %%xmm1,%%xmm3                   \n"
   1309     "psrlw     $0x8,%%xmm1                     \n"
   1310     "pand      %%xmm7,%%xmm2                   \n"
   1311     "pand      %%xmm7,%%xmm3                   \n"
   1312     "pavgw     %%xmm2,%%xmm0                   \n"
   1313     "pavgw     %%xmm3,%%xmm1                   \n"
   1314     "packuswb  %%xmm1,%%xmm0                   \n"
   1315     "movdqa    %%xmm0,%%xmm2                   \n"
   1316     "psrlw     $0x8,%%xmm0                     \n"
   1317     "pand      %%xmm7,%%xmm2                   \n"
   1318     "pavgw     %%xmm2,%%xmm0                   \n"
   1319     "packuswb  %%xmm0,%%xmm0                   \n"
   1320     "movq      %%xmm0,(%1)                     \n"
   1321     "lea       0x8(%1),%1                      \n"
   1322     "sub       $0x8,%2                         \n"
   1323     "jg        1b                              \n"
   1324   : "+r"(src_ptr),     // %0
   1325     "+r"(dst_ptr),     // %1
   1326     "+r"(dst_width),   // %2
   1327     "+r"(stridex3)     // %3
   1328   : "r"(static_cast<intptr_t>(src_stride))    // %4
   1329   : "memory", "cc"
   1330 #if defined(__SSE2__)
   1331     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
   1332 #endif
   1333   );
   1334 }
   1335 
   1336 #define HAS_SCALEROWDOWN8_SSE2
   1337 static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   1338                                uint8* dst_ptr, int dst_width) {
   1339   asm volatile (
   1340     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1341     "psrlq     $0x38,%%xmm5                    \n"
   1342     ".p2align  4                               \n"
   1343   "1:                                          \n"
   1344     "movdqa    (%0),%%xmm0                     \n"
   1345     "movdqa    0x10(%0),%%xmm1                 \n"
   1346     "lea       0x20(%0),%0                     \n"
   1347     "pand      %%xmm5,%%xmm0                   \n"
   1348     "pand      %%xmm5,%%xmm1                   \n"
   1349     "packuswb  %%xmm1,%%xmm0                   \n"
   1350     "packuswb  %%xmm0,%%xmm0                   \n"
   1351     "packuswb  %%xmm0,%%xmm0                   \n"
   1352     "movd      %%xmm0,(%1)                     \n"
   1353     "lea       0x4(%1),%1                      \n"
   1354     "sub       $0x4,%2                         \n"
   1355     "jg        1b                              \n"
   1356   : "+r"(src_ptr),    // %0
   1357     "+r"(dst_ptr),    // %1
   1358     "+r"(dst_width)   // %2
   1359   :
   1360   : "memory", "cc"
   1361 #if defined(__SSE2__)
   1362     , "xmm0", "xmm1", "xmm5"
   1363 #endif
   1364   );
   1365 }
   1366 
   1367 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   1368                                   uint8* dst_ptr, int dst_width) {
   1369   intptr_t stridex3 = 0;
   1370   intptr_t row4 = 0;
   1371   asm volatile (
   1372     "lea       (%5,%5,2),%3                    \n"
   1373     "pxor      %%xmm7,%%xmm7                   \n"
   1374     ".p2align  4                               \n"
   1375   "1:                                          \n"
   1376     "movdqa    (%0),%%xmm0                     \n"
   1377     "movdqa    0x10(%0),%%xmm1                 \n"
   1378     "movdqa    (%0,%5,1),%%xmm2                \n"
   1379     "movdqa    0x10(%0,%5,1),%%xmm3            \n"
   1380     "pavgb     %%xmm2,%%xmm0                   \n"
   1381     "pavgb     %%xmm3,%%xmm1                   \n"
   1382     "movdqa    (%0,%5,2),%%xmm2                \n"
   1383     "movdqa    0x10(%0,%5,2),%%xmm3            \n"
   1384     "movdqa    (%0,%3,1),%%xmm4                \n"
   1385     "movdqa    0x10(%0,%3,1),%%xmm5            \n"
   1386     "lea       (%0,%5,4),%4                    \n"
   1387     "lea       0x20(%0),%0                     \n"
   1388     "pavgb     %%xmm4,%%xmm2                   \n"
   1389     "pavgb     %%xmm5,%%xmm3                   \n"
   1390     "pavgb     %%xmm2,%%xmm0                   \n"
   1391     "pavgb     %%xmm3,%%xmm1                   \n"
   1392     "movdqa    0x0(%4),%%xmm2                  \n"
   1393     "movdqa    0x10(%4),%%xmm3                 \n"
   1394     "movdqa    0x0(%4,%5,1),%%xmm4             \n"
   1395     "movdqa    0x10(%4,%5,1),%%xmm5            \n"
   1396     "pavgb     %%xmm4,%%xmm2                   \n"
   1397     "pavgb     %%xmm5,%%xmm3                   \n"
   1398     "movdqa    0x0(%4,%5,2),%%xmm4             \n"
   1399     "movdqa    0x10(%4,%5,2),%%xmm5            \n"
   1400     "movdqa    0x0(%4,%3,1),%%xmm6             \n"
   1401     "pavgb     %%xmm6,%%xmm4                   \n"
   1402     "movdqa    0x10(%4,%3,1),%%xmm6            \n"
   1403     "pavgb     %%xmm6,%%xmm5                   \n"
   1404     "pavgb     %%xmm4,%%xmm2                   \n"
   1405     "pavgb     %%xmm5,%%xmm3                   \n"
   1406     "pavgb     %%xmm2,%%xmm0                   \n"
   1407     "pavgb     %%xmm3,%%xmm1                   \n"
   1408     "psadbw    %%xmm7,%%xmm0                   \n"
   1409     "psadbw    %%xmm7,%%xmm1                   \n"
   1410     "pshufd    $0xd8,%%xmm0,%%xmm0             \n"
   1411     "pshufd    $0x8d,%%xmm1,%%xmm1             \n"
   1412     "por       %%xmm1,%%xmm0                   \n"
   1413     "psrlw     $0x3,%%xmm0                     \n"
   1414     "packuswb  %%xmm0,%%xmm0                   \n"
   1415     "packuswb  %%xmm0,%%xmm0                   \n"
   1416     "movd      %%xmm0,(%1)                     \n"
   1417     "lea       0x4(%1),%1                      \n"
   1418     "sub       $0x4,%2                         \n"
   1419     "jg        1b                              \n"
   1420   : "+r"(src_ptr),     // %0
   1421     "+r"(dst_ptr),     // %1
   1422     "+rm"(dst_width),  // %2
   1423     "+r"(stridex3),    // %3
   1424     "+r"(row4)         // %4
   1425   : "r"(static_cast<intptr_t>(src_stride))  // %5
   1426   : "memory", "cc"
   1427 #if defined(__SSE2__)
   1428     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   1429 #endif
   1430   );
   1431 }
   1432 
   1433 #define HAS_SCALEROWDOWN34_SSSE3
   1434 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
   1435                                  uint8* dst_ptr, int dst_width) {
   1436   asm volatile (
   1437     "movdqa    %0,%%xmm3                       \n"
   1438     "movdqa    %1,%%xmm4                       \n"
   1439     "movdqa    %2,%%xmm5                       \n"
   1440   :
   1441   : "m"(kShuf0),  // %0
   1442     "m"(kShuf1),  // %1
   1443     "m"(kShuf2)   // %2
   1444   );
   1445   asm volatile (
   1446     ".p2align  4                               \n"
   1447   "1:                                          \n"
   1448     "movdqa    (%0),%%xmm0                     \n"
   1449     "movdqa    0x10(%0),%%xmm2                 \n"
   1450     "lea       0x20(%0),%0                     \n"
   1451     "movdqa    %%xmm2,%%xmm1                   \n"
   1452     "palignr   $0x8,%%xmm0,%%xmm1              \n"
   1453     "pshufb    %%xmm3,%%xmm0                   \n"
   1454     "pshufb    %%xmm4,%%xmm1                   \n"
   1455     "pshufb    %%xmm5,%%xmm2                   \n"
   1456     "movq      %%xmm0,(%1)                     \n"
   1457     "movq      %%xmm1,0x8(%1)                  \n"
   1458     "movq      %%xmm2,0x10(%1)                 \n"
   1459     "lea       0x18(%1),%1                     \n"
   1460     "sub       $0x18,%2                        \n"
   1461     "jg        1b                              \n"
   1462   : "+r"(src_ptr),   // %0
   1463     "+r"(dst_ptr),   // %1
   1464     "+r"(dst_width)  // %2
   1465   :
   1466   : "memory", "cc"
   1467 #if defined(__SSE2__)
   1468     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1469 #endif
   1470   );
   1471 }
   1472 
   1473 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
   1474                                        ptrdiff_t src_stride,
   1475                                        uint8* dst_ptr, int dst_width) {
   1476   asm volatile (
   1477     "movdqa    %0,%%xmm2                       \n"  // kShuf01
   1478     "movdqa    %1,%%xmm3                       \n"  // kShuf11
   1479     "movdqa    %2,%%xmm4                       \n"  // kShuf21
   1480   :
   1481   : "m"(kShuf01),  // %0
   1482     "m"(kShuf11),  // %1
   1483     "m"(kShuf21)   // %2
   1484   );
   1485   asm volatile (
   1486     "movdqa    %0,%%xmm5                       \n"  // kMadd01
   1487     "movdqa    %1,%%xmm0                       \n"  // kMadd11
   1488     "movdqa    %2,%%xmm1                       \n"  // kRound34
   1489   :
   1490   : "m"(kMadd01),  // %0
   1491     "m"(kMadd11),  // %1
   1492     "m"(kRound34)  // %2
   1493   );
   1494   asm volatile (
   1495     ".p2align  4                               \n"
   1496   "1:                                          \n"
   1497     "movdqa    (%0),%%xmm6                     \n"
   1498     "movdqa    (%0,%3),%%xmm7                  \n"
   1499     "pavgb     %%xmm7,%%xmm6                   \n"
   1500     "pshufb    %%xmm2,%%xmm6                   \n"
   1501     "pmaddubsw %%xmm5,%%xmm6                   \n"
   1502     "paddsw    %%xmm1,%%xmm6                   \n"
   1503     "psrlw     $0x2,%%xmm6                     \n"
   1504     "packuswb  %%xmm6,%%xmm6                   \n"
   1505     "movq      %%xmm6,(%1)                     \n"
   1506     "movdqu    0x8(%0),%%xmm6                  \n"
   1507     "movdqu    0x8(%0,%3),%%xmm7               \n"
   1508     "pavgb     %%xmm7,%%xmm6                   \n"
   1509     "pshufb    %%xmm3,%%xmm6                   \n"
   1510     "pmaddubsw %%xmm0,%%xmm6                   \n"
   1511     "paddsw    %%xmm1,%%xmm6                   \n"
   1512     "psrlw     $0x2,%%xmm6                     \n"
   1513     "packuswb  %%xmm6,%%xmm6                   \n"
   1514     "movq      %%xmm6,0x8(%1)                  \n"
   1515     "movdqa    0x10(%0),%%xmm6                 \n"
   1516     "movdqa    0x10(%0,%3),%%xmm7              \n"
   1517     "lea       0x20(%0),%0                     \n"
   1518     "pavgb     %%xmm7,%%xmm6                   \n"
   1519     "pshufb    %%xmm4,%%xmm6                   \n"
   1520     "pmaddubsw %4,%%xmm6                       \n"
   1521     "paddsw    %%xmm1,%%xmm6                   \n"
   1522     "psrlw     $0x2,%%xmm6                     \n"
   1523     "packuswb  %%xmm6,%%xmm6                   \n"
   1524     "movq      %%xmm6,0x10(%1)                 \n"
   1525     "lea       0x18(%1),%1                     \n"
   1526     "sub       $0x18,%2                        \n"
   1527     "jg        1b                              \n"
   1528   : "+r"(src_ptr),   // %0
   1529     "+r"(dst_ptr),   // %1
   1530     "+r"(dst_width)  // %2
   1531   : "r"(static_cast<intptr_t>(src_stride)),  // %3
   1532     "m"(kMadd21)     // %4
   1533   : "memory", "cc"
   1534 #if defined(__SSE2__)
   1535     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   1536 #endif
   1537   );
   1538 }
   1539 
   1540 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
   1541                                        ptrdiff_t src_stride,
   1542                                        uint8* dst_ptr, int dst_width) {
   1543   asm volatile (
   1544     "movdqa    %0,%%xmm2                       \n"  // kShuf01
   1545     "movdqa    %1,%%xmm3                       \n"  // kShuf11
   1546     "movdqa    %2,%%xmm4                       \n"  // kShuf21
   1547   :
   1548   : "m"(kShuf01),  // %0
   1549     "m"(kShuf11),  // %1
   1550     "m"(kShuf21)   // %2
   1551   );
   1552   asm volatile (
   1553     "movdqa    %0,%%xmm5                       \n"  // kMadd01
   1554     "movdqa    %1,%%xmm0                       \n"  // kMadd11
   1555     "movdqa    %2,%%xmm1                       \n"  // kRound34
   1556   :
   1557   : "m"(kMadd01),  // %0
   1558     "m"(kMadd11),  // %1
   1559     "m"(kRound34)  // %2
   1560   );
   1561 
   1562   asm volatile (
   1563     ".p2align  4                               \n"
   1564   "1:                                          \n"
   1565     "movdqa    (%0),%%xmm6                     \n"
   1566     "movdqa    (%0,%3,1),%%xmm7                \n"
   1567     "pavgb     %%xmm6,%%xmm7                   \n"
   1568     "pavgb     %%xmm7,%%xmm6                   \n"
   1569     "pshufb    %%xmm2,%%xmm6                   \n"
   1570     "pmaddubsw %%xmm5,%%xmm6                   \n"
   1571     "paddsw    %%xmm1,%%xmm6                   \n"
   1572     "psrlw     $0x2,%%xmm6                     \n"
   1573     "packuswb  %%xmm6,%%xmm6                   \n"
   1574     "movq      %%xmm6,(%1)                     \n"
   1575     "movdqu    0x8(%0),%%xmm6                  \n"
   1576     "movdqu    0x8(%0,%3,1),%%xmm7             \n"
   1577     "pavgb     %%xmm6,%%xmm7                   \n"
   1578     "pavgb     %%xmm7,%%xmm6                   \n"
   1579     "pshufb    %%xmm3,%%xmm6                   \n"
   1580     "pmaddubsw %%xmm0,%%xmm6                   \n"
   1581     "paddsw    %%xmm1,%%xmm6                   \n"
   1582     "psrlw     $0x2,%%xmm6                     \n"
   1583     "packuswb  %%xmm6,%%xmm6                   \n"
   1584     "movq      %%xmm6,0x8(%1)                  \n"
   1585     "movdqa    0x10(%0),%%xmm6                 \n"
   1586     "movdqa    0x10(%0,%3,1),%%xmm7            \n"
   1587     "lea       0x20(%0),%0                     \n"
   1588     "pavgb     %%xmm6,%%xmm7                   \n"
   1589     "pavgb     %%xmm7,%%xmm6                   \n"
   1590     "pshufb    %%xmm4,%%xmm6                   \n"
   1591     "pmaddubsw %4,%%xmm6                       \n"
   1592     "paddsw    %%xmm1,%%xmm6                   \n"
   1593     "psrlw     $0x2,%%xmm6                     \n"
   1594     "packuswb  %%xmm6,%%xmm6                   \n"
   1595     "movq      %%xmm6,0x10(%1)                 \n"
   1596     "lea       0x18(%1),%1                     \n"
   1597     "sub       $0x18,%2                        \n"
   1598     "jg        1b                              \n"
   1599     : "+r"(src_ptr),   // %0
   1600       "+r"(dst_ptr),   // %1
   1601       "+r"(dst_width)  // %2
   1602     : "r"(static_cast<intptr_t>(src_stride)),  // %3
   1603       "m"(kMadd21)     // %4
   1604     : "memory", "cc"
   1605 #if defined(__SSE2__)
   1606     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   1607 #endif
   1608   );
   1609 }
   1610 
   1611 #define HAS_SCALEROWDOWN38_SSSE3
   1612 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
   1613                                  uint8* dst_ptr, int dst_width) {
   1614   asm volatile (
   1615     "movdqa    %3,%%xmm4                       \n"
   1616     "movdqa    %4,%%xmm5                       \n"
   1617     ".p2align  4                               \n"
   1618   "1:                                          \n"
   1619     "movdqa    (%0),%%xmm0                     \n"
   1620     "movdqa    0x10(%0),%%xmm1                 \n"
   1621     "lea       0x20(%0),%0                     \n"
   1622     "pshufb    %%xmm4,%%xmm0                   \n"
   1623     "pshufb    %%xmm5,%%xmm1                   \n"
   1624     "paddusb   %%xmm1,%%xmm0                   \n"
   1625     "movq      %%xmm0,(%1)                     \n"
   1626     "movhlps   %%xmm0,%%xmm1                   \n"
   1627     "movd      %%xmm1,0x8(%1)                  \n"
   1628     "lea       0xc(%1),%1                      \n"
   1629     "sub       $0xc,%2                         \n"
   1630     "jg        1b                              \n"
   1631   : "+r"(src_ptr),   // %0
   1632     "+r"(dst_ptr),   // %1
   1633     "+r"(dst_width)  // %2
   1634   : "m"(kShuf38a),   // %3
   1635     "m"(kShuf38b)    // %4
   1636   : "memory", "cc"
   1637 #if defined(__SSE2__)
   1638       , "xmm0", "xmm1", "xmm4", "xmm5"
   1639 #endif
   1640   );
   1641 }
   1642 
   1643 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
   1644                                        ptrdiff_t src_stride,
   1645                                        uint8* dst_ptr, int dst_width) {
   1646   asm volatile (
   1647     "movdqa    %0,%%xmm2                       \n"
   1648     "movdqa    %1,%%xmm3                       \n"
   1649     "movdqa    %2,%%xmm4                       \n"
   1650     "movdqa    %3,%%xmm5                       \n"
   1651   :
   1652   : "m"(kShufAb0),   // %0
   1653     "m"(kShufAb1),   // %1
   1654     "m"(kShufAb2),   // %2
   1655     "m"(kScaleAb2)   // %3
   1656   );
   1657   asm volatile (
   1658     ".p2align  4                               \n"
   1659   "1:                                          \n"
   1660     "movdqa    (%0),%%xmm0                     \n"
   1661     "pavgb     (%0,%3,1),%%xmm0                \n"
   1662     "lea       0x10(%0),%0                     \n"
   1663     "movdqa    %%xmm0,%%xmm1                   \n"
   1664     "pshufb    %%xmm2,%%xmm1                   \n"
   1665     "movdqa    %%xmm0,%%xmm6                   \n"
   1666     "pshufb    %%xmm3,%%xmm6                   \n"
   1667     "paddusw   %%xmm6,%%xmm1                   \n"
   1668     "pshufb    %%xmm4,%%xmm0                   \n"
   1669     "paddusw   %%xmm0,%%xmm1                   \n"
   1670     "pmulhuw   %%xmm5,%%xmm1                   \n"
   1671     "packuswb  %%xmm1,%%xmm1                   \n"
   1672     "sub       $0x6,%2                         \n"
   1673     "movd      %%xmm1,(%1)                     \n"
   1674     "psrlq     $0x10,%%xmm1                    \n"
   1675     "movd      %%xmm1,0x2(%1)                  \n"
   1676     "lea       0x6(%1),%1                      \n"
   1677     "jg        1b                              \n"
   1678   : "+r"(src_ptr),     // %0
   1679     "+r"(dst_ptr),     // %1
   1680     "+r"(dst_width)    // %2
   1681   : "r"(static_cast<intptr_t>(src_stride))  // %3
   1682   : "memory", "cc"
   1683 #if defined(__SSE2__)
   1684     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   1685 #endif
   1686   );
   1687 }
   1688 
   1689 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
   1690                                        ptrdiff_t src_stride,
   1691                                        uint8* dst_ptr, int dst_width) {
   1692   asm volatile (
   1693     "movdqa    %0,%%xmm2                       \n"
   1694     "movdqa    %1,%%xmm3                       \n"
   1695     "movdqa    %2,%%xmm4                       \n"
   1696     "pxor      %%xmm5,%%xmm5                   \n"
   1697   :
   1698   : "m"(kShufAc),    // %0
   1699     "m"(kShufAc3),   // %1
   1700     "m"(kScaleAc33)  // %2
   1701   );
   1702   asm volatile (
   1703     ".p2align  4                               \n"
   1704   "1:                                          \n"
   1705     "movdqa    (%0),%%xmm0                     \n"
   1706     "movdqa    (%0,%3,1),%%xmm6                \n"
   1707     "movhlps   %%xmm0,%%xmm1                   \n"
   1708     "movhlps   %%xmm6,%%xmm7                   \n"
   1709     "punpcklbw %%xmm5,%%xmm0                   \n"
   1710     "punpcklbw %%xmm5,%%xmm1                   \n"
   1711     "punpcklbw %%xmm5,%%xmm6                   \n"
   1712     "punpcklbw %%xmm5,%%xmm7                   \n"
   1713     "paddusw   %%xmm6,%%xmm0                   \n"
   1714     "paddusw   %%xmm7,%%xmm1                   \n"
   1715     "movdqa    (%0,%3,2),%%xmm6                \n"
   1716     "lea       0x10(%0),%0                     \n"
   1717     "movhlps   %%xmm6,%%xmm7                   \n"
   1718     "punpcklbw %%xmm5,%%xmm6                   \n"
   1719     "punpcklbw %%xmm5,%%xmm7                   \n"
   1720     "paddusw   %%xmm6,%%xmm0                   \n"
   1721     "paddusw   %%xmm7,%%xmm1                   \n"
   1722     "movdqa    %%xmm0,%%xmm6                   \n"
   1723     "psrldq    $0x2,%%xmm0                     \n"
   1724     "paddusw   %%xmm0,%%xmm6                   \n"
   1725     "psrldq    $0x2,%%xmm0                     \n"
   1726     "paddusw   %%xmm0,%%xmm6                   \n"
   1727     "pshufb    %%xmm2,%%xmm6                   \n"
   1728     "movdqa    %%xmm1,%%xmm7                   \n"
   1729     "psrldq    $0x2,%%xmm1                     \n"
   1730     "paddusw   %%xmm1,%%xmm7                   \n"
   1731     "psrldq    $0x2,%%xmm1                     \n"
   1732     "paddusw   %%xmm1,%%xmm7                   \n"
   1733     "pshufb    %%xmm3,%%xmm7                   \n"
   1734     "paddusw   %%xmm7,%%xmm6                   \n"
   1735     "pmulhuw   %%xmm4,%%xmm6                   \n"
   1736     "packuswb  %%xmm6,%%xmm6                   \n"
   1737     "sub       $0x6,%2                         \n"
   1738     "movd      %%xmm6,(%1)                     \n"
   1739     "psrlq     $0x10,%%xmm6                    \n"
   1740     "movd      %%xmm6,0x2(%1)                  \n"
   1741     "lea       0x6(%1),%1                      \n"
   1742     "jg        1b                              \n"
   1743   : "+r"(src_ptr),    // %0
   1744     "+r"(dst_ptr),    // %1
   1745     "+r"(dst_width)   // %2
   1746   : "r"(static_cast<intptr_t>(src_stride))   // %3
   1747   : "memory", "cc"
   1748 #if defined(__SSE2__)
   1749     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   1750 #endif
   1751   );
   1752 }
   1753 
   1754 #define HAS_SCALEADDROWS_SSE2
   1755 static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   1756                               uint16* dst_ptr, int src_width, int src_height) {
   1757   int tmp_height = 0;
   1758   intptr_t tmp_src = 0;
   1759   asm volatile (
   1760     "pxor      %%xmm4,%%xmm4                   \n"
   1761     "sub       $0x1,%5                         \n"
   1762     ".p2align  4                               \n"
   1763   "1:                                          \n"
   1764     "movdqa    (%0),%%xmm0                     \n"
   1765     "mov       %0,%3                           \n"
   1766     "add       %6,%0                           \n"
   1767     "movdqa    %%xmm0,%%xmm1                   \n"
   1768     "punpcklbw %%xmm4,%%xmm0                   \n"
   1769     "punpckhbw %%xmm4,%%xmm1                   \n"
   1770     "mov       %5,%2                           \n"
   1771     "test      %2,%2                           \n"
   1772     "je        3f                              \n"
   1773   "2:                                          \n"
   1774     "movdqa    (%0),%%xmm2                     \n"
   1775     "add       %6,%0                           \n"
   1776     "movdqa    %%xmm2,%%xmm3                   \n"
   1777     "punpcklbw %%xmm4,%%xmm2                   \n"
   1778     "punpckhbw %%xmm4,%%xmm3                   \n"
   1779     "paddusw   %%xmm2,%%xmm0                   \n"
   1780     "paddusw   %%xmm3,%%xmm1                   \n"
   1781     "sub       $0x1,%2                         \n"
   1782     "jg        2b                              \n"
   1783   "3:                                          \n"
   1784     "movdqa    %%xmm0,(%1)                     \n"
   1785     "movdqa    %%xmm1,0x10(%1)                 \n"
   1786     "lea       0x10(%3),%0                     \n"
   1787     "lea       0x20(%1),%1                     \n"
   1788     "sub       $0x10,%4                        \n"
   1789     "jg        1b                              \n"
   1790   : "+r"(src_ptr),     // %0
   1791     "+r"(dst_ptr),     // %1
   1792     "+r"(tmp_height),  // %2
   1793     "+r"(tmp_src),     // %3
   1794     "+r"(src_width),   // %4
   1795     "+rm"(src_height)  // %5
   1796   : "rm"(static_cast<intptr_t>(src_stride))  // %6
   1797   : "memory", "cc"
   1798 #if defined(__SSE2__)
   1799     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   1800 #endif
   1801   );
   1802 }
   1803 
   1804 #ifndef SSE2_DISABLED
   1805 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
   1806 #define HAS_SCALEFILTERROWS_SSE2_DISABLED
   1807 static void ScaleFilterRows_SSE2(uint8* dst_ptr,
   1808                                  const uint8* src_ptr, ptrdiff_t src_stride,
   1809                                  int dst_width, int source_y_fraction) {
   1810   asm volatile (
   1811     "sub       %1,%0                           \n"
   1812     "cmp       $0x0,%3                         \n"
   1813     "je        2f                              \n"
   1814     "cmp       $0x80,%3                        \n"
   1815     "je        3f                              \n"
   1816     "movd      %3,%%xmm5                       \n"
   1817     "punpcklbw %%xmm5,%%xmm5                   \n"
   1818     "punpcklwd %%xmm5,%%xmm5                   \n"
   1819     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
   1820     "pxor      %%xmm4,%%xmm4                   \n"
   1821     ".p2align  4                               \n"
   1822   "1:                                          \n"
   1823     "movdqa    (%1),%%xmm0                     \n"
   1824     "movdqa    (%1,%4,1),%%xmm2                \n"
   1825     "movdqa    %%xmm0,%%xmm1                   \n"
   1826     "movdqa    %%xmm2,%%xmm3                   \n"
   1827     "punpcklbw %%xmm4,%%xmm2                   \n"
   1828     "punpckhbw %%xmm4,%%xmm3                   \n"
   1829     "punpcklbw %%xmm4,%%xmm0                   \n"
   1830     "punpckhbw %%xmm4,%%xmm1                   \n"
   1831     "psubw     %%xmm0,%%xmm2                   \n"
   1832     "psubw     %%xmm1,%%xmm3                   \n"
   1833     "pmulhw    %%xmm5,%%xmm2                   \n"
   1834     "pmulhw    %%xmm5,%%xmm3                   \n"
   1835     "paddw     %%xmm2,%%xmm0                   \n"
   1836     "paddw     %%xmm3,%%xmm1                   \n"
   1837     "packuswb  %%xmm1,%%xmm0                   \n"
   1838     "sub       $0x10,%2                        \n"
   1839     "movdqa    %%xmm0,(%1,%0,1)                \n"
   1840     "lea       0x10(%1),%1                     \n"
   1841     "jg        1b                              \n"
   1842     "jmp       4f                              \n"
   1843     ".p2align  4                               \n"
   1844   "2:                                          \n"
   1845     "movdqa    (%1),%%xmm0                     \n"
   1846     "sub       $0x10,%2                        \n"
   1847     "movdqa    %%xmm0,(%1,%0,1)                \n"
   1848     "lea       0x10(%1),%1                     \n"
   1849     "jg        2b                              \n"
   1850     "jmp       4f                              \n"
   1851     ".p2align  4                               \n"
   1852   "3:                                          \n"
   1853     "movdqa    (%1),%%xmm0                     \n"
   1854     "pavgb     (%1,%4,1),%%xmm0                \n"
   1855     "sub       $0x10,%2                        \n"
   1856     "movdqa    %%xmm0,(%1,%0,1)                \n"
   1857     "lea       0x10(%1),%1                     \n"
   1858     "jg        3b                              \n"
   1859     ".p2align  4                               \n"
   1860   "4:                                          \n"
   1861     "punpckhbw %%xmm0,%%xmm0                   \n"
   1862     "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
   1863     "punpckhqdq %%xmm0,%%xmm0                  \n"
   1864     "movdqa    %%xmm0,(%1,%0,1)                \n"
   1865   : "+r"(dst_ptr),    // %0
   1866     "+r"(src_ptr),    // %1
   1867     "+r"(dst_width),  // %2
   1868     "+r"(source_y_fraction)  // %3
   1869   : "r"(static_cast<intptr_t>(src_stride))  // %4
   1870   : "memory", "cc"
   1871 #if defined(__SSE2__)
   1872     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1873 #endif
   1874   );
   1875 }
   1876 #endif  // SSE2_DISABLED
   1877 
   1878 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
   1879 #define HAS_SCALEFILTERROWS_SSSE3
   1880 static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
   1881                                   const uint8* src_ptr, ptrdiff_t src_stride,
   1882                                   int dst_width, int source_y_fraction) {
   1883   asm volatile (
   1884     "sub       %1,%0                           \n"
   1885     "shr       %3                              \n"
   1886     "cmp       $0x0,%3                         \n"
   1887     "je        2f                              \n"
   1888     "cmp       $0x40,%3                        \n"
   1889     "je        3f                              \n"
   1890     "movd      %3,%%xmm0                       \n"
   1891     "neg       %3                              \n"
   1892     "add       $0x80,%3                        \n"
   1893     "movd      %3,%%xmm5                       \n"
   1894     "punpcklbw %%xmm0,%%xmm5                   \n"
   1895     "punpcklwd %%xmm5,%%xmm5                   \n"
   1896     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
   1897     ".p2align  4                               \n"
   1898   "1:                                          \n"
   1899     "movdqa    (%1),%%xmm0                     \n"
   1900     "movdqa    (%1,%4,1),%%xmm2                \n"
   1901     "movdqa    %%xmm0,%%xmm1                   \n"
   1902     "punpcklbw %%xmm2,%%xmm0                   \n"
   1903     "punpckhbw %%xmm2,%%xmm1                   \n"
   1904     "pmaddubsw %%xmm5,%%xmm0                   \n"
   1905     "pmaddubsw %%xmm5,%%xmm1                   \n"
   1906     "psrlw     $0x7,%%xmm0                     \n"
   1907     "psrlw     $0x7,%%xmm1                     \n"
   1908     "packuswb  %%xmm1,%%xmm0                   \n"
   1909     "sub       $0x10,%2                        \n"
   1910     "movdqa    %%xmm0,(%1,%0,1)                \n"
   1911     "lea       0x10(%1),%1                     \n"
   1912     "jg        1b                              \n"
   1913     "jmp       4f                              \n"
   1914     ".p2align  4                               \n"
   1915   "2:                                          \n"
   1916     "movdqa    (%1),%%xmm0                     \n"
   1917     "sub       $0x10,%2                        \n"
   1918     "movdqa    %%xmm0,(%1,%0,1)                \n"
   1919     "lea       0x10(%1),%1                     \n"
   1920     "jg        2b                              \n"
   1921     "jmp       4f                              \n"
   1922     ".p2align  4                               \n"
   1923   "3:                                          \n"
   1924     "movdqa    (%1),%%xmm0                     \n"
   1925     "pavgb     (%1,%4,1),%%xmm0                \n"
   1926     "sub       $0x10,%2                        \n"
   1927     "movdqa    %%xmm0,(%1,%0,1)                \n"
   1928     "lea       0x10(%1),%1                     \n"
   1929     "jg        3b                              \n"
   1930     ".p2align  4                               \n"
   1931   "4:                                          \n"
   1932     "punpckhbw %%xmm0,%%xmm0                   \n"
   1933     "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
   1934     "punpckhqdq %%xmm0,%%xmm0                  \n"
   1935     "movdqa    %%xmm0,(%1,%0,1)                \n"
   1936   : "+r"(dst_ptr),    // %0
   1937     "+r"(src_ptr),    // %1
   1938     "+r"(dst_width),  // %2
   1939     "+r"(source_y_fraction)  // %3
   1940   : "r"(static_cast<intptr_t>(src_stride))  // %4
   1941   : "memory", "cc"
   1942 #if defined(__SSE2__)
   1943     , "xmm0", "xmm1", "xmm2", "xmm5"
   1944 #endif
   1945   );
   1946 }
   1947 #endif  // defined(__x86_64__) || defined(__i386__)
   1948 
   1949 // CPU agnostic row functions
   1950 static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
   1951                             uint8* dst, int dst_width) {
   1952   uint8* dend = dst + dst_width - 1;
   1953   do {
   1954     dst[0] = src_ptr[0];
   1955     dst[1] = src_ptr[2];
   1956     dst += 2;
   1957     src_ptr += 4;
   1958   } while (dst < dend);
   1959   if (dst_width & 1) {
   1960     dst[0] = src_ptr[0];
   1961   }
   1962 }
   1963 
   1964 void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
   1965                         uint8* dst, int dst_width) {
   1966   const uint8* s = src_ptr;
   1967   const uint8* t = src_ptr + src_stride;
   1968   uint8* dend = dst + dst_width - 1;
   1969   do {
   1970     dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
   1971     dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
   1972     dst += 2;
   1973     s += 4;
   1974     t += 4;
   1975   } while (dst < dend);
   1976   if (dst_width & 1) {
   1977     dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
   1978   }
   1979 }
   1980 
   1981 static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
   1982                             uint8* dst, int dst_width) {
   1983   uint8* dend = dst + dst_width - 1;
   1984   do {
   1985     dst[0] = src_ptr[0];
   1986     dst[1] = src_ptr[4];
   1987     dst += 2;
   1988     src_ptr += 8;
   1989   } while (dst < dend);
   1990   if (dst_width & 1) {
   1991     dst[0] = src_ptr[0];
   1992   }
   1993 }
   1994 
   1995 static void ScaleRowDown4Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
   1996                                uint8* dst, int dst_width) {
   1997   intptr_t stride = src_stride;
   1998   uint8* dend = dst + dst_width - 1;
   1999   do {
   2000     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
   2001              src_ptr[stride + 0] + src_ptr[stride + 1] +
   2002              src_ptr[stride + 2] + src_ptr[stride + 3] +
   2003              src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
   2004              src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
   2005              src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
   2006              src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
   2007              8) >> 4;
   2008     dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
   2009              src_ptr[stride + 4] + src_ptr[stride + 5] +
   2010              src_ptr[stride + 6] + src_ptr[stride + 7] +
   2011              src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
   2012              src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
   2013              src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
   2014              src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
   2015              8) >> 4;
   2016     dst += 2;
   2017     src_ptr += 8;
   2018   } while (dst < dend);
   2019   if (dst_width & 1) {
   2020     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
   2021              src_ptr[stride + 0] + src_ptr[stride + 1] +
   2022              src_ptr[stride + 2] + src_ptr[stride + 3] +
   2023              src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
   2024              src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
   2025              src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
   2026              src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
   2027              8) >> 4;
   2028   }
   2029 }
   2030 
   2031 // 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
   2032 // Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
   2033 static const int kMaxOutputWidth = 640;
   2034 static const int kMaxRow12 = kMaxOutputWidth * 2;
   2035 
   2036 static void ScaleRowDown8_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
   2037                             uint8* dst, int dst_width) {
   2038   uint8* dend = dst + dst_width - 1;
   2039   do {
   2040     dst[0] = src_ptr[0];
   2041     dst[1] = src_ptr[8];
   2042     dst += 2;
   2043     src_ptr += 16;
   2044   } while (dst < dend);
   2045   if (dst_width & 1) {
   2046     dst[0] = src_ptr[0];
   2047   }
   2048 }
   2049 
   2050 // Note calling code checks width is less than max and if not
   2051 // uses ScaleRowDown8_C instead.
   2052 static void ScaleRowDown8Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
   2053                                uint8* dst, int dst_width) {
   2054   SIMD_ALIGNED(uint8 src_row[kMaxRow12 * 2]);
   2055   assert(dst_width <= kMaxOutputWidth);
   2056   ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
   2057   ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
   2058                      src_row + kMaxOutputWidth,
   2059                      dst_width * 2);
   2060   ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
   2061 }
   2062 
   2063 static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
   2064                              uint8* dst, int dst_width) {
   2065   assert((dst_width % 3 == 0) && (dst_width > 0));
   2066   uint8* dend = dst + dst_width;
   2067   do {
   2068     dst[0] = src_ptr[0];
   2069     dst[1] = src_ptr[1];
   2070     dst[2] = src_ptr[3];
   2071     dst += 3;
   2072     src_ptr += 4;
   2073   } while (dst < dend);
   2074 }
   2075 
   2076 // Filter rows 0 and 1 together, 3 : 1
   2077 static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
   2078                                    uint8* d, int dst_width) {
   2079   assert((dst_width % 3 == 0) && (dst_width > 0));
   2080   const uint8* s = src_ptr;
   2081   const uint8* t = src_ptr + src_stride;
   2082   uint8* dend = d + dst_width;
   2083   do {
   2084     uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
   2085     uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
   2086     uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
   2087     uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
   2088     uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
   2089     uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
   2090     d[0] = (a0 * 3 + b0 + 2) >> 2;
   2091     d[1] = (a1 * 3 + b1 + 2) >> 2;
   2092     d[2] = (a2 * 3 + b2 + 2) >> 2;
   2093     d += 3;
   2094     s += 4;
   2095     t += 4;
   2096   } while (d < dend);
   2097 }
   2098 
   2099 // Filter rows 1 and 2 together, 1 : 1
   2100 static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
   2101                                    uint8* d, int dst_width) {
   2102   assert((dst_width % 3 == 0) && (dst_width > 0));
   2103   const uint8* s = src_ptr;
   2104   const uint8* t = src_ptr + src_stride;
   2105   uint8* dend = d + dst_width;
   2106   do {
   2107     uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
   2108     uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
   2109     uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
   2110     uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
   2111     uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
   2112     uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
   2113     d[0] = (a0 + b0 + 1) >> 1;
   2114     d[1] = (a1 + b1 + 1) >> 1;
   2115     d[2] = (a2 + b2 + 1) >> 1;
   2116     d += 3;
   2117     s += 4;
   2118     t += 4;
   2119   } while (d < dend);
   2120 }
   2121 
   2122 // (1-f)a + fb can be replaced with a + f(b-a)
   2123 #define BLENDER(a, b, f) (static_cast<int>(a) + \
   2124     ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
   2125 
   2126 static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
   2127                               int dst_width, int x, int dx) {
   2128   for (int j = 0; j < dst_width - 1; j += 2) {
   2129     int xi = x >> 16;
   2130     int a = src_ptr[xi];
   2131     int b = src_ptr[xi + 1];
   2132     dst_ptr[0] = BLENDER(a, b, x & 0xffff);
   2133     x += dx;
   2134     xi = x >> 16;
   2135     a = src_ptr[xi];
   2136     b = src_ptr[xi + 1];
   2137     dst_ptr[1] = BLENDER(a, b, x & 0xffff);
   2138     x += dx;
   2139     dst_ptr += 2;
   2140   }
   2141   if (dst_width & 1) {
   2142     int xi = x >> 16;
   2143     int a = src_ptr[xi];
   2144     int b = src_ptr[xi + 1];
   2145     dst_ptr[0] = BLENDER(a, b, x & 0xffff);
   2146   }
   2147 }
   2148 
   2149 static const int kMaxInputWidth = 2560;
   2150 
   2151 #if defined(HAS_SCALEFILTERROWS_SSE2)
   2152 // Filter row to 3/4
   2153 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
   2154                                 int dst_width) {
   2155   assert((dst_width % 3 == 0) && (dst_width > 0));
   2156   const uint8* s = src_ptr;
   2157   uint8* dend = dst_ptr + dst_width;
   2158   do {
   2159     dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
   2160     dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
   2161     dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
   2162     dst_ptr += 3;
   2163     s += 4;
   2164   } while (dst_ptr < dend);
   2165 }
   2166 
   2167 #define HAS_SCALEROWDOWN34_SSE2_DISABLED
   2168 // Filter rows 0 and 1 together, 3 : 1
   2169 static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr,
   2170                                       ptrdiff_t src_stride,
   2171                                       uint8* dst_ptr, int dst_width) {
   2172   assert((dst_width % 3 == 0) && (dst_width > 0));
   2173   SIMD_ALIGNED(uint8 row[kMaxInputWidth]);
   2174   ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
   2175   ScaleFilterCols34_C(dst_ptr, row, dst_width);
   2176 }
   2177 
   2178 // Filter rows 1 and 2 together, 1 : 1
   2179 static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr,
   2180                                       ptrdiff_t src_stride,
   2181                                       uint8* dst_ptr, int dst_width) {
   2182   assert((dst_width % 3 == 0) && (dst_width > 0));
   2183   SIMD_ALIGNED(uint8 row[kMaxInputWidth]);
   2184   ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
   2185   ScaleFilterCols34_C(dst_ptr, row, dst_width);
   2186 }
   2187 #endif
   2188 
   2189 static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
   2190                              uint8* dst, int dst_width) {
   2191   assert(dst_width % 3 == 0);
   2192   for (int x = 0; x < dst_width; x += 3) {
   2193     dst[0] = src_ptr[0];
   2194     dst[1] = src_ptr[3];
   2195     dst[2] = src_ptr[6];
   2196     dst += 3;
   2197     src_ptr += 8;
   2198   }
   2199 }
   2200 
   2201 // 8x3 -> 3x1
   2202 static void ScaleRowDown38_3_Int_C(const uint8* src_ptr,
   2203                                    ptrdiff_t src_stride,
   2204                                    uint8* dst_ptr, int dst_width) {
   2205   assert((dst_width % 3 == 0) && (dst_width > 0));
   2206   intptr_t stride = src_stride;
   2207   for (int i = 0; i < dst_width; i += 3) {
   2208     dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
   2209         src_ptr[stride + 0] + src_ptr[stride + 1] +
   2210         src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
   2211         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
   2212         (65536 / 9) >> 16;
   2213     dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
   2214         src_ptr[stride + 3] + src_ptr[stride + 4] +
   2215         src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
   2216         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
   2217         (65536 / 9) >> 16;
   2218     dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
   2219         src_ptr[stride + 6] + src_ptr[stride + 7] +
   2220         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
   2221         (65536 / 6) >> 16;
   2222     src_ptr += 8;
   2223     dst_ptr += 3;
   2224   }
   2225 }
   2226 
   2227 // 8x2 -> 3x1
   2228 static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
   2229                                    uint8* dst_ptr, int dst_width) {
   2230   assert((dst_width % 3 == 0) && (dst_width > 0));
   2231   intptr_t stride = src_stride;
   2232   for (int i = 0; i < dst_width; i += 3) {
   2233     dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
   2234         src_ptr[stride + 0] + src_ptr[stride + 1] +
   2235         src_ptr[stride + 2]) * (65536 / 6) >> 16;
   2236     dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
   2237         src_ptr[stride + 3] + src_ptr[stride + 4] +
   2238         src_ptr[stride + 5]) * (65536 / 6) >> 16;
   2239     dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
   2240         src_ptr[stride + 6] + src_ptr[stride + 7]) *
   2241         (65536 / 4) >> 16;
   2242     src_ptr += 8;
   2243     dst_ptr += 3;
   2244   }
   2245 }
   2246 
   2247 // C version 8x2 -> 8x1
   2248 static void ScaleFilterRows_C(uint8* dst_ptr,
   2249                               const uint8* src_ptr, ptrdiff_t src_stride,
   2250                               int dst_width, int source_y_fraction) {
   2251   assert(dst_width > 0);
   2252   int y1_fraction = source_y_fraction;
   2253   int y0_fraction = 256 - y1_fraction;
   2254   const uint8* src_ptr1 = src_ptr + src_stride;
   2255   uint8* end = dst_ptr + dst_width;
   2256   do {
   2257     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
   2258     dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
   2259     dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
   2260     dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
   2261     dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
   2262     dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
   2263     dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
   2264     dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
   2265     src_ptr += 8;
   2266     src_ptr1 += 8;
   2267     dst_ptr += 8;
   2268   } while (dst_ptr < end);
   2269   dst_ptr[0] = dst_ptr[-1];
   2270 }
   2271 
   2272 void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
   2273                     uint16* dst_ptr, int src_width, int src_height) {
   2274   assert(src_width > 0);
   2275   assert(src_height > 0);
   2276   for (int x = 0; x < src_width; ++x) {
   2277     const uint8* s = src_ptr + x;
   2278     int sum = 0;
   2279     for (int y = 0; y < src_height; ++y) {
   2280       sum += s[0];
   2281       s += src_stride;
   2282     }
   2283     dst_ptr[x] = sum;
   2284   }
   2285 }
   2286 
   2287 /**
   2288  * Scale plane, 1/2
   2289  *
   2290  * This is an optimized version for scaling down a plane to 1/2 of
   2291  * its original size.
   2292  *
   2293  */
   2294 static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
   2295                             int dst_width, int dst_height,
   2296                             int src_stride, int dst_stride,
   2297                             const uint8* src_ptr, uint8* dst_ptr,
   2298                             FilterMode filtering) {
   2299   void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
   2300                         uint8* dst_ptr, int dst_width) =
   2301       filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
   2302 #if defined(HAS_SCALEROWDOWN2_NEON)
   2303   if (TestCpuFlag(kCpuHasNEON) &&
   2304       IS_ALIGNED(dst_width, 16)) {
   2305     ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
   2306   }
   2307 #elif defined(HAS_SCALEROWDOWN2_SSE2)
   2308   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
   2309     ScaleRowDown2 = filtering ? ScaleRowDown2Int_Unaligned_SSE2 :
   2310         ScaleRowDown2_Unaligned_SSE2;
   2311     if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
   2312         IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
   2313       ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
   2314     }
   2315   }
   2316 #endif
   2317 
   2318   // TODO(fbarchard): Loop through source height to allow odd height.
   2319   for (int y = 0; y < dst_height; ++y) {
   2320     ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
   2321     src_ptr += (src_stride << 1);
   2322     dst_ptr += dst_stride;
   2323   }
   2324 }
   2325 
   2326 /**
   2327  * Scale plane, 1/4
   2328  *
   2329  * This is an optimized version for scaling down a plane to 1/4 of
   2330  * its original size.
   2331  */
   2332 static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
   2333                             int dst_width, int dst_height,
   2334                             int src_stride, int dst_stride,
   2335                             const uint8* src_ptr, uint8* dst_ptr,
   2336                             FilterMode filtering) {
   2337   void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
   2338                         uint8* dst_ptr, int dst_width) =
   2339       filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
   2340 #if defined(HAS_SCALEROWDOWN4_NEON)
   2341   if (TestCpuFlag(kCpuHasNEON) &&
   2342       IS_ALIGNED(dst_width, 4)) {
   2343     ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
   2344   }
   2345 #elif defined(HAS_SCALEROWDOWN4_SSE2)
   2346   if (TestCpuFlag(kCpuHasSSE2) &&
   2347       IS_ALIGNED(dst_width, 8) &&
   2348       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
   2349     ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
   2350   }
   2351 #endif
   2352 
   2353   for (int y = 0; y < dst_height; ++y) {
   2354     ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
   2355     src_ptr += (src_stride << 2);
   2356     dst_ptr += dst_stride;
   2357   }
   2358 }
   2359 
   2360 /**
   2361  * Scale plane, 1/8
   2362  *
   2363  * This is an optimized version for scaling down a plane to 1/8
   2364  * of its original size.
   2365  *
   2366  */
   2367 static void ScalePlaneDown8(int /* src_width */, int /* src_height */,
   2368                             int dst_width, int dst_height,
   2369                             int src_stride, int dst_stride,
   2370                             const uint8* src_ptr, uint8* dst_ptr,
   2371                             FilterMode filtering) {
   2372   void (*ScaleRowDown8)(const uint8* src_ptr, ptrdiff_t src_stride,
   2373                         uint8* dst_ptr, int dst_width) =
   2374       filtering && (dst_width <= kMaxOutputWidth) ?
   2375       ScaleRowDown8Int_C : ScaleRowDown8_C;
   2376 #if defined(HAS_SCALEROWDOWN8_SSE2)
   2377   if (TestCpuFlag(kCpuHasSSE2) &&
   2378       IS_ALIGNED(dst_width, 4) &&
   2379       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
   2380     ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
   2381   }
   2382 #endif
   2383 
   2384   for (int y = 0; y < dst_height; ++y) {
   2385     ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
   2386     src_ptr += (src_stride << 3);
   2387     dst_ptr += dst_stride;
   2388   }
   2389 }
   2390 
   2391 /**
   2392  * Scale plane down, 3/4
   2393  *
   2394  * Provided by Frank Barchard (fbarchard (at) google.com)
   2395  *
   2396  */
   2397 static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
   2398                              int dst_width, int dst_height,
   2399                              int src_stride, int dst_stride,
   2400                              const uint8* src_ptr, uint8* dst_ptr,
   2401                              FilterMode filtering) {
   2402   assert(dst_width % 3 == 0);
   2403   void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
   2404                            uint8* dst_ptr, int dst_width);
   2405   void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
   2406                            uint8* dst_ptr, int dst_width);
   2407   if (!filtering) {
   2408     ScaleRowDown34_0 = ScaleRowDown34_C;
   2409     ScaleRowDown34_1 = ScaleRowDown34_C;
   2410   } else {
   2411     ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
   2412     ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
   2413   }
   2414 #if defined(HAS_SCALEROWDOWN34_NEON)
   2415   if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
   2416     if (!filtering) {
   2417       ScaleRowDown34_0 = ScaleRowDown34_NEON;
   2418       ScaleRowDown34_1 = ScaleRowDown34_NEON;
   2419     } else {
   2420       ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
   2421       ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
   2422     }
   2423   }
   2424 #endif
   2425 #if defined(HAS_SCALEROWDOWN34_SSE2)
   2426   if (TestCpuFlag(kCpuHasSSE2) && (dst_width % 24 == 0) &&
   2427       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && filtering) {
   2428     ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
   2429     ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
   2430   }
   2431 #endif
   2432 #if defined(HAS_SCALEROWDOWN34_SSSE3)
   2433   if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
   2434       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
   2435     if (!filtering) {
   2436       ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
   2437       ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
   2438     } else {
   2439       ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
   2440       ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
   2441     }
   2442   }
   2443 #endif
   2444 
   2445   for (int y = 0; y < dst_height - 2; y += 3) {
   2446     ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
   2447     src_ptr += src_stride;
   2448     dst_ptr += dst_stride;
   2449     ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
   2450     src_ptr += src_stride;
   2451     dst_ptr += dst_stride;
   2452     ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
   2453                      dst_ptr, dst_width);
   2454     src_ptr += src_stride * 2;
   2455     dst_ptr += dst_stride;
   2456   }
   2457 
   2458   // Remainder 1 or 2 rows with last row vertically unfiltered
   2459   if ((dst_height % 3) == 2) {
   2460     ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
   2461     src_ptr += src_stride;
   2462     dst_ptr += dst_stride;
   2463     ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
   2464   } else if ((dst_height % 3) == 1) {
   2465     ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
   2466   }
   2467 }
   2468 
   2469 /**
   2470  * Scale plane, 3/8
   2471  *
   2472  * This is an optimized version for scaling down a plane to 3/8
   2473  * of its original size.
   2474  *
   2475  * Uses box filter arranges like this
   2476  * aaabbbcc -> abc
   2477  * aaabbbcc    def
   2478  * aaabbbcc    ghi
   2479  * dddeeeff
   2480  * dddeeeff
   2481  * dddeeeff
   2482  * ggghhhii
   2483  * ggghhhii
   2484  * Boxes are 3x3, 2x3, 3x2 and 2x2
   2485  */
   2486 static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
   2487                              int dst_width, int dst_height,
   2488                              int src_stride, int dst_stride,
   2489                              const uint8* src_ptr, uint8* dst_ptr,
   2490                              FilterMode filtering) {
   2491   assert(dst_width % 3 == 0);
   2492   void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
   2493                            uint8* dst_ptr, int dst_width);
   2494   void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
   2495                            uint8* dst_ptr, int dst_width);
   2496   if (!filtering) {
   2497     ScaleRowDown38_3 = ScaleRowDown38_C;
   2498     ScaleRowDown38_2 = ScaleRowDown38_C;
   2499   } else {
   2500     ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
   2501     ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
   2502   }
   2503 #if defined(HAS_SCALEROWDOWN38_NEON)
   2504   if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
   2505     if (!filtering) {
   2506       ScaleRowDown38_3 = ScaleRowDown38_NEON;
   2507       ScaleRowDown38_2 = ScaleRowDown38_NEON;
   2508     } else {
   2509       ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
   2510       ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
   2511     }
   2512   }
   2513 #elif defined(HAS_SCALEROWDOWN38_SSSE3)
   2514   if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
   2515       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
   2516     if (!filtering) {
   2517       ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
   2518       ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
   2519     } else {
   2520       ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
   2521       ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
   2522     }
   2523   }
   2524 #endif
   2525 
   2526   for (int y = 0; y < dst_height - 2; y += 3) {
   2527     ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
   2528     src_ptr += src_stride * 3;
   2529     dst_ptr += dst_stride;
   2530     ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
   2531     src_ptr += src_stride * 3;
   2532     dst_ptr += dst_stride;
   2533     ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
   2534     src_ptr += src_stride * 2;
   2535     dst_ptr += dst_stride;
   2536   }
   2537 
   2538   // Remainder 1 or 2 rows with last row vertically unfiltered
   2539   if ((dst_height % 3) == 2) {
   2540     ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
   2541     src_ptr += src_stride * 3;
   2542     dst_ptr += dst_stride;
   2543     ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
   2544   } else if ((dst_height % 3) == 1) {
   2545     ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
   2546   }
   2547 }
   2548 
   2549 static __inline uint32 SumBox(int iboxwidth, int iboxheight,
   2550                               ptrdiff_t src_stride, const uint8* src_ptr) {
   2551   assert(iboxwidth > 0);
   2552   assert(iboxheight > 0);
   2553   uint32 sum = 0u;
   2554   for (int y = 0; y < iboxheight; ++y) {
   2555     for (int x = 0; x < iboxwidth; ++x) {
   2556       sum += src_ptr[x];
   2557     }
   2558     src_ptr += src_stride;
   2559   }
   2560   return sum;
   2561 }
   2562 
   2563 static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
   2564                                int x, int dx, ptrdiff_t src_stride,
   2565                                const uint8* src_ptr, uint8* dst_ptr) {
   2566   for (int i = 0; i < dst_width; ++i) {
   2567     int ix = x >> 16;
   2568     x += dx;
   2569     int boxwidth = (x >> 16) - ix;
   2570     *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
   2571         (boxwidth * boxheight);
   2572   }
   2573 }
   2574 
   2575 static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
   2576   assert(iboxwidth > 0);
   2577   uint32 sum = 0u;
   2578   for (int x = 0; x < iboxwidth; ++x) {
   2579     sum += src_ptr[x];
   2580   }
   2581   return sum;
   2582 }
   2583 
   2584 static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
   2585                             const uint16* src_ptr, uint8* dst_ptr) {
   2586   int scaletbl[2];
   2587   int minboxwidth = (dx >> 16);
   2588   scaletbl[0] = 65536 / (minboxwidth * boxheight);
   2589   scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
   2590   int *scaleptr = scaletbl - minboxwidth;
   2591   for (int i = 0; i < dst_width; ++i) {
   2592     int ix = x >> 16;
   2593     x += dx;
   2594     int boxwidth = (x >> 16) - ix;
   2595     *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
   2596   }
   2597 }
   2598 
   2599 static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
   2600                             const uint16* src_ptr, uint8* dst_ptr) {
   2601   int boxwidth = (dx >> 16);
   2602   int scaleval = 65536 / (boxwidth * boxheight);
   2603   for (int i = 0; i < dst_width; ++i) {
   2604     *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
   2605     x += boxwidth;
   2606   }
   2607 }
   2608 
   2609 /**
   2610  * Scale plane down to any dimensions, with interpolation.
   2611  * (boxfilter).
   2612  *
   2613  * Same method as SimpleScale, which is fixed point, outputting
   2614  * one pixel of destination using fixed point (16.16) to step
   2615  * through source, sampling a box of pixel with simple
   2616  * averaging.
   2617  */
   2618 static void ScalePlaneBox(int src_width, int src_height,
   2619                           int dst_width, int dst_height,
   2620                           int src_stride, int dst_stride,
   2621                           const uint8* src_ptr, uint8* dst_ptr) {
   2622   assert(dst_width > 0);
   2623   assert(dst_height > 0);
   2624   int dx = (src_width << 16) / dst_width;
   2625   int dy = (src_height << 16) / dst_height;
   2626   int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
   2627   int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
   2628   int maxy = (src_height << 16);
   2629   if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
   2630       dst_height * 2 > src_height) {
   2631     uint8* dst = dst_ptr;
   2632     for (int j = 0; j < dst_height; ++j) {
   2633       int iy = y >> 16;
   2634       const uint8* src = src_ptr + iy * src_stride;
   2635       y += dy;
   2636       if (y > maxy) {
   2637         y = maxy;
   2638       }
   2639       int boxheight = (y >> 16) - iy;
   2640       ScalePlaneBoxRow_C(dst_width, boxheight,
   2641                          x, dx, src_stride,
   2642                          src, dst);
   2643       dst += dst_stride;
   2644     }
   2645   } else {
   2646     SIMD_ALIGNED(uint16 row[kMaxInputWidth]);
   2647     void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
   2648                          uint16* dst_ptr, int src_width, int src_height)=
   2649         ScaleAddRows_C;
   2650     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
   2651                          const uint16* src_ptr, uint8* dst_ptr);
   2652     if (dx & 0xffff) {
   2653       ScaleAddCols = ScaleAddCols2_C;
   2654     } else {
   2655       ScaleAddCols = ScaleAddCols1_C;
   2656     }
   2657 #if defined(HAS_SCALEADDROWS_SSE2)
   2658     if (TestCpuFlag(kCpuHasSSE2) &&
   2659         IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
   2660       ScaleAddRows = ScaleAddRows_SSE2;
   2661     }
   2662 #endif
   2663 
   2664     for (int j = 0; j < dst_height; ++j) {
   2665       int iy = y >> 16;
   2666       const uint8* src = src_ptr + iy * src_stride;
   2667       y += dy;
   2668       if (y > (src_height << 16)) {
   2669         y = (src_height << 16);
   2670       }
   2671       int boxheight = (y >> 16) - iy;
   2672       ScaleAddRows(src, src_stride, row, src_width, boxheight);
   2673       ScaleAddCols(dst_width, boxheight, x, dx, row, dst_ptr);
   2674       dst_ptr += dst_stride;
   2675     }
   2676   }
   2677 }
   2678 
   2679 /**
   2680  * Scale plane to/from any dimensions, with interpolation.
   2681  */
   2682 static void ScalePlaneBilinearSimple(int src_width, int src_height,
   2683                                      int dst_width, int dst_height,
   2684                                      int src_stride, int dst_stride,
   2685                                      const uint8* src_ptr, uint8* dst_ptr) {
   2686   int dx = (src_width << 16) / dst_width;
   2687   int dy = (src_height << 16) / dst_height;
   2688   int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
   2689   int maxx = (src_width > 1) ? ((src_width - 1) << 16) - 1 : 0;
   2690   int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
   2691   for (int i = 0; i < dst_height; ++i) {
   2692     int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
   2693     int yi = y >> 16;
   2694     int yf = y & 0xffff;
   2695     const uint8* src0 = src_ptr + yi * src_stride;
   2696     const uint8* src1 = (yi < src_height - 1) ? src0 + src_stride : src0;
   2697     uint8* dst = dst_ptr;
   2698     for (int j = 0; j < dst_width; ++j) {
   2699       int xi = x >> 16;
   2700       int xf = x & 0xffff;
   2701       int x1 = (xi < src_width - 1) ? xi + 1 : xi;
   2702       int a = src0[xi];
   2703       int b = src0[x1];
   2704       int r0 = BLENDER(a, b, xf);
   2705       a = src1[xi];
   2706       b = src1[x1];
   2707       int r1 = BLENDER(a, b, xf);
   2708       *dst++ = BLENDER(r0, r1, yf);
   2709       x += dx;
   2710       if (x > maxx)
   2711         x = maxx;
   2712     }
   2713     dst_ptr += dst_stride;
   2714     y += dy;
   2715     if (y > maxy)
   2716       y = maxy;
   2717   }
   2718 }
   2719 
   2720 /**
   2721  * Scale plane to/from any dimensions, with bilinear
   2722  * interpolation.
   2723  */
   2724 void ScalePlaneBilinear(int src_width, int src_height,
   2725                         int dst_width, int dst_height,
   2726                         int src_stride, int dst_stride,
   2727                         const uint8* src_ptr, uint8* dst_ptr) {
   2728   assert(dst_width > 0);
   2729   assert(dst_height > 0);
   2730   if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
   2731     ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
   2732                              src_stride, dst_stride, src_ptr, dst_ptr);
   2733 
   2734   } else {
   2735     SIMD_ALIGNED(uint8 row[kMaxInputWidth + 16]);
   2736     void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
   2737                             ptrdiff_t src_stride,
   2738                             int dst_width, int source_y_fraction) =
   2739         ScaleFilterRows_C;
   2740 #if defined(HAS_SCALEFILTERROWS_NEON)
   2741     if (TestCpuFlag(kCpuHasNEON)) {
   2742       ScaleFilterRows = ScaleFilterRows_NEON;
   2743     }
   2744 #endif
   2745 #if defined(HAS_SCALEFILTERROWS_SSE2)
   2746     if (TestCpuFlag(kCpuHasSSE2) &&
   2747         IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
   2748       ScaleFilterRows = ScaleFilterRows_SSE2;
   2749     }
   2750 #endif
   2751 #if defined(HAS_SCALEFILTERROWS_SSSE3)
   2752     if (TestCpuFlag(kCpuHasSSSE3) &&
   2753         IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
   2754       ScaleFilterRows = ScaleFilterRows_SSSE3;
   2755     }
   2756 #endif
   2757 
   2758     int dx = (src_width << 16) / dst_width;
   2759     int dy = (src_height << 16) / dst_height;
   2760     int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
   2761     int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
   2762     int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
   2763     for (int j = 0; j < dst_height; ++j) {
   2764       int yi = y >> 16;
   2765       int yf = (y >> 8) & 255;
   2766       const uint8* src = src_ptr + yi * src_stride;
   2767       ScaleFilterRows(row, src, src_stride, src_width, yf);
   2768       ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
   2769       dst_ptr += dst_stride;
   2770       y += dy;
   2771       if (y > maxy) {
   2772         y = maxy;
   2773       }
   2774     }
   2775   }
   2776 }
   2777 
   2778 /**
   2779  * Scale plane to/from any dimensions, without interpolation.
   2780  * Fixed point math is used for performance: The upper 16 bits
   2781  * of x and dx is the integer part of the source position and
   2782  * the lower 16 bits are the fixed decimal part.
   2783  */
   2784 static void ScalePlaneSimple(int src_width, int src_height,
   2785                              int dst_width, int dst_height,
   2786                              int src_stride, int dst_stride,
   2787                              const uint8* src_ptr, uint8* dst_ptr) {
   2788   int dx = (src_width << 16) / dst_width;
   2789   int dy = (src_height << 16) / dst_height;
   2790   int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
   2791   for (int j = 0; j < dst_height; ++j) {
   2792     int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
   2793     int yi = y >> 16;
   2794     const uint8* src = src_ptr + yi * src_stride;
   2795     uint8* dst = dst_ptr;
   2796     for (int i = 0; i < dst_width; ++i) {
   2797       *dst++ = src[x >> 16];
   2798       x += dx;
   2799     }
   2800     dst_ptr += dst_stride;
   2801     y += dy;
   2802   }
   2803 }
   2804 
   2805 /**
   2806  * Scale plane to/from any dimensions.
   2807  */
   2808 static void ScalePlaneAnySize(int src_width, int src_height,
   2809                               int dst_width, int dst_height,
   2810                               int src_stride, int dst_stride,
   2811                               const uint8* src_ptr, uint8* dst_ptr,
   2812                               FilterMode filtering) {
   2813   if (!filtering) {
   2814     ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
   2815                      src_stride, dst_stride, src_ptr, dst_ptr);
   2816   } else {
   2817     // fall back to non-optimized version
   2818     ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
   2819                        src_stride, dst_stride, src_ptr, dst_ptr);
   2820   }
   2821 }
   2822 
   2823 /**
   2824  * Scale plane down, any size
   2825  *
   2826  * This is an optimized version for scaling down a plane to any size.
   2827  * The current implementation is ~10 times faster compared to the
   2828  * reference implementation for e.g. XGA->LowResPAL
   2829  *
   2830  */
   2831 static void ScalePlaneDown(int src_width, int src_height,
   2832                            int dst_width, int dst_height,
   2833                            int src_stride, int dst_stride,
   2834                            const uint8* src_ptr, uint8* dst_ptr,
   2835                            FilterMode filtering) {
   2836   if (!filtering) {
   2837     ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
   2838                      src_stride, dst_stride, src_ptr, dst_ptr);
   2839   } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
   2840     // between 1/2x and 1x use bilinear
   2841     ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
   2842                        src_stride, dst_stride, src_ptr, dst_ptr);
   2843   } else {
   2844     ScalePlaneBox(src_width, src_height, dst_width, dst_height,
   2845                   src_stride, dst_stride, src_ptr, dst_ptr);
   2846   }
   2847 }
   2848 
   2849 // Scale a plane.
   2850 // This function in turn calls a scaling function suitable for handling
   2851 // the desired resolutions.
   2852 
   2853 LIBYUV_API
   2854 void ScalePlane(const uint8* src, int src_stride,
   2855                 int src_width, int src_height,
   2856                 uint8* dst, int dst_stride,
   2857                 int dst_width, int dst_height,
   2858                 FilterMode filtering) {
   2859 #ifdef CPU_X86
   2860   // environment variable overrides for testing.
   2861   char *filter_override = getenv("LIBYUV_FILTER");
   2862   if (filter_override) {
   2863     filtering = (FilterMode)atoi(filter_override);  // NOLINT
   2864   }
   2865 #endif
   2866   // Use specialized scales to improve performance for common resolutions.
   2867   // For example, all the 1/2 scalings will use ScalePlaneDown2()
   2868   if (dst_width == src_width && dst_height == src_height) {
   2869     // Straight copy.
   2870     CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
   2871   } else if (dst_width <= src_width && dst_height <= src_height) {
   2872     // Scale down.
   2873     if (use_reference_impl_) {
   2874       // For testing, allow the optimized versions to be disabled.
   2875       ScalePlaneDown(src_width, src_height, dst_width, dst_height,
   2876                      src_stride, dst_stride, src, dst, filtering);
   2877     } else if (4 * dst_width == 3 * src_width &&
   2878                4 * dst_height == 3 * src_height) {
   2879       // optimized, 3/4
   2880       ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
   2881                        src_stride, dst_stride, src, dst, filtering);
   2882     } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
   2883       // optimized, 1/2
   2884       ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
   2885                       src_stride, dst_stride, src, dst, filtering);
   2886     // 3/8 rounded up for odd sized chroma height.
   2887     } else if (8 * dst_width == 3 * src_width &&
   2888                dst_height == ((src_height * 3 + 7) / 8)) {
   2889       // optimized, 3/8
   2890       ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
   2891                        src_stride, dst_stride, src, dst, filtering);
   2892     } else if (4 * dst_width == src_width && 4 * dst_height == src_height &&
   2893                filtering != kFilterBilinear) {
   2894       // optimized, 1/4
   2895       ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
   2896                       src_stride, dst_stride, src, dst, filtering);
   2897     } else if (8 * dst_width == src_width && 8 * dst_height == src_height &&
   2898                filtering != kFilterBilinear) {
   2899       // optimized, 1/8
   2900       ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
   2901                       src_stride, dst_stride, src, dst, filtering);
   2902     } else {
   2903       // Arbitrary downsample
   2904       ScalePlaneDown(src_width, src_height, dst_width, dst_height,
   2905                      src_stride, dst_stride, src, dst, filtering);
   2906     }
   2907   } else {
   2908     // Arbitrary scale up and/or down.
   2909     ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
   2910                       src_stride, dst_stride, src, dst, filtering);
   2911   }
   2912 }
   2913 
   2914 // Scale an I420 image.
   2915 // This function in turn calls a scaling function for each plane.
   2916 
   2917 #define UNDER_ALLOCATED_HACK 1
   2918 
   2919 LIBYUV_API
   2920 int I420Scale(const uint8* src_y, int src_stride_y,
   2921               const uint8* src_u, int src_stride_u,
   2922               const uint8* src_v, int src_stride_v,
   2923               int src_width, int src_height,
   2924               uint8* dst_y, int dst_stride_y,
   2925               uint8* dst_u, int dst_stride_u,
   2926               uint8* dst_v, int dst_stride_v,
   2927               int dst_width, int dst_height,
   2928               FilterMode filtering) {
   2929   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
   2930       !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
   2931     return -1;
   2932   }
   2933   // Negative height means invert the image.
   2934   if (src_height < 0) {
   2935     src_height = -src_height;
   2936     int halfheight = (src_height + 1) >> 1;
   2937     src_y = src_y + (src_height - 1) * src_stride_y;
   2938     src_u = src_u + (halfheight - 1) * src_stride_u;
   2939     src_v = src_v + (halfheight - 1) * src_stride_v;
   2940     src_stride_y = -src_stride_y;
   2941     src_stride_u = -src_stride_u;
   2942     src_stride_v = -src_stride_v;
   2943   }
   2944   int src_halfwidth = (src_width + 1) >> 1;
   2945   int src_halfheight = (src_height + 1) >> 1;
   2946   int dst_halfwidth = (dst_width + 1) >> 1;
   2947   int dst_halfheight = (dst_height + 1) >> 1;
   2948 
   2949 #ifdef UNDER_ALLOCATED_HACK
   2950   // If caller passed width / 2 for stride, adjust halfwidth to match.
   2951   if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) {
   2952     src_halfwidth = src_width >> 1;
   2953   }
   2954   if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) {
   2955     dst_halfwidth = dst_width >> 1;
   2956   }
   2957   // If caller used height / 2 when computing src_v, it will point into what
   2958   // should be the src_u plane. Detect this and reduce halfheight to match.
   2959   int uv_src_plane_size = src_halfwidth * src_halfheight;
   2960   if ((src_height & 1) &&
   2961       (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
   2962     src_halfheight = src_height >> 1;
   2963   }
   2964   int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
   2965   if ((dst_height & 1) &&
   2966       (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
   2967     dst_halfheight = dst_height >> 1;
   2968   }
   2969 #endif
   2970 
   2971   ScalePlane(src_y, src_stride_y, src_width, src_height,
   2972              dst_y, dst_stride_y, dst_width, dst_height,
   2973              filtering);
   2974   ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
   2975              dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
   2976              filtering);
   2977   ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
   2978              dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
   2979              filtering);
   2980   return 0;
   2981 }
   2982 
   2983 // Deprecated api
   2984 LIBYUV_API
   2985 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
   2986           int src_stride_y, int src_stride_u, int src_stride_v,
   2987           int src_width, int src_height,
   2988           uint8* dst_y, uint8* dst_u, uint8* dst_v,
   2989           int dst_stride_y, int dst_stride_u, int dst_stride_v,
   2990           int dst_width, int dst_height,
   2991           bool interpolate) {
   2992   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
   2993       !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
   2994     return -1;
   2995   }
   2996   // Negative height means invert the image.
   2997   if (src_height < 0) {
   2998     src_height = -src_height;
   2999     int halfheight = (src_height + 1) >> 1;
   3000     src_y = src_y + (src_height - 1) * src_stride_y;
   3001     src_u = src_u + (halfheight - 1) * src_stride_u;
   3002     src_v = src_v + (halfheight - 1) * src_stride_v;
   3003     src_stride_y = -src_stride_y;
   3004     src_stride_u = -src_stride_u;
   3005     src_stride_v = -src_stride_v;
   3006   }
   3007   int src_halfwidth = (src_width + 1) >> 1;
   3008   int src_halfheight = (src_height + 1) >> 1;
   3009   int dst_halfwidth = (dst_width + 1) >> 1;
   3010   int dst_halfheight = (dst_height + 1) >> 1;
   3011   FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
   3012 
   3013 #ifdef UNDER_ALLOCATED_HACK
   3014   // If caller passed width / 2 for stride, adjust halfwidth to match.
   3015   if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) {
   3016     src_halfwidth = src_width >> 1;
   3017   }
   3018   if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) {
   3019     dst_halfwidth = dst_width >> 1;
   3020   }
   3021   // If caller used height / 2 when computing src_v, it will point into what
   3022   // should be the src_u plane. Detect this and reduce halfheight to match.
   3023   int uv_src_plane_size = src_halfwidth * src_halfheight;
   3024   if ((src_height & 1) &&
   3025       (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
   3026     src_halfheight = src_height >> 1;
   3027   }
   3028   int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
   3029   if ((dst_height & 1) &&
   3030       (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
   3031     dst_halfheight = dst_height >> 1;
   3032   }
   3033 #endif
   3034 
   3035   ScalePlane(src_y, src_stride_y, src_width, src_height,
   3036              dst_y, dst_stride_y, dst_width, dst_height,
   3037              filtering);
   3038   ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
   3039              dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
   3040              filtering);
   3041   ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
   3042              dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
   3043              filtering);
   3044   return 0;
   3045 }
   3046 
   3047 // Deprecated api
   3048 LIBYUV_API
   3049 int ScaleOffset(const uint8* src, int src_width, int src_height,
   3050                 uint8* dst, int dst_width, int dst_height, int dst_yoffset,
   3051                 bool interpolate) {
   3052   if (!src || src_width <= 0 || src_height <= 0 ||
   3053       !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
   3054       dst_yoffset >= dst_height) {
   3055     return -1;
   3056   }
   3057   dst_yoffset = dst_yoffset & ~1;  // chroma requires offset to multiple of 2.
   3058   int src_halfwidth = (src_width + 1) >> 1;
   3059   int src_halfheight = (src_height + 1) >> 1;
   3060   int dst_halfwidth = (dst_width + 1) >> 1;
   3061   int dst_halfheight = (dst_height + 1) >> 1;
   3062   int aheight = dst_height - dst_yoffset * 2;  // actual output height
   3063   const uint8* src_y = src;
   3064   const uint8* src_u = src + src_width * src_height;
   3065   const uint8* src_v = src + src_width * src_height +
   3066                              src_halfwidth * src_halfheight;
   3067   uint8* dst_y = dst + dst_yoffset * dst_width;
   3068   uint8* dst_u = dst + dst_width * dst_height +
   3069                  (dst_yoffset >> 1) * dst_halfwidth;
   3070   uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
   3071                  (dst_yoffset >> 1) * dst_halfwidth;
   3072   return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
   3073                src_width, src_height, dst_y, dst_u, dst_v, dst_width,
   3074                dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
   3075 }
   3076 
   3077 #ifdef __cplusplus
   3078 }  // extern "C"
   3079 }  // namespace libyuv
   3080 #endif
   3081