Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/scale.h"
     12 
     13 #include <assert.h>
     14 #include <string.h>
     15 
     16 #include "libyuv/cpu_id.h"
     17 #include "libyuv/planar_functions.h"  // For CopyPlane
     18 #include "libyuv/row.h"
     19 #include "libyuv/scale_row.h"
     20 
     21 #ifdef __cplusplus
     22 namespace libyuv {
     23 extern "C" {
     24 #endif
     25 
     26 static __inline int Abs(int v) {
     27   return v >= 0 ? v : -v;
     28 }
     29 
     30 #define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
     31 
     32 // Scale plane, 1/2
     33 // This is an optimized version for scaling down a plane to 1/2 of
     34 // its original size.
     35 
     36 static void ScalePlaneDown2(int src_width,
     37                             int src_height,
     38                             int dst_width,
     39                             int dst_height,
     40                             int src_stride,
     41                             int dst_stride,
     42                             const uint8* src_ptr,
     43                             uint8* dst_ptr,
     44                             enum FilterMode filtering) {
     45   int y;
     46   void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
     47                         uint8* dst_ptr, int dst_width) =
     48       filtering == kFilterNone
     49           ? ScaleRowDown2_C
     50           : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
     51                                         : ScaleRowDown2Box_C);
     52   int row_stride = src_stride << 1;
     53   (void)src_width;
     54   (void)src_height;
     55   if (!filtering) {
     56     src_ptr += src_stride;  // Point to odd rows.
     57     src_stride = 0;
     58   }
     59 
     60 #if defined(HAS_SCALEROWDOWN2_NEON)
     61   if (TestCpuFlag(kCpuHasNEON)) {
     62     ScaleRowDown2 =
     63         filtering == kFilterNone
     64             ? ScaleRowDown2_Any_NEON
     65             : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON
     66                                           : ScaleRowDown2Box_Any_NEON);
     67     if (IS_ALIGNED(dst_width, 16)) {
     68       ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON
     69                                                : (filtering == kFilterLinear
     70                                                       ? ScaleRowDown2Linear_NEON
     71                                                       : ScaleRowDown2Box_NEON);
     72     }
     73   }
     74 #endif
     75 #if defined(HAS_SCALEROWDOWN2_SSSE3)
     76   if (TestCpuFlag(kCpuHasSSSE3)) {
     77     ScaleRowDown2 =
     78         filtering == kFilterNone
     79             ? ScaleRowDown2_Any_SSSE3
     80             : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3
     81                                           : ScaleRowDown2Box_Any_SSSE3);
     82     if (IS_ALIGNED(dst_width, 16)) {
     83       ScaleRowDown2 =
     84           filtering == kFilterNone
     85               ? ScaleRowDown2_SSSE3
     86               : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3
     87                                             : ScaleRowDown2Box_SSSE3);
     88     }
     89   }
     90 #endif
     91 #if defined(HAS_SCALEROWDOWN2_AVX2)
     92   if (TestCpuFlag(kCpuHasAVX2)) {
     93     ScaleRowDown2 =
     94         filtering == kFilterNone
     95             ? ScaleRowDown2_Any_AVX2
     96             : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2
     97                                           : ScaleRowDown2Box_Any_AVX2);
     98     if (IS_ALIGNED(dst_width, 32)) {
     99       ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2
    100                                                : (filtering == kFilterLinear
    101                                                       ? ScaleRowDown2Linear_AVX2
    102                                                       : ScaleRowDown2Box_AVX2);
    103     }
    104   }
    105 #endif
    106 #if defined(HAS_SCALEROWDOWN2_DSPR2)
    107   if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
    108       IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
    109       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
    110     ScaleRowDown2 = filtering ? ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
    111   }
    112 #endif
    113 #if defined(HAS_SCALEROWDOWN2_MSA)
    114   if (TestCpuFlag(kCpuHasMSA)) {
    115     ScaleRowDown2 =
    116         filtering == kFilterNone
    117             ? ScaleRowDown2_Any_MSA
    118             : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
    119                                           : ScaleRowDown2Box_Any_MSA);
    120     if (IS_ALIGNED(dst_width, 32)) {
    121       ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
    122                                                : (filtering == kFilterLinear
    123                                                       ? ScaleRowDown2Linear_MSA
    124                                                       : ScaleRowDown2Box_MSA);
    125     }
    126   }
    127 #endif
    128 
    129   if (filtering == kFilterLinear) {
    130     src_stride = 0;
    131   }
    132   // TODO(fbarchard): Loop through source height to allow odd height.
    133   for (y = 0; y < dst_height; ++y) {
    134     ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
    135     src_ptr += row_stride;
    136     dst_ptr += dst_stride;
    137   }
    138 }
    139 
    140 static void ScalePlaneDown2_16(int src_width,
    141                                int src_height,
    142                                int dst_width,
    143                                int dst_height,
    144                                int src_stride,
    145                                int dst_stride,
    146                                const uint16* src_ptr,
    147                                uint16* dst_ptr,
    148                                enum FilterMode filtering) {
    149   int y;
    150   void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
    151                         uint16* dst_ptr, int dst_width) =
    152       filtering == kFilterNone
    153           ? ScaleRowDown2_16_C
    154           : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
    155                                         : ScaleRowDown2Box_16_C);
    156   int row_stride = src_stride << 1;
    157   (void)src_width;
    158   (void)src_height;
    159   if (!filtering) {
    160     src_ptr += src_stride;  // Point to odd rows.
    161     src_stride = 0;
    162   }
    163 
    164 #if defined(HAS_SCALEROWDOWN2_16_NEON)
    165   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
    166     ScaleRowDown2 =
    167         filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON;
    168   }
    169 #endif
    170 #if defined(HAS_SCALEROWDOWN2_16_SSE2)
    171   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
    172     ScaleRowDown2 =
    173         filtering == kFilterNone
    174             ? ScaleRowDown2_16_SSE2
    175             : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2
    176                                           : ScaleRowDown2Box_16_SSE2);
    177   }
    178 #endif
    179 #if defined(HAS_SCALEROWDOWN2_16_DSPR2)
    180   if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
    181       IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
    182       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
    183     ScaleRowDown2 =
    184         filtering ? ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
    185   }
    186 #endif
    187 
    188   if (filtering == kFilterLinear) {
    189     src_stride = 0;
    190   }
    191   // TODO(fbarchard): Loop through source height to allow odd height.
    192   for (y = 0; y < dst_height; ++y) {
    193     ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
    194     src_ptr += row_stride;
    195     dst_ptr += dst_stride;
    196   }
    197 }
    198 
    199 // Scale plane, 1/4
    200 // This is an optimized version for scaling down a plane to 1/4 of
    201 // its original size.
    202 
    203 static void ScalePlaneDown4(int src_width,
    204                             int src_height,
    205                             int dst_width,
    206                             int dst_height,
    207                             int src_stride,
    208                             int dst_stride,
    209                             const uint8* src_ptr,
    210                             uint8* dst_ptr,
    211                             enum FilterMode filtering) {
    212   int y;
    213   void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
    214                         uint8* dst_ptr, int dst_width) =
    215       filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
    216   int row_stride = src_stride << 2;
    217   (void)src_width;
    218   (void)src_height;
    219   if (!filtering) {
    220     src_ptr += src_stride * 2;  // Point to row 2.
    221     src_stride = 0;
    222   }
    223 #if defined(HAS_SCALEROWDOWN4_NEON)
    224   if (TestCpuFlag(kCpuHasNEON)) {
    225     ScaleRowDown4 =
    226         filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
    227     if (IS_ALIGNED(dst_width, 8)) {
    228       ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
    229     }
    230   }
    231 #endif
    232 #if defined(HAS_SCALEROWDOWN4_SSSE3)
    233   if (TestCpuFlag(kCpuHasSSSE3)) {
    234     ScaleRowDown4 =
    235         filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
    236     if (IS_ALIGNED(dst_width, 8)) {
    237       ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
    238     }
    239   }
    240 #endif
    241 #if defined(HAS_SCALEROWDOWN4_AVX2)
    242   if (TestCpuFlag(kCpuHasAVX2)) {
    243     ScaleRowDown4 =
    244         filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
    245     if (IS_ALIGNED(dst_width, 16)) {
    246       ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
    247     }
    248   }
    249 #endif
    250 #if defined(HAS_SCALEROWDOWN4_DSPR2)
    251   if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
    252       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
    253       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
    254     ScaleRowDown4 = filtering ? ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
    255   }
    256 #endif
    257 #if defined(HAS_SCALEROWDOWN4_MSA)
    258   if (TestCpuFlag(kCpuHasMSA)) {
    259     ScaleRowDown4 =
    260         filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
    261     if (IS_ALIGNED(dst_width, 16)) {
    262       ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
    263     }
    264   }
    265 #endif
    266 
    267   if (filtering == kFilterLinear) {
    268     src_stride = 0;
    269   }
    270   for (y = 0; y < dst_height; ++y) {
    271     ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
    272     src_ptr += row_stride;
    273     dst_ptr += dst_stride;
    274   }
    275 }
    276 
    277 static void ScalePlaneDown4_16(int src_width,
    278                                int src_height,
    279                                int dst_width,
    280                                int dst_height,
    281                                int src_stride,
    282                                int dst_stride,
    283                                const uint16* src_ptr,
    284                                uint16* dst_ptr,
    285                                enum FilterMode filtering) {
    286   int y;
    287   void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
    288                         uint16* dst_ptr, int dst_width) =
    289       filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
    290   int row_stride = src_stride << 2;
    291   (void)src_width;
    292   (void)src_height;
    293   if (!filtering) {
    294     src_ptr += src_stride * 2;  // Point to row 2.
    295     src_stride = 0;
    296   }
    297 #if defined(HAS_SCALEROWDOWN4_16_NEON)
    298   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
    299     ScaleRowDown4 =
    300         filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON;
    301   }
    302 #endif
    303 #if defined(HAS_SCALEROWDOWN4_16_SSE2)
    304   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
    305     ScaleRowDown4 =
    306         filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
    307   }
    308 #endif
    309 #if defined(HAS_SCALEROWDOWN4_16_DSPR2)
    310   if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
    311       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
    312       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
    313     ScaleRowDown4 =
    314         filtering ? ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
    315   }
    316 #endif
    317 
    318   if (filtering == kFilterLinear) {
    319     src_stride = 0;
    320   }
    321   for (y = 0; y < dst_height; ++y) {
    322     ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
    323     src_ptr += row_stride;
    324     dst_ptr += dst_stride;
    325   }
    326 }
    327 
    328 // Scale plane down, 3/4
    329 static void ScalePlaneDown34(int src_width,
    330                              int src_height,
    331                              int dst_width,
    332                              int dst_height,
    333                              int src_stride,
    334                              int dst_stride,
    335                              const uint8* src_ptr,
    336                              uint8* dst_ptr,
    337                              enum FilterMode filtering) {
    338   int y;
    339   void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
    340                            uint8* dst_ptr, int dst_width);
    341   void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
    342                            uint8* dst_ptr, int dst_width);
    343   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
    344   (void)src_width;
    345   (void)src_height;
    346   assert(dst_width % 3 == 0);
    347   if (!filtering) {
    348     ScaleRowDown34_0 = ScaleRowDown34_C;
    349     ScaleRowDown34_1 = ScaleRowDown34_C;
    350   } else {
    351     ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
    352     ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
    353   }
    354 #if defined(HAS_SCALEROWDOWN34_NEON)
    355   if (TestCpuFlag(kCpuHasNEON)) {
    356     if (!filtering) {
    357       ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
    358       ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
    359     } else {
    360       ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
    361       ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
    362     }
    363     if (dst_width % 24 == 0) {
    364       if (!filtering) {
    365         ScaleRowDown34_0 = ScaleRowDown34_NEON;
    366         ScaleRowDown34_1 = ScaleRowDown34_NEON;
    367       } else {
    368         ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
    369         ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
    370       }
    371     }
    372   }
    373 #endif
    374 #if defined(HAS_SCALEROWDOWN34_SSSE3)
    375   if (TestCpuFlag(kCpuHasSSSE3)) {
    376     if (!filtering) {
    377       ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
    378       ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
    379     } else {
    380       ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
    381       ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
    382     }
    383     if (dst_width % 24 == 0) {
    384       if (!filtering) {
    385         ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
    386         ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
    387       } else {
    388         ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
    389         ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
    390       }
    391     }
    392   }
    393 #endif
    394 #if defined(HAS_SCALEROWDOWN34_DSPR2)
    395   if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
    396       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
    397       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
    398     if (!filtering) {
    399       ScaleRowDown34_0 = ScaleRowDown34_DSPR2;
    400       ScaleRowDown34_1 = ScaleRowDown34_DSPR2;
    401     } else {
    402       ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2;
    403       ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2;
    404     }
    405   }
    406 #endif
    407 
    408   for (y = 0; y < dst_height - 2; y += 3) {
    409     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
    410     src_ptr += src_stride;
    411     dst_ptr += dst_stride;
    412     ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
    413     src_ptr += src_stride;
    414     dst_ptr += dst_stride;
    415     ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
    416     src_ptr += src_stride * 2;
    417     dst_ptr += dst_stride;
    418   }
    419 
    420   // Remainder 1 or 2 rows with last row vertically unfiltered
    421   if ((dst_height % 3) == 2) {
    422     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
    423     src_ptr += src_stride;
    424     dst_ptr += dst_stride;
    425     ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
    426   } else if ((dst_height % 3) == 1) {
    427     ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
    428   }
    429 }
    430 
    431 static void ScalePlaneDown34_16(int src_width,
    432                                 int src_height,
    433                                 int dst_width,
    434                                 int dst_height,
    435                                 int src_stride,
    436                                 int dst_stride,
    437                                 const uint16* src_ptr,
    438                                 uint16* dst_ptr,
    439                                 enum FilterMode filtering) {
    440   int y;
    441   void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
    442                            uint16* dst_ptr, int dst_width);
    443   void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
    444                            uint16* dst_ptr, int dst_width);
    445   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
    446   (void)src_width;
    447   (void)src_height;
    448   assert(dst_width % 3 == 0);
    449   if (!filtering) {
    450     ScaleRowDown34_0 = ScaleRowDown34_16_C;
    451     ScaleRowDown34_1 = ScaleRowDown34_16_C;
    452   } else {
    453     ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;
    454     ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;
    455   }
    456 #if defined(HAS_SCALEROWDOWN34_16_NEON)
    457   if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
    458     if (!filtering) {
    459       ScaleRowDown34_0 = ScaleRowDown34_16_NEON;
    460       ScaleRowDown34_1 = ScaleRowDown34_16_NEON;
    461     } else {
    462       ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;
    463       ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;
    464     }
    465   }
    466 #endif
    467 #if defined(HAS_SCALEROWDOWN34_16_SSSE3)
    468   if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
    469     if (!filtering) {
    470       ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
    471       ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
    472     } else {
    473       ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;
    474       ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;
    475     }
    476   }
    477 #endif
    478 #if defined(HAS_SCALEROWDOWN34_16_DSPR2)
    479   if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
    480       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
    481       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
    482     if (!filtering) {
    483       ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2;
    484       ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2;
    485     } else {
    486       ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2;
    487       ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2;
    488     }
    489   }
    490 #endif
    491 
    492   for (y = 0; y < dst_height - 2; y += 3) {
    493     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
    494     src_ptr += src_stride;
    495     dst_ptr += dst_stride;
    496     ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
    497     src_ptr += src_stride;
    498     dst_ptr += dst_stride;
    499     ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
    500     src_ptr += src_stride * 2;
    501     dst_ptr += dst_stride;
    502   }
    503 
    504   // Remainder 1 or 2 rows with last row vertically unfiltered
    505   if ((dst_height % 3) == 2) {
    506     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
    507     src_ptr += src_stride;
    508     dst_ptr += dst_stride;
    509     ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
    510   } else if ((dst_height % 3) == 1) {
    511     ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
    512   }
    513 }
    514 
    515 // Scale plane, 3/8
    516 // This is an optimized version for scaling down a plane to 3/8
    517 // of its original size.
    518 //
    519 // Uses box filter arranges like this
    520 // aaabbbcc -> abc
    521 // aaabbbcc    def
    522 // aaabbbcc    ghi
    523 // dddeeeff
    524 // dddeeeff
    525 // dddeeeff
    526 // ggghhhii
    527 // ggghhhii
    528 // Boxes are 3x3, 2x3, 3x2 and 2x2
    529 
    530 static void ScalePlaneDown38(int src_width,
    531                              int src_height,
    532                              int dst_width,
    533                              int dst_height,
    534                              int src_stride,
    535                              int dst_stride,
    536                              const uint8* src_ptr,
    537                              uint8* dst_ptr,
    538                              enum FilterMode filtering) {
    539   int y;
    540   void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
    541                            uint8* dst_ptr, int dst_width);
    542   void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
    543                            uint8* dst_ptr, int dst_width);
    544   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
    545   assert(dst_width % 3 == 0);
    546   (void)src_width;
    547   (void)src_height;
    548   if (!filtering) {
    549     ScaleRowDown38_3 = ScaleRowDown38_C;
    550     ScaleRowDown38_2 = ScaleRowDown38_C;
    551   } else {
    552     ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
    553     ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
    554   }
    555 
    556 #if defined(HAS_SCALEROWDOWN38_NEON)
    557   if (TestCpuFlag(kCpuHasNEON)) {
    558     if (!filtering) {
    559       ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
    560       ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
    561     } else {
    562       ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
    563       ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
    564     }
    565     if (dst_width % 12 == 0) {
    566       if (!filtering) {
    567         ScaleRowDown38_3 = ScaleRowDown38_NEON;
    568         ScaleRowDown38_2 = ScaleRowDown38_NEON;
    569       } else {
    570         ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
    571         ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
    572       }
    573     }
    574   }
    575 #endif
    576 #if defined(HAS_SCALEROWDOWN38_SSSE3)
    577   if (TestCpuFlag(kCpuHasSSSE3)) {
    578     if (!filtering) {
    579       ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
    580       ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
    581     } else {
    582       ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
    583       ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
    584     }
    585     if (dst_width % 12 == 0 && !filtering) {
    586       ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
    587       ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
    588     }
    589     if (dst_width % 6 == 0 && filtering) {
    590       ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
    591       ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
    592     }
    593   }
    594 #endif
    595 #if defined(HAS_SCALEROWDOWN38_DSPR2)
    596   if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
    597       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
    598       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
    599     if (!filtering) {
    600       ScaleRowDown38_3 = ScaleRowDown38_DSPR2;
    601       ScaleRowDown38_2 = ScaleRowDown38_DSPR2;
    602     } else {
    603       ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2;
    604       ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2;
    605     }
    606   }
    607 #endif
    608 #if defined(HAS_SCALEROWDOWN38_MSA)
    609   if (TestCpuFlag(kCpuHasMSA)) {
    610     if (!filtering) {
    611       ScaleRowDown38_3 = ScaleRowDown38_Any_MSA;
    612       ScaleRowDown38_2 = ScaleRowDown38_Any_MSA;
    613     } else {
    614       ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA;
    615       ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA;
    616     }
    617     if (dst_width % 12 == 0) {
    618       if (!filtering) {
    619         ScaleRowDown38_3 = ScaleRowDown38_MSA;
    620         ScaleRowDown38_2 = ScaleRowDown38_MSA;
    621       } else {
    622         ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA;
    623         ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA;
    624       }
    625     }
    626   }
    627 #endif
    628 
    629   for (y = 0; y < dst_height - 2; y += 3) {
    630     ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
    631     src_ptr += src_stride * 3;
    632     dst_ptr += dst_stride;
    633     ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
    634     src_ptr += src_stride * 3;
    635     dst_ptr += dst_stride;
    636     ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
    637     src_ptr += src_stride * 2;
    638     dst_ptr += dst_stride;
    639   }
    640 
    641   // Remainder 1 or 2 rows with last row vertically unfiltered
    642   if ((dst_height % 3) == 2) {
    643     ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
    644     src_ptr += src_stride * 3;
    645     dst_ptr += dst_stride;
    646     ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
    647   } else if ((dst_height % 3) == 1) {
    648     ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
    649   }
    650 }
    651 
    652 static void ScalePlaneDown38_16(int src_width,
    653                                 int src_height,
    654                                 int dst_width,
    655                                 int dst_height,
    656                                 int src_stride,
    657                                 int dst_stride,
    658                                 const uint16* src_ptr,
    659                                 uint16* dst_ptr,
    660                                 enum FilterMode filtering) {
    661   int y;
    662   void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
    663                            uint16* dst_ptr, int dst_width);
    664   void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
    665                            uint16* dst_ptr, int dst_width);
    666   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
    667   (void)src_width;
    668   (void)src_height;
    669   assert(dst_width % 3 == 0);
    670   if (!filtering) {
    671     ScaleRowDown38_3 = ScaleRowDown38_16_C;
    672     ScaleRowDown38_2 = ScaleRowDown38_16_C;
    673   } else {
    674     ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;
    675     ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;
    676   }
    677 #if defined(HAS_SCALEROWDOWN38_16_NEON)
    678   if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
    679     if (!filtering) {
    680       ScaleRowDown38_3 = ScaleRowDown38_16_NEON;
    681       ScaleRowDown38_2 = ScaleRowDown38_16_NEON;
    682     } else {
    683       ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;
    684       ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
    685     }
    686   }
    687 #endif
    688 #if defined(HAS_SCALEROWDOWN38_16_SSSE3)
    689   if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
    690     if (!filtering) {
    691       ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
    692       ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
    693     } else {
    694       ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;
    695       ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
    696     }
    697   }
    698 #endif
    699 #if defined(HAS_SCALEROWDOWN38_16_DSPR2)
    700   if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
    701       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
    702       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
    703     if (!filtering) {
    704       ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2;
    705       ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2;
    706     } else {
    707       ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2;
    708       ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2;
    709     }
    710   }
    711 #endif
    712 
    713   for (y = 0; y < dst_height - 2; y += 3) {
    714     ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
    715     src_ptr += src_stride * 3;
    716     dst_ptr += dst_stride;
    717     ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
    718     src_ptr += src_stride * 3;
    719     dst_ptr += dst_stride;
    720     ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
    721     src_ptr += src_stride * 2;
    722     dst_ptr += dst_stride;
    723   }
    724 
    725   // Remainder 1 or 2 rows with last row vertically unfiltered
    726   if ((dst_height % 3) == 2) {
    727     ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
    728     src_ptr += src_stride * 3;
    729     dst_ptr += dst_stride;
    730     ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
    731   } else if ((dst_height % 3) == 1) {
    732     ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
    733   }
    734 }
    735 
    736 #define MIN1(x) ((x) < 1 ? 1 : (x))
    737 
    738 static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
    739   uint32 sum = 0u;
    740   int x;
    741   assert(iboxwidth > 0);
    742   for (x = 0; x < iboxwidth; ++x) {
    743     sum += src_ptr[x];
    744   }
    745   return sum;
    746 }
    747 
    748 static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
    749   uint32 sum = 0u;
    750   int x;
    751   assert(iboxwidth > 0);
    752   for (x = 0; x < iboxwidth; ++x) {
    753     sum += src_ptr[x];
    754   }
    755   return sum;
    756 }
    757 
    758 static void ScaleAddCols2_C(int dst_width,
    759                             int boxheight,
    760                             int x,
    761                             int dx,
    762                             const uint16* src_ptr,
    763                             uint8* dst_ptr) {
    764   int i;
    765   int scaletbl[2];
    766   int minboxwidth = dx >> 16;
    767   int boxwidth;
    768   scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
    769   scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
    770   for (i = 0; i < dst_width; ++i) {
    771     int ix = x >> 16;
    772     x += dx;
    773     boxwidth = MIN1((x >> 16) - ix);
    774     *dst_ptr++ =
    775         SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>
    776         16;
    777   }
    778 }
    779 
    780 static void ScaleAddCols2_16_C(int dst_width,
    781                                int boxheight,
    782                                int x,
    783                                int dx,
    784                                const uint32* src_ptr,
    785                                uint16* dst_ptr) {
    786   int i;
    787   int scaletbl[2];
    788   int minboxwidth = dx >> 16;
    789   int boxwidth;
    790   scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
    791   scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
    792   for (i = 0; i < dst_width; ++i) {
    793     int ix = x >> 16;
    794     x += dx;
    795     boxwidth = MIN1((x >> 16) - ix);
    796     *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
    797                      scaletbl[boxwidth - minboxwidth] >>
    798                  16;
    799   }
    800 }
    801 
    802 static void ScaleAddCols0_C(int dst_width,
    803                             int boxheight,
    804                             int x,
    805                             int,
    806                             const uint16* src_ptr,
    807                             uint8* dst_ptr) {
    808   int scaleval = 65536 / boxheight;
    809   int i;
    810   src_ptr += (x >> 16);
    811   for (i = 0; i < dst_width; ++i) {
    812     *dst_ptr++ = src_ptr[i] * scaleval >> 16;
    813   }
    814 }
    815 
    816 static void ScaleAddCols1_C(int dst_width,
    817                             int boxheight,
    818                             int x,
    819                             int dx,
    820                             const uint16* src_ptr,
    821                             uint8* dst_ptr) {
    822   int boxwidth = MIN1(dx >> 16);
    823   int scaleval = 65536 / (boxwidth * boxheight);
    824   int i;
    825   x >>= 16;
    826   for (i = 0; i < dst_width; ++i) {
    827     *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
    828     x += boxwidth;
    829   }
    830 }
    831 
    832 static void ScaleAddCols1_16_C(int dst_width,
    833                                int boxheight,
    834                                int x,
    835                                int dx,
    836                                const uint32* src_ptr,
    837                                uint16* dst_ptr) {
    838   int boxwidth = MIN1(dx >> 16);
    839   int scaleval = 65536 / (boxwidth * boxheight);
    840   int i;
    841   for (i = 0; i < dst_width; ++i) {
    842     *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
    843     x += boxwidth;
    844   }
    845 }
    846 
    847 // Scale plane down to any dimensions, with interpolation.
    848 // (boxfilter).
    849 //
    850 // Same method as SimpleScale, which is fixed point, outputting
    851 // one pixel of destination using fixed point (16.16) to step
    852 // through source, sampling a box of pixel with simple
    853 // averaging.
    854 static void ScalePlaneBox(int src_width,
    855                           int src_height,
    856                           int dst_width,
    857                           int dst_height,
    858                           int src_stride,
    859                           int dst_stride,
    860                           const uint8* src_ptr,
    861                           uint8* dst_ptr) {
    862   int j, k;
    863   // Initial source x/y coordinate and step values as 16.16 fixed point.
    864   int x = 0;
    865   int y = 0;
    866   int dx = 0;
    867   int dy = 0;
    868   const int max_y = (src_height << 16);
    869   ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
    870              &dx, &dy);
    871   src_width = Abs(src_width);
    872   {
    873     // Allocate a row buffer of uint16.
    874     align_buffer_64(row16, src_width * 2);
    875     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
    876                          const uint16* src_ptr, uint8* dst_ptr) =
    877         (dx & 0xffff) ? ScaleAddCols2_C
    878                       : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
    879     void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
    880         ScaleAddRow_C;
    881 #if defined(HAS_SCALEADDROW_SSE2)
    882     if (TestCpuFlag(kCpuHasSSE2)) {
    883       ScaleAddRow = ScaleAddRow_Any_SSE2;
    884       if (IS_ALIGNED(src_width, 16)) {
    885         ScaleAddRow = ScaleAddRow_SSE2;
    886       }
    887     }
    888 #endif
    889 #if defined(HAS_SCALEADDROW_AVX2)
    890     if (TestCpuFlag(kCpuHasAVX2)) {
    891       ScaleAddRow = ScaleAddRow_Any_AVX2;
    892       if (IS_ALIGNED(src_width, 32)) {
    893         ScaleAddRow = ScaleAddRow_AVX2;
    894       }
    895     }
    896 #endif
    897 #if defined(HAS_SCALEADDROW_NEON)
    898     if (TestCpuFlag(kCpuHasNEON)) {
    899       ScaleAddRow = ScaleAddRow_Any_NEON;
    900       if (IS_ALIGNED(src_width, 16)) {
    901         ScaleAddRow = ScaleAddRow_NEON;
    902       }
    903     }
    904 #endif
    905 #if defined(HAS_SCALEADDROW_MSA)
    906     if (TestCpuFlag(kCpuHasMSA)) {
    907       ScaleAddRow = ScaleAddRow_Any_MSA;
    908       if (IS_ALIGNED(src_width, 16)) {
    909         ScaleAddRow = ScaleAddRow_MSA;
    910       }
    911     }
    912 #endif
    913 #if defined(HAS_SCALEADDROW_DSPR2)
    914     if (TestCpuFlag(kCpuHasDSPR2)) {
    915       ScaleAddRow = ScaleAddRow_Any_DSPR2;
    916       if (IS_ALIGNED(src_width, 16)) {
    917         ScaleAddRow = ScaleAddRow_DSPR2;
    918       }
    919     }
    920 #endif
    921 
    922     for (j = 0; j < dst_height; ++j) {
    923       int boxheight;
    924       int iy = y >> 16;
    925       const uint8* src = src_ptr + iy * src_stride;
    926       y += dy;
    927       if (y > max_y) {
    928         y = max_y;
    929       }
    930       boxheight = MIN1((y >> 16) - iy);
    931       memset(row16, 0, src_width * 2);
    932       for (k = 0; k < boxheight; ++k) {
    933         ScaleAddRow(src, (uint16*)(row16), src_width);
    934         src += src_stride;
    935       }
    936       ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
    937       dst_ptr += dst_stride;
    938     }
    939     free_aligned_buffer_64(row16);
    940   }
    941 }
    942 
    943 static void ScalePlaneBox_16(int src_width,
    944                              int src_height,
    945                              int dst_width,
    946                              int dst_height,
    947                              int src_stride,
    948                              int dst_stride,
    949                              const uint16* src_ptr,
    950                              uint16* dst_ptr) {
    951   int j, k;
    952   // Initial source x/y coordinate and step values as 16.16 fixed point.
    953   int x = 0;
    954   int y = 0;
    955   int dx = 0;
    956   int dy = 0;
    957   const int max_y = (src_height << 16);
    958   ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
    959              &dx, &dy);
    960   src_width = Abs(src_width);
    961   {
    962     // Allocate a row buffer of uint32.
    963     align_buffer_64(row32, src_width * 4);
    964     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
    965                          const uint32* src_ptr, uint16* dst_ptr) =
    966         (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
    967     void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
    968         ScaleAddRow_16_C;
    969 
    970 #if defined(HAS_SCALEADDROW_16_SSE2)
    971     if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
    972       ScaleAddRow = ScaleAddRow_16_SSE2;
    973     }
    974 #endif
    975 
    976     for (j = 0; j < dst_height; ++j) {
    977       int boxheight;
    978       int iy = y >> 16;
    979       const uint16* src = src_ptr + iy * src_stride;
    980       y += dy;
    981       if (y > max_y) {
    982         y = max_y;
    983       }
    984       boxheight = MIN1((y >> 16) - iy);
    985       memset(row32, 0, src_width * 4);
    986       for (k = 0; k < boxheight; ++k) {
    987         ScaleAddRow(src, (uint32*)(row32), src_width);
    988         src += src_stride;
    989       }
    990       ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
    991       dst_ptr += dst_stride;
    992     }
    993     free_aligned_buffer_64(row32);
    994   }
    995 }
    996 
    997 // Scale plane down with bilinear interpolation.
    998 void ScalePlaneBilinearDown(int src_width,
    999                             int src_height,
   1000                             int dst_width,
   1001                             int dst_height,
   1002                             int src_stride,
   1003                             int dst_stride,
   1004                             const uint8* src_ptr,
   1005                             uint8* dst_ptr,
   1006                             enum FilterMode filtering) {
   1007   // Initial source x/y coordinate and step values as 16.16 fixed point.
   1008   int x = 0;
   1009   int y = 0;
   1010   int dx = 0;
   1011   int dy = 0;
   1012   // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
   1013   // Allocate a row buffer.
   1014   align_buffer_64(row, src_width);
   1015 
   1016   const int max_y = (src_height - 1) << 16;
   1017   int j;
   1018   void (*ScaleFilterCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width,
   1019                           int x, int dx) =
   1020       (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
   1021   void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
   1022                          ptrdiff_t src_stride, int dst_width,
   1023                          int source_y_fraction) = InterpolateRow_C;
   1024   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
   1025              &dx, &dy);
   1026   src_width = Abs(src_width);
   1027 
   1028 #if defined(HAS_INTERPOLATEROW_SSSE3)
   1029   if (TestCpuFlag(kCpuHasSSSE3)) {
   1030     InterpolateRow = InterpolateRow_Any_SSSE3;
   1031     if (IS_ALIGNED(src_width, 16)) {
   1032       InterpolateRow = InterpolateRow_SSSE3;
   1033     }
   1034   }
   1035 #endif
   1036 #if defined(HAS_INTERPOLATEROW_AVX2)
   1037   if (TestCpuFlag(kCpuHasAVX2)) {
   1038     InterpolateRow = InterpolateRow_Any_AVX2;
   1039     if (IS_ALIGNED(src_width, 32)) {
   1040       InterpolateRow = InterpolateRow_AVX2;
   1041     }
   1042   }
   1043 #endif
   1044 #if defined(HAS_INTERPOLATEROW_NEON)
   1045   if (TestCpuFlag(kCpuHasNEON)) {
   1046     InterpolateRow = InterpolateRow_Any_NEON;
   1047     if (IS_ALIGNED(src_width, 16)) {
   1048       InterpolateRow = InterpolateRow_NEON;
   1049     }
   1050   }
   1051 #endif
   1052 #if defined(HAS_INTERPOLATEROW_DSPR2)
   1053   if (TestCpuFlag(kCpuHasDSPR2)) {
   1054     InterpolateRow = InterpolateRow_Any_DSPR2;
   1055     if (IS_ALIGNED(src_width, 4)) {
   1056       InterpolateRow = InterpolateRow_DSPR2;
   1057     }
   1058   }
   1059 #endif
   1060 #if defined(HAS_INTERPOLATEROW_MSA)
   1061   if (TestCpuFlag(kCpuHasMSA)) {
   1062     InterpolateRow = InterpolateRow_Any_MSA;
   1063     if (IS_ALIGNED(src_width, 32)) {
   1064       InterpolateRow = InterpolateRow_MSA;
   1065     }
   1066   }
   1067 #endif
   1068 
   1069 #if defined(HAS_SCALEFILTERCOLS_SSSE3)
   1070   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
   1071     ScaleFilterCols = ScaleFilterCols_SSSE3;
   1072   }
   1073 #endif
   1074 #if defined(HAS_SCALEFILTERCOLS_NEON)
   1075   if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
   1076     ScaleFilterCols = ScaleFilterCols_Any_NEON;
   1077     if (IS_ALIGNED(dst_width, 8)) {
   1078       ScaleFilterCols = ScaleFilterCols_NEON;
   1079     }
   1080   }
   1081 #endif
   1082   if (y > max_y) {
   1083     y = max_y;
   1084   }
   1085 
   1086   for (j = 0; j < dst_height; ++j) {
   1087     int yi = y >> 16;
   1088     const uint8* src = src_ptr + yi * src_stride;
   1089     if (filtering == kFilterLinear) {
   1090       ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
   1091     } else {
   1092       int yf = (y >> 8) & 255;
   1093       InterpolateRow(row, src, src_stride, src_width, yf);
   1094       ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
   1095     }
   1096     dst_ptr += dst_stride;
   1097     y += dy;
   1098     if (y > max_y) {
   1099       y = max_y;
   1100     }
   1101   }
   1102   free_aligned_buffer_64(row);
   1103 }
   1104 
   1105 void ScalePlaneBilinearDown_16(int src_width,
   1106                                int src_height,
   1107                                int dst_width,
   1108                                int dst_height,
   1109                                int src_stride,
   1110                                int dst_stride,
   1111                                const uint16* src_ptr,
   1112                                uint16* dst_ptr,
   1113                                enum FilterMode filtering) {
   1114   // Initial source x/y coordinate and step values as 16.16 fixed point.
   1115   int x = 0;
   1116   int y = 0;
   1117   int dx = 0;
   1118   int dy = 0;
   1119   // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
   1120   // Allocate a row buffer.
   1121   align_buffer_64(row, src_width * 2);
   1122 
   1123   const int max_y = (src_height - 1) << 16;
   1124   int j;
   1125   void (*ScaleFilterCols)(uint16 * dst_ptr, const uint16* src_ptr,
   1126                           int dst_width, int x, int dx) =
   1127       (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
   1128   void (*InterpolateRow)(uint16 * dst_ptr, const uint16* src_ptr,
   1129                          ptrdiff_t src_stride, int dst_width,
   1130                          int source_y_fraction) = InterpolateRow_16_C;
   1131   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
   1132              &dx, &dy);
   1133   src_width = Abs(src_width);
   1134 
   1135 #if defined(HAS_INTERPOLATEROW_16_SSE2)
   1136   if (TestCpuFlag(kCpuHasSSE2)) {
   1137     InterpolateRow = InterpolateRow_Any_16_SSE2;
   1138     if (IS_ALIGNED(src_width, 16)) {
   1139       InterpolateRow = InterpolateRow_16_SSE2;
   1140     }
   1141   }
   1142 #endif
   1143 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
   1144   if (TestCpuFlag(kCpuHasSSSE3)) {
   1145     InterpolateRow = InterpolateRow_Any_16_SSSE3;
   1146     if (IS_ALIGNED(src_width, 16)) {
   1147       InterpolateRow = InterpolateRow_16_SSSE3;
   1148     }
   1149   }
   1150 #endif
   1151 #if defined(HAS_INTERPOLATEROW_16_AVX2)
   1152   if (TestCpuFlag(kCpuHasAVX2)) {
   1153     InterpolateRow = InterpolateRow_Any_16_AVX2;
   1154     if (IS_ALIGNED(src_width, 32)) {
   1155       InterpolateRow = InterpolateRow_16_AVX2;
   1156     }
   1157   }
   1158 #endif
   1159 #if defined(HAS_INTERPOLATEROW_16_NEON)
   1160   if (TestCpuFlag(kCpuHasNEON)) {
   1161     InterpolateRow = InterpolateRow_Any_16_NEON;
   1162     if (IS_ALIGNED(src_width, 16)) {
   1163       InterpolateRow = InterpolateRow_16_NEON;
   1164     }
   1165   }
   1166 #endif
   1167 #if defined(HAS_INTERPOLATEROW_16_DSPR2)
   1168   if (TestCpuFlag(kCpuHasDSPR2)) {
   1169     InterpolateRow = InterpolateRow_Any_16_DSPR2;
   1170     if (IS_ALIGNED(src_width, 4)) {
   1171       InterpolateRow = InterpolateRow_16_DSPR2;
   1172     }
   1173   }
   1174 #endif
   1175 
   1176 #if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
   1177   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
   1178     ScaleFilterCols = ScaleFilterCols_16_SSSE3;
   1179   }
   1180 #endif
   1181   if (y > max_y) {
   1182     y = max_y;
   1183   }
   1184 
   1185   for (j = 0; j < dst_height; ++j) {
   1186     int yi = y >> 16;
   1187     const uint16* src = src_ptr + yi * src_stride;
   1188     if (filtering == kFilterLinear) {
   1189       ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
   1190     } else {
   1191       int yf = (y >> 8) & 255;
   1192       InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
   1193       ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
   1194     }
   1195     dst_ptr += dst_stride;
   1196     y += dy;
   1197     if (y > max_y) {
   1198       y = max_y;
   1199     }
   1200   }
   1201   free_aligned_buffer_64(row);
   1202 }
   1203 
   1204 // Scale up down with bilinear interpolation.
   1205 void ScalePlaneBilinearUp(int src_width,
   1206                           int src_height,
   1207                           int dst_width,
   1208                           int dst_height,
   1209                           int src_stride,
   1210                           int dst_stride,
   1211                           const uint8* src_ptr,
   1212                           uint8* dst_ptr,
   1213                           enum FilterMode filtering) {
   1214   int j;
   1215   // Initial source x/y coordinate and step values as 16.16 fixed point.
   1216   int x = 0;
   1217   int y = 0;
   1218   int dx = 0;
   1219   int dy = 0;
   1220   const int max_y = (src_height - 1) << 16;
   1221   void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
   1222                          ptrdiff_t src_stride, int dst_width,
   1223                          int source_y_fraction) = InterpolateRow_C;
   1224   void (*ScaleFilterCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width,
   1225                           int x, int dx) =
   1226       filtering ? ScaleFilterCols_C : ScaleCols_C;
   1227   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
   1228              &dx, &dy);
   1229   src_width = Abs(src_width);
   1230 
   1231 #if defined(HAS_INTERPOLATEROW_SSSE3)
   1232   if (TestCpuFlag(kCpuHasSSSE3)) {
   1233     InterpolateRow = InterpolateRow_Any_SSSE3;
   1234     if (IS_ALIGNED(dst_width, 16)) {
   1235       InterpolateRow = InterpolateRow_SSSE3;
   1236     }
   1237   }
   1238 #endif
   1239 #if defined(HAS_INTERPOLATEROW_AVX2)
   1240   if (TestCpuFlag(kCpuHasAVX2)) {
   1241     InterpolateRow = InterpolateRow_Any_AVX2;
   1242     if (IS_ALIGNED(dst_width, 32)) {
   1243       InterpolateRow = InterpolateRow_AVX2;
   1244     }
   1245   }
   1246 #endif
   1247 #if defined(HAS_INTERPOLATEROW_NEON)
   1248   if (TestCpuFlag(kCpuHasNEON)) {
   1249     InterpolateRow = InterpolateRow_Any_NEON;
   1250     if (IS_ALIGNED(dst_width, 16)) {
   1251       InterpolateRow = InterpolateRow_NEON;
   1252     }
   1253   }
   1254 #endif
   1255 #if defined(HAS_INTERPOLATEROW_DSPR2)
   1256   if (TestCpuFlag(kCpuHasDSPR2)) {
   1257     InterpolateRow = InterpolateRow_Any_DSPR2;
   1258     if (IS_ALIGNED(dst_width, 4)) {
   1259       InterpolateRow = InterpolateRow_DSPR2;
   1260     }
   1261   }
   1262 #endif
   1263 
   1264   if (filtering && src_width >= 32768) {
   1265     ScaleFilterCols = ScaleFilterCols64_C;
   1266   }
   1267 #if defined(HAS_SCALEFILTERCOLS_SSSE3)
   1268   if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
   1269     ScaleFilterCols = ScaleFilterCols_SSSE3;
   1270   }
   1271 #endif
   1272 #if defined(HAS_SCALEFILTERCOLS_NEON)
   1273   if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
   1274     ScaleFilterCols = ScaleFilterCols_Any_NEON;
   1275     if (IS_ALIGNED(dst_width, 8)) {
   1276       ScaleFilterCols = ScaleFilterCols_NEON;
   1277     }
   1278   }
   1279 #endif
   1280   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
   1281     ScaleFilterCols = ScaleColsUp2_C;
   1282 #if defined(HAS_SCALECOLS_SSE2)
   1283     if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
   1284       ScaleFilterCols = ScaleColsUp2_SSE2;
   1285     }
   1286 #endif
   1287   }
   1288 
   1289   if (y > max_y) {
   1290     y = max_y;
   1291   }
   1292   {
   1293     int yi = y >> 16;
   1294     const uint8* src = src_ptr + yi * src_stride;
   1295 
   1296     // Allocate 2 row buffers.
   1297     const int kRowSize = (dst_width + 31) & ~31;
   1298     align_buffer_64(row, kRowSize * 2);
   1299 
   1300     uint8* rowptr = row;
   1301     int rowstride = kRowSize;
   1302     int lasty = yi;
   1303 
   1304     ScaleFilterCols(rowptr, src, dst_width, x, dx);
   1305     if (src_height > 1) {
   1306       src += src_stride;
   1307     }
   1308     ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
   1309     src += src_stride;
   1310 
   1311     for (j = 0; j < dst_height; ++j) {
   1312       yi = y >> 16;
   1313       if (yi != lasty) {
   1314         if (y > max_y) {
   1315           y = max_y;
   1316           yi = y >> 16;
   1317           src = src_ptr + yi * src_stride;
   1318         }
   1319         if (yi != lasty) {
   1320           ScaleFilterCols(rowptr, src, dst_width, x, dx);
   1321           rowptr += rowstride;
   1322           rowstride = -rowstride;
   1323           lasty = yi;
   1324           src += src_stride;
   1325         }
   1326       }
   1327       if (filtering == kFilterLinear) {
   1328         InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
   1329       } else {
   1330         int yf = (y >> 8) & 255;
   1331         InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
   1332       }
   1333       dst_ptr += dst_stride;
   1334       y += dy;
   1335     }
   1336     free_aligned_buffer_64(row);
   1337   }
   1338 }
   1339 
   1340 void ScalePlaneBilinearUp_16(int src_width,
   1341                              int src_height,
   1342                              int dst_width,
   1343                              int dst_height,
   1344                              int src_stride,
   1345                              int dst_stride,
   1346                              const uint16* src_ptr,
   1347                              uint16* dst_ptr,
   1348                              enum FilterMode filtering) {
   1349   int j;
   1350   // Initial source x/y coordinate and step values as 16.16 fixed point.
   1351   int x = 0;
   1352   int y = 0;
   1353   int dx = 0;
   1354   int dy = 0;
   1355   const int max_y = (src_height - 1) << 16;
   1356   void (*InterpolateRow)(uint16 * dst_ptr, const uint16* src_ptr,
   1357                          ptrdiff_t src_stride, int dst_width,
   1358                          int source_y_fraction) = InterpolateRow_16_C;
   1359   void (*ScaleFilterCols)(uint16 * dst_ptr, const uint16* src_ptr,
   1360                           int dst_width, int x, int dx) =
   1361       filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
   1362   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
   1363              &dx, &dy);
   1364   src_width = Abs(src_width);
   1365 
   1366 #if defined(HAS_INTERPOLATEROW_16_SSE2)
   1367   if (TestCpuFlag(kCpuHasSSE2)) {
   1368     InterpolateRow = InterpolateRow_Any_16_SSE2;
   1369     if (IS_ALIGNED(dst_width, 16)) {
   1370       InterpolateRow = InterpolateRow_16_SSE2;
   1371     }
   1372   }
   1373 #endif
   1374 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
   1375   if (TestCpuFlag(kCpuHasSSSE3)) {
   1376     InterpolateRow = InterpolateRow_Any_16_SSSE3;
   1377     if (IS_ALIGNED(dst_width, 16)) {
   1378       InterpolateRow = InterpolateRow_16_SSSE3;
   1379     }
   1380   }
   1381 #endif
   1382 #if defined(HAS_INTERPOLATEROW_16_AVX2)
   1383   if (TestCpuFlag(kCpuHasAVX2)) {
   1384     InterpolateRow = InterpolateRow_Any_16_AVX2;
   1385     if (IS_ALIGNED(dst_width, 32)) {
   1386       InterpolateRow = InterpolateRow_16_AVX2;
   1387     }
   1388   }
   1389 #endif
   1390 #if defined(HAS_INTERPOLATEROW_16_NEON)
   1391   if (TestCpuFlag(kCpuHasNEON)) {
   1392     InterpolateRow = InterpolateRow_Any_16_NEON;
   1393     if (IS_ALIGNED(dst_width, 16)) {
   1394       InterpolateRow = InterpolateRow_16_NEON;
   1395     }
   1396   }
   1397 #endif
   1398 #if defined(HAS_INTERPOLATEROW_16_DSPR2)
   1399   if (TestCpuFlag(kCpuHasDSPR2)) {
   1400     InterpolateRow = InterpolateRow_Any_16_DSPR2;
   1401     if (IS_ALIGNED(dst_width, 4)) {
   1402       InterpolateRow = InterpolateRow_16_DSPR2;
   1403     }
   1404   }
   1405 #endif
   1406 
   1407   if (filtering && src_width >= 32768) {
   1408     ScaleFilterCols = ScaleFilterCols64_16_C;
   1409   }
   1410 #if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
   1411   if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
   1412     ScaleFilterCols = ScaleFilterCols_16_SSSE3;
   1413   }
   1414 #endif
   1415   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
   1416     ScaleFilterCols = ScaleColsUp2_16_C;
   1417 #if defined(HAS_SCALECOLS_16_SSE2)
   1418     if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
   1419       ScaleFilterCols = ScaleColsUp2_16_SSE2;
   1420     }
   1421 #endif
   1422   }
   1423 
   1424   if (y > max_y) {
   1425     y = max_y;
   1426   }
   1427   {
   1428     int yi = y >> 16;
   1429     const uint16* src = src_ptr + yi * src_stride;
   1430 
   1431     // Allocate 2 row buffers.
   1432     const int kRowSize = (dst_width + 31) & ~31;
   1433     align_buffer_64(row, kRowSize * 4);
   1434 
   1435     uint16* rowptr = (uint16*)row;
   1436     int rowstride = kRowSize;
   1437     int lasty = yi;
   1438 
   1439     ScaleFilterCols(rowptr, src, dst_width, x, dx);
   1440     if (src_height > 1) {
   1441       src += src_stride;
   1442     }
   1443     ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
   1444     src += src_stride;
   1445 
   1446     for (j = 0; j < dst_height; ++j) {
   1447       yi = y >> 16;
   1448       if (yi != lasty) {
   1449         if (y > max_y) {
   1450           y = max_y;
   1451           yi = y >> 16;
   1452           src = src_ptr + yi * src_stride;
   1453         }
   1454         if (yi != lasty) {
   1455           ScaleFilterCols(rowptr, src, dst_width, x, dx);
   1456           rowptr += rowstride;
   1457           rowstride = -rowstride;
   1458           lasty = yi;
   1459           src += src_stride;
   1460         }
   1461       }
   1462       if (filtering == kFilterLinear) {
   1463         InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
   1464       } else {
   1465         int yf = (y >> 8) & 255;
   1466         InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
   1467       }
   1468       dst_ptr += dst_stride;
   1469       y += dy;
   1470     }
   1471     free_aligned_buffer_64(row);
   1472   }
   1473 }
   1474 
   1475 // Scale Plane to/from any dimensions, without interpolation.
   1476 // Fixed point math is used for performance: The upper 16 bits
   1477 // of x and dx is the integer part of the source position and
   1478 // the lower 16 bits are the fixed decimal part.
   1479 
   1480 static void ScalePlaneSimple(int src_width,
   1481                              int src_height,
   1482                              int dst_width,
   1483                              int dst_height,
   1484                              int src_stride,
   1485                              int dst_stride,
   1486                              const uint8* src_ptr,
   1487                              uint8* dst_ptr) {
   1488   int i;
   1489   void (*ScaleCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width, int x,
   1490                     int dx) = ScaleCols_C;
   1491   // Initial source x/y coordinate and step values as 16.16 fixed point.
   1492   int x = 0;
   1493   int y = 0;
   1494   int dx = 0;
   1495   int dy = 0;
   1496   ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
   1497              &dx, &dy);
   1498   src_width = Abs(src_width);
   1499 
   1500   if (src_width * 2 == dst_width && x < 0x8000) {
   1501     ScaleCols = ScaleColsUp2_C;
   1502 #if defined(HAS_SCALECOLS_SSE2)
   1503     if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
   1504       ScaleCols = ScaleColsUp2_SSE2;
   1505     }
   1506 #endif
   1507   }
   1508 
   1509   for (i = 0; i < dst_height; ++i) {
   1510     ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
   1511     dst_ptr += dst_stride;
   1512     y += dy;
   1513   }
   1514 }
   1515 
   1516 static void ScalePlaneSimple_16(int src_width,
   1517                                 int src_height,
   1518                                 int dst_width,
   1519                                 int dst_height,
   1520                                 int src_stride,
   1521                                 int dst_stride,
   1522                                 const uint16* src_ptr,
   1523                                 uint16* dst_ptr) {
   1524   int i;
   1525   void (*ScaleCols)(uint16 * dst_ptr, const uint16* src_ptr, int dst_width,
   1526                     int x, int dx) = ScaleCols_16_C;
   1527   // Initial source x/y coordinate and step values as 16.16 fixed point.
   1528   int x = 0;
   1529   int y = 0;
   1530   int dx = 0;
   1531   int dy = 0;
   1532   ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
   1533              &dx, &dy);
   1534   src_width = Abs(src_width);
   1535 
   1536   if (src_width * 2 == dst_width && x < 0x8000) {
   1537     ScaleCols = ScaleColsUp2_16_C;
   1538 #if defined(HAS_SCALECOLS_16_SSE2)
   1539     if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
   1540       ScaleCols = ScaleColsUp2_16_SSE2;
   1541     }
   1542 #endif
   1543   }
   1544 
   1545   for (i = 0; i < dst_height; ++i) {
   1546     ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
   1547     dst_ptr += dst_stride;
   1548     y += dy;
   1549   }
   1550 }
   1551 
   1552 // Scale a plane.
   1553 // This function dispatches to a specialized scaler based on scale factor.
   1554 
   1555 LIBYUV_API
   1556 void ScalePlane(const uint8* src,
   1557                 int src_stride,
   1558                 int src_width,
   1559                 int src_height,
   1560                 uint8* dst,
   1561                 int dst_stride,
   1562                 int dst_width,
   1563                 int dst_height,
   1564                 enum FilterMode filtering) {
   1565   // Simplify filtering when possible.
   1566   filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
   1567                                 filtering);
   1568 
   1569   // Negative height means invert the image.
   1570   if (src_height < 0) {
   1571     src_height = -src_height;
   1572     src = src + (src_height - 1) * src_stride;
   1573     src_stride = -src_stride;
   1574   }
   1575 
   1576   // Use specialized scales to improve performance for common resolutions.
   1577   // For example, all the 1/2 scalings will use ScalePlaneDown2()
   1578   if (dst_width == src_width && dst_height == src_height) {
   1579     // Straight copy.
   1580     CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
   1581     return;
   1582   }
   1583   if (dst_width == src_width && filtering != kFilterBox) {
   1584     int dy = FixedDiv(src_height, dst_height);
   1585     // Arbitrary scale vertically, but unscaled horizontally.
   1586     ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
   1587                        dst_stride, src, dst, 0, 0, dy, 1, filtering);
   1588     return;
   1589   }
   1590   if (dst_width <= Abs(src_width) && dst_height <= src_height) {
   1591     // Scale down.
   1592     if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
   1593       // optimized, 3/4
   1594       ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
   1595                        dst_stride, src, dst, filtering);
   1596       return;
   1597     }
   1598     if (2 * dst_width == src_width && 2 * dst_height == src_height) {
   1599       // optimized, 1/2
   1600       ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
   1601                       dst_stride, src, dst, filtering);
   1602       return;
   1603     }
   1604     // 3/8 rounded up for odd sized chroma height.
   1605     if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
   1606       // optimized, 3/8
   1607       ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
   1608                        dst_stride, src, dst, filtering);
   1609       return;
   1610     }
   1611     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
   1612         (filtering == kFilterBox || filtering == kFilterNone)) {
   1613       // optimized, 1/4
   1614       ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
   1615                       dst_stride, src, dst, filtering);
   1616       return;
   1617     }
   1618   }
   1619   if (filtering == kFilterBox && dst_height * 2 < src_height) {
   1620     ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
   1621                   dst_stride, src, dst);
   1622     return;
   1623   }
   1624   if (filtering && dst_height > src_height) {
   1625     ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
   1626                          src_stride, dst_stride, src, dst, filtering);
   1627     return;
   1628   }
   1629   if (filtering) {
   1630     ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
   1631                            src_stride, dst_stride, src, dst, filtering);
   1632     return;
   1633   }
   1634   ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
   1635                    dst_stride, src, dst);
   1636 }
   1637 
   1638 LIBYUV_API
   1639 void ScalePlane_16(const uint16* src,
   1640                    int src_stride,
   1641                    int src_width,
   1642                    int src_height,
   1643                    uint16* dst,
   1644                    int dst_stride,
   1645                    int dst_width,
   1646                    int dst_height,
   1647                    enum FilterMode filtering) {
   1648   // Simplify filtering when possible.
   1649   filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
   1650                                 filtering);
   1651 
   1652   // Negative height means invert the image.
   1653   if (src_height < 0) {
   1654     src_height = -src_height;
   1655     src = src + (src_height - 1) * src_stride;
   1656     src_stride = -src_stride;
   1657   }
   1658 
   1659   // Use specialized scales to improve performance for common resolutions.
   1660   // For example, all the 1/2 scalings will use ScalePlaneDown2()
   1661   if (dst_width == src_width && dst_height == src_height) {
   1662     // Straight copy.
   1663     CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
   1664     return;
   1665   }
   1666   if (dst_width == src_width) {
   1667     int dy = FixedDiv(src_height, dst_height);
   1668     // Arbitrary scale vertically, but unscaled vertically.
   1669     ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
   1670                           dst_stride, src, dst, 0, 0, dy, 1, filtering);
   1671     return;
   1672   }
   1673   if (dst_width <= Abs(src_width) && dst_height <= src_height) {
   1674     // Scale down.
   1675     if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
   1676       // optimized, 3/4
   1677       ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
   1678                           src_stride, dst_stride, src, dst, filtering);
   1679       return;
   1680     }
   1681     if (2 * dst_width == src_width && 2 * dst_height == src_height) {
   1682       // optimized, 1/2
   1683       ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
   1684                          src_stride, dst_stride, src, dst, filtering);
   1685       return;
   1686     }
   1687     // 3/8 rounded up for odd sized chroma height.
   1688     if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
   1689       // optimized, 3/8
   1690       ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
   1691                           src_stride, dst_stride, src, dst, filtering);
   1692       return;
   1693     }
   1694     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
   1695         filtering != kFilterBilinear) {
   1696       // optimized, 1/4
   1697       ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
   1698                          src_stride, dst_stride, src, dst, filtering);
   1699       return;
   1700     }
   1701   }
   1702   if (filtering == kFilterBox && dst_height * 2 < src_height) {
   1703     ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride,
   1704                      dst_stride, src, dst);
   1705     return;
   1706   }
   1707   if (filtering && dst_height > src_height) {
   1708     ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
   1709                             src_stride, dst_stride, src, dst, filtering);
   1710     return;
   1711   }
   1712   if (filtering) {
   1713     ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
   1714                               src_stride, dst_stride, src, dst, filtering);
   1715     return;
   1716   }
   1717   ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,
   1718                       dst_stride, src, dst);
   1719 }
   1720 
   1721 // Scale an I420 image.
   1722 // This function in turn calls a scaling function for each plane.
   1723 
   1724 LIBYUV_API
   1725 int I420Scale(const uint8* src_y,
   1726               int src_stride_y,
   1727               const uint8* src_u,
   1728               int src_stride_u,
   1729               const uint8* src_v,
   1730               int src_stride_v,
   1731               int src_width,
   1732               int src_height,
   1733               uint8* dst_y,
   1734               int dst_stride_y,
   1735               uint8* dst_u,
   1736               int dst_stride_u,
   1737               uint8* dst_v,
   1738               int dst_stride_v,
   1739               int dst_width,
   1740               int dst_height,
   1741               enum FilterMode filtering) {
   1742   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   1743   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   1744   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   1745   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   1746   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
   1747       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
   1748       dst_width <= 0 || dst_height <= 0) {
   1749     return -1;
   1750   }
   1751 
   1752   ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
   1753              dst_width, dst_height, filtering);
   1754   ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
   1755              dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
   1756   ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
   1757              dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
   1758   return 0;
   1759 }
   1760 
   1761 LIBYUV_API
   1762 int I420Scale_16(const uint16* src_y,
   1763                  int src_stride_y,
   1764                  const uint16* src_u,
   1765                  int src_stride_u,
   1766                  const uint16* src_v,
   1767                  int src_stride_v,
   1768                  int src_width,
   1769                  int src_height,
   1770                  uint16* dst_y,
   1771                  int dst_stride_y,
   1772                  uint16* dst_u,
   1773                  int dst_stride_u,
   1774                  uint16* dst_v,
   1775                  int dst_stride_v,
   1776                  int dst_width,
   1777                  int dst_height,
   1778                  enum FilterMode filtering) {
   1779   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   1780   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   1781   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   1782   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   1783   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
   1784       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
   1785       dst_width <= 0 || dst_height <= 0) {
   1786     return -1;
   1787   }
   1788 
   1789   ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
   1790                 dst_width, dst_height, filtering);
   1791   ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
   1792                 dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
   1793   ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
   1794                 dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
   1795   return 0;
   1796 }
   1797 
   1798 // Deprecated api
   1799 LIBYUV_API
   1800 int Scale(const uint8* src_y,
   1801           const uint8* src_u,
   1802           const uint8* src_v,
   1803           int src_stride_y,
   1804           int src_stride_u,
   1805           int src_stride_v,
   1806           int src_width,
   1807           int src_height,
   1808           uint8* dst_y,
   1809           uint8* dst_u,
   1810           uint8* dst_v,
   1811           int dst_stride_y,
   1812           int dst_stride_u,
   1813           int dst_stride_v,
   1814           int dst_width,
   1815           int dst_height,
   1816           LIBYUV_BOOL interpolate) {
   1817   return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v,
   1818                    src_stride_v, src_width, src_height, dst_y, dst_stride_y,
   1819                    dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width,
   1820                    dst_height, interpolate ? kFilterBox : kFilterNone);
   1821 }
   1822 
   1823 // Deprecated api
   1824 LIBYUV_API
   1825 int ScaleOffset(const uint8* src,
   1826                 int src_width,
   1827                 int src_height,
   1828                 uint8* dst,
   1829                 int dst_width,
   1830                 int dst_height,
   1831                 int dst_yoffset,
   1832                 LIBYUV_BOOL interpolate) {
   1833   // Chroma requires offset to multiple of 2.
   1834   int dst_yoffset_even = dst_yoffset & ~1;
   1835   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   1836   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   1837   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   1838   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   1839   int aheight = dst_height - dst_yoffset_even * 2;  // actual output height
   1840   const uint8* src_y = src;
   1841   const uint8* src_u = src + src_width * src_height;
   1842   const uint8* src_v =
   1843       src + src_width * src_height + src_halfwidth * src_halfheight;
   1844   uint8* dst_y = dst + dst_yoffset_even * dst_width;
   1845   uint8* dst_u =
   1846       dst + dst_width * dst_height + (dst_yoffset_even >> 1) * dst_halfwidth;
   1847   uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
   1848                  (dst_yoffset_even >> 1) * dst_halfwidth;
   1849   if (!src || src_width <= 0 || src_height <= 0 || !dst || dst_width <= 0 ||
   1850       dst_height <= 0 || dst_yoffset_even < 0 ||
   1851       dst_yoffset_even >= dst_height) {
   1852     return -1;
   1853   }
   1854   return I420Scale(src_y, src_width, src_u, src_halfwidth, src_v, src_halfwidth,
   1855                    src_width, src_height, dst_y, dst_width, dst_u,
   1856                    dst_halfwidth, dst_v, dst_halfwidth, dst_width, aheight,
   1857                    interpolate ? kFilterBox : kFilterNone);
   1858 }
   1859 
   1860 #ifdef __cplusplus
   1861 }  // extern "C"
   1862 }  // namespace libyuv
   1863 #endif
   1864