Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 #include "libyuv/scale.h"
     13 #include "libyuv/scale_row.h"
     14 
     15 #ifdef __cplusplus
     16 namespace libyuv {
     17 extern "C" {
     18 #endif
     19 
     20 // This module is for GCC Neon armv8 64 bit.
     21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
     22 
     23 // Read 32x1 throw away even pixels, and write 16x1.
     24 void ScaleRowDown2_NEON(const uint8* src_ptr,
     25                         ptrdiff_t src_stride,
     26                         uint8* dst,
     27                         int dst_width) {
     28   (void)src_stride;
     29   asm volatile (
     30   "1:                                          \n"
     31     // load even pixels into v0, odd into v1
     32     MEMACCESS(0)
     33     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
     34     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
     35     MEMACCESS(1)
     36     "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
     37     "b.gt       1b                             \n"
     38   : "+r"(src_ptr),          // %0
     39     "+r"(dst),              // %1
     40     "+r"(dst_width)         // %2
     41   :
     42   : "v0", "v1"              // Clobber List
     43   );
     44 }
     45 
     46 // Read 32x1 average down and write 16x1.
     47 void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
     48                               ptrdiff_t src_stride,
     49                               uint8* dst,
     50                               int dst_width) {
     51   (void)src_stride;
     52   asm volatile (
     53   "1:                                          \n"
     54     MEMACCESS(0)
     55     "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
     56     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
     57     "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
     58     "uaddlp     v1.8h, v1.16b                  \n"
     59     "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
     60     "rshrn2     v0.16b, v1.8h, #1              \n"
     61     MEMACCESS(1)
     62     "st1        {v0.16b}, [%1], #16            \n"
     63     "b.gt       1b                             \n"
     64   : "+r"(src_ptr),          // %0
     65     "+r"(dst),              // %1
     66     "+r"(dst_width)         // %2
     67   :
     68   : "v0", "v1"     // Clobber List
     69   );
     70 }
     71 
     72 // Read 32x2 average down and write 16x1.
     73 void ScaleRowDown2Box_NEON(const uint8* src_ptr,
     74                            ptrdiff_t src_stride,
     75                            uint8* dst,
     76                            int dst_width) {
     77   asm volatile (
     78     // change the stride to row 2 pointer
     79     "add        %1, %1, %0                     \n"
     80   "1:                                          \n"
     81     MEMACCESS(0)
     82     "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
     83     MEMACCESS(1)
     84     "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
     85     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
     86     "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
     87     "uaddlp     v1.8h, v1.16b                  \n"
     88     "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
     89     "uadalp     v1.8h, v3.16b                  \n"
     90     "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
     91     "rshrn2     v0.16b, v1.8h, #2              \n"
     92     MEMACCESS(2)
     93     "st1        {v0.16b}, [%2], #16            \n"
     94     "b.gt       1b                             \n"
     95   : "+r"(src_ptr),          // %0
     96     "+r"(src_stride),       // %1
     97     "+r"(dst),              // %2
     98     "+r"(dst_width)         // %3
     99   :
    100   : "v0", "v1", "v2", "v3"     // Clobber List
    101   );
    102 }
    103 
    104 void ScaleRowDown4_NEON(const uint8* src_ptr,
    105                         ptrdiff_t src_stride,
    106                         uint8* dst_ptr,
    107                         int dst_width) {
    108   (void)src_stride;
    109   asm volatile (
    110   "1:                                          \n"
    111     MEMACCESS(0)
    112     "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
    113     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
    114     MEMACCESS(1)
    115     "st1     {v2.8b}, [%1], #8                 \n"
    116     "b.gt       1b                             \n"
    117   : "+r"(src_ptr),          // %0
    118     "+r"(dst_ptr),          // %1
    119     "+r"(dst_width)         // %2
    120   :
    121   : "v0", "v1", "v2", "v3", "memory", "cc"
    122   );
    123 }
    124 
    125 void ScaleRowDown4Box_NEON(const uint8* src_ptr,
    126                            ptrdiff_t src_stride,
    127                            uint8* dst_ptr,
    128                            int dst_width) {
    129   const uint8* src_ptr1 = src_ptr + src_stride;
    130   const uint8* src_ptr2 = src_ptr + src_stride * 2;
    131   const uint8* src_ptr3 = src_ptr + src_stride * 3;
    132   asm volatile (
    133   "1:                                          \n"
    134     MEMACCESS(0)
    135     "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
    136     MEMACCESS(3)
    137     "ld1     {v1.16b}, [%2], #16               \n"
    138     MEMACCESS(4)
    139     "ld1     {v2.16b}, [%3], #16               \n"
    140     MEMACCESS(5)
    141     "ld1     {v3.16b}, [%4], #16               \n"
    142     "subs    %w5, %w5, #4                      \n"
    143     "uaddlp  v0.8h, v0.16b                     \n"
    144     "uadalp  v0.8h, v1.16b                     \n"
    145     "uadalp  v0.8h, v2.16b                     \n"
    146     "uadalp  v0.8h, v3.16b                     \n"
    147     "addp    v0.8h, v0.8h, v0.8h               \n"
    148     "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
    149     MEMACCESS(1)
    150     "st1    {v0.s}[0], [%1], #4                \n"
    151     "b.gt       1b                             \n"
    152   : "+r"(src_ptr),   // %0
    153     "+r"(dst_ptr),   // %1
    154     "+r"(src_ptr1),  // %2
    155     "+r"(src_ptr2),  // %3
    156     "+r"(src_ptr3),  // %4
    157     "+r"(dst_width)  // %5
    158   :
    159   : "v0", "v1", "v2", "v3", "memory", "cc"
    160   );
    161 }
    162 
    163 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
    164 // to load up the every 4th pixel into a 4 different registers.
    165 // Point samples 32 pixels to 24 pixels.
    166 void ScaleRowDown34_NEON(const uint8* src_ptr,
    167                          ptrdiff_t src_stride,
    168                          uint8* dst_ptr,
    169                          int dst_width) {
    170   (void)src_stride;
    171   asm volatile (
    172   "1:                                                  \n"
    173     MEMACCESS(0)
    174     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
    175     "subs      %w2, %w2, #24                           \n"
    176     "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
    177     MEMACCESS(1)
    178     "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
    179     "b.gt      1b                                      \n"
    180   : "+r"(src_ptr),          // %0
    181     "+r"(dst_ptr),          // %1
    182     "+r"(dst_width)         // %2
    183   :
    184   : "v0", "v1", "v2", "v3", "memory", "cc"
    185   );
    186 }
    187 
    188 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
    189                                ptrdiff_t src_stride,
    190                                uint8* dst_ptr,
    191                                int dst_width) {
    192   asm volatile (
    193     "movi      v20.8b, #3                              \n"
    194     "add       %3, %3, %0                              \n"
    195   "1:                                                  \n"
    196     MEMACCESS(0)
    197     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
    198     MEMACCESS(3)
    199     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
    200     "subs         %w2, %w2, #24                        \n"
    201 
    202     // filter src line 0 with src line 1
    203     // expand chars to shorts to allow for room
    204     // when adding lines together
    205     "ushll     v16.8h, v4.8b, #0                       \n"
    206     "ushll     v17.8h, v5.8b, #0                       \n"
    207     "ushll     v18.8h, v6.8b, #0                       \n"
    208     "ushll     v19.8h, v7.8b, #0                       \n"
    209 
    210     // 3 * line_0 + line_1
    211     "umlal     v16.8h, v0.8b, v20.8b                   \n"
    212     "umlal     v17.8h, v1.8b, v20.8b                   \n"
    213     "umlal     v18.8h, v2.8b, v20.8b                   \n"
    214     "umlal     v19.8h, v3.8b, v20.8b                   \n"
    215 
    216     // (3 * line_0 + line_1) >> 2
    217     "uqrshrn   v0.8b, v16.8h, #2                       \n"
    218     "uqrshrn   v1.8b, v17.8h, #2                       \n"
    219     "uqrshrn   v2.8b, v18.8h, #2                       \n"
    220     "uqrshrn   v3.8b, v19.8h, #2                       \n"
    221 
    222     // a0 = (src[0] * 3 + s[1] * 1) >> 2
    223     "ushll     v16.8h, v1.8b, #0                       \n"
    224     "umlal     v16.8h, v0.8b, v20.8b                   \n"
    225     "uqrshrn   v0.8b, v16.8h, #2                       \n"
    226 
    227     // a1 = (src[1] * 1 + s[2] * 1) >> 1
    228     "urhadd    v1.8b, v1.8b, v2.8b                     \n"
    229 
    230     // a2 = (src[2] * 1 + s[3] * 3) >> 2
    231     "ushll     v16.8h, v2.8b, #0                       \n"
    232     "umlal     v16.8h, v3.8b, v20.8b                   \n"
    233     "uqrshrn   v2.8b, v16.8h, #2                       \n"
    234 
    235     MEMACCESS(1)
    236     "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
    237 
    238     "b.gt      1b                                      \n"
    239   : "+r"(src_ptr),          // %0
    240     "+r"(dst_ptr),          // %1
    241     "+r"(dst_width),        // %2
    242     "+r"(src_stride)        // %3
    243   :
    244   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
    245     "v20", "memory", "cc"
    246   );
    247 }
    248 
    249 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
    250                                ptrdiff_t src_stride,
    251                                uint8* dst_ptr,
    252                                int dst_width) {
    253   asm volatile (
    254     "movi      v20.8b, #3                              \n"
    255     "add       %3, %3, %0                              \n"
    256   "1:                                                  \n"
    257     MEMACCESS(0)
    258     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
    259     MEMACCESS(3)
    260     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
    261     "subs         %w2, %w2, #24                        \n"
    262     // average src line 0 with src line 1
    263     "urhadd    v0.8b, v0.8b, v4.8b                     \n"
    264     "urhadd    v1.8b, v1.8b, v5.8b                     \n"
    265     "urhadd    v2.8b, v2.8b, v6.8b                     \n"
    266     "urhadd    v3.8b, v3.8b, v7.8b                     \n"
    267 
    268     // a0 = (src[0] * 3 + s[1] * 1) >> 2
    269     "ushll     v4.8h, v1.8b, #0                        \n"
    270     "umlal     v4.8h, v0.8b, v20.8b                    \n"
    271     "uqrshrn   v0.8b, v4.8h, #2                        \n"
    272 
    273     // a1 = (src[1] * 1 + s[2] * 1) >> 1
    274     "urhadd    v1.8b, v1.8b, v2.8b                     \n"
    275 
    276     // a2 = (src[2] * 1 + s[3] * 3) >> 2
    277     "ushll     v4.8h, v2.8b, #0                        \n"
    278     "umlal     v4.8h, v3.8b, v20.8b                    \n"
    279     "uqrshrn   v2.8b, v4.8h, #2                        \n"
    280 
    281     MEMACCESS(1)
    282     "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
    283     "b.gt      1b                                      \n"
    284   : "+r"(src_ptr),          // %0
    285     "+r"(dst_ptr),          // %1
    286     "+r"(dst_width),        // %2
    287     "+r"(src_stride)        // %3
    288   :
    289   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
    290   );
    291 }
    292 
    293 static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
    294 static uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
    295                           34, 6,  22, 35, 0,  0,  0, 0};
    296 static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
    297                              65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
    298 static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
    299                              65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
    300 
    301 // 32 -> 12
    302 void ScaleRowDown38_NEON(const uint8* src_ptr,
    303                          ptrdiff_t src_stride,
    304                          uint8* dst_ptr,
    305                          int dst_width) {
    306   (void)src_stride;
    307   asm volatile (
    308     MEMACCESS(3)
    309     "ld1       {v3.16b}, [%3]                          \n"
    310   "1:                                                  \n"
    311     MEMACCESS(0)
    312     "ld1       {v0.16b,v1.16b}, [%0], #32              \n"
    313     "subs      %w2, %w2, #12                           \n"
    314     "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b         \n"
    315     MEMACCESS(1)
    316     "st1       {v2.8b}, [%1], #8                       \n"
    317     MEMACCESS(1)
    318     "st1       {v2.s}[2], [%1], #4                     \n"
    319     "b.gt      1b                                      \n"
    320   : "+r"(src_ptr),          // %0
    321     "+r"(dst_ptr),          // %1
    322     "+r"(dst_width)         // %2
    323   : "r"(&kShuf38)           // %3
    324   : "v0", "v1", "v2", "v3", "memory", "cc"
    325   );
    326 }
    327 
    328 // 32x3 -> 12x1
    329 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
    330                                       ptrdiff_t src_stride,
    331                                       uint8* dst_ptr,
    332                                       int dst_width) {
    333   const uint8* src_ptr1 = src_ptr + src_stride * 2;
    334   ptrdiff_t tmp_src_stride = src_stride;
    335 
    336   asm volatile (
    337     MEMACCESS(5)
    338     "ld1       {v29.8h}, [%5]                          \n"
    339     MEMACCESS(6)
    340     "ld1       {v30.16b}, [%6]                         \n"
    341     MEMACCESS(7)
    342     "ld1       {v31.8h}, [%7]                          \n"
    343     "add       %2, %2, %0                              \n"
    344   "1:                                                  \n"
    345 
    346     // 00 40 01 41 02 42 03 43
    347     // 10 50 11 51 12 52 13 53
    348     // 20 60 21 61 22 62 23 63
    349     // 30 70 31 71 32 72 33 73
    350     MEMACCESS(0)
    351     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
    352     MEMACCESS(3)
    353     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
    354     MEMACCESS(4)
    355     "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
    356     "subs      %w4, %w4, #12                           \n"
    357 
    358     // Shuffle the input data around to get align the data
    359     //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
    360     // 00 10 01 11 02 12 03 13
    361     // 40 50 41 51 42 52 43 53
    362     "trn1      v20.8b, v0.8b, v1.8b                    \n"
    363     "trn2      v21.8b, v0.8b, v1.8b                    \n"
    364     "trn1      v22.8b, v4.8b, v5.8b                    \n"
    365     "trn2      v23.8b, v4.8b, v5.8b                    \n"
    366     "trn1      v24.8b, v16.8b, v17.8b                  \n"
    367     "trn2      v25.8b, v16.8b, v17.8b                  \n"
    368 
    369     // 20 30 21 31 22 32 23 33
    370     // 60 70 61 71 62 72 63 73
    371     "trn1      v0.8b, v2.8b, v3.8b                     \n"
    372     "trn2      v1.8b, v2.8b, v3.8b                     \n"
    373     "trn1      v4.8b, v6.8b, v7.8b                     \n"
    374     "trn2      v5.8b, v6.8b, v7.8b                     \n"
    375     "trn1      v16.8b, v18.8b, v19.8b                  \n"
    376     "trn2      v17.8b, v18.8b, v19.8b                  \n"
    377 
    378     // 00+10 01+11 02+12 03+13
    379     // 40+50 41+51 42+52 43+53
    380     "uaddlp    v20.4h, v20.8b                          \n"
    381     "uaddlp    v21.4h, v21.8b                          \n"
    382     "uaddlp    v22.4h, v22.8b                          \n"
    383     "uaddlp    v23.4h, v23.8b                          \n"
    384     "uaddlp    v24.4h, v24.8b                          \n"
    385     "uaddlp    v25.4h, v25.8b                          \n"
    386 
    387     // 60+70 61+71 62+72 63+73
    388     "uaddlp    v1.4h, v1.8b                            \n"
    389     "uaddlp    v5.4h, v5.8b                            \n"
    390     "uaddlp    v17.4h, v17.8b                          \n"
    391 
    392     // combine source lines
    393     "add       v20.4h, v20.4h, v22.4h                  \n"
    394     "add       v21.4h, v21.4h, v23.4h                  \n"
    395     "add       v20.4h, v20.4h, v24.4h                  \n"
    396     "add       v21.4h, v21.4h, v25.4h                  \n"
    397     "add       v2.4h, v1.4h, v5.4h                     \n"
    398     "add       v2.4h, v2.4h, v17.4h                    \n"
    399 
    400     // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
    401     //             + s[6 + st * 1] + s[7 + st * 1]
    402     //             + s[6 + st * 2] + s[7 + st * 2]) / 6
    403     "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
    404     "xtn       v2.8b,  v2.8h                           \n"
    405 
    406     // Shuffle 2,3 reg around so that 2 can be added to the
    407     //  0,1 reg and 3 can be added to the 4,5 reg. This
    408     //  requires expanding from u8 to u16 as the 0,1 and 4,5
    409     //  registers are already expanded. Then do transposes
    410     //  to get aligned.
    411     // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
    412     "ushll     v16.8h, v16.8b, #0                      \n"
    413     "uaddl     v0.8h, v0.8b, v4.8b                     \n"
    414 
    415     // combine source lines
    416     "add       v0.8h, v0.8h, v16.8h                    \n"
    417 
    418     // xx 20 xx 21 xx 22 xx 23
    419     // xx 30 xx 31 xx 32 xx 33
    420     "trn1      v1.8h, v0.8h, v0.8h                     \n"
    421     "trn2      v4.8h, v0.8h, v0.8h                     \n"
    422     "xtn       v0.4h, v1.4s                            \n"
    423     "xtn       v4.4h, v4.4s                            \n"
    424 
    425     // 0+1+2, 3+4+5
    426     "add       v20.8h, v20.8h, v0.8h                   \n"
    427     "add       v21.8h, v21.8h, v4.8h                   \n"
    428 
    429     // Need to divide, but can't downshift as the the value
    430     //  isn't a power of 2. So multiply by 65536 / n
    431     //  and take the upper 16 bits.
    432     "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
    433     "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
    434 
    435     // Align for table lookup, vtbl requires registers to
    436     //  be adjacent
    437     "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
    438 
    439     MEMACCESS(1)
    440     "st1       {v3.8b}, [%1], #8                       \n"
    441     MEMACCESS(1)
    442     "st1       {v3.s}[2], [%1], #4                     \n"
    443     "b.gt      1b                                      \n"
    444   : "+r"(src_ptr),          // %0
    445     "+r"(dst_ptr),          // %1
    446     "+r"(tmp_src_stride),   // %2
    447     "+r"(src_ptr1),         // %3
    448     "+r"(dst_width)         // %4
    449   : "r"(&kMult38_Div6),     // %5
    450     "r"(&kShuf38_2),        // %6
    451     "r"(&kMult38_Div9)      // %7
    452   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
    453     "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
    454     "v30", "v31", "memory", "cc"
    455   );
    456 }
    457 
    458 // 32x2 -> 12x1
    459 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
    460                                ptrdiff_t src_stride,
    461                                uint8* dst_ptr,
    462                                int dst_width) {
    463   // TODO(fbarchard): use src_stride directly for clang 3.5+.
    464   ptrdiff_t tmp_src_stride = src_stride;
    465   asm volatile (
    466     MEMACCESS(4)
    467     "ld1       {v30.8h}, [%4]                          \n"
    468     MEMACCESS(5)
    469     "ld1       {v31.16b}, [%5]                         \n"
    470     "add       %2, %2, %0                              \n"
    471   "1:                                                  \n"
    472 
    473     // 00 40 01 41 02 42 03 43
    474     // 10 50 11 51 12 52 13 53
    475     // 20 60 21 61 22 62 23 63
    476     // 30 70 31 71 32 72 33 73
    477     MEMACCESS(0)
    478     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
    479     MEMACCESS(3)
    480     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
    481     "subs      %w3, %w3, #12                           \n"
    482 
    483     // Shuffle the input data around to get align the data
    484     //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
    485     // 00 10 01 11 02 12 03 13
    486     // 40 50 41 51 42 52 43 53
    487     "trn1      v16.8b, v0.8b, v1.8b                    \n"
    488     "trn2      v17.8b, v0.8b, v1.8b                    \n"
    489     "trn1      v18.8b, v4.8b, v5.8b                    \n"
    490     "trn2      v19.8b, v4.8b, v5.8b                    \n"
    491 
    492     // 20 30 21 31 22 32 23 33
    493     // 60 70 61 71 62 72 63 73
    494     "trn1      v0.8b, v2.8b, v3.8b                     \n"
    495     "trn2      v1.8b, v2.8b, v3.8b                     \n"
    496     "trn1      v4.8b, v6.8b, v7.8b                     \n"
    497     "trn2      v5.8b, v6.8b, v7.8b                     \n"
    498 
    499     // 00+10 01+11 02+12 03+13
    500     // 40+50 41+51 42+52 43+53
    501     "uaddlp    v16.4h, v16.8b                          \n"
    502     "uaddlp    v17.4h, v17.8b                          \n"
    503     "uaddlp    v18.4h, v18.8b                          \n"
    504     "uaddlp    v19.4h, v19.8b                          \n"
    505 
    506     // 60+70 61+71 62+72 63+73
    507     "uaddlp    v1.4h, v1.8b                            \n"
    508     "uaddlp    v5.4h, v5.8b                            \n"
    509 
    510     // combine source lines
    511     "add       v16.4h, v16.4h, v18.4h                  \n"
    512     "add       v17.4h, v17.4h, v19.4h                  \n"
    513     "add       v2.4h, v1.4h, v5.4h                     \n"
    514 
    515     // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
    516     "uqrshrn   v2.8b, v2.8h, #2                        \n"
    517 
    518     // Shuffle 2,3 reg around so that 2 can be added to the
    519     //  0,1 reg and 3 can be added to the 4,5 reg. This
    520     //  requires expanding from u8 to u16 as the 0,1 and 4,5
    521     //  registers are already expanded. Then do transposes
    522     //  to get aligned.
    523     // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
    524 
    525     // combine source lines
    526     "uaddl     v0.8h, v0.8b, v4.8b                     \n"
    527 
    528     // xx 20 xx 21 xx 22 xx 23
    529     // xx 30 xx 31 xx 32 xx 33
    530     "trn1      v1.8h, v0.8h, v0.8h                     \n"
    531     "trn2      v4.8h, v0.8h, v0.8h                     \n"
    532     "xtn       v0.4h, v1.4s                            \n"
    533     "xtn       v4.4h, v4.4s                            \n"
    534 
    535     // 0+1+2, 3+4+5
    536     "add       v16.8h, v16.8h, v0.8h                   \n"
    537     "add       v17.8h, v17.8h, v4.8h                   \n"
    538 
    539     // Need to divide, but can't downshift as the the value
    540     //  isn't a power of 2. So multiply by 65536 / n
    541     //  and take the upper 16 bits.
    542     "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
    543     "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
    544 
    545     // Align for table lookup, vtbl requires registers to
    546     //  be adjacent
    547 
    548     "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
    549 
    550     MEMACCESS(1)
    551     "st1       {v3.8b}, [%1], #8                       \n"
    552     MEMACCESS(1)
    553     "st1       {v3.s}[2], [%1], #4                     \n"
    554     "b.gt      1b                                      \n"
    555   : "+r"(src_ptr),         // %0
    556     "+r"(dst_ptr),         // %1
    557     "+r"(tmp_src_stride),  // %2
    558     "+r"(dst_width)        // %3
    559   : "r"(&kMult38_Div6),    // %4
    560     "r"(&kShuf38_2)        // %5
    561   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
    562     "v18", "v19", "v30", "v31", "memory", "cc"
    563   );
    564 }
    565 
    566 void ScaleAddRows_NEON(const uint8* src_ptr,
    567                        ptrdiff_t src_stride,
    568                        uint16* dst_ptr,
    569                        int src_width,
    570                        int src_height) {
    571   const uint8* src_tmp;
    572   asm volatile (
    573   "1:                                          \n"
    574     "mov       %0, %1                          \n"
    575     "mov       w12, %w5                        \n"
    576     "eor       v2.16b, v2.16b, v2.16b          \n"
    577     "eor       v3.16b, v3.16b, v3.16b          \n"
    578   "2:                                          \n"
    579     // load 16 pixels into q0
    580     MEMACCESS(0)
    581     "ld1       {v0.16b}, [%0], %3              \n"
    582     "uaddw2    v3.8h, v3.8h, v0.16b            \n"
    583     "uaddw     v2.8h, v2.8h, v0.8b             \n"
    584     "subs      w12, w12, #1                    \n"
    585     "b.gt      2b                              \n"
    586     MEMACCESS(2)
    587     "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
    588     "add      %1, %1, #16                      \n"
    589     "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
    590     "b.gt     1b                               \n"
    591   : "=&r"(src_tmp),    // %0
    592     "+r"(src_ptr),     // %1
    593     "+r"(dst_ptr),     // %2
    594     "+r"(src_stride),  // %3
    595     "+r"(src_width),   // %4
    596     "+r"(src_height)   // %5
    597   :
    598   : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
    599   );
    600 }
    601 
    602 // clang-format off
    603 // TODO(Yang Zhang): Investigate less load instructions for
    604 // the x/dx stepping
    605 #define LOAD2_DATA8_LANE(n)                                 \
    606   "lsr        %5, %3, #16                    \n"            \
    607   "add        %6, %1, %5                     \n"            \
    608   "add        %3, %3, %4                     \n"            \
    609   MEMACCESS(6)                                              \
    610   "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
    611 // clang-format on
    612 
    613 // The NEON version mimics this formula (from row_common.cc):
    614 // #define BLENDER(a, b, f) (uint8)((int)(a) +
    615 //    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
    616 
    617 void ScaleFilterCols_NEON(uint8* dst_ptr,
    618                           const uint8* src_ptr,
    619                           int dst_width,
    620                           int x,
    621                           int dx) {
    622   int dx_offset[4] = {0, 1, 2, 3};
    623   int* tmp = dx_offset;
    624   const uint8* src_tmp = src_ptr;
    625   int64 dst_width64 = (int64)dst_width;  // Work around ios 64 bit warning.
    626   int64 x64 = (int64)x;
    627   int64 dx64 = (int64)dx;
    628   asm volatile (
    629     "dup        v0.4s, %w3                     \n"  // x
    630     "dup        v1.4s, %w4                     \n"  // dx
    631     "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
    632     "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
    633     "mul        v1.4s, v1.4s, v2.4s            \n"
    634     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
    635     "add        v1.4s, v1.4s, v0.4s            \n"
    636     // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
    637     "add        v2.4s, v1.4s, v3.4s            \n"
    638     "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
    639   "1:                                          \n"
    640     LOAD2_DATA8_LANE(0)
    641     LOAD2_DATA8_LANE(1)
    642     LOAD2_DATA8_LANE(2)
    643     LOAD2_DATA8_LANE(3)
    644     LOAD2_DATA8_LANE(4)
    645     LOAD2_DATA8_LANE(5)
    646     LOAD2_DATA8_LANE(6)
    647     LOAD2_DATA8_LANE(7)
    648     "mov       v6.16b, v1.16b                  \n"
    649     "mov       v7.16b, v2.16b                  \n"
    650     "uzp1      v6.8h, v6.8h, v7.8h             \n"
    651     "ushll     v4.8h, v4.8b, #0                \n"
    652     "ushll     v5.8h, v5.8b, #0                \n"
    653     "ssubl     v16.4s, v5.4h, v4.4h            \n"
    654     "ssubl2    v17.4s, v5.8h, v4.8h            \n"
    655     "ushll     v7.4s, v6.4h, #0                \n"
    656     "ushll2    v6.4s, v6.8h, #0                \n"
    657     "mul       v16.4s, v16.4s, v7.4s           \n"
    658     "mul       v17.4s, v17.4s, v6.4s           \n"
    659     "rshrn     v6.4h, v16.4s, #16              \n"
    660     "rshrn2    v6.8h, v17.4s, #16              \n"
    661     "add       v4.8h, v4.8h, v6.8h             \n"
    662     "xtn       v4.8b, v4.8h                    \n"
    663 
    664     MEMACCESS(0)
    665     "st1       {v4.8b}, [%0], #8               \n"  // store pixels
    666     "add       v1.4s, v1.4s, v0.4s             \n"
    667     "add       v2.4s, v2.4s, v0.4s             \n"
    668     "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
    669     "b.gt      1b                              \n"
    670   : "+r"(dst_ptr),          // %0
    671     "+r"(src_ptr),          // %1
    672     "+r"(dst_width64),      // %2
    673     "+r"(x64),              // %3
    674     "+r"(dx64),             // %4
    675     "+r"(tmp),              // %5
    676     "+r"(src_tmp)           // %6
    677   :
    678   : "memory", "cc", "v0", "v1", "v2", "v3",
    679     "v4", "v5", "v6", "v7", "v16", "v17"
    680   );
    681 }
    682 
    683 #undef LOAD2_DATA8_LANE
    684 
    685 // 16x2 -> 16x1
    686 void ScaleFilterRows_NEON(uint8* dst_ptr,
    687                           const uint8* src_ptr,
    688                           ptrdiff_t src_stride,
    689                           int dst_width,
    690                           int source_y_fraction) {
    691   int y_fraction = 256 - source_y_fraction;
    692   asm volatile (
    693     "cmp          %w4, #0                      \n"
    694     "b.eq         100f                         \n"
    695     "add          %2, %2, %1                   \n"
    696     "cmp          %w4, #64                     \n"
    697     "b.eq         75f                          \n"
    698     "cmp          %w4, #128                    \n"
    699     "b.eq         50f                          \n"
    700     "cmp          %w4, #192                    \n"
    701     "b.eq         25f                          \n"
    702 
    703     "dup          v5.8b, %w4                   \n"
    704     "dup          v4.8b, %w5                   \n"
    705     // General purpose row blend.
    706   "1:                                          \n"
    707     MEMACCESS(1)
    708     "ld1          {v0.16b}, [%1], #16          \n"
    709     MEMACCESS(2)
    710     "ld1          {v1.16b}, [%2], #16          \n"
    711     "subs         %w3, %w3, #16                \n"
    712     "umull        v6.8h, v0.8b, v4.8b          \n"
    713     "umull2       v7.8h, v0.16b, v4.16b        \n"
    714     "umlal        v6.8h, v1.8b, v5.8b          \n"
    715     "umlal2       v7.8h, v1.16b, v5.16b        \n"
    716     "rshrn        v0.8b, v6.8h, #8             \n"
    717     "rshrn2       v0.16b, v7.8h, #8            \n"
    718     MEMACCESS(0)
    719     "st1          {v0.16b}, [%0], #16          \n"
    720     "b.gt         1b                           \n"
    721     "b            99f                          \n"
    722 
    723     // Blend 25 / 75.
    724   "25:                                         \n"
    725     MEMACCESS(1)
    726     "ld1          {v0.16b}, [%1], #16          \n"
    727     MEMACCESS(2)
    728     "ld1          {v1.16b}, [%2], #16          \n"
    729     "subs         %w3, %w3, #16                \n"
    730     "urhadd       v0.16b, v0.16b, v1.16b       \n"
    731     "urhadd       v0.16b, v0.16b, v1.16b       \n"
    732     MEMACCESS(0)
    733     "st1          {v0.16b}, [%0], #16          \n"
    734     "b.gt         25b                          \n"
    735     "b            99f                          \n"
    736 
    737     // Blend 50 / 50.
    738   "50:                                         \n"
    739     MEMACCESS(1)
    740     "ld1          {v0.16b}, [%1], #16          \n"
    741     MEMACCESS(2)
    742     "ld1          {v1.16b}, [%2], #16          \n"
    743     "subs         %w3, %w3, #16                \n"
    744     "urhadd       v0.16b, v0.16b, v1.16b       \n"
    745     MEMACCESS(0)
    746     "st1          {v0.16b}, [%0], #16          \n"
    747     "b.gt         50b                          \n"
    748     "b            99f                          \n"
    749 
    750     // Blend 75 / 25.
    751   "75:                                         \n"
    752     MEMACCESS(1)
    753     "ld1          {v1.16b}, [%1], #16          \n"
    754     MEMACCESS(2)
    755     "ld1          {v0.16b}, [%2], #16          \n"
    756     "subs         %w3, %w3, #16                \n"
    757     "urhadd       v0.16b, v0.16b, v1.16b       \n"
    758     "urhadd       v0.16b, v0.16b, v1.16b       \n"
    759     MEMACCESS(0)
    760     "st1          {v0.16b}, [%0], #16          \n"
    761     "b.gt         75b                          \n"
    762     "b            99f                          \n"
    763 
    764     // Blend 100 / 0 - Copy row unchanged.
    765   "100:                                        \n"
    766     MEMACCESS(1)
    767     "ld1          {v0.16b}, [%1], #16          \n"
    768     "subs         %w3, %w3, #16                \n"
    769     MEMACCESS(0)
    770     "st1          {v0.16b}, [%0], #16          \n"
    771     "b.gt         100b                         \n"
    772 
    773   "99:                                         \n"
    774     MEMACCESS(0)
    775     "st1          {v0.b}[15], [%0]             \n"
    776   : "+r"(dst_ptr),          // %0
    777     "+r"(src_ptr),          // %1
    778     "+r"(src_stride),       // %2
    779     "+r"(dst_width),        // %3
    780     "+r"(source_y_fraction),// %4
    781     "+r"(y_fraction)        // %5
    782   :
    783   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
    784   );
    785 }
    786 
    787 void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
    788                             ptrdiff_t src_stride,
    789                             uint8* dst,
    790                             int dst_width) {
    791   (void)src_stride;
    792   asm volatile (
    793   "1:                                          \n"
    794     // load even pixels into q0, odd into q1
    795     MEMACCESS (0)
    796     "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
    797     MEMACCESS (0)
    798     "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
    799     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
    800     MEMACCESS (1)
    801     "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
    802     MEMACCESS (1)
    803     "st1        {v3.16b}, [%1], #16            \n"
    804     "b.gt       1b                             \n"
    805   : "+r" (src_ptr),          // %0
    806     "+r" (dst),              // %1
    807     "+r" (dst_width)         // %2
    808   :
    809   : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
    810   );
    811 }
    812 
    813 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
    814                                   ptrdiff_t src_stride,
    815                                   uint8* dst_argb,
    816                                   int dst_width) {
    817   (void)src_stride;
    818   asm volatile (
    819   "1:                                          \n"
    820     MEMACCESS (0)
    821     // load 8 ARGB pixels.
    822     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
    823     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    824     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
    825     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
    826     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
    827     "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
    828     "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
    829     "rshrn      v1.8b, v1.8h, #1               \n"
    830     "rshrn      v2.8b, v2.8h, #1               \n"
    831     "rshrn      v3.8b, v3.8h, #1               \n"
    832     MEMACCESS (1)
    833     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
    834     "b.gt       1b                             \n"
    835   : "+r"(src_argb),         // %0
    836     "+r"(dst_argb),         // %1
    837     "+r"(dst_width)         // %2
    838   :
    839   : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
    840   );
    841 }
    842 
    843 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
    844                                ptrdiff_t src_stride,
    845                                uint8* dst,
    846                                int dst_width) {
    847   asm volatile (
    848     // change the stride to row 2 pointer
    849     "add        %1, %1, %0                     \n"
    850   "1:                                          \n"
    851     MEMACCESS (0)
    852     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
    853     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
    854     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
    855     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
    856     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
    857     "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
    858     MEMACCESS (1)
    859     "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
    860     "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
    861     "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
    862     "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
    863     "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
    864     "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
    865     "rshrn      v1.8b, v1.8h, #2               \n"
    866     "rshrn      v2.8b, v2.8h, #2               \n"
    867     "rshrn      v3.8b, v3.8h, #2               \n"
    868     MEMACCESS (2)
    869     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
    870     "b.gt       1b                             \n"
    871   : "+r" (src_ptr),          // %0
    872     "+r" (src_stride),       // %1
    873     "+r" (dst),              // %2
    874     "+r" (dst_width)         // %3
    875   :
    876   : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
    877   );
    878 }
    879 
    880 // Reads 4 pixels at a time.
    881 // Alignment requirement: src_argb 4 byte aligned.
    882 void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
    883                                ptrdiff_t src_stride,
    884                                int src_stepx,
    885                                uint8* dst_argb,
    886                                int dst_width) {
    887   (void)src_stride;
    888   asm volatile (
    889   "1:                                          \n"
    890     MEMACCESS(0)
    891     "ld1        {v0.s}[0], [%0], %3            \n"
    892     MEMACCESS(0)
    893     "ld1        {v0.s}[1], [%0], %3            \n"
    894     MEMACCESS(0)
    895     "ld1        {v0.s}[2], [%0], %3            \n"
    896     MEMACCESS(0)
    897     "ld1        {v0.s}[3], [%0], %3            \n"
    898     "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
    899     MEMACCESS(1)
    900     "st1        {v0.16b}, [%1], #16            \n"
    901     "b.gt       1b                             \n"
    902   : "+r"(src_argb),    // %0
    903     "+r"(dst_argb),    // %1
    904     "+r"(dst_width)    // %2
    905   : "r"((int64)(src_stepx * 4)) // %3
    906   : "memory", "cc", "v0"
    907   );
    908 }
    909 
    910 // Reads 4 pixels at a time.
    911 // Alignment requirement: src_argb 4 byte aligned.
    912 // TODO(Yang Zhang): Might be worth another optimization pass in future.
    913 // It could be upgraded to 8 pixels at a time to start with.
    914 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
    915                                   ptrdiff_t src_stride,
    916                                   int src_stepx,
    917                                   uint8* dst_argb,
    918                                   int dst_width) {
    919   asm volatile (
    920     "add        %1, %1, %0                     \n"
    921   "1:                                          \n"
    922     MEMACCESS(0)
    923     "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
    924     MEMACCESS(1)
    925     "ld1        {v1.8b}, [%1], %4              \n"
    926     MEMACCESS(0)
    927     "ld1        {v2.8b}, [%0], %4              \n"
    928     MEMACCESS(1)
    929     "ld1        {v3.8b}, [%1], %4              \n"
    930     MEMACCESS(0)
    931     "ld1        {v4.8b}, [%0], %4              \n"
    932     MEMACCESS(1)
    933     "ld1        {v5.8b}, [%1], %4              \n"
    934     MEMACCESS(0)
    935     "ld1        {v6.8b}, [%0], %4              \n"
    936     MEMACCESS(1)
    937     "ld1        {v7.8b}, [%1], %4              \n"
    938     "uaddl      v0.8h, v0.8b, v1.8b            \n"
    939     "uaddl      v2.8h, v2.8b, v3.8b            \n"
    940     "uaddl      v4.8h, v4.8b, v5.8b            \n"
    941     "uaddl      v6.8h, v6.8b, v7.8b            \n"
    942     "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
    943     "mov        v0.d[1], v2.d[0]               \n"
    944     "mov        v2.d[0], v16.d[1]              \n"
    945     "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
    946     "mov        v4.d[1], v6.d[0]               \n"
    947     "mov        v6.d[0], v16.d[1]              \n"
    948     "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
    949     "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
    950     "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
    951     "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
    952     "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
    953     MEMACCESS(2)
    954     "st1     {v0.16b}, [%2], #16               \n"
    955     "b.gt       1b                             \n"
    956   : "+r"(src_argb),    // %0
    957     "+r"(src_stride),  // %1
    958     "+r"(dst_argb),    // %2
    959     "+r"(dst_width)    // %3
    960   : "r"((int64)(src_stepx * 4)) // %4
    961   : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
    962   );
    963 }
    964 
    965 // clang-format off
    966 // TODO(Yang Zhang): Investigate less load instructions for
    967 // the x/dx stepping
    968 #define LOAD1_DATA32_LANE(vn, n)                            \
    969   "lsr        %5, %3, #16                    \n"            \
    970   "add        %6, %1, %5, lsl #2             \n"            \
    971   "add        %3, %3, %4                     \n"            \
    972   MEMACCESS(6)                                              \
    973  "ld1        {" #vn ".s}[" #n "], [%6]       \n"
    974 // clang-format on
    975 
    976 void ScaleARGBCols_NEON(uint8* dst_argb,
    977                         const uint8* src_argb,
    978                         int dst_width,
    979                         int x,
    980                         int dx) {
    981   const uint8* src_tmp = src_argb;
    982   int64 dst_width64 = (int64)dst_width;  // Work around ios 64 bit warning.
    983   int64 x64 = (int64)x;
    984   int64 dx64 = (int64)dx;
    985   int64 tmp64;
    986   asm volatile (
    987   "1:                                          \n"
    988     LOAD1_DATA32_LANE(v0, 0)
    989     LOAD1_DATA32_LANE(v0, 1)
    990     LOAD1_DATA32_LANE(v0, 2)
    991     LOAD1_DATA32_LANE(v0, 3)
    992     LOAD1_DATA32_LANE(v1, 0)
    993     LOAD1_DATA32_LANE(v1, 1)
    994     LOAD1_DATA32_LANE(v1, 2)
    995     LOAD1_DATA32_LANE(v1, 3)
    996 
    997     MEMACCESS(0)
    998     "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
    999     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
   1000     "b.gt        1b                            \n"
   1001   : "+r"(dst_argb),     // %0
   1002     "+r"(src_argb),     // %1
   1003     "+r"(dst_width64),  // %2
   1004     "+r"(x64),          // %3
   1005     "+r"(dx64),         // %4
   1006     "=&r"(tmp64),       // %5
   1007     "+r"(src_tmp)       // %6
   1008   :
   1009   : "memory", "cc", "v0", "v1"
   1010   );
   1011 }
   1012 
   1013 #undef LOAD1_DATA32_LANE
   1014 
   1015 // clang-format off
   1016 // TODO(Yang Zhang): Investigate less load instructions for
   1017 // the x/dx stepping
   1018 #define LOAD2_DATA32_LANE(vn1, vn2, n)                             \
   1019   "lsr        %5, %3, #16                           \n"            \
   1020   "add        %6, %1, %5, lsl #2                    \n"            \
   1021   "add        %3, %3, %4                            \n"            \
   1022   MEMACCESS(6)                                                     \
   1023   "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
   1024 // clang-format on
   1025 
   1026 void ScaleARGBFilterCols_NEON(uint8* dst_argb,
   1027                               const uint8* src_argb,
   1028                               int dst_width,
   1029                               int x,
   1030                               int dx) {
   1031   int dx_offset[4] = {0, 1, 2, 3};
   1032   int* tmp = dx_offset;
   1033   const uint8* src_tmp = src_argb;
   1034   int64 dst_width64 = (int64)dst_width;  // Work around ios 64 bit warning.
   1035   int64 x64 = (int64)x;
   1036   int64 dx64 = (int64)dx;
   1037   asm volatile (
   1038     "dup        v0.4s, %w3                     \n"  // x
   1039     "dup        v1.4s, %w4                     \n"  // dx
   1040     "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
   1041     "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
   1042     "mul        v1.4s, v1.4s, v2.4s            \n"
   1043     "movi       v3.16b, #0x7f                  \n"  // 0x7F
   1044     "movi       v4.8h, #0x7f                   \n"  // 0x7F
   1045     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
   1046     "add        v5.4s, v1.4s, v0.4s            \n"
   1047   "1:                                          \n"
   1048     // d0, d1: a
   1049     // d2, d3: b
   1050     LOAD2_DATA32_LANE(v0, v1, 0)
   1051     LOAD2_DATA32_LANE(v0, v1, 1)
   1052     LOAD2_DATA32_LANE(v0, v1, 2)
   1053     LOAD2_DATA32_LANE(v0, v1, 3)
   1054     "shrn       v2.4h, v5.4s, #9               \n"
   1055     "and        v2.8b, v2.8b, v4.8b            \n"
   1056     "dup        v16.8b, v2.b[0]                \n"
   1057     "dup        v17.8b, v2.b[2]                \n"
   1058     "dup        v18.8b, v2.b[4]                \n"
   1059     "dup        v19.8b, v2.b[6]                \n"
   1060     "ext        v2.8b, v16.8b, v17.8b, #4      \n"
   1061     "ext        v17.8b, v18.8b, v19.8b, #4     \n"
   1062     "ins        v2.d[1], v17.d[0]              \n"  // f
   1063     "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
   1064     "umull      v16.8h, v0.8b, v7.8b           \n"
   1065     "umull2     v17.8h, v0.16b, v7.16b         \n"
   1066     "umull      v18.8h, v1.8b, v2.8b           \n"
   1067     "umull2     v19.8h, v1.16b, v2.16b         \n"
   1068     "add        v16.8h, v16.8h, v18.8h         \n"
   1069     "add        v17.8h, v17.8h, v19.8h         \n"
   1070     "shrn       v0.8b, v16.8h, #7              \n"
   1071     "shrn2      v0.16b, v17.8h, #7             \n"
   1072 
   1073     MEMACCESS(0)
   1074     "st1     {v0.4s}, [%0], #16                \n"  // store pixels
   1075     "add     v5.4s, v5.4s, v6.4s               \n"
   1076     "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
   1077     "b.gt    1b                                \n"
   1078   : "+r"(dst_argb),         // %0
   1079     "+r"(src_argb),         // %1
   1080     "+r"(dst_width64),      // %2
   1081     "+r"(x64),              // %3
   1082     "+r"(dx64),             // %4
   1083     "+r"(tmp),              // %5
   1084     "+r"(src_tmp)           // %6
   1085   :
   1086   : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
   1087     "v6", "v7", "v16", "v17", "v18", "v19"
   1088   );
   1089 }
   1090 
   1091 #undef LOAD2_DATA32_LANE
   1092 
   1093 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
   1094 
   1095 #ifdef __cplusplus
   1096 }  // extern "C"
   1097 }  // namespace libyuv
   1098 #endif
   1099