Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 
     13 #ifdef __cplusplus
     14 namespace libyuv {
     15 extern "C" {
     16 #endif
     17 
     18 // This module is for GCC Neon armv8 64 bit.
     19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
     20 
     21 // Read 8 Y, 4 U and 4 V from 422
     22 #define READYUV422                                                             \
     23     MEMACCESS(0)                                                               \
     24     "ld1        {v0.8b}, [%0], #8              \n"                             \
     25     MEMACCESS(1)                                                               \
     26     "ld1        {v1.s}[0], [%1], #4            \n"                             \
     27     MEMACCESS(2)                                                               \
     28     "ld1        {v1.s}[1], [%2], #4            \n"
     29 
     30 // Read 8 Y, 2 U and 2 V from 422
     31 #define READYUV411                                                             \
     32     MEMACCESS(0)                                                               \
     33     "ld1        {v0.8b}, [%0], #8              \n"                             \
     34     MEMACCESS(1)                                                               \
     35     "ld1        {v2.h}[0], [%1], #2            \n"                             \
     36     MEMACCESS(2)                                                               \
     37     "ld1        {v2.h}[1], [%2], #2            \n"                             \
     38     "zip1       v1.8b, v2.8b, v2.8b            \n"
     39 
     40 // Read 8 Y, 8 U and 8 V from 444
     41 #define READYUV444                                                             \
     42     MEMACCESS(0)                                                               \
     43     "ld1        {v0.8b}, [%0], #8              \n"                             \
     44     MEMACCESS(1)                                                               \
     45     "ld1        {v1.d}[0], [%1], #8            \n"                             \
     46     MEMACCESS(2)                                                               \
     47     "ld1        {v1.d}[1], [%2], #8            \n"                             \
     48     "uaddlp     v1.8h, v1.16b                  \n"                             \
     49     "rshrn      v1.8b, v1.8h, #1               \n"
     50 
     51 // Read 8 Y, and set 4 U and 4 V to 128
     52 #define READYUV400                                                             \
     53     MEMACCESS(0)                                                               \
     54     "ld1        {v0.8b}, [%0], #8              \n"                             \
     55     "movi       v1.8b , #128                   \n"
     56 
     57 // Read 8 Y and 4 UV from NV12
     58 #define READNV12                                                               \
     59     MEMACCESS(0)                                                               \
     60     "ld1        {v0.8b}, [%0], #8              \n"                             \
     61     MEMACCESS(1)                                                               \
     62     "ld1        {v2.8b}, [%1], #8              \n"                             \
     63     "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
     64     "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
     65     "ins        v1.s[1], v3.s[0]               \n"
     66 
     67 // Read 8 Y and 4 VU from NV21
     68 #define READNV21                                                               \
     69     MEMACCESS(0)                                                               \
     70     "ld1        {v0.8b}, [%0], #8              \n"                             \
     71     MEMACCESS(1)                                                               \
     72     "ld1        {v2.8b}, [%1], #8              \n"                             \
     73     "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
     74     "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
     75     "ins        v1.s[1], v3.s[0]               \n"
     76 
     77 // Read 8 YUY2
     78 #define READYUY2                                                               \
     79     MEMACCESS(0)                                                               \
     80     "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
     81     "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
     82     "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
     83     "ins        v1.s[1], v3.s[0]               \n"
     84 
     85 // Read 8 UYVY
     86 #define READUYVY                                                               \
     87     MEMACCESS(0)                                                               \
     88     "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
     89     "orr        v0.8b, v3.8b, v3.8b            \n"                             \
     90     "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
     91     "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
     92     "ins        v1.s[1], v3.s[0]               \n"
     93 
     94 #define YUV422TORGB_SETUP_REG                                                  \
     95     "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
     96     "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
     97     "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
     98     "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
     99     "movi       v27.8h, #128                   \n"                             \
    100     "movi       v28.8h, #102                   \n"                             \
    101     "movi       v29.8h, #25                    \n"                             \
    102     "movi       v30.8h, #52                    \n"
    103 
    104 #define YUV422TORGB(vR, vG, vB)                                                \
    105     "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
    106     "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
    107     "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
    108     "ushll      v0.4s, v0.4h, #0               \n"                             \
    109     "mul        v3.4s, v3.4s, v31.4s           \n"                             \
    110     "mul        v0.4s, v0.4s, v31.4s           \n"                             \
    111     "sqshrun    v0.4h, v0.4s, #16              \n"                             \
    112     "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
    113     "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
    114     "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
    115     "uxtl       v2.8h, v2.8b                   \n"                             \
    116     "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
    117     "mul        v3.8h, v1.8h, v27.8h           \n"                             \
    118     "mul        v5.8h, v1.8h, v29.8h           \n"                             \
    119     "mul        v6.8h, v2.8h, v30.8h           \n"                             \
    120     "mul        v7.8h, v2.8h, v28.8h           \n"                             \
    121     "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
    122     "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
    123     "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
    124     "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
    125     "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
    126     "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
    127     "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
    128     "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
    129     "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
    130     "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
    131 
    132 // YUV to RGB conversion constants.
    133 // Y contribution to R,G,B.  Scale and bias.
    134 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
    135 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
    136 
    137 // U and V contributions to R,G,B.
    138 #define UB -128 /* -min(128, round(2.018 * 64)) */
    139 #define UG 25 /* -round(-0.391 * 64) */
    140 #define VG 52 /* -round(-0.813 * 64) */
    141 #define VR -102 /* -round(1.596 * 64) */
    142 
    143 // Bias values to subtract 16 from Y and 128 from U and V.
    144 #define BB (UB * 128            - YGB)
    145 #define BG (UG * 128 + VG * 128 - YGB)
    146 #define BR            (VR * 128 - YGB)
    147 
    148 static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
    149 static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
    150 
    151 #undef YG
    152 #undef YGB
    153 #undef UB
    154 #undef UG
    155 #undef VG
    156 #undef VR
    157 #undef BB
    158 #undef BG
    159 #undef BR
    160 
    161 #define RGBTOUV_SETUP_REG                                                      \
    162     "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
    163     "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
    164     "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
    165     "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
    166     "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
    167     "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
    168 
    169 
    170 #ifdef HAS_I444TOARGBROW_NEON
    171 void I444ToARGBRow_NEON(const uint8* src_y,
    172                         const uint8* src_u,
    173                         const uint8* src_v,
    174                         uint8* dst_argb,
    175                         int width) {
    176   asm volatile (
    177     YUV422TORGB_SETUP_REG
    178   "1:                                          \n"
    179     READYUV444
    180     YUV422TORGB(v22, v21, v20)
    181     "subs       %w4, %w4, #8                 \n"
    182     "movi       v23.8b, #255                   \n" /* A */
    183     MEMACCESS(3)
    184     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
    185     "b.gt       1b                             \n"
    186     : "+r"(src_y),     // %0
    187       "+r"(src_u),     // %1
    188       "+r"(src_v),     // %2
    189       "+r"(dst_argb),  // %3
    190       "+r"(width)      // %4
    191     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    192       [kYToRgb]"r"(&kYToRgb)
    193     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    194       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    195   );
    196 }
    197 #endif  // HAS_I444TOARGBROW_NEON
    198 
    199 #ifdef HAS_I422TOARGBROW_NEON
    200 void I422ToARGBRow_NEON(const uint8* src_y,
    201                         const uint8* src_u,
    202                         const uint8* src_v,
    203                         uint8* dst_argb,
    204                         int width) {
    205   asm volatile (
    206     YUV422TORGB_SETUP_REG
    207   "1:                                          \n"
    208     READYUV422
    209     YUV422TORGB(v22, v21, v20)
    210     "subs       %w4, %w4, #8                   \n"
    211     "movi       v23.8b, #255                   \n" /* A */
    212     MEMACCESS(3)
    213     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
    214     "b.gt       1b                             \n"
    215     : "+r"(src_y),     // %0
    216       "+r"(src_u),     // %1
    217       "+r"(src_v),     // %2
    218       "+r"(dst_argb),  // %3
    219       "+r"(width)      // %4
    220     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    221       [kYToRgb]"r"(&kYToRgb)
    222     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    223       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    224   );
    225 }
    226 #endif  // HAS_I422TOARGBROW_NEON
    227 
    228 #ifdef HAS_I411TOARGBROW_NEON
    229 void I411ToARGBRow_NEON(const uint8* src_y,
    230                         const uint8* src_u,
    231                         const uint8* src_v,
    232                         uint8* dst_argb,
    233                         int width) {
    234   asm volatile (
    235     YUV422TORGB_SETUP_REG
    236   "1:                                          \n"
    237     READYUV411
    238     YUV422TORGB(v22, v21, v20)
    239     "subs       %w4, %w4, #8                   \n"
    240     "movi       v23.8b, #255                   \n" /* A */
    241     MEMACCESS(3)
    242     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
    243     "b.gt       1b                             \n"
    244     : "+r"(src_y),     // %0
    245       "+r"(src_u),     // %1
    246       "+r"(src_v),     // %2
    247       "+r"(dst_argb),  // %3
    248       "+r"(width)      // %4
    249     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    250       [kYToRgb]"r"(&kYToRgb)
    251     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    252       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    253   );
    254 }
    255 #endif  // HAS_I411TOARGBROW_NEON
    256 
    257 #ifdef HAS_I422TOBGRAROW_NEON
    258 void I422ToBGRARow_NEON(const uint8* src_y,
    259                         const uint8* src_u,
    260                         const uint8* src_v,
    261                         uint8* dst_bgra,
    262                         int width) {
    263   asm volatile (
    264     YUV422TORGB_SETUP_REG
    265   "1:                                          \n"
    266     READYUV422
    267     YUV422TORGB(v21, v22, v23)
    268     "subs       %w4, %w4, #8                   \n"
    269     "movi       v20.8b, #255                   \n" /* A */
    270     MEMACCESS(3)
    271     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
    272     "b.gt       1b                             \n"
    273     : "+r"(src_y),     // %0
    274       "+r"(src_u),     // %1
    275       "+r"(src_v),     // %2
    276       "+r"(dst_bgra),  // %3
    277       "+r"(width)      // %4
    278     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    279       [kYToRgb]"r"(&kYToRgb)
    280     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    281       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    282   );
    283 }
    284 #endif  // HAS_I422TOBGRAROW_NEON
    285 
    286 #ifdef HAS_I422TOABGRROW_NEON
    287 void I422ToABGRRow_NEON(const uint8* src_y,
    288                         const uint8* src_u,
    289                         const uint8* src_v,
    290                         uint8* dst_abgr,
    291                         int width) {
    292   asm volatile (
    293     YUV422TORGB_SETUP_REG
    294   "1:                                          \n"
    295     READYUV422
    296     YUV422TORGB(v20, v21, v22)
    297     "subs       %w4, %w4, #8                   \n"
    298     "movi       v23.8b, #255                   \n" /* A */
    299     MEMACCESS(3)
    300     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
    301     "b.gt       1b                             \n"
    302     : "+r"(src_y),     // %0
    303       "+r"(src_u),     // %1
    304       "+r"(src_v),     // %2
    305       "+r"(dst_abgr),  // %3
    306       "+r"(width)      // %4
    307     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    308       [kYToRgb]"r"(&kYToRgb)
    309     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    310       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    311   );
    312 }
    313 #endif  // HAS_I422TOABGRROW_NEON
    314 
    315 #ifdef HAS_I422TORGBAROW_NEON
    316 void I422ToRGBARow_NEON(const uint8* src_y,
    317                         const uint8* src_u,
    318                         const uint8* src_v,
    319                         uint8* dst_rgba,
    320                         int width) {
    321   asm volatile (
    322     YUV422TORGB_SETUP_REG
    323   "1:                                          \n"
    324     READYUV422
    325     YUV422TORGB(v23, v22, v21)
    326     "subs       %w4, %w4, #8                   \n"
    327     "movi       v20.8b, #255                   \n" /* A */
    328     MEMACCESS(3)
    329     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
    330     "b.gt       1b                             \n"
    331     : "+r"(src_y),     // %0
    332       "+r"(src_u),     // %1
    333       "+r"(src_v),     // %2
    334       "+r"(dst_rgba),  // %3
    335       "+r"(width)      // %4
    336     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    337       [kYToRgb]"r"(&kYToRgb)
    338     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    339       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    340   );
    341 }
    342 #endif  // HAS_I422TORGBAROW_NEON
    343 
    344 #ifdef HAS_I422TORGB24ROW_NEON
    345 void I422ToRGB24Row_NEON(const uint8* src_y,
    346                          const uint8* src_u,
    347                          const uint8* src_v,
    348                          uint8* dst_rgb24,
    349                          int width) {
    350   asm volatile (
    351     YUV422TORGB_SETUP_REG
    352   "1:                                          \n"
    353     READYUV422
    354     YUV422TORGB(v22, v21, v20)
    355     "subs       %w4, %w4, #8                   \n"
    356     MEMACCESS(3)
    357     "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
    358     "b.gt       1b                             \n"
    359     : "+r"(src_y),     // %0
    360       "+r"(src_u),     // %1
    361       "+r"(src_v),     // %2
    362       "+r"(dst_rgb24), // %3
    363       "+r"(width)      // %4
    364     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    365       [kYToRgb]"r"(&kYToRgb)
    366     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    367       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    368   );
    369 }
    370 #endif  // HAS_I422TORGB24ROW_NEON
    371 
    372 #ifdef HAS_I422TORAWROW_NEON
    373 void I422ToRAWRow_NEON(const uint8* src_y,
    374                        const uint8* src_u,
    375                        const uint8* src_v,
    376                        uint8* dst_raw,
    377                        int width) {
    378   asm volatile (
    379     YUV422TORGB_SETUP_REG
    380   "1:                                          \n"
    381     READYUV422
    382     YUV422TORGB(v20, v21, v22)
    383     "subs       %w4, %w4, #8                   \n"
    384     MEMACCESS(3)
    385     "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
    386     "b.gt       1b                             \n"
    387     : "+r"(src_y),     // %0
    388       "+r"(src_u),     // %1
    389       "+r"(src_v),     // %2
    390       "+r"(dst_raw),   // %3
    391       "+r"(width)      // %4
    392     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    393       [kYToRgb]"r"(&kYToRgb)
    394     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    395       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    396   );
    397 }
    398 #endif  // HAS_I422TORAWROW_NEON
    399 
    400 #define ARGBTORGB565                                                           \
    401     "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
    402     "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
    403     "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
    404     "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
    405     "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
    406 
    407 #ifdef HAS_I422TORGB565ROW_NEON
    408 void I422ToRGB565Row_NEON(const uint8* src_y,
    409                           const uint8* src_u,
    410                           const uint8* src_v,
    411                           uint8* dst_rgb565,
    412                           int width) {
    413   asm volatile (
    414     YUV422TORGB_SETUP_REG
    415   "1:                                          \n"
    416     READYUV422
    417     YUV422TORGB(v22, v21, v20)
    418     "subs       %w4, %w4, #8                   \n"
    419     ARGBTORGB565
    420     MEMACCESS(3)
    421     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
    422     "b.gt       1b                             \n"
    423     : "+r"(src_y),    // %0
    424       "+r"(src_u),    // %1
    425       "+r"(src_v),    // %2
    426       "+r"(dst_rgb565),  // %3
    427       "+r"(width)     // %4
    428     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    429       [kYToRgb]"r"(&kYToRgb)
    430     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    431       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    432   );
    433 }
    434 #endif  // HAS_I422TORGB565ROW_NEON
    435 
    436 #define ARGBTOARGB1555                                                         \
    437     "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
    438     "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
    439     "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
    440     "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
    441     "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
    442     "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
    443     "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
    444 
    445 #ifdef HAS_I422TOARGB1555ROW_NEON
    446 void I422ToARGB1555Row_NEON(const uint8* src_y,
    447                             const uint8* src_u,
    448                             const uint8* src_v,
    449                             uint8* dst_argb1555,
    450                             int width) {
    451   asm volatile (
    452     YUV422TORGB_SETUP_REG
    453   "1:                                          \n"
    454     READYUV422
    455     YUV422TORGB(v22, v21, v20)
    456     "subs       %w4, %w4, #8                   \n"
    457     "movi       v23.8b, #255                   \n"
    458     ARGBTOARGB1555
    459     MEMACCESS(3)
    460     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
    461     "b.gt       1b                             \n"
    462     : "+r"(src_y),    // %0
    463       "+r"(src_u),    // %1
    464       "+r"(src_v),    // %2
    465       "+r"(dst_argb1555),  // %3
    466       "+r"(width)     // %4
    467     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    468       [kYToRgb]"r"(&kYToRgb)
    469     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    470       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    471   );
    472 }
    473 #endif  // HAS_I422TOARGB1555ROW_NEON
    474 
    475 #define ARGBTOARGB4444                                                         \
    476     /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
    477     "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
    478     "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
    479     "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
    480     "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
    481     "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
    482     "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
    483     "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
    484 
    485 #ifdef HAS_I422TOARGB4444ROW_NEON
    486 void I422ToARGB4444Row_NEON(const uint8* src_y,
    487                             const uint8* src_u,
    488                             const uint8* src_v,
    489                             uint8* dst_argb4444,
    490                             int width) {
    491   asm volatile (
    492     YUV422TORGB_SETUP_REG
    493     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
    494   "1:                                          \n"
    495     READYUV422
    496     YUV422TORGB(v22, v21, v20)
    497     "subs       %w4, %w4, #8                   \n"
    498     "movi       v23.8b, #255                   \n"
    499     ARGBTOARGB4444
    500     MEMACCESS(3)
    501     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
    502     "b.gt       1b                             \n"
    503     : "+r"(src_y),    // %0
    504       "+r"(src_u),    // %1
    505       "+r"(src_v),    // %2
    506       "+r"(dst_argb4444),  // %3
    507       "+r"(width)     // %4
    508     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    509       [kYToRgb]"r"(&kYToRgb)
    510     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    511       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    512   );
    513 }
    514 #endif  // HAS_I422TOARGB4444ROW_NEON
    515 
    516 #ifdef HAS_I400TOARGBROW_NEON
    517 void I400ToARGBRow_NEON(const uint8* src_y,
    518                         uint8* dst_argb,
    519                         int width) {
    520   int64 width64 = (int64)(width);
    521   asm volatile (
    522     YUV422TORGB_SETUP_REG
    523   "1:                                          \n"
    524     READYUV400
    525     YUV422TORGB(v22, v21, v20)
    526     "subs       %w2, %w2, #8                   \n"
    527     "movi       v23.8b, #255                   \n"
    528     MEMACCESS(1)
    529     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
    530     "b.gt       1b                             \n"
    531     : "+r"(src_y),     // %0
    532       "+r"(dst_argb),  // %1
    533       "+r"(width64)    // %2
    534     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    535       [kYToRgb]"r"(&kYToRgb)
    536     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    537       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    538   );
    539 }
    540 #endif  // HAS_I400TOARGBROW_NEON
    541 
    542 #ifdef HAS_J400TOARGBROW_NEON
    543 void J400ToARGBRow_NEON(const uint8* src_y,
    544                         uint8* dst_argb,
    545                         int width) {
    546   asm volatile (
    547     "movi       v23.8b, #255                   \n"
    548   "1:                                          \n"
    549     MEMACCESS(0)
    550     "ld1        {v20.8b}, [%0], #8             \n"
    551     "orr        v21.8b, v20.8b, v20.8b         \n"
    552     "orr        v22.8b, v20.8b, v20.8b         \n"
    553     "subs       %w2, %w2, #8                   \n"
    554     MEMACCESS(1)
    555     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
    556     "b.gt       1b                             \n"
    557     : "+r"(src_y),     // %0
    558       "+r"(dst_argb),  // %1
    559       "+r"(width)      // %2
    560     :
    561     : "cc", "memory", "v20", "v21", "v22", "v23"
    562   );
    563 }
    564 #endif  // HAS_J400TOARGBROW_NEON
    565 
    566 #ifdef HAS_NV12TOARGBROW_NEON
    567 void NV12ToARGBRow_NEON(const uint8* src_y,
    568                         const uint8* src_uv,
    569                         uint8* dst_argb,
    570                         int width) {
    571   asm volatile (
    572     YUV422TORGB_SETUP_REG
    573   "1:                                          \n"
    574     READNV12
    575     YUV422TORGB(v22, v21, v20)
    576     "subs       %w3, %w3, #8                   \n"
    577     "movi       v23.8b, #255                   \n"
    578     MEMACCESS(2)
    579     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
    580     "b.gt       1b                             \n"
    581     : "+r"(src_y),     // %0
    582       "+r"(src_uv),    // %1
    583       "+r"(dst_argb),  // %2
    584       "+r"(width)      // %3
    585     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    586       [kYToRgb]"r"(&kYToRgb)
    587     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    588       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    589   );
    590 }
    591 #endif  // HAS_NV12TOARGBROW_NEON
    592 
    593 #ifdef HAS_NV21TOARGBROW_NEON
    594 void NV21ToARGBRow_NEON(const uint8* src_y,
    595                         const uint8* src_uv,
    596                         uint8* dst_argb,
    597                         int width) {
    598   asm volatile (
    599     YUV422TORGB_SETUP_REG
    600   "1:                                          \n"
    601     READNV21
    602     YUV422TORGB(v22, v21, v20)
    603     "subs       %w3, %w3, #8                   \n"
    604     "movi       v23.8b, #255                   \n"
    605     MEMACCESS(2)
    606     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
    607     "b.gt       1b                             \n"
    608     : "+r"(src_y),     // %0
    609       "+r"(src_uv),    // %1
    610       "+r"(dst_argb),  // %2
    611       "+r"(width)      // %3
    612     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    613       [kYToRgb]"r"(&kYToRgb)
    614     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    615       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    616   );
    617 }
    618 #endif  // HAS_NV21TOARGBROW_NEON
    619 
    620 #ifdef HAS_NV12TORGB565ROW_NEON
    621 void NV12ToRGB565Row_NEON(const uint8* src_y,
    622                           const uint8* src_uv,
    623                           uint8* dst_rgb565,
    624                           int width) {
    625   asm volatile (
    626     YUV422TORGB_SETUP_REG
    627   "1:                                          \n"
    628     READNV12
    629     YUV422TORGB(v22, v21, v20)
    630     "subs       %w3, %w3, #8                   \n"
    631     ARGBTORGB565
    632     MEMACCESS(2)
    633     "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
    634     "b.gt       1b                             \n"
    635     : "+r"(src_y),     // %0
    636       "+r"(src_uv),    // %1
    637       "+r"(dst_rgb565),  // %2
    638       "+r"(width)      // %3
    639     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    640       [kYToRgb]"r"(&kYToRgb)
    641     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    642       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    643   );
    644 }
    645 #endif  // HAS_NV12TORGB565ROW_NEON
    646 
    647 #ifdef HAS_NV21TORGB565ROW_NEON
    648 void NV21ToRGB565Row_NEON(const uint8* src_y,
    649                           const uint8* src_uv,
    650                           uint8* dst_rgb565,
    651                           int width) {
    652   asm volatile (
    653     YUV422TORGB_SETUP_REG
    654   "1:                                          \n"
    655     READNV21
    656     YUV422TORGB(v22, v21, v20)
    657     "subs       %w3, %w3, #8                   \n"
    658     ARGBTORGB565
    659     MEMACCESS(2)
    660     "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
    661     "b.gt       1b                             \n"
    662     : "+r"(src_y),     // %0
    663       "+r"(src_uv),    // %1
    664       "+r"(dst_rgb565),  // %2
    665       "+r"(width)      // %3
    666     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    667       [kYToRgb]"r"(&kYToRgb)
    668     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    669       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    670   );
    671 }
    672 #endif  // HAS_NV21TORGB565ROW_NEON
    673 
    674 #ifdef HAS_YUY2TOARGBROW_NEON
    675 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
    676                         uint8* dst_argb,
    677                         int width) {
    678   int64 width64 = (int64)(width);
    679   asm volatile (
    680     YUV422TORGB_SETUP_REG
    681   "1:                                          \n"
    682     READYUY2
    683     YUV422TORGB(v22, v21, v20)
    684     "subs       %w2, %w2, #8                   \n"
    685     "movi       v23.8b, #255                   \n"
    686     MEMACCESS(1)
    687     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
    688     "b.gt       1b                             \n"
    689     : "+r"(src_yuy2),  // %0
    690       "+r"(dst_argb),  // %1
    691       "+r"(width64)    // %2
    692     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    693       [kYToRgb]"r"(&kYToRgb)
    694     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    695       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    696   );
    697 }
    698 #endif  // HAS_YUY2TOARGBROW_NEON
    699 
    700 #ifdef HAS_UYVYTOARGBROW_NEON
    701 void UYVYToARGBRow_NEON(const uint8* src_uyvy,
    702                         uint8* dst_argb,
    703                         int width) {
    704   int64 width64 = (int64)(width);
    705   asm volatile (
    706     YUV422TORGB_SETUP_REG
    707   "1:                                          \n"
    708     READUYVY
    709     YUV422TORGB(v22, v21, v20)
    710     "subs       %w2, %w2, #8                   \n"
    711     "movi       v23.8b, #255                   \n"
    712     MEMACCESS(1)
    713     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
    714     "b.gt       1b                             \n"
    715     : "+r"(src_uyvy),  // %0
    716       "+r"(dst_argb),  // %1
    717       "+r"(width64)    // %2
    718     : [kUVBiasBGR]"r"(&kUVBiasBGR),
    719       [kYToRgb]"r"(&kYToRgb)
    720     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    721       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    722   );
    723 }
    724 #endif  // HAS_UYVYTOARGBROW_NEON
    725 
    726 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
    727 #ifdef HAS_SPLITUVROW_NEON
    728 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
    729                      int width) {
    730   asm volatile (
    731   "1:                                          \n"
    732     MEMACCESS(0)
    733     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
    734     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
    735     MEMACCESS(1)
    736     "st1        {v0.16b}, [%1], #16            \n"  // store U
    737     MEMACCESS(2)
    738     "st1        {v1.16b}, [%2], #16            \n"  // store V
    739     "b.gt       1b                             \n"
    740     : "+r"(src_uv),  // %0
    741       "+r"(dst_u),   // %1
    742       "+r"(dst_v),   // %2
    743       "+r"(width)    // %3  // Output registers
    744     :                       // Input registers
    745     : "cc", "memory", "v0", "v1"  // Clobber List
    746   );
    747 }
    748 #endif  // HAS_SPLITUVROW_NEON
    749 
    750 // Reads 16 U's and V's and writes out 16 pairs of UV.
    751 #ifdef HAS_MERGEUVROW_NEON
    752 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
    753                      int width) {
    754   asm volatile (
    755   "1:                                          \n"
    756     MEMACCESS(0)
    757     "ld1        {v0.16b}, [%0], #16            \n"  // load U
    758     MEMACCESS(1)
    759     "ld1        {v1.16b}, [%1], #16            \n"  // load V
    760     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
    761     MEMACCESS(2)
    762     "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
    763     "b.gt       1b                             \n"
    764     :
    765       "+r"(src_u),   // %0
    766       "+r"(src_v),   // %1
    767       "+r"(dst_uv),  // %2
    768       "+r"(width)    // %3  // Output registers
    769     :                       // Input registers
    770     : "cc", "memory", "v0", "v1"  // Clobber List
    771   );
    772 }
    773 #endif  // HAS_MERGEUVROW_NEON
    774 
    775 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
    776 #ifdef HAS_COPYROW_NEON
    777 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
    778   asm volatile (
    779   "1:                                          \n"
    780     MEMACCESS(0)
    781     "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
    782     "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
    783     MEMACCESS(1)
    784     "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
    785     "b.gt       1b                             \n"
    786   : "+r"(src),   // %0
    787     "+r"(dst),   // %1
    788     "+r"(count)  // %2  // Output registers
    789   :                     // Input registers
    790   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
    791   );
    792 }
    793 #endif  // HAS_COPYROW_NEON
    794 
    795 // SetRow writes 'count' bytes using an 8 bit value repeated.
    796 void SetRow_NEON(uint8* dst, uint8 v8, int count) {
    797   asm volatile (
    798     "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
    799   "1:                                          \n"
    800     "subs      %w1, %w1, #16                   \n"  // 16 bytes per loop
    801     MEMACCESS(0)
    802     "st1        {v0.16b}, [%0], #16            \n"  // store
    803     "b.gt      1b                              \n"
    804   : "+r"(dst),   // %0
    805     "+r"(count)  // %1
    806   : "r"(v8)      // %2
    807   : "cc", "memory", "v0"
    808   );
    809 }
    810 
    811 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
    812   asm volatile (
    813     "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
    814   "1:                                          \n"
    815     "subs      %w1, %w1, #4                    \n"  // 4 ints per loop
    816     MEMACCESS(0)
    817     "st1        {v0.16b}, [%0], #16            \n"  // store
    818     "b.gt      1b                              \n"
    819   : "+r"(dst),   // %0
    820     "+r"(count)  // %1
    821   : "r"(v32)     // %2
    822   : "cc", "memory", "v0"
    823   );
    824 }
    825 
    826 #ifdef HAS_MIRRORROW_NEON
    827 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
    828   int64 width64 = (int64) width;
    829   asm volatile (
    830     // Start at end of source row.
    831     "add        %0, %0, %2                     \n"
    832     "sub        %0, %0, #16                    \n"
    833 
    834   "1:                                          \n"
    835     MEMACCESS(0)
    836     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
    837     "subs       %2, %2, #16                   \n"  // 16 pixels per loop.
    838     "rev64      v0.16b, v0.16b                 \n"
    839     MEMACCESS(1)
    840     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
    841     MEMACCESS(1)
    842     "st1        {v0.D}[0], [%1], #8            \n"
    843     "b.gt       1b                             \n"
    844   : "+r"(src),   // %0
    845     "+r"(dst),   // %1
    846     "+r"(width64)  // %2
    847   : "r"((ptrdiff_t)-16)    // %3
    848   : "cc", "memory", "v0"
    849   );
    850 }
    851 #endif  // HAS_MIRRORROW_NEON
    852 
    853 #ifdef HAS_MIRRORUVROW_NEON
    854 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
    855                       int width) {
    856   int64 width64 = (int64) width;
    857   asm volatile (
    858     // Start at end of source row.
    859     "add        %0, %0, %3, lsl #1             \n"
    860     "sub        %0, %0, #16                    \n"
    861 
    862   "1:                                          \n"
    863     MEMACCESS(0)
    864     "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
    865     "subs       %3, %3, #8                     \n"  // 8 pixels per loop.
    866     "rev64      v0.8b, v0.8b                   \n"
    867     "rev64      v1.8b, v1.8b                   \n"
    868     MEMACCESS(1)
    869     "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
    870     MEMACCESS(2)
    871     "st1        {v1.8b}, [%2], #8              \n"
    872     "b.gt       1b                             \n"
    873   : "+r"(src_uv),  // %0
    874     "+r"(dst_u),   // %1
    875     "+r"(dst_v),   // %2
    876     "+r"(width64)    // %3
    877   : "r"((ptrdiff_t)-16)      // %4
    878   : "cc", "memory", "v0", "v1"
    879   );
    880 }
    881 #endif  // HAS_MIRRORUVROW_NEON
    882 
    883 #ifdef HAS_ARGBMIRRORROW_NEON
    884 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
    885   int64 width64 = (int64) width;
    886   asm volatile (
    887     // Start at end of source row.
    888     "add        %0, %0, %2, lsl #2             \n"
    889     "sub        %0, %0, #16                    \n"
    890 
    891   "1:                                          \n"
    892     MEMACCESS(0)
    893     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
    894     "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
    895     "rev64      v0.4s, v0.4s                   \n"
    896     MEMACCESS(1)
    897     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
    898     MEMACCESS(1)
    899     "st1        {v0.D}[0], [%1], #8            \n"
    900     "b.gt       1b                             \n"
    901   : "+r"(src),   // %0
    902     "+r"(dst),   // %1
    903     "+r"(width64)  // %2
    904   : "r"((ptrdiff_t)-16)    // %3
    905   : "cc", "memory", "v0"
    906   );
    907 }
    908 #endif  // HAS_ARGBMIRRORROW_NEON
    909 
    910 #ifdef HAS_RGB24TOARGBROW_NEON
    911 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
    912   asm volatile (
    913     "movi       v4.8b, #255                    \n"  // Alpha
    914   "1:                                          \n"
    915     MEMACCESS(0)
    916     "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
    917     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    918     MEMACCESS(1)
    919     "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
    920     "b.gt       1b                             \n"
    921   : "+r"(src_rgb24),  // %0
    922     "+r"(dst_argb),   // %1
    923     "+r"(pix)         // %2
    924   :
    925   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
    926   );
    927 }
    928 #endif  // HAS_RGB24TOARGBROW_NEON
    929 
    930 #ifdef HAS_RAWTOARGBROW_NEON
    931 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
    932   asm volatile (
    933     "movi       v5.8b, #255                    \n"  // Alpha
    934   "1:                                          \n"
    935     MEMACCESS(0)
    936     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
    937     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    938     "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
    939     "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
    940     MEMACCESS(1)
    941     "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
    942     "b.gt       1b                             \n"
    943   : "+r"(src_raw),   // %0
    944     "+r"(dst_argb),  // %1
    945     "+r"(pix)        // %2
    946   :
    947   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
    948   );
    949 }
    950 #endif  // HAS_RAWTOARGBROW_NEON
    951 
    952 #define RGB565TOARGB                                                           \
    953     "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
    954     "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
    955     "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
    956     "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
    957     "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
    958     "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
    959     "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
    960     "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
    961     "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
    962     "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
    963     "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
    964 
    965 #ifdef HAS_RGB565TOARGBROW_NEON
    966 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
    967   asm volatile (
    968     "movi       v3.8b, #255                    \n"  // Alpha
    969   "1:                                          \n"
    970     MEMACCESS(0)
    971     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
    972     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    973     RGB565TOARGB
    974     MEMACCESS(1)
    975     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
    976     "b.gt       1b                             \n"
    977   : "+r"(src_rgb565),  // %0
    978     "+r"(dst_argb),    // %1
    979     "+r"(pix)          // %2
    980   :
    981   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
    982   );
    983 }
    984 #endif  // HAS_RGB565TOARGBROW_NEON
    985 
    986 #define ARGB1555TOARGB                                                         \
    987     "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
    988     "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
    989     "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
    990                                                                                \
    991     "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
    992     "xtn2       v3.16b, v2.8h                  \n"                             \
    993                                                                                \
    994     "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
    995     "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
    996                                                                                \
    997     "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
    998     "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
    999     "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
   1000                                                                                \
   1001     "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
   1002     "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
   1003     "dup        v1.2D, v0.D[1]                 \n"                             \
   1004     "dup        v3.2D, v2.D[1]                 \n"
   1005 
   1006 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
   1007 #define RGB555TOARGB                                                           \
   1008     "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
   1009     "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
   1010     "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
   1011                                                                                \
   1012     "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
   1013     "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
   1014                                                                                \
   1015     "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
   1016     "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
   1017     "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
   1018                                                                                \
   1019     "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
   1020     "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
   1021     "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
   1022 
   1023 #ifdef HAS_ARGB1555TOARGBROW_NEON
   1024 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
   1025                             int pix) {
   1026   asm volatile (
   1027     "movi       v3.8b, #255                    \n"  // Alpha
   1028   "1:                                          \n"
   1029     MEMACCESS(0)
   1030     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
   1031     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1032     ARGB1555TOARGB
   1033     MEMACCESS(1)
   1034     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
   1035     "b.gt       1b                             \n"
   1036   : "+r"(src_argb1555),  // %0
   1037     "+r"(dst_argb),    // %1
   1038     "+r"(pix)          // %2
   1039   :
   1040   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   1041   );
   1042 }
   1043 #endif  // HAS_ARGB1555TOARGBROW_NEON
   1044 
   1045 #define ARGB4444TOARGB                                                         \
   1046     "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
   1047     "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
   1048     "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
   1049     "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
   1050     "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
   1051     "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
   1052     "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
   1053     "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
   1054     "dup        v0.2D, v2.D[1]                 \n"                             \
   1055     "dup        v1.2D, v3.D[1]                 \n"
   1056 
   1057 #ifdef HAS_ARGB4444TOARGBROW_NEON
   1058 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
   1059                             int pix) {
   1060   asm volatile (
   1061   "1:                                          \n"
   1062     MEMACCESS(0)
   1063     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
   1064     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1065     ARGB4444TOARGB
   1066     MEMACCESS(1)
   1067     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
   1068     "b.gt       1b                             \n"
   1069   : "+r"(src_argb4444),  // %0
   1070     "+r"(dst_argb),    // %1
   1071     "+r"(pix)          // %2
   1072   :
   1073   : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
   1074   );
   1075 }
   1076 #endif  // HAS_ARGB4444TOARGBROW_NEON
   1077 
   1078 #ifdef HAS_ARGBTORGB24ROW_NEON
   1079 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
   1080   asm volatile (
   1081   "1:                                          \n"
   1082     MEMACCESS(0)
   1083     "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
   1084     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1085     MEMACCESS(1)
   1086     "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
   1087     "b.gt       1b                             \n"
   1088   : "+r"(src_argb),   // %0
   1089     "+r"(dst_rgb24),  // %1
   1090     "+r"(pix)         // %2
   1091   :
   1092   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
   1093   );
   1094 }
   1095 #endif  // HAS_ARGBTORGB24ROW_NEON
   1096 
   1097 #ifdef HAS_ARGBTORAWROW_NEON
   1098 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
   1099   asm volatile (
   1100   "1:                                          \n"
   1101     MEMACCESS(0)
   1102     "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
   1103     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1104     "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
   1105     "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
   1106     MEMACCESS(1)
   1107     "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
   1108     "b.gt       1b                             \n"
   1109   : "+r"(src_argb),  // %0
   1110     "+r"(dst_raw),   // %1
   1111     "+r"(pix)        // %2
   1112   :
   1113   : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
   1114   );
   1115 }
   1116 #endif  // HAS_ARGBTORAWROW_NEON
   1117 
   1118 #ifdef HAS_YUY2TOYROW_NEON
   1119 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
   1120   asm volatile (
   1121   "1:                                          \n"
   1122     MEMACCESS(0)
   1123     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
   1124     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
   1125     MEMACCESS(1)
   1126     "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
   1127     "b.gt       1b                             \n"
   1128   : "+r"(src_yuy2),  // %0
   1129     "+r"(dst_y),     // %1
   1130     "+r"(pix)        // %2
   1131   :
   1132   : "cc", "memory", "v0", "v1"  // Clobber List
   1133   );
   1134 }
   1135 #endif  // HAS_YUY2TOYROW_NEON
   1136 
   1137 #ifdef HAS_UYVYTOYROW_NEON
   1138 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
   1139   asm volatile (
   1140   "1:                                          \n"
   1141     MEMACCESS(0)
   1142     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
   1143     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
   1144     MEMACCESS(1)
   1145     "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
   1146     "b.gt       1b                             \n"
   1147   : "+r"(src_uyvy),  // %0
   1148     "+r"(dst_y),     // %1
   1149     "+r"(pix)        // %2
   1150   :
   1151   : "cc", "memory", "v0", "v1"  // Clobber List
   1152   );
   1153 }
   1154 #endif  // HAS_UYVYTOYROW_NEON
   1155 
   1156 #ifdef HAS_YUY2TOUV422ROW_NEON
   1157 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
   1158                          int pix) {
   1159   asm volatile (
   1160   "1:                                          \n"
   1161     MEMACCESS(0)
   1162     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
   1163     "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
   1164     MEMACCESS(1)
   1165     "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
   1166     MEMACCESS(2)
   1167     "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
   1168     "b.gt       1b                             \n"
   1169   : "+r"(src_yuy2),  // %0
   1170     "+r"(dst_u),     // %1
   1171     "+r"(dst_v),     // %2
   1172     "+r"(pix)        // %3
   1173   :
   1174   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   1175   );
   1176 }
   1177 #endif  // HAS_YUY2TOUV422ROW_NEON
   1178 
   1179 #ifdef HAS_UYVYTOUV422ROW_NEON
   1180 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
   1181                          int pix) {
   1182   asm volatile (
   1183   "1:                                          \n"
   1184     MEMACCESS(0)
   1185     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
   1186     "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
   1187     MEMACCESS(1)
   1188     "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
   1189     MEMACCESS(2)
   1190     "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
   1191     "b.gt       1b                             \n"
   1192   : "+r"(src_uyvy),  // %0
   1193     "+r"(dst_u),     // %1
   1194     "+r"(dst_v),     // %2
   1195     "+r"(pix)        // %3
   1196   :
   1197   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   1198   );
   1199 }
   1200 #endif  // HAS_UYVYTOUV422ROW_NEON
   1201 
   1202 #ifdef HAS_YUY2TOUVROW_NEON
   1203 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
   1204                       uint8* dst_u, uint8* dst_v, int pix) {
   1205   const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
   1206   asm volatile (
   1207   "1:                                          \n"
   1208     MEMACCESS(0)
   1209     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
   1210     "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
   1211     MEMACCESS(1)
   1212     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
   1213     "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
   1214     "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
   1215     MEMACCESS(2)
   1216     "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
   1217     MEMACCESS(3)
   1218     "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
   1219     "b.gt       1b                             \n"
   1220   : "+r"(src_yuy2),     // %0
   1221     "+r"(src_yuy2b),    // %1
   1222     "+r"(dst_u),        // %2
   1223     "+r"(dst_v),        // %3
   1224     "+r"(pix)           // %4
   1225   :
   1226   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
   1227     "v5", "v6", "v7"  // Clobber List
   1228   );
   1229 }
   1230 #endif  // HAS_YUY2TOUVROW_NEON
   1231 
   1232 #ifdef HAS_UYVYTOUVROW_NEON
   1233 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
   1234                       uint8* dst_u, uint8* dst_v, int pix) {
   1235   const uint8* src_uyvyb = src_uyvy + stride_uyvy;
   1236   asm volatile (
   1237   "1:                                          \n"
   1238     MEMACCESS(0)
   1239     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
   1240     "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
   1241     MEMACCESS(1)
   1242     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
   1243     "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
   1244     "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
   1245     MEMACCESS(2)
   1246     "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
   1247     MEMACCESS(3)
   1248     "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
   1249     "b.gt       1b                             \n"
   1250   : "+r"(src_uyvy),     // %0
   1251     "+r"(src_uyvyb),    // %1
   1252     "+r"(dst_u),        // %2
   1253     "+r"(dst_v),        // %3
   1254     "+r"(pix)           // %4
   1255   :
   1256   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
   1257     "v5", "v6", "v7"  // Clobber List
   1258   );
   1259 }
   1260 #endif  // HAS_UYVYTOUVROW_NEON
   1261 
   1262 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
   1263 #ifdef HAS_ARGBSHUFFLEROW_NEON
   1264 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
   1265                          const uint8* shuffler, int pix) {
   1266   asm volatile (
   1267     MEMACCESS(3)
   1268     "ld1        {v2.16b}, [%3]                 \n"  // shuffler
   1269   "1:                                          \n"
   1270     MEMACCESS(0)
   1271     "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
   1272     "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
   1273     "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
   1274     MEMACCESS(1)
   1275     "st1        {v1.16b}, [%1], #16            \n"  // store 4.
   1276     "b.gt       1b                             \n"
   1277   : "+r"(src_argb),  // %0
   1278     "+r"(dst_argb),  // %1
   1279     "+r"(pix)        // %2
   1280   : "r"(shuffler)    // %3
   1281   : "cc", "memory", "v0", "v1", "v2"  // Clobber List
   1282   );
   1283 }
   1284 #endif  // HAS_ARGBSHUFFLEROW_NEON
   1285 
   1286 #ifdef HAS_I422TOYUY2ROW_NEON
   1287 void I422ToYUY2Row_NEON(const uint8* src_y,
   1288                         const uint8* src_u,
   1289                         const uint8* src_v,
   1290                         uint8* dst_yuy2, int width) {
   1291   asm volatile (
   1292   "1:                                          \n"
   1293     MEMACCESS(0)
   1294     "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
   1295     "orr        v2.8b, v1.8b, v1.8b            \n"
   1296     MEMACCESS(1)
   1297     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
   1298     MEMACCESS(2)
   1299     "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
   1300     "subs       %w4, %w4, #16                  \n"  // 16 pixels
   1301     MEMACCESS(3)
   1302     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
   1303     "b.gt       1b                             \n"
   1304   : "+r"(src_y),     // %0
   1305     "+r"(src_u),     // %1
   1306     "+r"(src_v),     // %2
   1307     "+r"(dst_yuy2),  // %3
   1308     "+r"(width)      // %4
   1309   :
   1310   : "cc", "memory", "v0", "v1", "v2", "v3"
   1311   );
   1312 }
   1313 #endif  // HAS_I422TOYUY2ROW_NEON
   1314 
   1315 #ifdef HAS_I422TOUYVYROW_NEON
   1316 void I422ToUYVYRow_NEON(const uint8* src_y,
   1317                         const uint8* src_u,
   1318                         const uint8* src_v,
   1319                         uint8* dst_uyvy, int width) {
   1320   asm volatile (
   1321   "1:                                          \n"
   1322     MEMACCESS(0)
   1323     "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
   1324     "orr        v3.8b, v2.8b, v2.8b            \n"
   1325     MEMACCESS(1)
   1326     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
   1327     MEMACCESS(2)
   1328     "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
   1329     "subs       %w4, %w4, #16                  \n"  // 16 pixels
   1330     MEMACCESS(3)
   1331     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
   1332     "b.gt       1b                             \n"
   1333   : "+r"(src_y),     // %0
   1334     "+r"(src_u),     // %1
   1335     "+r"(src_v),     // %2
   1336     "+r"(dst_uyvy),  // %3
   1337     "+r"(width)      // %4
   1338   :
   1339   : "cc", "memory", "v0", "v1", "v2", "v3"
   1340   );
   1341 }
   1342 #endif  // HAS_I422TOUYVYROW_NEON
   1343 
   1344 #ifdef HAS_ARGBTORGB565ROW_NEON
   1345 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
   1346   asm volatile (
   1347   "1:                                          \n"
   1348     MEMACCESS(0)
   1349     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
   1350     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1351     ARGBTORGB565
   1352     MEMACCESS(1)
   1353     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
   1354     "b.gt       1b                             \n"
   1355   : "+r"(src_argb),  // %0
   1356     "+r"(dst_rgb565),  // %1
   1357     "+r"(pix)        // %2
   1358   :
   1359   : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
   1360   );
   1361 }
   1362 #endif  // HAS_ARGBTORGB565ROW_NEON
   1363 
   1364 #ifdef HAS_ARGBTORGB565DITHERROW_NEON
   1365 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
   1366                                 const uint32 dither4, int width) {
   1367   asm volatile (
   1368     "dup        v1.4s, %w2                     \n"  // dither4
   1369   "1:                                          \n"
   1370     MEMACCESS(1)
   1371     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
   1372     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   1373     "uqadd      v20.8b, v20.8b, v1.8b          \n"
   1374     "uqadd      v21.8b, v21.8b, v1.8b          \n"
   1375     "uqadd      v22.8b, v22.8b, v1.8b          \n"
   1376     ARGBTORGB565
   1377     MEMACCESS(0)
   1378     "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
   1379     "b.gt       1b                             \n"
   1380   : "+r"(dst_rgb)    // %0
   1381   : "r"(src_argb),   // %1
   1382     "r"(dither4),    // %2
   1383     "r"(width)       // %3
   1384   : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
   1385   );
   1386 }
   1387 #endif  // HAS_ARGBTORGB565ROW_NEON
   1388 
   1389 #ifdef HAS_ARGBTOARGB1555ROW_NEON
   1390 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
   1391                             int pix) {
   1392   asm volatile (
   1393   "1:                                          \n"
   1394     MEMACCESS(0)
   1395     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
   1396     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1397     ARGBTOARGB1555
   1398     MEMACCESS(1)
   1399     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
   1400     "b.gt       1b                             \n"
   1401   : "+r"(src_argb),  // %0
   1402     "+r"(dst_argb1555),  // %1
   1403     "+r"(pix)        // %2
   1404   :
   1405   : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
   1406   );
   1407 }
   1408 #endif  // HAS_ARGBTOARGB1555ROW_NEON
   1409 
   1410 #ifdef HAS_ARGBTOARGB4444ROW_NEON
   1411 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
   1412                             int pix) {
   1413   asm volatile (
   1414     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
   1415   "1:                                          \n"
   1416     MEMACCESS(0)
   1417     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
   1418     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1419     ARGBTOARGB4444
   1420     MEMACCESS(1)
   1421     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
   1422     "b.gt       1b                             \n"
   1423   : "+r"(src_argb),      // %0
   1424     "+r"(dst_argb4444),  // %1
   1425     "+r"(pix)            // %2
   1426   :
   1427   : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
   1428   );
   1429 }
   1430 #endif  // HAS_ARGBTOARGB4444ROW_NEON
   1431 
   1432 #ifdef HAS_ARGBTOYROW_NEON
   1433 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
   1434   asm volatile (
   1435     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
   1436     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
   1437     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
   1438     "movi       v7.8b, #16                     \n"  // Add 16 constant
   1439   "1:                                          \n"
   1440     MEMACCESS(0)
   1441     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   1442     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1443     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
   1444     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
   1445     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
   1446     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
   1447     "uqadd      v0.8b, v0.8b, v7.8b            \n"
   1448     MEMACCESS(1)
   1449     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   1450     "b.gt       1b                             \n"
   1451   : "+r"(src_argb),  // %0
   1452     "+r"(dst_y),     // %1
   1453     "+r"(pix)        // %2
   1454   :
   1455   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   1456   );
   1457 }
   1458 #endif  // HAS_ARGBTOYROW_NEON
   1459 
   1460 #ifdef HAS_ARGBTOYJROW_NEON
   1461 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
   1462   asm volatile (
   1463     "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
   1464     "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
   1465     "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
   1466   "1:                                          \n"
   1467     MEMACCESS(0)
   1468     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   1469     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1470     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
   1471     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
   1472     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
   1473     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
   1474     MEMACCESS(1)
   1475     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   1476     "b.gt       1b                             \n"
   1477   : "+r"(src_argb),  // %0
   1478     "+r"(dst_y),     // %1
   1479     "+r"(pix)        // %2
   1480   :
   1481   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
   1482   );
   1483 }
   1484 #endif  // HAS_ARGBTOYJROW_NEON
   1485 
   1486 // 8x1 pixels.
   1487 #ifdef HAS_ARGBTOUV444ROW_NEON
   1488 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
   1489                          int pix) {
   1490   asm volatile (
   1491     "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
   1492     "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
   1493     "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
   1494     "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
   1495     "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
   1496     "movi       v29.16b,#0x80                  \n"  // 128.5
   1497   "1:                                          \n"
   1498     MEMACCESS(0)
   1499     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   1500     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   1501     "umull      v4.8h, v0.8b, v24.8b           \n"  // B
   1502     "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
   1503     "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
   1504     "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
   1505 
   1506     "umull      v3.8h, v2.8b, v24.8b           \n"  // R
   1507     "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
   1508     "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
   1509     "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
   1510 
   1511     "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
   1512     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
   1513 
   1514     MEMACCESS(1)
   1515     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
   1516     MEMACCESS(2)
   1517     "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
   1518     "b.gt       1b                             \n"
   1519   : "+r"(src_argb),  // %0
   1520     "+r"(dst_u),     // %1
   1521     "+r"(dst_v),     // %2
   1522     "+r"(pix)        // %3
   1523   :
   1524   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
   1525     "v24", "v25", "v26", "v27", "v28", "v29"
   1526   );
   1527 }
   1528 #endif  // HAS_ARGBTOUV444ROW_NEON
   1529 
   1530 // 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
   1531 #ifdef HAS_ARGBTOUV422ROW_NEON
   1532 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
   1533                          int pix) {
   1534   asm volatile (
   1535     RGBTOUV_SETUP_REG
   1536   "1:                                          \n"
   1537     MEMACCESS(0)
   1538     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
   1539 
   1540     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
   1541     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
   1542     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
   1543 
   1544     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
   1545     "mul        v3.8h, v0.8h, v20.8h           \n"  // B
   1546     "mls        v3.8h, v1.8h, v21.8h           \n"  // G
   1547     "mls        v3.8h, v2.8h, v22.8h           \n"  // R
   1548     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
   1549 
   1550     "mul        v4.8h, v2.8h, v20.8h           \n"  // R
   1551     "mls        v4.8h, v1.8h, v24.8h           \n"  // G
   1552     "mls        v4.8h, v0.8h, v23.8h           \n"  // B
   1553     "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
   1554 
   1555     "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
   1556     "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
   1557 
   1558     MEMACCESS(1)
   1559     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
   1560     MEMACCESS(2)
   1561     "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
   1562     "b.gt       1b                             \n"
   1563   : "+r"(src_argb),  // %0
   1564     "+r"(dst_u),     // %1
   1565     "+r"(dst_v),     // %2
   1566     "+r"(pix)        // %3
   1567   :
   1568   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1569     "v20", "v21", "v22", "v23", "v24", "v25"
   1570   );
   1571 }
   1572 #endif  // HAS_ARGBTOUV422ROW_NEON
   1573 
   1574 // 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
   1575 #ifdef HAS_ARGBTOUV411ROW_NEON
   1576 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
   1577                          int pix) {
   1578   asm volatile (
   1579     RGBTOUV_SETUP_REG
   1580   "1:                                          \n"
   1581     MEMACCESS(0)
   1582     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
   1583     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
   1584     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
   1585     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
   1586     MEMACCESS(0)
   1587     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
   1588     "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
   1589     "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
   1590     "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
   1591 
   1592     "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
   1593     "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
   1594     "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
   1595 
   1596     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
   1597     "urshr      v1.8h, v1.8h, #1               \n"
   1598     "urshr      v2.8h, v2.8h, #1               \n"
   1599 
   1600     "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
   1601     "mul        v3.8h, v0.8h, v20.8h           \n"  // B
   1602     "mls        v3.8h, v1.8h, v21.8h           \n"  // G
   1603     "mls        v3.8h, v2.8h, v22.8h           \n"  // R
   1604     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
   1605     "mul        v4.8h, v2.8h, v20.8h           \n"  // R
   1606     "mls        v4.8h, v1.8h, v24.8h           \n"  // G
   1607     "mls        v4.8h, v0.8h, v23.8h           \n"  // B
   1608     "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
   1609     "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
   1610     "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
   1611     MEMACCESS(1)
   1612     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
   1613     MEMACCESS(2)
   1614     "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
   1615     "b.gt       1b                             \n"
   1616   : "+r"(src_argb),  // %0
   1617     "+r"(dst_u),     // %1
   1618     "+r"(dst_v),     // %2
   1619     "+r"(pix)        // %3
   1620   :
   1621   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1622     "v20", "v21", "v22", "v23", "v24", "v25"
   1623   );
   1624 }
   1625 #endif  // HAS_ARGBTOUV411ROW_NEON
   1626 
   1627 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
   1628 #define RGBTOUV(QB, QG, QR) \
   1629     "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
   1630     "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
   1631     "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
   1632     "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
   1633     "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
   1634     "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
   1635     "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
   1636     "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
   1637     "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
   1638     "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
   1639 
   1640 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
   1641 // TODO(fbarchard): consider ptrdiff_t for all strides.
   1642 
   1643 #ifdef HAS_ARGBTOUVROW_NEON
   1644 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
   1645                       uint8* dst_u, uint8* dst_v, int pix) {
   1646   const uint8* src_argb_1 = src_argb + src_stride_argb;
   1647   asm volatile (
   1648     RGBTOUV_SETUP_REG
   1649   "1:                                          \n"
   1650     MEMACCESS(0)
   1651     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
   1652     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
   1653     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
   1654     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
   1655 
   1656     MEMACCESS(1)
   1657     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
   1658     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
   1659     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
   1660     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
   1661 
   1662     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
   1663     "urshr      v1.8h, v1.8h, #1               \n"
   1664     "urshr      v2.8h, v2.8h, #1               \n"
   1665 
   1666     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
   1667     RGBTOUV(v0.8h, v1.8h, v2.8h)
   1668     MEMACCESS(2)
   1669     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1670     MEMACCESS(3)
   1671     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1672     "b.gt       1b                             \n"
   1673   : "+r"(src_argb),  // %0
   1674     "+r"(src_argb_1),  // %1
   1675     "+r"(dst_u),     // %2
   1676     "+r"(dst_v),     // %3
   1677     "+r"(pix)        // %4
   1678   :
   1679   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1680     "v20", "v21", "v22", "v23", "v24", "v25"
   1681   );
   1682 }
   1683 #endif  // HAS_ARGBTOUVROW_NEON
   1684 
   1685 // TODO(fbarchard): Subsample match C code.
   1686 #ifdef HAS_ARGBTOUVJROW_NEON
   1687 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
   1688                        uint8* dst_u, uint8* dst_v, int pix) {
   1689   const uint8* src_argb_1 = src_argb + src_stride_argb;
   1690   asm volatile (
   1691     "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
   1692     "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
   1693     "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
   1694     "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
   1695     "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
   1696     "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
   1697   "1:                                          \n"
   1698     MEMACCESS(0)
   1699     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
   1700     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
   1701     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
   1702     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
   1703     MEMACCESS(1)
   1704     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
   1705     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
   1706     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
   1707     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
   1708 
   1709     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
   1710     "urshr      v1.8h, v1.8h, #1               \n"
   1711     "urshr      v2.8h, v2.8h, #1               \n"
   1712 
   1713     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
   1714     RGBTOUV(v0.8h, v1.8h, v2.8h)
   1715     MEMACCESS(2)
   1716     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1717     MEMACCESS(3)
   1718     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1719     "b.gt       1b                             \n"
   1720   : "+r"(src_argb),  // %0
   1721     "+r"(src_argb_1),  // %1
   1722     "+r"(dst_u),     // %2
   1723     "+r"(dst_v),     // %3
   1724     "+r"(pix)        // %4
   1725   :
   1726   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1727     "v20", "v21", "v22", "v23", "v24", "v25"
   1728   );
   1729 }
   1730 #endif  // HAS_ARGBTOUVJROW_NEON
   1731 
   1732 #ifdef HAS_BGRATOUVROW_NEON
   1733 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
   1734                       uint8* dst_u, uint8* dst_v, int pix) {
   1735   const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
   1736   asm volatile (
   1737     RGBTOUV_SETUP_REG
   1738   "1:                                          \n"
   1739     MEMACCESS(0)
   1740     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
   1741     "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
   1742     "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
   1743     "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
   1744     MEMACCESS(1)
   1745     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
   1746     "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
   1747     "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
   1748     "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
   1749 
   1750     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
   1751     "urshr      v1.8h, v3.8h, #1               \n"
   1752     "urshr      v2.8h, v2.8h, #1               \n"
   1753 
   1754     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
   1755     RGBTOUV(v0.8h, v1.8h, v2.8h)
   1756     MEMACCESS(2)
   1757     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1758     MEMACCESS(3)
   1759     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1760     "b.gt       1b                             \n"
   1761   : "+r"(src_bgra),  // %0
   1762     "+r"(src_bgra_1),  // %1
   1763     "+r"(dst_u),     // %2
   1764     "+r"(dst_v),     // %3
   1765     "+r"(pix)        // %4
   1766   :
   1767   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1768     "v20", "v21", "v22", "v23", "v24", "v25"
   1769   );
   1770 }
   1771 #endif  // HAS_BGRATOUVROW_NEON
   1772 
   1773 #ifdef HAS_ABGRTOUVROW_NEON
   1774 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
   1775                       uint8* dst_u, uint8* dst_v, int pix) {
   1776   const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
   1777   asm volatile (
   1778     RGBTOUV_SETUP_REG
   1779   "1:                                          \n"
   1780     MEMACCESS(0)
   1781     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
   1782     "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
   1783     "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
   1784     "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
   1785     MEMACCESS(1)
   1786     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
   1787     "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
   1788     "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
   1789     "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
   1790 
   1791     "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
   1792     "urshr      v2.8h, v2.8h, #1               \n"
   1793     "urshr      v1.8h, v1.8h, #1               \n"
   1794 
   1795     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
   1796     RGBTOUV(v0.8h, v2.8h, v1.8h)
   1797     MEMACCESS(2)
   1798     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1799     MEMACCESS(3)
   1800     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1801     "b.gt       1b                             \n"
   1802   : "+r"(src_abgr),  // %0
   1803     "+r"(src_abgr_1),  // %1
   1804     "+r"(dst_u),     // %2
   1805     "+r"(dst_v),     // %3
   1806     "+r"(pix)        // %4
   1807   :
   1808   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1809     "v20", "v21", "v22", "v23", "v24", "v25"
   1810   );
   1811 }
   1812 #endif  // HAS_ABGRTOUVROW_NEON
   1813 
   1814 #ifdef HAS_RGBATOUVROW_NEON
   1815 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
   1816                       uint8* dst_u, uint8* dst_v, int pix) {
   1817   const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
   1818   asm volatile (
   1819     RGBTOUV_SETUP_REG
   1820   "1:                                          \n"
   1821     MEMACCESS(0)
   1822     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
   1823     "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
   1824     "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
   1825     "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
   1826     MEMACCESS(1)
   1827     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
   1828     "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
   1829     "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
   1830     "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
   1831 
   1832     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
   1833     "urshr      v1.8h, v1.8h, #1               \n"
   1834     "urshr      v2.8h, v2.8h, #1               \n"
   1835 
   1836     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
   1837     RGBTOUV(v0.8h, v1.8h, v2.8h)
   1838     MEMACCESS(2)
   1839     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1840     MEMACCESS(3)
   1841     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1842     "b.gt       1b                             \n"
   1843   : "+r"(src_rgba),  // %0
   1844     "+r"(src_rgba_1),  // %1
   1845     "+r"(dst_u),     // %2
   1846     "+r"(dst_v),     // %3
   1847     "+r"(pix)        // %4
   1848   :
   1849   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1850     "v20", "v21", "v22", "v23", "v24", "v25"
   1851   );
   1852 }
   1853 #endif  // HAS_RGBATOUVROW_NEON
   1854 
   1855 #ifdef HAS_RGB24TOUVROW_NEON
   1856 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
   1857                        uint8* dst_u, uint8* dst_v, int pix) {
   1858   const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
   1859   asm volatile (
   1860     RGBTOUV_SETUP_REG
   1861   "1:                                          \n"
   1862     MEMACCESS(0)
   1863     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
   1864     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
   1865     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
   1866     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
   1867     MEMACCESS(1)
   1868     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
   1869     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
   1870     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
   1871     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
   1872 
   1873     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
   1874     "urshr      v1.8h, v1.8h, #1               \n"
   1875     "urshr      v2.8h, v2.8h, #1               \n"
   1876 
   1877     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
   1878     RGBTOUV(v0.8h, v1.8h, v2.8h)
   1879     MEMACCESS(2)
   1880     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1881     MEMACCESS(3)
   1882     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1883     "b.gt       1b                             \n"
   1884   : "+r"(src_rgb24),  // %0
   1885     "+r"(src_rgb24_1),  // %1
   1886     "+r"(dst_u),     // %2
   1887     "+r"(dst_v),     // %3
   1888     "+r"(pix)        // %4
   1889   :
   1890   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1891     "v20", "v21", "v22", "v23", "v24", "v25"
   1892   );
   1893 }
   1894 #endif  // HAS_RGB24TOUVROW_NEON
   1895 
   1896 #ifdef HAS_RAWTOUVROW_NEON
   1897 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
   1898                      uint8* dst_u, uint8* dst_v, int pix) {
   1899   const uint8* src_raw_1 = src_raw + src_stride_raw;
   1900   asm volatile (
   1901     RGBTOUV_SETUP_REG
   1902   "1:                                          \n"
   1903     MEMACCESS(0)
   1904     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
   1905     "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
   1906     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
   1907     "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
   1908     MEMACCESS(1)
   1909     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
   1910     "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
   1911     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
   1912     "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
   1913 
   1914     "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
   1915     "urshr      v1.8h, v1.8h, #1               \n"
   1916     "urshr      v0.8h, v0.8h, #1               \n"
   1917 
   1918     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
   1919     RGBTOUV(v2.8h, v1.8h, v0.8h)
   1920     MEMACCESS(2)
   1921     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1922     MEMACCESS(3)
   1923     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1924     "b.gt       1b                             \n"
   1925   : "+r"(src_raw),  // %0
   1926     "+r"(src_raw_1),  // %1
   1927     "+r"(dst_u),     // %2
   1928     "+r"(dst_v),     // %3
   1929     "+r"(pix)        // %4
   1930   :
   1931   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1932     "v20", "v21", "v22", "v23", "v24", "v25"
   1933   );
   1934 }
   1935 #endif  // HAS_RAWTOUVROW_NEON
   1936 
   1937 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
   1938 #ifdef HAS_RGB565TOUVROW_NEON
   1939 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
   1940                         uint8* dst_u, uint8* dst_v, int pix) {
   1941   const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
   1942   asm volatile (
   1943     "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
   1944     "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
   1945     "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
   1946     "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
   1947     "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
   1948     "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
   1949   "1:                                          \n"
   1950     MEMACCESS(0)
   1951     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
   1952     RGB565TOARGB
   1953     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1954     "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1955     "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1956     MEMACCESS(0)
   1957     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
   1958     RGB565TOARGB
   1959     "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1960     "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1961     "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1962 
   1963     MEMACCESS(1)
   1964     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
   1965     RGB565TOARGB
   1966     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1967     "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1968     "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1969     MEMACCESS(1)
   1970     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
   1971     RGB565TOARGB
   1972     "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1973     "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1974     "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1975 
   1976     "ins        v16.D[1], v17.D[0]             \n"
   1977     "ins        v18.D[1], v19.D[0]             \n"
   1978     "ins        v20.D[1], v21.D[0]             \n"
   1979 
   1980     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
   1981     "urshr      v5.8h, v18.8h, #1              \n"
   1982     "urshr      v6.8h, v20.8h, #1              \n"
   1983 
   1984     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
   1985     "mul        v16.8h, v4.8h, v22.8h          \n"  // B
   1986     "mls        v16.8h, v5.8h, v23.8h          \n"  // G
   1987     "mls        v16.8h, v6.8h, v24.8h          \n"  // R
   1988     "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
   1989     "mul        v17.8h, v6.8h, v22.8h          \n"  // R
   1990     "mls        v17.8h, v5.8h, v26.8h          \n"  // G
   1991     "mls        v17.8h, v4.8h, v25.8h          \n"  // B
   1992     "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
   1993     "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
   1994     "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
   1995     MEMACCESS(2)
   1996     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1997     MEMACCESS(3)
   1998     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1999     "b.gt       1b                             \n"
   2000   : "+r"(src_rgb565),  // %0
   2001     "+r"(src_rgb565_1),  // %1
   2002     "+r"(dst_u),     // %2
   2003     "+r"(dst_v),     // %3
   2004     "+r"(pix)        // %4
   2005   :
   2006   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   2007     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
   2008     "v25", "v26", "v27"
   2009   );
   2010 }
   2011 #endif  // HAS_RGB565TOUVROW_NEON
   2012 
   2013 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
   2014 #ifdef HAS_ARGB1555TOUVROW_NEON
   2015 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
   2016                         uint8* dst_u, uint8* dst_v, int pix) {
   2017   const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
   2018   asm volatile (
   2019     RGBTOUV_SETUP_REG
   2020   "1:                                          \n"
   2021     MEMACCESS(0)
   2022     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
   2023     RGB555TOARGB
   2024     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   2025     "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   2026     "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   2027     MEMACCESS(0)
   2028     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
   2029     RGB555TOARGB
   2030     "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   2031     "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   2032     "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   2033 
   2034     MEMACCESS(1)
   2035     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
   2036     RGB555TOARGB
   2037     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   2038     "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   2039     "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   2040     MEMACCESS(1)
   2041     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
   2042     RGB555TOARGB
   2043     "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   2044     "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   2045     "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   2046 
   2047     "ins        v16.D[1], v26.D[0]             \n"
   2048     "ins        v17.D[1], v27.D[0]             \n"
   2049     "ins        v18.D[1], v28.D[0]             \n"
   2050 
   2051     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
   2052     "urshr      v5.8h, v17.8h, #1              \n"
   2053     "urshr      v6.8h, v18.8h, #1              \n"
   2054 
   2055     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
   2056     "mul        v2.8h, v4.8h, v20.8h           \n"  // B
   2057     "mls        v2.8h, v5.8h, v21.8h           \n"  // G
   2058     "mls        v2.8h, v6.8h, v22.8h           \n"  // R
   2059     "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
   2060     "mul        v3.8h, v6.8h, v20.8h           \n"  // R
   2061     "mls        v3.8h, v5.8h, v24.8h           \n"  // G
   2062     "mls        v3.8h, v4.8h, v23.8h           \n"  // B
   2063     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
   2064     "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
   2065     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
   2066     MEMACCESS(2)
   2067     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   2068     MEMACCESS(3)
   2069     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   2070     "b.gt       1b                             \n"
   2071   : "+r"(src_argb1555),  // %0
   2072     "+r"(src_argb1555_1),  // %1
   2073     "+r"(dst_u),     // %2
   2074     "+r"(dst_v),     // %3
   2075     "+r"(pix)        // %4
   2076   :
   2077   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
   2078     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
   2079     "v26", "v27", "v28"
   2080   );
   2081 }
   2082 #endif  // HAS_ARGB1555TOUVROW_NEON
   2083 
   2084 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
   2085 #ifdef HAS_ARGB4444TOUVROW_NEON
   2086 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
   2087                           uint8* dst_u, uint8* dst_v, int pix) {
   2088   const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
   2089   asm volatile (
   2090     RGBTOUV_SETUP_REG
   2091   "1:                                          \n"
   2092     MEMACCESS(0)
   2093     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
   2094     ARGB4444TOARGB
   2095     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   2096     "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   2097     "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   2098     MEMACCESS(0)
   2099     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
   2100     ARGB4444TOARGB
   2101     "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   2102     "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   2103     "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   2104 
   2105     MEMACCESS(1)
   2106     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
   2107     ARGB4444TOARGB
   2108     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   2109     "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   2110     "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   2111     MEMACCESS(1)
   2112     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
   2113     ARGB4444TOARGB
   2114     "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   2115     "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   2116     "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   2117 
   2118     "ins        v16.D[1], v26.D[0]             \n"
   2119     "ins        v17.D[1], v27.D[0]             \n"
   2120     "ins        v18.D[1], v28.D[0]             \n"
   2121 
   2122     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
   2123     "urshr      v5.8h, v17.8h, #1              \n"
   2124     "urshr      v6.8h, v18.8h, #1              \n"
   2125 
   2126     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
   2127     "mul        v2.8h, v4.8h, v20.8h           \n"  // B
   2128     "mls        v2.8h, v5.8h, v21.8h           \n"  // G
   2129     "mls        v2.8h, v6.8h, v22.8h           \n"  // R
   2130     "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
   2131     "mul        v3.8h, v6.8h, v20.8h           \n"  // R
   2132     "mls        v3.8h, v5.8h, v24.8h           \n"  // G
   2133     "mls        v3.8h, v4.8h, v23.8h           \n"  // B
   2134     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
   2135     "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
   2136     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
   2137     MEMACCESS(2)
   2138     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   2139     MEMACCESS(3)
   2140     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   2141     "b.gt       1b                             \n"
   2142   : "+r"(src_argb4444),  // %0
   2143     "+r"(src_argb4444_1),  // %1
   2144     "+r"(dst_u),     // %2
   2145     "+r"(dst_v),     // %3
   2146     "+r"(pix)        // %4
   2147   :
   2148   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
   2149     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
   2150     "v26", "v27", "v28"
   2151 
   2152   );
   2153 }
   2154 #endif  // HAS_ARGB4444TOUVROW_NEON
   2155 
   2156 #ifdef HAS_RGB565TOYROW_NEON
   2157 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
   2158   asm volatile (
   2159     "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
   2160     "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
   2161     "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
   2162     "movi       v27.8b, #16                    \n"  // Add 16 constant
   2163   "1:                                          \n"
   2164     MEMACCESS(0)
   2165     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
   2166     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2167     RGB565TOARGB
   2168     "umull      v3.8h, v0.8b, v24.8b           \n"  // B
   2169     "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
   2170     "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
   2171     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
   2172     "uqadd      v0.8b, v0.8b, v27.8b           \n"
   2173     MEMACCESS(1)
   2174     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   2175     "b.gt       1b                             \n"
   2176   : "+r"(src_rgb565),  // %0
   2177     "+r"(dst_y),       // %1
   2178     "+r"(pix)          // %2
   2179   :
   2180   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
   2181     "v24", "v25", "v26", "v27"
   2182   );
   2183 }
   2184 #endif  // HAS_RGB565TOYROW_NEON
   2185 
   2186 #ifdef HAS_ARGB1555TOYROW_NEON
   2187 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
   2188   asm volatile (
   2189     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
   2190     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
   2191     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
   2192     "movi       v7.8b, #16                     \n"  // Add 16 constant
   2193   "1:                                          \n"
   2194     MEMACCESS(0)
   2195     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
   2196     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2197     ARGB1555TOARGB
   2198     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
   2199     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
   2200     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
   2201     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
   2202     "uqadd      v0.8b, v0.8b, v7.8b            \n"
   2203     MEMACCESS(1)
   2204     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   2205     "b.gt       1b                             \n"
   2206   : "+r"(src_argb1555),  // %0
   2207     "+r"(dst_y),         // %1
   2208     "+r"(pix)            // %2
   2209   :
   2210   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   2211   );
   2212 }
   2213 #endif  // HAS_ARGB1555TOYROW_NEON
   2214 
   2215 #ifdef HAS_ARGB4444TOYROW_NEON
   2216 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
   2217   asm volatile (
   2218     "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
   2219     "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
   2220     "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
   2221     "movi       v27.8b, #16                    \n"  // Add 16 constant
   2222   "1:                                          \n"
   2223     MEMACCESS(0)
   2224     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
   2225     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2226     ARGB4444TOARGB
   2227     "umull      v3.8h, v0.8b, v24.8b           \n"  // B
   2228     "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
   2229     "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
   2230     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
   2231     "uqadd      v0.8b, v0.8b, v27.8b           \n"
   2232     MEMACCESS(1)
   2233     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   2234     "b.gt       1b                             \n"
   2235   : "+r"(src_argb4444),  // %0
   2236     "+r"(dst_y),         // %1
   2237     "+r"(pix)            // %2
   2238   :
   2239   : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
   2240   );
   2241 }
   2242 #endif  // HAS_ARGB4444TOYROW_NEON
   2243 
   2244 #ifdef HAS_BGRATOYROW_NEON
   2245 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
   2246   asm volatile (
   2247     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
   2248     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
   2249     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
   2250     "movi       v7.8b, #16                     \n"  // Add 16 constant
   2251   "1:                                          \n"
   2252     MEMACCESS(0)
   2253     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
   2254     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2255     "umull      v16.8h, v1.8b, v4.8b           \n"  // R
   2256     "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
   2257     "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
   2258     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
   2259     "uqadd      v0.8b, v0.8b, v7.8b            \n"
   2260     MEMACCESS(1)
   2261     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   2262     "b.gt       1b                             \n"
   2263   : "+r"(src_bgra),  // %0
   2264     "+r"(dst_y),     // %1
   2265     "+r"(pix)        // %2
   2266   :
   2267   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   2268   );
   2269 }
   2270 #endif  // HAS_BGRATOYROW_NEON
   2271 
   2272 #ifdef HAS_ABGRTOYROW_NEON
   2273 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
   2274   asm volatile (
   2275     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
   2276     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
   2277     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
   2278     "movi       v7.8b, #16                     \n"  // Add 16 constant
   2279   "1:                                          \n"
   2280     MEMACCESS(0)
   2281     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
   2282     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2283     "umull      v16.8h, v0.8b, v4.8b           \n"  // R
   2284     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
   2285     "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
   2286     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
   2287     "uqadd      v0.8b, v0.8b, v7.8b            \n"
   2288     MEMACCESS(1)
   2289     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   2290     "b.gt       1b                             \n"
   2291   : "+r"(src_abgr),  // %0
   2292     "+r"(dst_y),     // %1
   2293     "+r"(pix)        // %2
   2294   :
   2295   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   2296   );
   2297 }
   2298 #endif  // HAS_ABGRTOYROW_NEON
   2299 
   2300 #ifdef HAS_RGBATOYROW_NEON
   2301 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
   2302   asm volatile (
   2303     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
   2304     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
   2305     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
   2306     "movi       v7.8b, #16                     \n"  // Add 16 constant
   2307   "1:                                          \n"
   2308     MEMACCESS(0)
   2309     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
   2310     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2311     "umull      v16.8h, v1.8b, v4.8b           \n"  // B
   2312     "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
   2313     "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
   2314     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
   2315     "uqadd      v0.8b, v0.8b, v7.8b            \n"
   2316     MEMACCESS(1)
   2317     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   2318     "b.gt       1b                             \n"
   2319   : "+r"(src_rgba),  // %0
   2320     "+r"(dst_y),     // %1
   2321     "+r"(pix)        // %2
   2322   :
   2323   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   2324   );
   2325 }
   2326 #endif  // HAS_RGBATOYROW_NEON
   2327 
   2328 #ifdef HAS_RGB24TOYROW_NEON
   2329 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
   2330   asm volatile (
   2331     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
   2332     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
   2333     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
   2334     "movi       v7.8b, #16                     \n"  // Add 16 constant
   2335   "1:                                          \n"
   2336     MEMACCESS(0)
   2337     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
   2338     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2339     "umull      v16.8h, v0.8b, v4.8b           \n"  // B
   2340     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
   2341     "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
   2342     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
   2343     "uqadd      v0.8b, v0.8b, v7.8b            \n"
   2344     MEMACCESS(1)
   2345     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   2346     "b.gt       1b                             \n"
   2347   : "+r"(src_rgb24),  // %0
   2348     "+r"(dst_y),      // %1
   2349     "+r"(pix)         // %2
   2350   :
   2351   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   2352   );
   2353 }
   2354 #endif  // HAS_RGB24TOYROW_NEON
   2355 
   2356 #ifdef HAS_RAWTOYROW_NEON
   2357 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
   2358   asm volatile (
   2359     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
   2360     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
   2361     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
   2362     "movi       v7.8b, #16                     \n"  // Add 16 constant
   2363   "1:                                          \n"
   2364     MEMACCESS(0)
   2365     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
   2366     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2367     "umull      v16.8h, v0.8b, v4.8b           \n"  // B
   2368     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
   2369     "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
   2370     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
   2371     "uqadd      v0.8b, v0.8b, v7.8b            \n"
   2372     MEMACCESS(1)
   2373     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   2374     "b.gt       1b                             \n"
   2375   : "+r"(src_raw),  // %0
   2376     "+r"(dst_y),    // %1
   2377     "+r"(pix)       // %2
   2378   :
   2379   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   2380   );
   2381 }
   2382 #endif  // HAS_RAWTOYROW_NEON
   2383 
   2384 // Bilinear filter 16x2 -> 16x1
   2385 #ifdef HAS_INTERPOLATEROW_NEON
   2386 void InterpolateRow_NEON(uint8* dst_ptr,
   2387                          const uint8* src_ptr, ptrdiff_t src_stride,
   2388                          int dst_width, int source_y_fraction) {
   2389   int y1_fraction = source_y_fraction;
   2390   int y0_fraction = 256 - y1_fraction;
   2391   const uint8* src_ptr1 = src_ptr + src_stride;
   2392   asm volatile (
   2393     "cmp        %w4, #0                        \n"
   2394     "b.eq       100f                           \n"
   2395     "cmp        %w4, #64                       \n"
   2396     "b.eq       75f                            \n"
   2397     "cmp        %w4, #128                      \n"
   2398     "b.eq       50f                            \n"
   2399     "cmp        %w4, #192                      \n"
   2400     "b.eq       25f                            \n"
   2401 
   2402     "dup        v5.16b, %w4                    \n"
   2403     "dup        v4.16b, %w5                    \n"
   2404     // General purpose row blend.
   2405   "1:                                          \n"
   2406     MEMACCESS(1)
   2407     "ld1        {v0.16b}, [%1], #16            \n"
   2408     MEMACCESS(2)
   2409     "ld1        {v1.16b}, [%2], #16            \n"
   2410     "subs       %w3, %w3, #16                  \n"
   2411     "umull      v2.8h, v0.8b,  v4.8b           \n"
   2412     "umull2     v3.8h, v0.16b, v4.16b          \n"
   2413     "umlal      v2.8h, v1.8b,  v5.8b           \n"
   2414     "umlal2     v3.8h, v1.16b, v5.16b          \n"
   2415     "rshrn      v0.8b,  v2.8h, #8              \n"
   2416     "rshrn2     v0.16b, v3.8h, #8              \n"
   2417     MEMACCESS(0)
   2418     "st1        {v0.16b}, [%0], #16            \n"
   2419     "b.gt       1b                             \n"
   2420     "b          99f                            \n"
   2421 
   2422     // Blend 25 / 75.
   2423   "25:                                         \n"
   2424     MEMACCESS(1)
   2425     "ld1        {v0.16b}, [%1], #16            \n"
   2426     MEMACCESS(2)
   2427     "ld1        {v1.16b}, [%2], #16            \n"
   2428     "subs       %w3, %w3, #16                  \n"
   2429     "urhadd     v0.16b, v0.16b, v1.16b         \n"
   2430     "urhadd     v0.16b, v0.16b, v1.16b         \n"
   2431     MEMACCESS(0)
   2432     "st1        {v0.16b}, [%0], #16            \n"
   2433     "b.gt       25b                            \n"
   2434     "b          99f                            \n"
   2435 
   2436     // Blend 50 / 50.
   2437   "50:                                         \n"
   2438     MEMACCESS(1)
   2439     "ld1        {v0.16b}, [%1], #16            \n"
   2440     MEMACCESS(2)
   2441     "ld1        {v1.16b}, [%2], #16            \n"
   2442     "subs       %w3, %w3, #16                  \n"
   2443     "urhadd     v0.16b, v0.16b, v1.16b         \n"
   2444     MEMACCESS(0)
   2445     "st1        {v0.16b}, [%0], #16            \n"
   2446     "b.gt       50b                            \n"
   2447     "b          99f                            \n"
   2448 
   2449     // Blend 75 / 25.
   2450   "75:                                         \n"
   2451     MEMACCESS(1)
   2452     "ld1        {v1.16b}, [%1], #16            \n"
   2453     MEMACCESS(2)
   2454     "ld1        {v0.16b}, [%2], #16            \n"
   2455     "subs       %w3, %w3, #16                  \n"
   2456     "urhadd     v0.16b, v0.16b, v1.16b         \n"
   2457     "urhadd     v0.16b, v0.16b, v1.16b         \n"
   2458     MEMACCESS(0)
   2459     "st1        {v0.16b}, [%0], #16            \n"
   2460     "b.gt       75b                            \n"
   2461     "b          99f                            \n"
   2462 
   2463     // Blend 100 / 0 - Copy row unchanged.
   2464   "100:                                        \n"
   2465     MEMACCESS(1)
   2466     "ld1        {v0.16b}, [%1], #16            \n"
   2467     "subs       %w3, %w3, #16                  \n"
   2468     MEMACCESS(0)
   2469     "st1        {v0.16b}, [%0], #16            \n"
   2470     "b.gt       100b                           \n"
   2471 
   2472   "99:                                         \n"
   2473   : "+r"(dst_ptr),          // %0
   2474     "+r"(src_ptr),          // %1
   2475     "+r"(src_ptr1),         // %2
   2476     "+r"(dst_width),        // %3
   2477     "+r"(y1_fraction),      // %4
   2478     "+r"(y0_fraction)       // %5
   2479   :
   2480   : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
   2481   );
   2482 }
   2483 #endif  // HAS_INTERPOLATEROW_NEON
   2484 
   2485 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
   2486 #ifdef HAS_ARGBBLENDROW_NEON
   2487 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
   2488                        uint8* dst_argb, int width) {
   2489   asm volatile (
   2490     "subs       %w3, %w3, #8                   \n"
   2491     "b.lt       89f                            \n"
   2492     // Blend 8 pixels.
   2493   "8:                                          \n"
   2494     MEMACCESS(0)
   2495     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
   2496     MEMACCESS(1)
   2497     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
   2498     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   2499     "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
   2500     "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
   2501     "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
   2502     "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
   2503     "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
   2504     "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
   2505     "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
   2506     "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
   2507     "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
   2508     "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
   2509     "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
   2510     "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
   2511     "movi       v3.8b, #255                    \n"  // a = 255
   2512     MEMACCESS(2)
   2513     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
   2514     "b.ge       8b                             \n"
   2515 
   2516   "89:                                         \n"
   2517     "adds       %w3, %w3, #8-1                 \n"
   2518     "b.lt       99f                            \n"
   2519 
   2520     // Blend 1 pixels.
   2521   "1:                                          \n"
   2522     MEMACCESS(0)
   2523     "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
   2524     MEMACCESS(1)
   2525     "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
   2526     "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
   2527     "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
   2528     "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
   2529     "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
   2530     "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
   2531     "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
   2532     "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
   2533     "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
   2534     "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
   2535     "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
   2536     "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
   2537     "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
   2538     "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
   2539     "movi       v3.8b, #255                    \n"  // a = 255
   2540     MEMACCESS(2)
   2541     "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
   2542     "b.ge       1b                             \n"
   2543 
   2544   "99:                                         \n"
   2545 
   2546   : "+r"(src_argb0),    // %0
   2547     "+r"(src_argb1),    // %1
   2548     "+r"(dst_argb),     // %2
   2549     "+r"(width)         // %3
   2550   :
   2551   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   2552     "v16", "v17", "v18"
   2553   );
   2554 }
   2555 #endif  // HAS_ARGBBLENDROW_NEON
   2556 
   2557 // Attenuate 8 pixels at a time.
   2558 #ifdef HAS_ARGBATTENUATEROW_NEON
   2559 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
   2560   asm volatile (
   2561     // Attenuate 8 pixels.
   2562   "1:                                          \n"
   2563     MEMACCESS(0)
   2564     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
   2565     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2566     "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
   2567     "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
   2568     "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
   2569     "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
   2570     "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
   2571     "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
   2572     MEMACCESS(1)
   2573     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
   2574     "b.gt       1b                             \n"
   2575   : "+r"(src_argb),   // %0
   2576     "+r"(dst_argb),   // %1
   2577     "+r"(width)       // %2
   2578   :
   2579   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
   2580   );
   2581 }
   2582 #endif  // HAS_ARGBATTENUATEROW_NEON
   2583 
   2584 // Quantize 8 ARGB pixels (32 bytes).
   2585 // dst = (dst * scale >> 16) * interval_size + interval_offset;
   2586 #ifdef HAS_ARGBQUANTIZEROW_NEON
   2587 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
   2588                           int interval_offset, int width) {
   2589   asm volatile (
   2590     "dup        v4.8h, %w2                     \n"
   2591     "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
   2592     "dup        v5.8h, %w3                     \n"  // interval multiply.
   2593     "dup        v6.8h, %w4                     \n"  // interval add
   2594 
   2595     // 8 pixel loop.
   2596   "1:                                          \n"
   2597     MEMACCESS(0)
   2598     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
   2599     "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
   2600     "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
   2601     "uxtl       v1.8h, v1.8b                   \n"
   2602     "uxtl       v2.8h, v2.8b                   \n"
   2603     "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
   2604     "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
   2605     "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
   2606     "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
   2607     "mul        v1.8h, v1.8h, v5.8h            \n"  // g
   2608     "mul        v2.8h, v2.8h, v5.8h            \n"  // r
   2609     "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
   2610     "add        v1.8h, v1.8h, v6.8h            \n"  // g
   2611     "add        v2.8h, v2.8h, v6.8h            \n"  // r
   2612     "uqxtn      v0.8b, v0.8h                   \n"
   2613     "uqxtn      v1.8b, v1.8h                   \n"
   2614     "uqxtn      v2.8b, v2.8h                   \n"
   2615     MEMACCESS(0)
   2616     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
   2617     "b.gt       1b                             \n"
   2618   : "+r"(dst_argb),       // %0
   2619     "+r"(width)           // %1
   2620   : "r"(scale),           // %2
   2621     "r"(interval_size),   // %3
   2622     "r"(interval_offset)  // %4
   2623   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
   2624   );
   2625 }
   2626 #endif  // HAS_ARGBQUANTIZEROW_NEON
   2627 
   2628 // Shade 8 pixels at a time by specified value.
   2629 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
   2630 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
   2631 #ifdef HAS_ARGBSHADEROW_NEON
   2632 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
   2633                        uint32 value) {
   2634   asm volatile (
   2635     "dup        v0.4s, %w3                     \n"  // duplicate scale value.
   2636     "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
   2637     "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
   2638 
   2639     // 8 pixel loop.
   2640   "1:                                          \n"
   2641     MEMACCESS(0)
   2642     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   2643     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2644     "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
   2645     "uxtl       v5.8h, v5.8b                   \n"
   2646     "uxtl       v6.8h, v6.8b                   \n"
   2647     "uxtl       v7.8h, v7.8b                   \n"
   2648     "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
   2649     "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
   2650     "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
   2651     "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
   2652     "uqxtn      v4.8b, v4.8h                   \n"
   2653     "uqxtn      v5.8b, v5.8h                   \n"
   2654     "uqxtn      v6.8b, v6.8h                   \n"
   2655     "uqxtn      v7.8b, v7.8h                   \n"
   2656     MEMACCESS(1)
   2657     "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
   2658     "b.gt       1b                             \n"
   2659   : "+r"(src_argb),       // %0
   2660     "+r"(dst_argb),       // %1
   2661     "+r"(width)           // %2
   2662   : "r"(value)            // %3
   2663   : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
   2664   );
   2665 }
   2666 #endif  // HAS_ARGBSHADEROW_NEON
   2667 
   2668 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
   2669 // Similar to ARGBToYJ but stores ARGB.
   2670 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
   2671 #ifdef HAS_ARGBGRAYROW_NEON
   2672 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
   2673   asm volatile (
   2674     "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
   2675     "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
   2676     "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
   2677   "1:                                          \n"
   2678     MEMACCESS(0)
   2679     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   2680     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2681     "umull      v4.8h, v0.8b, v24.8b           \n"  // B
   2682     "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
   2683     "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
   2684     "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
   2685     "orr        v1.8b, v0.8b, v0.8b            \n"  // G
   2686     "orr        v2.8b, v0.8b, v0.8b            \n"  // R
   2687     MEMACCESS(1)
   2688     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
   2689     "b.gt       1b                             \n"
   2690   : "+r"(src_argb),  // %0
   2691     "+r"(dst_argb),  // %1
   2692     "+r"(width)      // %2
   2693   :
   2694   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
   2695   );
   2696 }
   2697 #endif  // HAS_ARGBGRAYROW_NEON
   2698 
   2699 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
   2700 //    b = (r * 35 + g * 68 + b * 17) >> 7
   2701 //    g = (r * 45 + g * 88 + b * 22) >> 7
   2702 //    r = (r * 50 + g * 98 + b * 24) >> 7
   2703 
   2704 #ifdef HAS_ARGBSEPIAROW_NEON
   2705 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
   2706   asm volatile (
   2707     "movi       v20.8b, #17                    \n"  // BB coefficient
   2708     "movi       v21.8b, #68                    \n"  // BG coefficient
   2709     "movi       v22.8b, #35                    \n"  // BR coefficient
   2710     "movi       v24.8b, #22                    \n"  // GB coefficient
   2711     "movi       v25.8b, #88                    \n"  // GG coefficient
   2712     "movi       v26.8b, #45                    \n"  // GR coefficient
   2713     "movi       v28.8b, #24                    \n"  // BB coefficient
   2714     "movi       v29.8b, #98                    \n"  // BG coefficient
   2715     "movi       v30.8b, #50                    \n"  // BR coefficient
   2716   "1:                                          \n"
   2717     MEMACCESS(0)
   2718     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
   2719     "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
   2720     "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
   2721     "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
   2722     "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
   2723     "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
   2724     "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
   2725     "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
   2726     "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
   2727     "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
   2728     "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
   2729     "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
   2730     "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
   2731     "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
   2732     MEMACCESS(0)
   2733     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
   2734     "b.gt       1b                             \n"
   2735   : "+r"(dst_argb),  // %0
   2736     "+r"(width)      // %1
   2737   :
   2738   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   2739     "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
   2740   );
   2741 }
   2742 #endif  // HAS_ARGBSEPIAROW_NEON
   2743 
   2744 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
   2745 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
   2746 // needs to saturate.  Consider doing a non-saturating version.
   2747 #ifdef HAS_ARGBCOLORMATRIXROW_NEON
   2748 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
   2749                              const int8* matrix_argb, int width) {
   2750   asm volatile (
   2751     MEMACCESS(3)
   2752     "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
   2753     "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
   2754     "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
   2755 
   2756   "1:                                          \n"
   2757     MEMACCESS(0)
   2758     "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
   2759     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2760     "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
   2761     "uxtl       v17.8h, v17.8b                 \n"  // g
   2762     "uxtl       v18.8h, v18.8b                 \n"  // r
   2763     "uxtl       v19.8h, v19.8b                 \n"  // a
   2764     "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
   2765     "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
   2766     "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
   2767     "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
   2768     "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
   2769     "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
   2770     "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
   2771     "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
   2772     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
   2773     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
   2774     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
   2775     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
   2776     "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
   2777     "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
   2778     "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
   2779     "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
   2780     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
   2781     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
   2782     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
   2783     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
   2784     "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
   2785     "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
   2786     "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
   2787     "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
   2788     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
   2789     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
   2790     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
   2791     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
   2792     "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
   2793     "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
   2794     "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
   2795     "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
   2796     MEMACCESS(1)
   2797     "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
   2798     "b.gt       1b                             \n"
   2799   : "+r"(src_argb),   // %0
   2800     "+r"(dst_argb),   // %1
   2801     "+r"(width)       // %2
   2802   : "r"(matrix_argb)  // %3
   2803   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
   2804     "v18", "v19", "v22", "v23", "v24", "v25"
   2805   );
   2806 }
   2807 #endif  // HAS_ARGBCOLORMATRIXROW_NEON
   2808 
   2809 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
   2810 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
   2811 #ifdef HAS_ARGBMULTIPLYROW_NEON
   2812 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
   2813                           uint8* dst_argb, int width) {
   2814   asm volatile (
   2815     // 8 pixel loop.
   2816   "1:                                          \n"
   2817     MEMACCESS(0)
   2818     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   2819     MEMACCESS(1)
   2820     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
   2821     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   2822     "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
   2823     "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
   2824     "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
   2825     "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
   2826     "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
   2827     "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
   2828     "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
   2829     "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
   2830     MEMACCESS(2)
   2831     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
   2832     "b.gt       1b                             \n"
   2833 
   2834   : "+r"(src_argb0),  // %0
   2835     "+r"(src_argb1),  // %1
   2836     "+r"(dst_argb),   // %2
   2837     "+r"(width)       // %3
   2838   :
   2839   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   2840   );
   2841 }
   2842 #endif  // HAS_ARGBMULTIPLYROW_NEON
   2843 
   2844 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
   2845 #ifdef HAS_ARGBADDROW_NEON
   2846 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
   2847                      uint8* dst_argb, int width) {
   2848   asm volatile (
   2849     // 8 pixel loop.
   2850   "1:                                          \n"
   2851     MEMACCESS(0)
   2852     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   2853     MEMACCESS(1)
   2854     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
   2855     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   2856     "uqadd      v0.8b, v0.8b, v4.8b            \n"
   2857     "uqadd      v1.8b, v1.8b, v5.8b            \n"
   2858     "uqadd      v2.8b, v2.8b, v6.8b            \n"
   2859     "uqadd      v3.8b, v3.8b, v7.8b            \n"
   2860     MEMACCESS(2)
   2861     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
   2862     "b.gt       1b                             \n"
   2863 
   2864   : "+r"(src_argb0),  // %0
   2865     "+r"(src_argb1),  // %1
   2866     "+r"(dst_argb),   // %2
   2867     "+r"(width)       // %3
   2868   :
   2869   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   2870   );
   2871 }
   2872 #endif  // HAS_ARGBADDROW_NEON
   2873 
   2874 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
   2875 #ifdef HAS_ARGBSUBTRACTROW_NEON
   2876 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
   2877                           uint8* dst_argb, int width) {
   2878   asm volatile (
   2879     // 8 pixel loop.
   2880   "1:                                          \n"
   2881     MEMACCESS(0)
   2882     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   2883     MEMACCESS(1)
   2884     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
   2885     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   2886     "uqsub      v0.8b, v0.8b, v4.8b            \n"
   2887     "uqsub      v1.8b, v1.8b, v5.8b            \n"
   2888     "uqsub      v2.8b, v2.8b, v6.8b            \n"
   2889     "uqsub      v3.8b, v3.8b, v7.8b            \n"
   2890     MEMACCESS(2)
   2891     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
   2892     "b.gt       1b                             \n"
   2893 
   2894   : "+r"(src_argb0),  // %0
   2895     "+r"(src_argb1),  // %1
   2896     "+r"(dst_argb),   // %2
   2897     "+r"(width)       // %3
   2898   :
   2899   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   2900   );
   2901 }
   2902 #endif  // HAS_ARGBSUBTRACTROW_NEON
   2903 
   2904 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
   2905 // A = 255
   2906 // R = Sobel
   2907 // G = Sobel
   2908 // B = Sobel
   2909 #ifdef HAS_SOBELROW_NEON
   2910 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
   2911                      uint8* dst_argb, int width) {
   2912   asm volatile (
   2913     "movi       v3.8b, #255                    \n"  // alpha
   2914     // 8 pixel loop.
   2915   "1:                                          \n"
   2916     MEMACCESS(0)
   2917     "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
   2918     MEMACCESS(1)
   2919     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
   2920     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   2921     "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
   2922     "orr        v1.8b, v0.8b, v0.8b            \n"
   2923     "orr        v2.8b, v0.8b, v0.8b            \n"
   2924     MEMACCESS(2)
   2925     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
   2926     "b.gt       1b                             \n"
   2927   : "+r"(src_sobelx),  // %0
   2928     "+r"(src_sobely),  // %1
   2929     "+r"(dst_argb),    // %2
   2930     "+r"(width)        // %3
   2931   :
   2932   : "cc", "memory", "v0", "v1", "v2", "v3"
   2933   );
   2934 }
   2935 #endif  // HAS_SOBELROW_NEON
   2936 
   2937 // Adds Sobel X and Sobel Y and stores Sobel into plane.
   2938 #ifdef HAS_SOBELTOPLANEROW_NEON
   2939 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
   2940                           uint8* dst_y, int width) {
   2941   asm volatile (
   2942     // 16 pixel loop.
   2943   "1:                                          \n"
   2944     MEMACCESS(0)
   2945     "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
   2946     MEMACCESS(1)
   2947     "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
   2948     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
   2949     "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
   2950     MEMACCESS(2)
   2951     "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
   2952     "b.gt       1b                             \n"
   2953   : "+r"(src_sobelx),  // %0
   2954     "+r"(src_sobely),  // %1
   2955     "+r"(dst_y),       // %2
   2956     "+r"(width)        // %3
   2957   :
   2958   : "cc", "memory", "v0", "v1"
   2959   );
   2960 }
   2961 #endif  // HAS_SOBELTOPLANEROW_NEON
   2962 
   2963 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
   2964 // A = 255
   2965 // R = Sobel X
   2966 // G = Sobel
   2967 // B = Sobel Y
   2968 #ifdef HAS_SOBELXYROW_NEON
   2969 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
   2970                      uint8* dst_argb, int width) {
   2971   asm volatile (
   2972     "movi       v3.8b, #255                    \n"  // alpha
   2973     // 8 pixel loop.
   2974   "1:                                          \n"
   2975     MEMACCESS(0)
   2976     "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
   2977     MEMACCESS(1)
   2978     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
   2979     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   2980     "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
   2981     MEMACCESS(2)
   2982     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
   2983     "b.gt       1b                             \n"
   2984   : "+r"(src_sobelx),  // %0
   2985     "+r"(src_sobely),  // %1
   2986     "+r"(dst_argb),    // %2
   2987     "+r"(width)        // %3
   2988   :
   2989   : "cc", "memory", "v0", "v1", "v2", "v3"
   2990   );
   2991 }
   2992 #endif  // HAS_SOBELXYROW_NEON
   2993 
   2994 // SobelX as a matrix is
   2995 // -1  0  1
   2996 // -2  0  2
   2997 // -1  0  1
   2998 #ifdef HAS_SOBELXROW_NEON
   2999 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
   3000                     const uint8* src_y2, uint8* dst_sobelx, int width) {
   3001   asm volatile (
   3002   "1:                                          \n"
   3003     MEMACCESS(0)
   3004     "ld1        {v0.8b}, [%0],%5               \n"  // top
   3005     MEMACCESS(0)
   3006     "ld1        {v1.8b}, [%0],%6               \n"
   3007     "usubl      v0.8h, v0.8b, v1.8b            \n"
   3008     MEMACCESS(1)
   3009     "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
   3010     MEMACCESS(1)
   3011     "ld1        {v3.8b}, [%1],%6               \n"
   3012     "usubl      v1.8h, v2.8b, v3.8b            \n"
   3013     "add        v0.8h, v0.8h, v1.8h            \n"
   3014     "add        v0.8h, v0.8h, v1.8h            \n"
   3015     MEMACCESS(2)
   3016     "ld1        {v2.8b}, [%2],%5               \n"  // bottom
   3017     MEMACCESS(2)
   3018     "ld1        {v3.8b}, [%2],%6               \n"
   3019     "subs       %w4, %w4, #8                   \n"  // 8 pixels
   3020     "usubl      v1.8h, v2.8b, v3.8b            \n"
   3021     "add        v0.8h, v0.8h, v1.8h            \n"
   3022     "abs        v0.8h, v0.8h                   \n"
   3023     "uqxtn      v0.8b, v0.8h                   \n"
   3024     MEMACCESS(3)
   3025     "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
   3026     "b.gt       1b                             \n"
   3027   : "+r"(src_y0),      // %0
   3028     "+r"(src_y1),      // %1
   3029     "+r"(src_y2),      // %2
   3030     "+r"(dst_sobelx),  // %3
   3031     "+r"(width)        // %4
   3032   : "r"(2LL),          // %5
   3033     "r"(6LL)           // %6
   3034   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   3035   );
   3036 }
   3037 #endif  // HAS_SOBELXROW_NEON
   3038 
   3039 // SobelY as a matrix is
   3040 // -1 -2 -1
   3041 //  0  0  0
   3042 //  1  2  1
   3043 #ifdef HAS_SOBELYROW_NEON
   3044 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
   3045                     uint8* dst_sobely, int width) {
   3046   asm volatile (
   3047   "1:                                          \n"
   3048     MEMACCESS(0)
   3049     "ld1        {v0.8b}, [%0],%4               \n"  // left
   3050     MEMACCESS(1)
   3051     "ld1        {v1.8b}, [%1],%4               \n"
   3052     "usubl      v0.8h, v0.8b, v1.8b            \n"
   3053     MEMACCESS(0)
   3054     "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
   3055     MEMACCESS(1)
   3056     "ld1        {v3.8b}, [%1],%4               \n"
   3057     "usubl      v1.8h, v2.8b, v3.8b            \n"
   3058     "add        v0.8h, v0.8h, v1.8h            \n"
   3059     "add        v0.8h, v0.8h, v1.8h            \n"
   3060     MEMACCESS(0)
   3061     "ld1        {v2.8b}, [%0],%5               \n"  // right
   3062     MEMACCESS(1)
   3063     "ld1        {v3.8b}, [%1],%5               \n"
   3064     "subs       %w3, %w3, #8                   \n"  // 8 pixels
   3065     "usubl      v1.8h, v2.8b, v3.8b            \n"
   3066     "add        v0.8h, v0.8h, v1.8h            \n"
   3067     "abs        v0.8h, v0.8h                   \n"
   3068     "uqxtn      v0.8b, v0.8h                   \n"
   3069     MEMACCESS(2)
   3070     "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
   3071     "b.gt       1b                             \n"
   3072   : "+r"(src_y0),      // %0
   3073     "+r"(src_y1),      // %1
   3074     "+r"(dst_sobely),  // %2
   3075     "+r"(width)        // %3
   3076   : "r"(1LL),          // %4
   3077     "r"(6LL)           // %5
   3078   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   3079   );
   3080 }
   3081 #endif  // HAS_SOBELYROW_NEON
   3082 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
   3083 
   3084 #ifdef __cplusplus
   3085 }  // extern "C"
   3086 }  // namespace libyuv
   3087 #endif
   3088