Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 
     13 #ifdef __cplusplus
     14 namespace libyuv {
     15 extern "C" {
     16 #endif
     17 
     18 // This module is for GCC Neon armv8 64 bit.
     19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
     20 
     21 // Read 8 Y, 4 U and 4 V from 422
     22 #define READYUV422 \
     23   MEMACCESS(0)     \
     24   "ld1        {v0.8b}, [%0], #8              \n"                             \
     25     MEMACCESS(1)                                                               \
     26     "ld1        {v1.s}[0], [%1], #4            \n"                             \
     27     MEMACCESS(2)                                                               \
     28     "ld1        {v1.s}[1], [%2], #4            \n"
     29 
     30 // Read 8 Y, 8 U and 8 V from 444
     31 #define READYUV444 \
     32   MEMACCESS(0)     \
     33   "ld1        {v0.8b}, [%0], #8              \n"                             \
     34     MEMACCESS(1)                                                               \
     35     "ld1        {v1.d}[0], [%1], #8            \n"                             \
     36     MEMACCESS(2)                                                               \
     37     "ld1        {v1.d}[1], [%2], #8            \n"                             \
     38     "uaddlp     v1.8h, v1.16b                  \n"                             \
     39     "rshrn      v1.8b, v1.8h, #1               \n"
     40 
     41 // Read 8 Y, and set 4 U and 4 V to 128
     42 #define READYUV400                               \
     43   MEMACCESS(0)                                   \
     44   "ld1        {v0.8b}, [%0], #8              \n" \
     45   "movi       v1.8b , #128                   \n"
     46 
     47 // Read 8 Y and 4 UV from NV12
     48 #define READNV12 \
     49   MEMACCESS(0)   \
     50   "ld1        {v0.8b}, [%0], #8              \n"                             \
     51     MEMACCESS(1)                                                               \
     52     "ld1        {v2.8b}, [%1], #8              \n"                             \
     53     "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
     54     "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
     55     "ins        v1.s[1], v3.s[0]               \n"
     56 
     57 // Read 8 Y and 4 VU from NV21
     58 #define READNV21 \
     59   MEMACCESS(0)   \
     60   "ld1        {v0.8b}, [%0], #8              \n"                             \
     61     MEMACCESS(1)                                                               \
     62     "ld1        {v2.8b}, [%1], #8              \n"                             \
     63     "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
     64     "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
     65     "ins        v1.s[1], v3.s[0]               \n"
     66 
     67 // Read 8 YUY2
     68 #define READYUY2                                 \
     69   MEMACCESS(0)                                   \
     70   "ld2        {v0.8b, v1.8b}, [%0], #16      \n" \
     71   "uzp2       v3.8b, v1.8b, v1.8b            \n" \
     72   "uzp1       v1.8b, v1.8b, v1.8b            \n" \
     73   "ins        v1.s[1], v3.s[0]               \n"
     74 
     75 // Read 8 UYVY
     76 #define READUYVY                                 \
     77   MEMACCESS(0)                                   \
     78   "ld2        {v2.8b, v3.8b}, [%0], #16      \n" \
     79   "orr        v0.8b, v3.8b, v3.8b            \n" \
     80   "uzp1       v1.8b, v2.8b, v2.8b            \n" \
     81   "uzp2       v3.8b, v2.8b, v2.8b            \n" \
     82   "ins        v1.s[1], v3.s[0]               \n"
     83 
     84 #define YUVTORGB_SETUP                           \
     85   "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n" \
     86   "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n" \
     87   "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n" \
     88   "ld1r       {v31.4s}, [%[kYToRgb]]         \n" \
     89   "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
     90   "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
     91 
     92 #define YUVTORGB(vR, vG, vB)                                        \
     93   "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */ \
     94   "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */ \
     95   "ushll2     v3.4s, v0.8h, #0               \n" /* Y */            \
     96   "ushll      v0.4s, v0.4h, #0               \n"                    \
     97   "mul        v3.4s, v3.4s, v31.4s           \n"                    \
     98   "mul        v0.4s, v0.4s, v31.4s           \n"                    \
     99   "sqshrun    v0.4h, v0.4s, #16              \n"                    \
    100   "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */            \
    101   "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */ \
    102   "mov        v2.d[0], v1.d[1]               \n" /* Extract V */    \
    103   "uxtl       v2.8h, v2.8b                   \n"                    \
    104   "uxtl       v1.8h, v1.8b                   \n" /* Extract U */    \
    105   "mul        v3.8h, v1.8h, v27.8h           \n"                    \
    106   "mul        v5.8h, v1.8h, v29.8h           \n"                    \
    107   "mul        v6.8h, v2.8h, v30.8h           \n"                    \
    108   "mul        v7.8h, v2.8h, v28.8h           \n"                    \
    109   "sqadd      v6.8h, v6.8h, v5.8h            \n"                    \
    110   "sqadd      " #vB                                                 \
    111   ".8h, v24.8h, v0.8h      \n" /* B */                              \
    112   "sqadd      " #vG                                                 \
    113   ".8h, v25.8h, v0.8h      \n" /* G */                              \
    114   "sqadd      " #vR                                                 \
    115   ".8h, v26.8h, v0.8h      \n" /* R */                              \
    116   "sqadd      " #vB ".8h, " #vB                                     \
    117   ".8h, v3.8h  \n" /* B */                                          \
    118   "sqsub      " #vG ".8h, " #vG                                     \
    119   ".8h, v6.8h  \n" /* G */                                          \
    120   "sqadd      " #vR ".8h, " #vR                                     \
    121   ".8h, v7.8h  \n" /* R */                                          \
    122   "sqshrun    " #vB ".8b, " #vB                                     \
    123   ".8h, #6     \n" /* B */                                          \
    124   "sqshrun    " #vG ".8b, " #vG                                     \
    125   ".8h, #6     \n"                               /* G */            \
    126   "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */
    127 
    128 void I444ToARGBRow_NEON(const uint8* src_y,
    129                         const uint8* src_u,
    130                         const uint8* src_v,
    131                         uint8* dst_argb,
    132                         const struct YuvConstants* yuvconstants,
    133                         int width) {
    134   asm volatile (
    135     YUVTORGB_SETUP
    136     "movi       v23.8b, #255                   \n" /* A */
    137   "1:                                          \n"
    138     READYUV444
    139     YUVTORGB(v22, v21, v20)
    140     "subs       %w4, %w4, #8                   \n"
    141     MEMACCESS(3)
    142     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
    143     "b.gt       1b                             \n"
    144     : "+r"(src_y),     // %0
    145       "+r"(src_u),     // %1
    146       "+r"(src_v),     // %2
    147       "+r"(dst_argb),  // %3
    148       "+r"(width)      // %4
    149     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
    150       [kUVToG]"r"(&yuvconstants->kUVToG),
    151       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
    152       [kYToRgb]"r"(&yuvconstants->kYToRgb)
    153     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    154       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    155   );
    156 }
    157 
    158 void I422ToARGBRow_NEON(const uint8* src_y,
    159                         const uint8* src_u,
    160                         const uint8* src_v,
    161                         uint8* dst_argb,
    162                         const struct YuvConstants* yuvconstants,
    163                         int width) {
    164   asm volatile (
    165     YUVTORGB_SETUP
    166     "movi       v23.8b, #255                   \n" /* A */
    167   "1:                                          \n"
    168     READYUV422
    169     YUVTORGB(v22, v21, v20)
    170     "subs       %w4, %w4, #8                   \n"
    171     MEMACCESS(3)
    172     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
    173     "b.gt       1b                             \n"
    174     : "+r"(src_y),     // %0
    175       "+r"(src_u),     // %1
    176       "+r"(src_v),     // %2
    177       "+r"(dst_argb),  // %3
    178       "+r"(width)      // %4
    179     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
    180       [kUVToG]"r"(&yuvconstants->kUVToG),
    181       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
    182       [kYToRgb]"r"(&yuvconstants->kYToRgb)
    183     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    184       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    185   );
    186 }
    187 
    188 void I422AlphaToARGBRow_NEON(const uint8* src_y,
    189                              const uint8* src_u,
    190                              const uint8* src_v,
    191                              const uint8* src_a,
    192                              uint8* dst_argb,
    193                              const struct YuvConstants* yuvconstants,
    194                              int width) {
    195   asm volatile (
    196     YUVTORGB_SETUP
    197   "1:                                          \n"
    198     READYUV422
    199     YUVTORGB(v22, v21, v20)
    200     MEMACCESS(3)
    201     "ld1        {v23.8b}, [%3], #8             \n"
    202     "subs       %w5, %w5, #8                   \n"
    203     MEMACCESS(4)
    204     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"
    205     "b.gt       1b                             \n"
    206     : "+r"(src_y),     // %0
    207       "+r"(src_u),     // %1
    208       "+r"(src_v),     // %2
    209       "+r"(src_a),     // %3
    210       "+r"(dst_argb),  // %4
    211       "+r"(width)      // %5
    212     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
    213       [kUVToG]"r"(&yuvconstants->kUVToG),
    214       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
    215       [kYToRgb]"r"(&yuvconstants->kYToRgb)
    216     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    217       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    218   );
    219 }
    220 
    221 void I422ToRGBARow_NEON(const uint8* src_y,
    222                         const uint8* src_u,
    223                         const uint8* src_v,
    224                         uint8* dst_rgba,
    225                         const struct YuvConstants* yuvconstants,
    226                         int width) {
    227   asm volatile (
    228     YUVTORGB_SETUP
    229     "movi       v20.8b, #255                   \n" /* A */
    230   "1:                                          \n"
    231     READYUV422
    232     YUVTORGB(v23, v22, v21)
    233     "subs       %w4, %w4, #8                   \n"
    234     MEMACCESS(3)
    235     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
    236     "b.gt       1b                             \n"
    237     : "+r"(src_y),     // %0
    238       "+r"(src_u),     // %1
    239       "+r"(src_v),     // %2
    240       "+r"(dst_rgba),  // %3
    241       "+r"(width)      // %4
    242     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
    243       [kUVToG]"r"(&yuvconstants->kUVToG),
    244       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
    245       [kYToRgb]"r"(&yuvconstants->kYToRgb)
    246     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    247       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    248   );
    249 }
    250 
    251 void I422ToRGB24Row_NEON(const uint8* src_y,
    252                          const uint8* src_u,
    253                          const uint8* src_v,
    254                          uint8* dst_rgb24,
    255                          const struct YuvConstants* yuvconstants,
    256                          int width) {
    257   asm volatile (
    258     YUVTORGB_SETUP
    259   "1:                                          \n"
    260     READYUV422
    261     YUVTORGB(v22, v21, v20)
    262     "subs       %w4, %w4, #8                   \n"
    263     MEMACCESS(3)
    264     "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
    265     "b.gt       1b                             \n"
    266     : "+r"(src_y),     // %0
    267       "+r"(src_u),     // %1
    268       "+r"(src_v),     // %2
    269       "+r"(dst_rgb24), // %3
    270       "+r"(width)      // %4
    271     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
    272       [kUVToG]"r"(&yuvconstants->kUVToG),
    273       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
    274       [kYToRgb]"r"(&yuvconstants->kYToRgb)
    275     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    276       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    277   );
    278 }
    279 
    280 #define ARGBTORGB565                                                        \
    281   "shll       v0.8h,  v22.8b, #8             \n" /* R                    */ \
    282   "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
    283   "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
    284   "sri        v0.8h,  v21.8h, #5             \n" /* RG                   */ \
    285   "sri        v0.8h,  v20.8h, #11            \n" /* RGB                  */
    286 
    287 void I422ToRGB565Row_NEON(const uint8* src_y,
    288                           const uint8* src_u,
    289                           const uint8* src_v,
    290                           uint8* dst_rgb565,
    291                           const struct YuvConstants* yuvconstants,
    292                           int width) {
    293   asm volatile (
    294     YUVTORGB_SETUP
    295   "1:                                          \n"
    296     READYUV422
    297     YUVTORGB(v22, v21, v20)
    298     "subs       %w4, %w4, #8                   \n"
    299     ARGBTORGB565
    300     MEMACCESS(3)
    301     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
    302     "b.gt       1b                             \n"
    303     : "+r"(src_y),    // %0
    304       "+r"(src_u),    // %1
    305       "+r"(src_v),    // %2
    306       "+r"(dst_rgb565),  // %3
    307       "+r"(width)     // %4
    308     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
    309       [kUVToG]"r"(&yuvconstants->kUVToG),
    310       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
    311       [kYToRgb]"r"(&yuvconstants->kYToRgb)
    312     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    313       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    314   );
    315 }
    316 
    317 #define ARGBTOARGB1555                                                      \
    318   "shll       v0.8h,  v23.8b, #8             \n" /* A                    */ \
    319   "shll       v22.8h, v22.8b, #8             \n" /* R                    */ \
    320   "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
    321   "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
    322   "sri        v0.8h,  v22.8h, #1             \n" /* AR                   */ \
    323   "sri        v0.8h,  v21.8h, #6             \n" /* ARG                  */ \
    324   "sri        v0.8h,  v20.8h, #11            \n" /* ARGB                 */
    325 
    326 void I422ToARGB1555Row_NEON(const uint8* src_y,
    327                             const uint8* src_u,
    328                             const uint8* src_v,
    329                             uint8* dst_argb1555,
    330                             const struct YuvConstants* yuvconstants,
    331                             int width) {
    332   asm volatile (
    333     YUVTORGB_SETUP
    334     "movi       v23.8b, #255                   \n"
    335   "1:                                          \n"
    336     READYUV422
    337     YUVTORGB(v22, v21, v20)
    338     "subs       %w4, %w4, #8                   \n"
    339     ARGBTOARGB1555
    340     MEMACCESS(3)
    341     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
    342     "b.gt       1b                             \n"
    343     : "+r"(src_y),    // %0
    344       "+r"(src_u),    // %1
    345       "+r"(src_v),    // %2
    346       "+r"(dst_argb1555),  // %3
    347       "+r"(width)     // %4
    348     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
    349       [kUVToG]"r"(&yuvconstants->kUVToG),
    350       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
    351       [kYToRgb]"r"(&yuvconstants->kYToRgb)
    352     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    353       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    354   );
    355 }
    356 
    357 #define ARGBTOARGB4444                                                       \
    358   /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
    359   "ushr       v20.8b, v20.8b, #4             \n" /* B                    */  \
    360   "bic        v21.8b, v21.8b, v4.8b          \n" /* G                    */  \
    361   "ushr       v22.8b, v22.8b, #4             \n" /* R                    */  \
    362   "bic        v23.8b, v23.8b, v4.8b          \n" /* A                    */  \
    363   "orr        v0.8b,  v20.8b, v21.8b         \n" /* BG                   */  \
    364   "orr        v1.8b,  v22.8b, v23.8b         \n" /* RA                   */  \
    365   "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
    366 
    367 void I422ToARGB4444Row_NEON(const uint8* src_y,
    368                             const uint8* src_u,
    369                             const uint8* src_v,
    370                             uint8* dst_argb4444,
    371                             const struct YuvConstants* yuvconstants,
    372                             int width) {
    373   asm volatile (
    374     YUVTORGB_SETUP
    375     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
    376   "1:                                          \n"
    377     READYUV422
    378     YUVTORGB(v22, v21, v20)
    379     "subs       %w4, %w4, #8                   \n"
    380     "movi       v23.8b, #255                   \n"
    381     ARGBTOARGB4444
    382     MEMACCESS(3)
    383     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
    384     "b.gt       1b                             \n"
    385     : "+r"(src_y),    // %0
    386       "+r"(src_u),    // %1
    387       "+r"(src_v),    // %2
    388       "+r"(dst_argb4444),  // %3
    389       "+r"(width)     // %4
    390     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
    391       [kUVToG]"r"(&yuvconstants->kUVToG),
    392       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
    393       [kYToRgb]"r"(&yuvconstants->kYToRgb)
    394     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    395       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    396   );
    397 }
    398 
    399 void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
    400   asm volatile (
    401     YUVTORGB_SETUP
    402     "movi       v23.8b, #255                   \n"
    403   "1:                                          \n"
    404     READYUV400
    405     YUVTORGB(v22, v21, v20)
    406     "subs       %w2, %w2, #8                   \n"
    407     MEMACCESS(1)
    408     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
    409     "b.gt       1b                             \n"
    410     : "+r"(src_y),     // %0
    411       "+r"(dst_argb),  // %1
    412       "+r"(width)      // %2
    413     : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
    414       [kUVToG]"r"(&kYuvI601Constants.kUVToG),
    415       [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
    416       [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
    417     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    418       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    419   );
    420 }
    421 
    422 void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
    423   asm volatile (
    424     "movi       v23.8b, #255                   \n"
    425   "1:                                          \n"
    426     MEMACCESS(0)
    427     "ld1        {v20.8b}, [%0], #8             \n"
    428     "orr        v21.8b, v20.8b, v20.8b         \n"
    429     "orr        v22.8b, v20.8b, v20.8b         \n"
    430     "subs       %w2, %w2, #8                   \n"
    431     MEMACCESS(1)
    432     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
    433     "b.gt       1b                             \n"
    434     : "+r"(src_y),     // %0
    435       "+r"(dst_argb),  // %1
    436       "+r"(width)      // %2
    437     :
    438     : "cc", "memory", "v20", "v21", "v22", "v23"
    439   );
    440 }
    441 
    442 void NV12ToARGBRow_NEON(const uint8* src_y,
    443                         const uint8* src_uv,
    444                         uint8* dst_argb,
    445                         const struct YuvConstants* yuvconstants,
    446                         int width) {
    447   asm volatile (
    448     YUVTORGB_SETUP
    449     "movi       v23.8b, #255                   \n"
    450   "1:                                          \n"
    451     READNV12
    452     YUVTORGB(v22, v21, v20)
    453     "subs       %w3, %w3, #8                   \n"
    454     MEMACCESS(2)
    455     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
    456     "b.gt       1b                             \n"
    457     : "+r"(src_y),     // %0
    458       "+r"(src_uv),    // %1
    459       "+r"(dst_argb),  // %2
    460       "+r"(width)      // %3
    461     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
    462       [kUVToG]"r"(&yuvconstants->kUVToG),
    463       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
    464       [kYToRgb]"r"(&yuvconstants->kYToRgb)
    465     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    466       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    467   );
    468 }
    469 
    470 void NV21ToARGBRow_NEON(const uint8* src_y,
    471                         const uint8* src_vu,
    472                         uint8* dst_argb,
    473                         const struct YuvConstants* yuvconstants,
    474                         int width) {
    475   asm volatile (
    476     YUVTORGB_SETUP
    477     "movi       v23.8b, #255                   \n"
    478   "1:                                          \n"
    479     READNV21
    480     YUVTORGB(v22, v21, v20)
    481     "subs       %w3, %w3, #8                   \n"
    482     MEMACCESS(2)
    483     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
    484     "b.gt       1b                             \n"
    485     : "+r"(src_y),     // %0
    486       "+r"(src_vu),    // %1
    487       "+r"(dst_argb),  // %2
    488       "+r"(width)      // %3
    489     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
    490       [kUVToG]"r"(&yuvconstants->kUVToG),
    491       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
    492       [kYToRgb]"r"(&yuvconstants->kYToRgb)
    493     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    494       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    495   );
    496 }
    497 
    498 void NV12ToRGB565Row_NEON(const uint8* src_y,
    499                           const uint8* src_uv,
    500                           uint8* dst_rgb565,
    501                           const struct YuvConstants* yuvconstants,
    502                           int width) {
    503   asm volatile (
    504     YUVTORGB_SETUP
    505   "1:                                          \n"
    506     READNV12
    507     YUVTORGB(v22, v21, v20)
    508     "subs       %w3, %w3, #8                   \n"
    509     ARGBTORGB565
    510     MEMACCESS(2)
    511     "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
    512     "b.gt       1b                             \n"
    513     : "+r"(src_y),     // %0
    514       "+r"(src_uv),    // %1
    515       "+r"(dst_rgb565),  // %2
    516       "+r"(width)      // %3
    517     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
    518       [kUVToG]"r"(&yuvconstants->kUVToG),
    519       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
    520       [kYToRgb]"r"(&yuvconstants->kYToRgb)
    521     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    522       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    523   );
    524 }
    525 
    526 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
    527                         uint8* dst_argb,
    528                         const struct YuvConstants* yuvconstants,
    529                         int width) {
    530   asm volatile (
    531     YUVTORGB_SETUP
    532     "movi       v23.8b, #255                   \n"
    533   "1:                                          \n"
    534     READYUY2
    535     YUVTORGB(v22, v21, v20)
    536     "subs       %w2, %w2, #8                   \n"
    537     MEMACCESS(1)
    538     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
    539     "b.gt       1b                             \n"
    540     : "+r"(src_yuy2),  // %0
    541       "+r"(dst_argb),  // %1
    542       "+r"(width)      // %2
    543     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
    544       [kUVToG]"r"(&yuvconstants->kUVToG),
    545       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
    546       [kYToRgb]"r"(&yuvconstants->kYToRgb)
    547     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    548       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    549   );
    550 }
    551 
    552 void UYVYToARGBRow_NEON(const uint8* src_uyvy,
    553                         uint8* dst_argb,
    554                         const struct YuvConstants* yuvconstants,
    555                         int width) {
    556   asm volatile (
    557     YUVTORGB_SETUP
    558     "movi       v23.8b, #255                   \n"
    559   "1:                                          \n"
    560     READUYVY
    561     YUVTORGB(v22, v21, v20)
    562     "subs       %w2, %w2, #8                   \n"
    563     MEMACCESS(1)
    564     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
    565     "b.gt       1b                             \n"
    566     : "+r"(src_uyvy),  // %0
    567       "+r"(dst_argb),  // %1
    568       "+r"(width)      // %2
    569     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
    570       [kUVToG]"r"(&yuvconstants->kUVToG),
    571       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
    572       [kYToRgb]"r"(&yuvconstants->kYToRgb)
    573     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
    574       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
    575   );
    576 }
    577 
    578 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
    579 void SplitUVRow_NEON(const uint8* src_uv,
    580                      uint8* dst_u,
    581                      uint8* dst_v,
    582                      int width) {
    583   asm volatile (
    584   "1:                                          \n"
    585     MEMACCESS(0)
    586     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
    587     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
    588     MEMACCESS(1)
    589     "st1        {v0.16b}, [%1], #16            \n"  // store U
    590     MEMACCESS(2)
    591     "st1        {v1.16b}, [%2], #16            \n"  // store V
    592     "b.gt       1b                             \n"
    593     : "+r"(src_uv),  // %0
    594       "+r"(dst_u),   // %1
    595       "+r"(dst_v),   // %2
    596       "+r"(width)    // %3  // Output registers
    597     :                       // Input registers
    598     : "cc", "memory", "v0", "v1"  // Clobber List
    599   );
    600 }
    601 
    602 // Reads 16 U's and V's and writes out 16 pairs of UV.
    603 void MergeUVRow_NEON(const uint8* src_u,
    604                      const uint8* src_v,
    605                      uint8* dst_uv,
    606                      int width) {
    607   asm volatile (
    608   "1:                                          \n"
    609     MEMACCESS(0)
    610     "ld1        {v0.16b}, [%0], #16            \n"  // load U
    611     MEMACCESS(1)
    612     "ld1        {v1.16b}, [%1], #16            \n"  // load V
    613     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
    614     MEMACCESS(2)
    615     "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
    616     "b.gt       1b                             \n"
    617     :
    618       "+r"(src_u),   // %0
    619       "+r"(src_v),   // %1
    620       "+r"(dst_uv),  // %2
    621       "+r"(width)    // %3  // Output registers
    622     :                       // Input registers
    623     : "cc", "memory", "v0", "v1"  // Clobber List
    624   );
    625 }
    626 
    627 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
    628 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
    629   asm volatile (
    630   "1:                                          \n"
    631     MEMACCESS(0)
    632     "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
    633     "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
    634     MEMACCESS(1)
    635     "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
    636     "b.gt       1b                             \n"
    637   : "+r"(src),   // %0
    638     "+r"(dst),   // %1
    639     "+r"(count)  // %2  // Output registers
    640   :                     // Input registers
    641   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
    642   );
    643 }
    644 
    645 // SetRow writes 'count' bytes using an 8 bit value repeated.
    646 void SetRow_NEON(uint8* dst, uint8 v8, int count) {
    647   asm volatile (
    648     "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
    649   "1:                                          \n"
    650     "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
    651     MEMACCESS(0)
    652     "st1        {v0.16b}, [%0], #16            \n"  // store
    653     "b.gt       1b                             \n"
    654   : "+r"(dst),   // %0
    655     "+r"(count)  // %1
    656   : "r"(v8)      // %2
    657   : "cc", "memory", "v0"
    658   );
    659 }
    660 
    661 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
    662   asm volatile (
    663     "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
    664   "1:                                          \n"
    665     "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
    666     MEMACCESS(0)
    667     "st1        {v0.16b}, [%0], #16            \n"  // store
    668     "b.gt       1b                             \n"
    669   : "+r"(dst),   // %0
    670     "+r"(count)  // %1
    671   : "r"(v32)     // %2
    672   : "cc", "memory", "v0"
    673   );
    674 }
    675 
    676 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
    677   asm volatile (
    678     // Start at end of source row.
    679     "add        %0, %0, %w2, sxtw              \n"
    680     "sub        %0, %0, #16                    \n"
    681   "1:                                          \n"
    682     MEMACCESS(0)
    683     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
    684     "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
    685     "rev64      v0.16b, v0.16b                 \n"
    686     MEMACCESS(1)
    687     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
    688     MEMACCESS(1)
    689     "st1        {v0.D}[0], [%1], #8            \n"
    690     "b.gt       1b                             \n"
    691   : "+r"(src),   // %0
    692     "+r"(dst),   // %1
    693     "+r"(width)  // %2
    694   : "r"((ptrdiff_t)-16)    // %3
    695   : "cc", "memory", "v0"
    696   );
    697 }
    698 
    699 void MirrorUVRow_NEON(const uint8* src_uv,
    700                       uint8* dst_u,
    701                       uint8* dst_v,
    702                       int width) {
    703   asm volatile (
    704     // Start at end of source row.
    705     "add        %0, %0, %w3, sxtw #1           \n"
    706     "sub        %0, %0, #16                    \n"
    707   "1:                                          \n"
    708     MEMACCESS(0)
    709     "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
    710     "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
    711     "rev64      v0.8b, v0.8b                   \n"
    712     "rev64      v1.8b, v1.8b                   \n"
    713     MEMACCESS(1)
    714     "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
    715     MEMACCESS(2)
    716     "st1        {v1.8b}, [%2], #8              \n"
    717     "b.gt       1b                             \n"
    718   : "+r"(src_uv),  // %0
    719     "+r"(dst_u),   // %1
    720     "+r"(dst_v),   // %2
    721     "+r"(width)    // %3
    722   : "r"((ptrdiff_t)-16)      // %4
    723   : "cc", "memory", "v0", "v1"
    724   );
    725 }
    726 
    727 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
    728   asm volatile (
    729   // Start at end of source row.
    730     "add        %0, %0, %w2, sxtw #2           \n"
    731     "sub        %0, %0, #16                    \n"
    732   "1:                                          \n"
    733     MEMACCESS(0)
    734     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
    735     "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
    736     "rev64      v0.4s, v0.4s                   \n"
    737     MEMACCESS(1)
    738     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
    739     MEMACCESS(1)
    740     "st1        {v0.D}[0], [%1], #8            \n"
    741     "b.gt       1b                             \n"
    742   : "+r"(src),   // %0
    743     "+r"(dst),   // %1
    744     "+r"(width)  // %2
    745   : "r"((ptrdiff_t)-16)    // %3
    746   : "cc", "memory", "v0"
    747   );
    748 }
    749 
    750 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
    751   asm volatile (
    752     "movi       v4.8b, #255                    \n"  // Alpha
    753   "1:                                          \n"
    754     MEMACCESS(0)
    755     "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
    756     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    757     MEMACCESS(1)
    758     "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
    759     "b.gt       1b                             \n"
    760   : "+r"(src_rgb24),  // %0
    761     "+r"(dst_argb),   // %1
    762     "+r"(width)       // %2
    763   :
    764   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
    765   );
    766 }
    767 
    768 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
    769   asm volatile (
    770     "movi       v5.8b, #255                    \n"  // Alpha
    771   "1:                                          \n"
    772     MEMACCESS(0)
    773     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
    774     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    775     "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
    776     "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
    777     MEMACCESS(1)
    778     "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
    779     "b.gt       1b                             \n"
    780   : "+r"(src_raw),   // %0
    781     "+r"(dst_argb),  // %1
    782     "+r"(width)      // %2
    783   :
    784   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
    785   );
    786 }
    787 
    788 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
    789   asm volatile (
    790   "1:                                          \n"
    791     MEMACCESS(0)
    792     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
    793     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    794     "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
    795     "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
    796     MEMACCESS(1)
    797     "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
    798     "b.gt       1b                             \n"
    799   : "+r"(src_raw),    // %0
    800     "+r"(dst_rgb24),  // %1
    801     "+r"(width)       // %2
    802   :
    803   : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
    804   );
    805 }
    806 
    807 #define RGB565TOARGB                                                        \
    808   "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \
    809   "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \
    810   "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \
    811   "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \
    812   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
    813   "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \
    814   "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \
    815   "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \
    816   "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \
    817   "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
    818   "dup        v2.2D, v0.D[1]                 \n" /* R                    */
    819 
    820 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
    821   asm volatile (
    822     "movi       v3.8b, #255                    \n"  // Alpha
    823   "1:                                          \n"
    824     MEMACCESS(0)
    825     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
    826     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    827     RGB565TOARGB
    828     MEMACCESS(1)
    829     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
    830     "b.gt       1b                             \n"
    831   : "+r"(src_rgb565),  // %0
    832     "+r"(dst_argb),    // %1
    833     "+r"(width)          // %2
    834   :
    835   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
    836   );
    837 }
    838 
    839 #define ARGB1555TOARGB                                                      \
    840   "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
    841   "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
    842   "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \
    843                                                                             \
    844   "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \
    845   "xtn2       v3.16b, v2.8h                  \n"                            \
    846                                                                             \
    847   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
    848   "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
    849                                                                             \
    850   "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \
    851   "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
    852   "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
    853                                                                             \
    854   "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
    855   "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \
    856   "dup        v1.2D, v0.D[1]                 \n"                            \
    857   "dup        v3.2D, v2.D[1]                 \n"
    858 
    859 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
    860 #define RGB555TOARGB                                                        \
    861   "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
    862   "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
    863   "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \
    864                                                                             \
    865   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
    866   "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
    867                                                                             \
    868   "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \
    869   "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
    870   "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
    871                                                                             \
    872   "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
    873   "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \
    874   "dup        v1.2D, v0.D[1]                 \n" /* G */
    875 
    876 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
    877                             uint8* dst_argb,
    878                             int width) {
    879   asm volatile (
    880     "movi       v3.8b, #255                    \n"  // Alpha
    881   "1:                                          \n"
    882     MEMACCESS(0)
    883     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
    884     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    885     ARGB1555TOARGB
    886     MEMACCESS(1)
    887     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
    888     "b.gt       1b                             \n"
    889   : "+r"(src_argb1555),  // %0
    890     "+r"(dst_argb),    // %1
    891     "+r"(width)          // %2
    892   :
    893   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
    894   );
    895 }
    896 
    897 #define ARGB4444TOARGB                                                      \
    898   "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
    899   "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
    900   "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \
    901   "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \
    902   "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \
    903   "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \
    904   "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \
    905   "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \
    906   "dup        v0.2D, v2.D[1]                 \n"                            \
    907   "dup        v1.2D, v3.D[1]                 \n"
    908 
    909 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
    910                             uint8* dst_argb,
    911                             int width) {
    912   asm volatile (
    913   "1:                                          \n"
    914     MEMACCESS(0)
    915     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
    916     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    917     ARGB4444TOARGB
    918     MEMACCESS(1)
    919     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
    920     "b.gt       1b                             \n"
    921   : "+r"(src_argb4444),  // %0
    922     "+r"(dst_argb),    // %1
    923     "+r"(width)          // %2
    924   :
    925   : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
    926   );
    927 }
    928 
    929 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
    930   asm volatile (
    931   "1:                                          \n"
    932     MEMACCESS(0)
    933     "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
    934     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    935     MEMACCESS(1)
    936     "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
    937     "b.gt       1b                             \n"
    938   : "+r"(src_argb),   // %0
    939     "+r"(dst_rgb24),  // %1
    940     "+r"(width)         // %2
    941   :
    942   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
    943   );
    944 }
    945 
    946 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
    947   asm volatile (
    948   "1:                                          \n"
    949     MEMACCESS(0)
    950     "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
    951     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    952     "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
    953     "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
    954     MEMACCESS(1)
    955     "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
    956     "b.gt       1b                             \n"
    957   : "+r"(src_argb),  // %0
    958     "+r"(dst_raw),   // %1
    959     "+r"(width)        // %2
    960   :
    961   : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
    962   );
    963 }
    964 
    965 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
    966   asm volatile (
    967   "1:                                          \n"
    968     MEMACCESS(0)
    969     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
    970     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
    971     MEMACCESS(1)
    972     "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
    973     "b.gt       1b                             \n"
    974   : "+r"(src_yuy2),  // %0
    975     "+r"(dst_y),     // %1
    976     "+r"(width)        // %2
    977   :
    978   : "cc", "memory", "v0", "v1"  // Clobber List
    979   );
    980 }
    981 
    982 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
    983   asm volatile (
    984   "1:                                          \n"
    985     MEMACCESS(0)
    986     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
    987     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
    988     MEMACCESS(1)
    989     "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
    990     "b.gt       1b                             \n"
    991   : "+r"(src_uyvy),  // %0
    992     "+r"(dst_y),     // %1
    993     "+r"(width)        // %2
    994   :
    995   : "cc", "memory", "v0", "v1"  // Clobber List
    996   );
    997 }
    998 
    999 void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
   1000                          uint8* dst_u,
   1001                          uint8* dst_v,
   1002                          int width) {
   1003   asm volatile (
   1004   "1:                                          \n"
   1005     MEMACCESS(0)
   1006     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
   1007     "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
   1008     MEMACCESS(1)
   1009     "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
   1010     MEMACCESS(2)
   1011     "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
   1012     "b.gt       1b                             \n"
   1013   : "+r"(src_yuy2),  // %0
   1014     "+r"(dst_u),     // %1
   1015     "+r"(dst_v),     // %2
   1016     "+r"(width)        // %3
   1017   :
   1018   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   1019   );
   1020 }
   1021 
   1022 void UYVYToUV422Row_NEON(const uint8* src_uyvy,
   1023                          uint8* dst_u,
   1024                          uint8* dst_v,
   1025                          int width) {
   1026   asm volatile (
   1027   "1:                                          \n"
   1028     MEMACCESS(0)
   1029     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
   1030     "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
   1031     MEMACCESS(1)
   1032     "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
   1033     MEMACCESS(2)
   1034     "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
   1035     "b.gt       1b                             \n"
   1036   : "+r"(src_uyvy),  // %0
   1037     "+r"(dst_u),     // %1
   1038     "+r"(dst_v),     // %2
   1039     "+r"(width)        // %3
   1040   :
   1041   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   1042   );
   1043 }
   1044 
   1045 void YUY2ToUVRow_NEON(const uint8* src_yuy2,
   1046                       int stride_yuy2,
   1047                       uint8* dst_u,
   1048                       uint8* dst_v,
   1049                       int width) {
   1050   const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
   1051   asm volatile (
   1052   "1:                                          \n"
   1053     MEMACCESS(0)
   1054     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
   1055     "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
   1056     MEMACCESS(1)
   1057     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
   1058     "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
   1059     "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
   1060     MEMACCESS(2)
   1061     "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
   1062     MEMACCESS(3)
   1063     "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
   1064     "b.gt       1b                             \n"
   1065   : "+r"(src_yuy2),     // %0
   1066     "+r"(src_yuy2b),    // %1
   1067     "+r"(dst_u),        // %2
   1068     "+r"(dst_v),        // %3
   1069     "+r"(width)           // %4
   1070   :
   1071   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
   1072     "v5", "v6", "v7"  // Clobber List
   1073   );
   1074 }
   1075 
   1076 void UYVYToUVRow_NEON(const uint8* src_uyvy,
   1077                       int stride_uyvy,
   1078                       uint8* dst_u,
   1079                       uint8* dst_v,
   1080                       int width) {
   1081   const uint8* src_uyvyb = src_uyvy + stride_uyvy;
   1082   asm volatile (
   1083   "1:                                          \n"
   1084     MEMACCESS(0)
   1085     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
   1086     "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
   1087     MEMACCESS(1)
   1088     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
   1089     "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
   1090     "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
   1091     MEMACCESS(2)
   1092     "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
   1093     MEMACCESS(3)
   1094     "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
   1095     "b.gt       1b                             \n"
   1096   : "+r"(src_uyvy),     // %0
   1097     "+r"(src_uyvyb),    // %1
   1098     "+r"(dst_u),        // %2
   1099     "+r"(dst_v),        // %3
   1100     "+r"(width)           // %4
   1101   :
   1102   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
   1103     "v5", "v6", "v7"  // Clobber List
   1104   );
   1105 }
   1106 
   1107 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
   1108 void ARGBShuffleRow_NEON(const uint8* src_argb,
   1109                          uint8* dst_argb,
   1110                          const uint8* shuffler,
   1111                          int width) {
   1112   asm volatile (
   1113     MEMACCESS(3)
   1114     "ld1        {v2.16b}, [%3]                 \n"  // shuffler
   1115   "1:                                          \n"
   1116     MEMACCESS(0)
   1117     "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
   1118     "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
   1119     "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
   1120     MEMACCESS(1)
   1121     "st1        {v1.16b}, [%1], #16            \n"  // store 4.
   1122     "b.gt       1b                             \n"
   1123   : "+r"(src_argb),  // %0
   1124     "+r"(dst_argb),  // %1
   1125     "+r"(width)        // %2
   1126   : "r"(shuffler)    // %3
   1127   : "cc", "memory", "v0", "v1", "v2"  // Clobber List
   1128   );
   1129 }
   1130 
   1131 void I422ToYUY2Row_NEON(const uint8* src_y,
   1132                         const uint8* src_u,
   1133                         const uint8* src_v,
   1134                         uint8* dst_yuy2,
   1135                         int width) {
   1136   asm volatile (
   1137   "1:                                          \n"
   1138     MEMACCESS(0)
   1139     "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
   1140     "orr        v2.8b, v1.8b, v1.8b            \n"
   1141     MEMACCESS(1)
   1142     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
   1143     MEMACCESS(2)
   1144     "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
   1145     "subs       %w4, %w4, #16                  \n"  // 16 pixels
   1146     MEMACCESS(3)
   1147     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
   1148     "b.gt       1b                             \n"
   1149   : "+r"(src_y),     // %0
   1150     "+r"(src_u),     // %1
   1151     "+r"(src_v),     // %2
   1152     "+r"(dst_yuy2),  // %3
   1153     "+r"(width)      // %4
   1154   :
   1155   : "cc", "memory", "v0", "v1", "v2", "v3"
   1156   );
   1157 }
   1158 
   1159 void I422ToUYVYRow_NEON(const uint8* src_y,
   1160                         const uint8* src_u,
   1161                         const uint8* src_v,
   1162                         uint8* dst_uyvy,
   1163                         int width) {
   1164   asm volatile (
   1165   "1:                                          \n"
   1166     MEMACCESS(0)
   1167     "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
   1168     "orr        v3.8b, v2.8b, v2.8b            \n"
   1169     MEMACCESS(1)
   1170     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
   1171     MEMACCESS(2)
   1172     "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
   1173     "subs       %w4, %w4, #16                  \n"  // 16 pixels
   1174     MEMACCESS(3)
   1175     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
   1176     "b.gt       1b                             \n"
   1177   : "+r"(src_y),     // %0
   1178     "+r"(src_u),     // %1
   1179     "+r"(src_v),     // %2
   1180     "+r"(dst_uyvy),  // %3
   1181     "+r"(width)      // %4
   1182   :
   1183   : "cc", "memory", "v0", "v1", "v2", "v3"
   1184   );
   1185 }
   1186 
   1187 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
   1188   asm volatile (
   1189   "1:                                          \n"
   1190     MEMACCESS(0)
   1191     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
   1192     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1193     ARGBTORGB565
   1194     MEMACCESS(1)
   1195     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
   1196     "b.gt       1b                             \n"
   1197   : "+r"(src_argb),  // %0
   1198     "+r"(dst_rgb565),  // %1
   1199     "+r"(width)        // %2
   1200   :
   1201   : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
   1202   );
   1203 }
   1204 
   1205 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
   1206                                 uint8* dst_rgb,
   1207                                 const uint32 dither4,
   1208                                 int width) {
   1209   asm volatile (
   1210     "dup        v1.4s, %w2                     \n"  // dither4
   1211   "1:                                          \n"
   1212     MEMACCESS(1)
   1213     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
   1214     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   1215     "uqadd      v20.8b, v20.8b, v1.8b          \n"
   1216     "uqadd      v21.8b, v21.8b, v1.8b          \n"
   1217     "uqadd      v22.8b, v22.8b, v1.8b          \n"
   1218     ARGBTORGB565
   1219     MEMACCESS(0)
   1220     "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
   1221     "b.gt       1b                             \n"
   1222   : "+r"(dst_rgb)    // %0
   1223   : "r"(src_argb),   // %1
   1224     "r"(dither4),    // %2
   1225     "r"(width)       // %3
   1226   : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
   1227   );
   1228 }
   1229 
   1230 void ARGBToARGB1555Row_NEON(const uint8* src_argb,
   1231                             uint8* dst_argb1555,
   1232                             int width) {
   1233   asm volatile (
   1234   "1:                                          \n"
   1235     MEMACCESS(0)
   1236     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
   1237     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1238     ARGBTOARGB1555
   1239     MEMACCESS(1)
   1240     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
   1241     "b.gt       1b                             \n"
   1242   : "+r"(src_argb),  // %0
   1243     "+r"(dst_argb1555),  // %1
   1244     "+r"(width)        // %2
   1245   :
   1246   : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
   1247   );
   1248 }
   1249 
   1250 void ARGBToARGB4444Row_NEON(const uint8* src_argb,
   1251                             uint8* dst_argb4444,
   1252                             int width) {
   1253   asm volatile (
   1254     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
   1255   "1:                                          \n"
   1256     MEMACCESS(0)
   1257     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
   1258     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1259     ARGBTOARGB4444
   1260     MEMACCESS(1)
   1261     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
   1262     "b.gt       1b                             \n"
   1263   : "+r"(src_argb),      // %0
   1264     "+r"(dst_argb4444),  // %1
   1265     "+r"(width)            // %2
   1266   :
   1267   : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
   1268   );
   1269 }
   1270 
   1271 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
   1272   asm volatile (
   1273     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
   1274     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
   1275     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
   1276     "movi       v7.8b, #16                     \n"  // Add 16 constant
   1277   "1:                                          \n"
   1278     MEMACCESS(0)
   1279     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   1280     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1281     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
   1282     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
   1283     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
   1284     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
   1285     "uqadd      v0.8b, v0.8b, v7.8b            \n"
   1286     MEMACCESS(1)
   1287     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   1288     "b.gt       1b                             \n"
   1289   : "+r"(src_argb),  // %0
   1290     "+r"(dst_y),     // %1
   1291     "+r"(width)        // %2
   1292   :
   1293   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   1294   );
   1295 }
   1296 
   1297 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
   1298   asm volatile (
   1299   "1:                                          \n"
   1300     MEMACCESS(0)
   1301     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16 pixels
   1302     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
   1303     MEMACCESS(1)
   1304     "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
   1305     "b.gt       1b                             \n"
   1306   : "+r"(src_argb),   // %0
   1307     "+r"(dst_a),      // %1
   1308     "+r"(width)       // %2
   1309   :
   1310   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   1311   );
   1312 }
   1313 
   1314 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
   1315   asm volatile (
   1316     "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
   1317     "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
   1318     "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
   1319   "1:                                          \n"
   1320     MEMACCESS(0)
   1321     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   1322     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1323     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
   1324     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
   1325     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
   1326     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
   1327     MEMACCESS(1)
   1328     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   1329     "b.gt       1b                             \n"
   1330   : "+r"(src_argb),  // %0
   1331     "+r"(dst_y),     // %1
   1332     "+r"(width)        // %2
   1333   :
   1334   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
   1335   );
   1336 }
   1337 
   1338 // 8x1 pixels.
   1339 void ARGBToUV444Row_NEON(const uint8* src_argb,
   1340                          uint8* dst_u,
   1341                          uint8* dst_v,
   1342                          int width) {
   1343   asm volatile (
   1344     "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
   1345     "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
   1346     "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
   1347     "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
   1348     "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
   1349     "movi       v29.16b,#0x80                  \n"  // 128.5
   1350   "1:                                          \n"
   1351     MEMACCESS(0)
   1352     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   1353     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   1354     "umull      v4.8h, v0.8b, v24.8b           \n"  // B
   1355     "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
   1356     "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
   1357     "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
   1358 
   1359     "umull      v3.8h, v2.8b, v24.8b           \n"  // R
   1360     "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
   1361     "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
   1362     "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
   1363 
   1364     "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
   1365     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
   1366 
   1367     MEMACCESS(1)
   1368     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
   1369     MEMACCESS(2)
   1370     "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
   1371     "b.gt       1b                             \n"
   1372   : "+r"(src_argb),  // %0
   1373     "+r"(dst_u),     // %1
   1374     "+r"(dst_v),     // %2
   1375     "+r"(width)        // %3
   1376   :
   1377   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
   1378     "v24", "v25", "v26", "v27", "v28", "v29"
   1379   );
   1380 }
   1381 
   1382 #define RGBTOUV_SETUP_REG                                                  \
   1383   "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
   1384   "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
   1385   "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
   1386   "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
   1387   "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
   1388   "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
   1389 
   1390 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
   1391 #define RGBTOUV(QB, QG, QR)                                                 \
   1392   "mul        v3.8h, " #QB                                                  \
   1393   ",v20.8h          \n" /* B                    */                          \
   1394   "mul        v4.8h, " #QR                                                  \
   1395   ",v20.8h          \n" /* R                    */                          \
   1396   "mls        v3.8h, " #QG                                                  \
   1397   ",v21.8h          \n" /* G                    */                          \
   1398   "mls        v4.8h, " #QG                                                  \
   1399   ",v24.8h          \n" /* G                    */                          \
   1400   "mls        v3.8h, " #QR                                                  \
   1401   ",v22.8h          \n" /* R                    */                          \
   1402   "mls        v4.8h, " #QB                                                  \
   1403   ",v23.8h          \n"                          /* B                    */ \
   1404   "add        v3.8h, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \
   1405   "add        v4.8h, v4.8h, v25.8h           \n" /* +128 -> unsigned     */ \
   1406   "uqshrn     v0.8b, v3.8h, #8               \n" /* 16 bit to 8 bit U    */ \
   1407   "uqshrn     v1.8b, v4.8h, #8               \n" /* 16 bit to 8 bit V    */
   1408 
   1409 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
   1410 // TODO(fbarchard): consider ptrdiff_t for all strides.
   1411 
   1412 void ARGBToUVRow_NEON(const uint8* src_argb,
   1413                       int src_stride_argb,
   1414                       uint8* dst_u,
   1415                       uint8* dst_v,
   1416                       int width) {
   1417   const uint8* src_argb_1 = src_argb + src_stride_argb;
   1418   asm volatile (
   1419     RGBTOUV_SETUP_REG
   1420   "1:                                          \n"
   1421     MEMACCESS(0)
   1422     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
   1423     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
   1424     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
   1425     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
   1426 
   1427     MEMACCESS(1)
   1428     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
   1429     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
   1430     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
   1431     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
   1432 
   1433     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
   1434     "urshr      v1.8h, v1.8h, #1               \n"
   1435     "urshr      v2.8h, v2.8h, #1               \n"
   1436 
   1437     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
   1438     RGBTOUV(v0.8h, v1.8h, v2.8h)
   1439     MEMACCESS(2)
   1440     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1441     MEMACCESS(3)
   1442     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1443     "b.gt       1b                             \n"
   1444   : "+r"(src_argb),  // %0
   1445     "+r"(src_argb_1),  // %1
   1446     "+r"(dst_u),     // %2
   1447     "+r"(dst_v),     // %3
   1448     "+r"(width)        // %4
   1449   :
   1450   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1451     "v20", "v21", "v22", "v23", "v24", "v25"
   1452   );
   1453 }
   1454 
   1455 // TODO(fbarchard): Subsample match C code.
   1456 void ARGBToUVJRow_NEON(const uint8* src_argb,
   1457                        int src_stride_argb,
   1458                        uint8* dst_u,
   1459                        uint8* dst_v,
   1460                        int width) {
   1461   const uint8* src_argb_1 = src_argb + src_stride_argb;
   1462   asm volatile (
   1463     "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
   1464     "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
   1465     "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
   1466     "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
   1467     "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
   1468     "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
   1469   "1:                                          \n"
   1470     MEMACCESS(0)
   1471     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
   1472     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
   1473     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
   1474     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
   1475     MEMACCESS(1)
   1476     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
   1477     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
   1478     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
   1479     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
   1480 
   1481     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
   1482     "urshr      v1.8h, v1.8h, #1               \n"
   1483     "urshr      v2.8h, v2.8h, #1               \n"
   1484 
   1485     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
   1486     RGBTOUV(v0.8h, v1.8h, v2.8h)
   1487     MEMACCESS(2)
   1488     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1489     MEMACCESS(3)
   1490     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1491     "b.gt       1b                             \n"
   1492   : "+r"(src_argb),  // %0
   1493     "+r"(src_argb_1),  // %1
   1494     "+r"(dst_u),     // %2
   1495     "+r"(dst_v),     // %3
   1496     "+r"(width)        // %4
   1497   :
   1498   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1499     "v20", "v21", "v22", "v23", "v24", "v25"
   1500   );
   1501 }
   1502 
   1503 void BGRAToUVRow_NEON(const uint8* src_bgra,
   1504                       int src_stride_bgra,
   1505                       uint8* dst_u,
   1506                       uint8* dst_v,
   1507                       int width) {
   1508   const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
   1509   asm volatile (
   1510     RGBTOUV_SETUP_REG
   1511   "1:                                          \n"
   1512     MEMACCESS(0)
   1513     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
   1514     "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
   1515     "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
   1516     "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
   1517     MEMACCESS(1)
   1518     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
   1519     "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
   1520     "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
   1521     "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
   1522 
   1523     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
   1524     "urshr      v1.8h, v3.8h, #1               \n"
   1525     "urshr      v2.8h, v2.8h, #1               \n"
   1526 
   1527     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
   1528     RGBTOUV(v0.8h, v1.8h, v2.8h)
   1529     MEMACCESS(2)
   1530     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1531     MEMACCESS(3)
   1532     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1533     "b.gt       1b                             \n"
   1534   : "+r"(src_bgra),  // %0
   1535     "+r"(src_bgra_1),  // %1
   1536     "+r"(dst_u),     // %2
   1537     "+r"(dst_v),     // %3
   1538     "+r"(width)        // %4
   1539   :
   1540   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1541     "v20", "v21", "v22", "v23", "v24", "v25"
   1542   );
   1543 }
   1544 
   1545 void ABGRToUVRow_NEON(const uint8* src_abgr,
   1546                       int src_stride_abgr,
   1547                       uint8* dst_u,
   1548                       uint8* dst_v,
   1549                       int width) {
   1550   const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
   1551   asm volatile (
   1552     RGBTOUV_SETUP_REG
   1553   "1:                                          \n"
   1554     MEMACCESS(0)
   1555     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
   1556     "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
   1557     "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
   1558     "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
   1559     MEMACCESS(1)
   1560     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
   1561     "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
   1562     "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
   1563     "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
   1564 
   1565     "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
   1566     "urshr      v2.8h, v2.8h, #1               \n"
   1567     "urshr      v1.8h, v1.8h, #1               \n"
   1568 
   1569     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
   1570     RGBTOUV(v0.8h, v2.8h, v1.8h)
   1571     MEMACCESS(2)
   1572     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1573     MEMACCESS(3)
   1574     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1575     "b.gt       1b                             \n"
   1576   : "+r"(src_abgr),  // %0
   1577     "+r"(src_abgr_1),  // %1
   1578     "+r"(dst_u),     // %2
   1579     "+r"(dst_v),     // %3
   1580     "+r"(width)        // %4
   1581   :
   1582   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1583     "v20", "v21", "v22", "v23", "v24", "v25"
   1584   );
   1585 }
   1586 
   1587 void RGBAToUVRow_NEON(const uint8* src_rgba,
   1588                       int src_stride_rgba,
   1589                       uint8* dst_u,
   1590                       uint8* dst_v,
   1591                       int width) {
   1592   const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
   1593   asm volatile (
   1594     RGBTOUV_SETUP_REG
   1595   "1:                                          \n"
   1596     MEMACCESS(0)
   1597     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
   1598     "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
   1599     "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
   1600     "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
   1601     MEMACCESS(1)
   1602     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
   1603     "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
   1604     "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
   1605     "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
   1606 
   1607     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
   1608     "urshr      v1.8h, v1.8h, #1               \n"
   1609     "urshr      v2.8h, v2.8h, #1               \n"
   1610 
   1611     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
   1612     RGBTOUV(v0.8h, v1.8h, v2.8h)
   1613     MEMACCESS(2)
   1614     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1615     MEMACCESS(3)
   1616     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1617     "b.gt       1b                             \n"
   1618   : "+r"(src_rgba),  // %0
   1619     "+r"(src_rgba_1),  // %1
   1620     "+r"(dst_u),     // %2
   1621     "+r"(dst_v),     // %3
   1622     "+r"(width)        // %4
   1623   :
   1624   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1625     "v20", "v21", "v22", "v23", "v24", "v25"
   1626   );
   1627 }
   1628 
   1629 void RGB24ToUVRow_NEON(const uint8* src_rgb24,
   1630                        int src_stride_rgb24,
   1631                        uint8* dst_u,
   1632                        uint8* dst_v,
   1633                        int width) {
   1634   const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
   1635   asm volatile (
   1636     RGBTOUV_SETUP_REG
   1637   "1:                                          \n"
   1638     MEMACCESS(0)
   1639     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
   1640     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
   1641     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
   1642     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
   1643     MEMACCESS(1)
   1644     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
   1645     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
   1646     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
   1647     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
   1648 
   1649     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
   1650     "urshr      v1.8h, v1.8h, #1               \n"
   1651     "urshr      v2.8h, v2.8h, #1               \n"
   1652 
   1653     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
   1654     RGBTOUV(v0.8h, v1.8h, v2.8h)
   1655     MEMACCESS(2)
   1656     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1657     MEMACCESS(3)
   1658     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1659     "b.gt       1b                             \n"
   1660   : "+r"(src_rgb24),  // %0
   1661     "+r"(src_rgb24_1),  // %1
   1662     "+r"(dst_u),     // %2
   1663     "+r"(dst_v),     // %3
   1664     "+r"(width)        // %4
   1665   :
   1666   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1667     "v20", "v21", "v22", "v23", "v24", "v25"
   1668   );
   1669 }
   1670 
   1671 void RAWToUVRow_NEON(const uint8* src_raw,
   1672                      int src_stride_raw,
   1673                      uint8* dst_u,
   1674                      uint8* dst_v,
   1675                      int width) {
   1676   const uint8* src_raw_1 = src_raw + src_stride_raw;
   1677   asm volatile (
   1678     RGBTOUV_SETUP_REG
   1679   "1:                                          \n"
   1680     MEMACCESS(0)
   1681     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
   1682     "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
   1683     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
   1684     "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
   1685     MEMACCESS(1)
   1686     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
   1687     "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
   1688     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
   1689     "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
   1690 
   1691     "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
   1692     "urshr      v1.8h, v1.8h, #1               \n"
   1693     "urshr      v0.8h, v0.8h, #1               \n"
   1694 
   1695     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
   1696     RGBTOUV(v2.8h, v1.8h, v0.8h)
   1697     MEMACCESS(2)
   1698     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1699     MEMACCESS(3)
   1700     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1701     "b.gt       1b                             \n"
   1702   : "+r"(src_raw),  // %0
   1703     "+r"(src_raw_1),  // %1
   1704     "+r"(dst_u),     // %2
   1705     "+r"(dst_v),     // %3
   1706     "+r"(width)        // %4
   1707   :
   1708   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1709     "v20", "v21", "v22", "v23", "v24", "v25"
   1710   );
   1711 }
   1712 
   1713 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
   1714 void RGB565ToUVRow_NEON(const uint8* src_rgb565,
   1715                         int src_stride_rgb565,
   1716                         uint8* dst_u,
   1717                         uint8* dst_v,
   1718                         int width) {
   1719   const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
   1720   asm volatile (
   1721     "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
   1722     "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
   1723     "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
   1724     "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
   1725     "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
   1726     "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
   1727   "1:                                          \n"
   1728     MEMACCESS(0)
   1729     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
   1730     RGB565TOARGB
   1731     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1732     "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1733     "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1734     MEMACCESS(0)
   1735     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
   1736     RGB565TOARGB
   1737     "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1738     "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1739     "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1740 
   1741     MEMACCESS(1)
   1742     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
   1743     RGB565TOARGB
   1744     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1745     "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1746     "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1747     MEMACCESS(1)
   1748     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
   1749     RGB565TOARGB
   1750     "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1751     "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1752     "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1753 
   1754     "ins        v16.D[1], v17.D[0]             \n"
   1755     "ins        v18.D[1], v19.D[0]             \n"
   1756     "ins        v20.D[1], v21.D[0]             \n"
   1757 
   1758     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
   1759     "urshr      v5.8h, v18.8h, #1              \n"
   1760     "urshr      v6.8h, v20.8h, #1              \n"
   1761 
   1762     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
   1763     "mul        v16.8h, v4.8h, v22.8h          \n"  // B
   1764     "mls        v16.8h, v5.8h, v23.8h          \n"  // G
   1765     "mls        v16.8h, v6.8h, v24.8h          \n"  // R
   1766     "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
   1767     "mul        v17.8h, v6.8h, v22.8h          \n"  // R
   1768     "mls        v17.8h, v5.8h, v26.8h          \n"  // G
   1769     "mls        v17.8h, v4.8h, v25.8h          \n"  // B
   1770     "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
   1771     "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
   1772     "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
   1773     MEMACCESS(2)
   1774     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1775     MEMACCESS(3)
   1776     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1777     "b.gt       1b                             \n"
   1778   : "+r"(src_rgb565),  // %0
   1779     "+r"(src_rgb565_1),  // %1
   1780     "+r"(dst_u),     // %2
   1781     "+r"(dst_v),     // %3
   1782     "+r"(width)        // %4
   1783   :
   1784   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   1785     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
   1786     "v25", "v26", "v27"
   1787   );
   1788 }
   1789 
   1790 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
   1791 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
   1792                           int src_stride_argb1555,
   1793                           uint8* dst_u,
   1794                           uint8* dst_v,
   1795                           int width) {
   1796   const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
   1797   asm volatile (
   1798     RGBTOUV_SETUP_REG
   1799   "1:                                          \n"
   1800     MEMACCESS(0)
   1801     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
   1802     RGB555TOARGB
   1803     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1804     "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1805     "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1806     MEMACCESS(0)
   1807     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
   1808     RGB555TOARGB
   1809     "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1810     "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1811     "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1812 
   1813     MEMACCESS(1)
   1814     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
   1815     RGB555TOARGB
   1816     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1817     "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1818     "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1819     MEMACCESS(1)
   1820     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
   1821     RGB555TOARGB
   1822     "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1823     "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1824     "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1825 
   1826     "ins        v16.D[1], v26.D[0]             \n"
   1827     "ins        v17.D[1], v27.D[0]             \n"
   1828     "ins        v18.D[1], v28.D[0]             \n"
   1829 
   1830     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
   1831     "urshr      v5.8h, v17.8h, #1              \n"
   1832     "urshr      v6.8h, v18.8h, #1              \n"
   1833 
   1834     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
   1835     "mul        v2.8h, v4.8h, v20.8h           \n"  // B
   1836     "mls        v2.8h, v5.8h, v21.8h           \n"  // G
   1837     "mls        v2.8h, v6.8h, v22.8h           \n"  // R
   1838     "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
   1839     "mul        v3.8h, v6.8h, v20.8h           \n"  // R
   1840     "mls        v3.8h, v5.8h, v24.8h           \n"  // G
   1841     "mls        v3.8h, v4.8h, v23.8h           \n"  // B
   1842     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
   1843     "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
   1844     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
   1845     MEMACCESS(2)
   1846     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1847     MEMACCESS(3)
   1848     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1849     "b.gt       1b                             \n"
   1850   : "+r"(src_argb1555),  // %0
   1851     "+r"(src_argb1555_1),  // %1
   1852     "+r"(dst_u),     // %2
   1853     "+r"(dst_v),     // %3
   1854     "+r"(width)        // %4
   1855   :
   1856   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
   1857     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
   1858     "v26", "v27", "v28"
   1859   );
   1860 }
   1861 
   1862 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
   1863 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
   1864                           int src_stride_argb4444,
   1865                           uint8* dst_u,
   1866                           uint8* dst_v,
   1867                           int width) {
   1868   const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
   1869   asm volatile (
   1870     RGBTOUV_SETUP_REG
   1871   "1:                                          \n"
   1872     MEMACCESS(0)
   1873     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
   1874     ARGB4444TOARGB
   1875     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1876     "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1877     "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1878     MEMACCESS(0)
   1879     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
   1880     ARGB4444TOARGB
   1881     "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1882     "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1883     "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1884 
   1885     MEMACCESS(1)
   1886     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
   1887     ARGB4444TOARGB
   1888     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1889     "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1890     "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1891     MEMACCESS(1)
   1892     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
   1893     ARGB4444TOARGB
   1894     "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
   1895     "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
   1896     "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
   1897 
   1898     "ins        v16.D[1], v26.D[0]             \n"
   1899     "ins        v17.D[1], v27.D[0]             \n"
   1900     "ins        v18.D[1], v28.D[0]             \n"
   1901 
   1902     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
   1903     "urshr      v5.8h, v17.8h, #1              \n"
   1904     "urshr      v6.8h, v18.8h, #1              \n"
   1905 
   1906     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
   1907     "mul        v2.8h, v4.8h, v20.8h           \n"  // B
   1908     "mls        v2.8h, v5.8h, v21.8h           \n"  // G
   1909     "mls        v2.8h, v6.8h, v22.8h           \n"  // R
   1910     "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
   1911     "mul        v3.8h, v6.8h, v20.8h           \n"  // R
   1912     "mls        v3.8h, v5.8h, v24.8h           \n"  // G
   1913     "mls        v3.8h, v4.8h, v23.8h           \n"  // B
   1914     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
   1915     "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
   1916     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
   1917     MEMACCESS(2)
   1918     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
   1919     MEMACCESS(3)
   1920     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
   1921     "b.gt       1b                             \n"
   1922   : "+r"(src_argb4444),  // %0
   1923     "+r"(src_argb4444_1),  // %1
   1924     "+r"(dst_u),     // %2
   1925     "+r"(dst_v),     // %3
   1926     "+r"(width)        // %4
   1927   :
   1928   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
   1929     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
   1930     "v26", "v27", "v28"
   1931 
   1932   );
   1933 }
   1934 
   1935 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
   1936   asm volatile (
   1937     "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
   1938     "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
   1939     "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
   1940     "movi       v27.8b, #16                    \n"  // Add 16 constant
   1941   "1:                                          \n"
   1942     MEMACCESS(0)
   1943     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
   1944     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1945     RGB565TOARGB
   1946     "umull      v3.8h, v0.8b, v24.8b           \n"  // B
   1947     "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
   1948     "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
   1949     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
   1950     "uqadd      v0.8b, v0.8b, v27.8b           \n"
   1951     MEMACCESS(1)
   1952     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   1953     "b.gt       1b                             \n"
   1954   : "+r"(src_rgb565),  // %0
   1955     "+r"(dst_y),       // %1
   1956     "+r"(width)          // %2
   1957   :
   1958   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
   1959     "v24", "v25", "v26", "v27"
   1960   );
   1961 }
   1962 
   1963 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
   1964   asm volatile (
   1965     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
   1966     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
   1967     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
   1968     "movi       v7.8b, #16                     \n"  // Add 16 constant
   1969   "1:                                          \n"
   1970     MEMACCESS(0)
   1971     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
   1972     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   1973     ARGB1555TOARGB
   1974     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
   1975     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
   1976     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
   1977     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
   1978     "uqadd      v0.8b, v0.8b, v7.8b            \n"
   1979     MEMACCESS(1)
   1980     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   1981     "b.gt       1b                             \n"
   1982   : "+r"(src_argb1555),  // %0
   1983     "+r"(dst_y),         // %1
   1984     "+r"(width)            // %2
   1985   :
   1986   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   1987   );
   1988 }
   1989 
   1990 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
   1991   asm volatile (
   1992     "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
   1993     "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
   1994     "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
   1995     "movi       v27.8b, #16                    \n"  // Add 16 constant
   1996   "1:                                          \n"
   1997     MEMACCESS(0)
   1998     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
   1999     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2000     ARGB4444TOARGB
   2001     "umull      v3.8h, v0.8b, v24.8b           \n"  // B
   2002     "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
   2003     "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
   2004     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
   2005     "uqadd      v0.8b, v0.8b, v27.8b           \n"
   2006     MEMACCESS(1)
   2007     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   2008     "b.gt       1b                             \n"
   2009   : "+r"(src_argb4444),  // %0
   2010     "+r"(dst_y),         // %1
   2011     "+r"(width)            // %2
   2012   :
   2013   : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
   2014   );
   2015 }
   2016 
   2017 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
   2018   asm volatile (
   2019     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
   2020     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
   2021     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
   2022     "movi       v7.8b, #16                     \n"  // Add 16 constant
   2023   "1:                                          \n"
   2024     MEMACCESS(0)
   2025     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
   2026     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2027     "umull      v16.8h, v1.8b, v4.8b           \n"  // R
   2028     "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
   2029     "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
   2030     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
   2031     "uqadd      v0.8b, v0.8b, v7.8b            \n"
   2032     MEMACCESS(1)
   2033     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   2034     "b.gt       1b                             \n"
   2035   : "+r"(src_bgra),  // %0
   2036     "+r"(dst_y),     // %1
   2037     "+r"(width)        // %2
   2038   :
   2039   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   2040   );
   2041 }
   2042 
   2043 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
   2044   asm volatile (
   2045     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
   2046     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
   2047     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
   2048     "movi       v7.8b, #16                     \n"  // Add 16 constant
   2049   "1:                                          \n"
   2050     MEMACCESS(0)
   2051     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
   2052     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2053     "umull      v16.8h, v0.8b, v4.8b           \n"  // R
   2054     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
   2055     "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
   2056     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
   2057     "uqadd      v0.8b, v0.8b, v7.8b            \n"
   2058     MEMACCESS(1)
   2059     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   2060     "b.gt       1b                             \n"
   2061   : "+r"(src_abgr),  // %0
   2062     "+r"(dst_y),     // %1
   2063     "+r"(width)        // %2
   2064   :
   2065   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   2066   );
   2067 }
   2068 
   2069 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
   2070   asm volatile (
   2071     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
   2072     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
   2073     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
   2074     "movi       v7.8b, #16                     \n"  // Add 16 constant
   2075   "1:                                          \n"
   2076     MEMACCESS(0)
   2077     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
   2078     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2079     "umull      v16.8h, v1.8b, v4.8b           \n"  // B
   2080     "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
   2081     "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
   2082     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
   2083     "uqadd      v0.8b, v0.8b, v7.8b            \n"
   2084     MEMACCESS(1)
   2085     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   2086     "b.gt       1b                             \n"
   2087   : "+r"(src_rgba),  // %0
   2088     "+r"(dst_y),     // %1
   2089     "+r"(width)        // %2
   2090   :
   2091   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   2092   );
   2093 }
   2094 
   2095 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
   2096   asm volatile (
   2097     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
   2098     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
   2099     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
   2100     "movi       v7.8b, #16                     \n"  // Add 16 constant
   2101   "1:                                          \n"
   2102     MEMACCESS(0)
   2103     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
   2104     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2105     "umull      v16.8h, v0.8b, v4.8b           \n"  // B
   2106     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
   2107     "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
   2108     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
   2109     "uqadd      v0.8b, v0.8b, v7.8b            \n"
   2110     MEMACCESS(1)
   2111     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   2112     "b.gt       1b                             \n"
   2113   : "+r"(src_rgb24),  // %0
   2114     "+r"(dst_y),      // %1
   2115     "+r"(width)         // %2
   2116   :
   2117   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   2118   );
   2119 }
   2120 
   2121 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
   2122   asm volatile (
   2123     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
   2124     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
   2125     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
   2126     "movi       v7.8b, #16                     \n"  // Add 16 constant
   2127   "1:                                          \n"
   2128     MEMACCESS(0)
   2129     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
   2130     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2131     "umull      v16.8h, v0.8b, v4.8b           \n"  // B
   2132     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
   2133     "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
   2134     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
   2135     "uqadd      v0.8b, v0.8b, v7.8b            \n"
   2136     MEMACCESS(1)
   2137     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
   2138     "b.gt       1b                             \n"
   2139   : "+r"(src_raw),  // %0
   2140     "+r"(dst_y),    // %1
   2141     "+r"(width)       // %2
   2142   :
   2143   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   2144   );
   2145 }
   2146 
   2147 // Bilinear filter 16x2 -> 16x1
   2148 void InterpolateRow_NEON(uint8* dst_ptr,
   2149                          const uint8* src_ptr,
   2150                          ptrdiff_t src_stride,
   2151                          int dst_width,
   2152                          int source_y_fraction) {
   2153   int y1_fraction = source_y_fraction;
   2154   int y0_fraction = 256 - y1_fraction;
   2155   const uint8* src_ptr1 = src_ptr + src_stride;
   2156   asm volatile (
   2157     "cmp        %w4, #0                        \n"
   2158     "b.eq       100f                           \n"
   2159     "cmp        %w4, #128                      \n"
   2160     "b.eq       50f                            \n"
   2161 
   2162     "dup        v5.16b, %w4                    \n"
   2163     "dup        v4.16b, %w5                    \n"
   2164     // General purpose row blend.
   2165   "1:                                          \n"
   2166     MEMACCESS(1)
   2167     "ld1        {v0.16b}, [%1], #16            \n"
   2168     MEMACCESS(2)
   2169     "ld1        {v1.16b}, [%2], #16            \n"
   2170     "subs       %w3, %w3, #16                  \n"
   2171     "umull      v2.8h, v0.8b,  v4.8b           \n"
   2172     "umull2     v3.8h, v0.16b, v4.16b          \n"
   2173     "umlal      v2.8h, v1.8b,  v5.8b           \n"
   2174     "umlal2     v3.8h, v1.16b, v5.16b          \n"
   2175     "rshrn      v0.8b,  v2.8h, #8              \n"
   2176     "rshrn2     v0.16b, v3.8h, #8              \n"
   2177     MEMACCESS(0)
   2178     "st1        {v0.16b}, [%0], #16            \n"
   2179     "b.gt       1b                             \n"
   2180     "b          99f                            \n"
   2181 
   2182     // Blend 50 / 50.
   2183   "50:                                         \n"
   2184     MEMACCESS(1)
   2185     "ld1        {v0.16b}, [%1], #16            \n"
   2186     MEMACCESS(2)
   2187     "ld1        {v1.16b}, [%2], #16            \n"
   2188     "subs       %w3, %w3, #16                  \n"
   2189     "urhadd     v0.16b, v0.16b, v1.16b         \n"
   2190     MEMACCESS(0)
   2191     "st1        {v0.16b}, [%0], #16            \n"
   2192     "b.gt       50b                            \n"
   2193     "b          99f                            \n"
   2194 
   2195     // Blend 100 / 0 - Copy row unchanged.
   2196   "100:                                        \n"
   2197     MEMACCESS(1)
   2198     "ld1        {v0.16b}, [%1], #16            \n"
   2199     "subs       %w3, %w3, #16                  \n"
   2200     MEMACCESS(0)
   2201     "st1        {v0.16b}, [%0], #16            \n"
   2202     "b.gt       100b                           \n"
   2203 
   2204   "99:                                         \n"
   2205   : "+r"(dst_ptr),          // %0
   2206     "+r"(src_ptr),          // %1
   2207     "+r"(src_ptr1),         // %2
   2208     "+r"(dst_width),        // %3
   2209     "+r"(y1_fraction),      // %4
   2210     "+r"(y0_fraction)       // %5
   2211   :
   2212   : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
   2213   );
   2214 }
   2215 
   2216 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
   2217 void ARGBBlendRow_NEON(const uint8* src_argb0,
   2218                        const uint8* src_argb1,
   2219                        uint8* dst_argb,
   2220                        int width) {
   2221   asm volatile (
   2222     "subs       %w3, %w3, #8                   \n"
   2223     "b.lt       89f                            \n"
   2224     // Blend 8 pixels.
   2225   "8:                                          \n"
   2226     MEMACCESS(0)
   2227     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
   2228     MEMACCESS(1)
   2229     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
   2230     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   2231     "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
   2232     "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
   2233     "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
   2234     "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
   2235     "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
   2236     "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
   2237     "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
   2238     "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
   2239     "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
   2240     "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
   2241     "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
   2242     "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
   2243     "movi       v3.8b, #255                    \n"  // a = 255
   2244     MEMACCESS(2)
   2245     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
   2246     "b.ge       8b                             \n"
   2247 
   2248   "89:                                         \n"
   2249     "adds       %w3, %w3, #8-1                 \n"
   2250     "b.lt       99f                            \n"
   2251 
   2252     // Blend 1 pixels.
   2253   "1:                                          \n"
   2254     MEMACCESS(0)
   2255     "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
   2256     MEMACCESS(1)
   2257     "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
   2258     "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
   2259     "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
   2260     "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
   2261     "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
   2262     "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
   2263     "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
   2264     "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
   2265     "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
   2266     "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
   2267     "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
   2268     "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
   2269     "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
   2270     "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
   2271     "movi       v3.8b, #255                    \n"  // a = 255
   2272     MEMACCESS(2)
   2273     "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
   2274     "b.ge       1b                             \n"
   2275 
   2276   "99:                                         \n"
   2277 
   2278   : "+r"(src_argb0),    // %0
   2279     "+r"(src_argb1),    // %1
   2280     "+r"(dst_argb),     // %2
   2281     "+r"(width)         // %3
   2282   :
   2283   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   2284     "v16", "v17", "v18"
   2285   );
   2286 }
   2287 
   2288 // Attenuate 8 pixels at a time.
   2289 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
   2290   asm volatile (
   2291     // Attenuate 8 pixels.
   2292   "1:                                          \n"
   2293     MEMACCESS(0)
   2294     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
   2295     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2296     "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
   2297     "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
   2298     "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
   2299     "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
   2300     "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
   2301     "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
   2302     MEMACCESS(1)
   2303     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
   2304     "b.gt       1b                             \n"
   2305   : "+r"(src_argb),   // %0
   2306     "+r"(dst_argb),   // %1
   2307     "+r"(width)       // %2
   2308   :
   2309   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
   2310   );
   2311 }
   2312 
   2313 // Quantize 8 ARGB pixels (32 bytes).
   2314 // dst = (dst * scale >> 16) * interval_size + interval_offset;
   2315 void ARGBQuantizeRow_NEON(uint8* dst_argb,
   2316                           int scale,
   2317                           int interval_size,
   2318                           int interval_offset,
   2319                           int width) {
   2320   asm volatile (
   2321     "dup        v4.8h, %w2                     \n"
   2322     "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
   2323     "dup        v5.8h, %w3                     \n"  // interval multiply.
   2324     "dup        v6.8h, %w4                     \n"  // interval add
   2325 
   2326     // 8 pixel loop.
   2327   "1:                                          \n"
   2328     MEMACCESS(0)
   2329     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
   2330     "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
   2331     "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
   2332     "uxtl       v1.8h, v1.8b                   \n"
   2333     "uxtl       v2.8h, v2.8b                   \n"
   2334     "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
   2335     "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
   2336     "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
   2337     "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
   2338     "mul        v1.8h, v1.8h, v5.8h            \n"  // g
   2339     "mul        v2.8h, v2.8h, v5.8h            \n"  // r
   2340     "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
   2341     "add        v1.8h, v1.8h, v6.8h            \n"  // g
   2342     "add        v2.8h, v2.8h, v6.8h            \n"  // r
   2343     "uqxtn      v0.8b, v0.8h                   \n"
   2344     "uqxtn      v1.8b, v1.8h                   \n"
   2345     "uqxtn      v2.8b, v2.8h                   \n"
   2346     MEMACCESS(0)
   2347     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
   2348     "b.gt       1b                             \n"
   2349   : "+r"(dst_argb),       // %0
   2350     "+r"(width)           // %1
   2351   : "r"(scale),           // %2
   2352     "r"(interval_size),   // %3
   2353     "r"(interval_offset)  // %4
   2354   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
   2355   );
   2356 }
   2357 
   2358 // Shade 8 pixels at a time by specified value.
   2359 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
   2360 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
   2361 void ARGBShadeRow_NEON(const uint8* src_argb,
   2362                        uint8* dst_argb,
   2363                        int width,
   2364                        uint32 value) {
   2365   asm volatile (
   2366     "dup        v0.4s, %w3                     \n"  // duplicate scale value.
   2367     "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
   2368     "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
   2369 
   2370     // 8 pixel loop.
   2371   "1:                                          \n"
   2372     MEMACCESS(0)
   2373     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   2374     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2375     "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
   2376     "uxtl       v5.8h, v5.8b                   \n"
   2377     "uxtl       v6.8h, v6.8b                   \n"
   2378     "uxtl       v7.8h, v7.8b                   \n"
   2379     "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
   2380     "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
   2381     "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
   2382     "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
   2383     "uqxtn      v4.8b, v4.8h                   \n"
   2384     "uqxtn      v5.8b, v5.8h                   \n"
   2385     "uqxtn      v6.8b, v6.8h                   \n"
   2386     "uqxtn      v7.8b, v7.8h                   \n"
   2387     MEMACCESS(1)
   2388     "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
   2389     "b.gt       1b                             \n"
   2390   : "+r"(src_argb),       // %0
   2391     "+r"(dst_argb),       // %1
   2392     "+r"(width)           // %2
   2393   : "r"(value)            // %3
   2394   : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
   2395   );
   2396 }
   2397 
   2398 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
   2399 // Similar to ARGBToYJ but stores ARGB.
   2400 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
   2401 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
   2402   asm volatile (
   2403     "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
   2404     "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
   2405     "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
   2406   "1:                                          \n"
   2407     MEMACCESS(0)
   2408     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   2409     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2410     "umull      v4.8h, v0.8b, v24.8b           \n"  // B
   2411     "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
   2412     "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
   2413     "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
   2414     "orr        v1.8b, v0.8b, v0.8b            \n"  // G
   2415     "orr        v2.8b, v0.8b, v0.8b            \n"  // R
   2416     MEMACCESS(1)
   2417     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
   2418     "b.gt       1b                             \n"
   2419   : "+r"(src_argb),  // %0
   2420     "+r"(dst_argb),  // %1
   2421     "+r"(width)      // %2
   2422   :
   2423   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
   2424   );
   2425 }
   2426 
   2427 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
   2428 //    b = (r * 35 + g * 68 + b * 17) >> 7
   2429 //    g = (r * 45 + g * 88 + b * 22) >> 7
   2430 //    r = (r * 50 + g * 98 + b * 24) >> 7
   2431 
   2432 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
   2433   asm volatile (
   2434     "movi       v20.8b, #17                    \n"  // BB coefficient
   2435     "movi       v21.8b, #68                    \n"  // BG coefficient
   2436     "movi       v22.8b, #35                    \n"  // BR coefficient
   2437     "movi       v24.8b, #22                    \n"  // GB coefficient
   2438     "movi       v25.8b, #88                    \n"  // GG coefficient
   2439     "movi       v26.8b, #45                    \n"  // GR coefficient
   2440     "movi       v28.8b, #24                    \n"  // BB coefficient
   2441     "movi       v29.8b, #98                    \n"  // BG coefficient
   2442     "movi       v30.8b, #50                    \n"  // BR coefficient
   2443   "1:                                          \n"
   2444     MEMACCESS(0)
   2445     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
   2446     "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
   2447     "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
   2448     "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
   2449     "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
   2450     "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
   2451     "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
   2452     "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
   2453     "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
   2454     "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
   2455     "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
   2456     "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
   2457     "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
   2458     "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
   2459     MEMACCESS(0)
   2460     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
   2461     "b.gt       1b                             \n"
   2462   : "+r"(dst_argb),  // %0
   2463     "+r"(width)      // %1
   2464   :
   2465   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
   2466     "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
   2467   );
   2468 }
   2469 
   2470 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
   2471 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
   2472 // needs to saturate.  Consider doing a non-saturating version.
   2473 void ARGBColorMatrixRow_NEON(const uint8* src_argb,
   2474                              uint8* dst_argb,
   2475                              const int8* matrix_argb,
   2476                              int width) {
   2477   asm volatile (
   2478     MEMACCESS(3)
   2479     "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
   2480     "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
   2481     "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
   2482 
   2483   "1:                                          \n"
   2484     MEMACCESS(0)
   2485     "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
   2486     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
   2487     "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
   2488     "uxtl       v17.8h, v17.8b                 \n"  // g
   2489     "uxtl       v18.8h, v18.8b                 \n"  // r
   2490     "uxtl       v19.8h, v19.8b                 \n"  // a
   2491     "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
   2492     "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
   2493     "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
   2494     "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
   2495     "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
   2496     "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
   2497     "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
   2498     "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
   2499     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
   2500     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
   2501     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
   2502     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
   2503     "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
   2504     "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
   2505     "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
   2506     "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
   2507     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
   2508     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
   2509     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
   2510     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
   2511     "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
   2512     "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
   2513     "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
   2514     "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
   2515     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
   2516     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
   2517     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
   2518     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
   2519     "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
   2520     "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
   2521     "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
   2522     "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
   2523     MEMACCESS(1)
   2524     "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
   2525     "b.gt       1b                             \n"
   2526   : "+r"(src_argb),   // %0
   2527     "+r"(dst_argb),   // %1
   2528     "+r"(width)       // %2
   2529   : "r"(matrix_argb)  // %3
   2530   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
   2531     "v18", "v19", "v22", "v23", "v24", "v25"
   2532   );
   2533 }
   2534 
   2535 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
   2536 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
   2537 void ARGBMultiplyRow_NEON(const uint8* src_argb0,
   2538                           const uint8* src_argb1,
   2539                           uint8* dst_argb,
   2540                           int width) {
   2541   asm volatile (
   2542     // 8 pixel loop.
   2543   "1:                                          \n"
   2544     MEMACCESS(0)
   2545     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   2546     MEMACCESS(1)
   2547     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
   2548     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   2549     "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
   2550     "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
   2551     "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
   2552     "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
   2553     "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
   2554     "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
   2555     "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
   2556     "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
   2557     MEMACCESS(2)
   2558     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
   2559     "b.gt       1b                             \n"
   2560 
   2561   : "+r"(src_argb0),  // %0
   2562     "+r"(src_argb1),  // %1
   2563     "+r"(dst_argb),   // %2
   2564     "+r"(width)       // %3
   2565   :
   2566   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   2567   );
   2568 }
   2569 
   2570 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
   2571 void ARGBAddRow_NEON(const uint8* src_argb0,
   2572                      const uint8* src_argb1,
   2573                      uint8* dst_argb,
   2574                      int width) {
   2575   asm volatile (
   2576     // 8 pixel loop.
   2577   "1:                                          \n"
   2578     MEMACCESS(0)
   2579     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   2580     MEMACCESS(1)
   2581     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
   2582     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   2583     "uqadd      v0.8b, v0.8b, v4.8b            \n"
   2584     "uqadd      v1.8b, v1.8b, v5.8b            \n"
   2585     "uqadd      v2.8b, v2.8b, v6.8b            \n"
   2586     "uqadd      v3.8b, v3.8b, v7.8b            \n"
   2587     MEMACCESS(2)
   2588     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
   2589     "b.gt       1b                             \n"
   2590 
   2591   : "+r"(src_argb0),  // %0
   2592     "+r"(src_argb1),  // %1
   2593     "+r"(dst_argb),   // %2
   2594     "+r"(width)       // %3
   2595   :
   2596   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   2597   );
   2598 }
   2599 
   2600 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
   2601 void ARGBSubtractRow_NEON(const uint8* src_argb0,
   2602                           const uint8* src_argb1,
   2603                           uint8* dst_argb,
   2604                           int width) {
   2605   asm volatile (
   2606     // 8 pixel loop.
   2607   "1:                                          \n"
   2608     MEMACCESS(0)
   2609     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
   2610     MEMACCESS(1)
   2611     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
   2612     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   2613     "uqsub      v0.8b, v0.8b, v4.8b            \n"
   2614     "uqsub      v1.8b, v1.8b, v5.8b            \n"
   2615     "uqsub      v2.8b, v2.8b, v6.8b            \n"
   2616     "uqsub      v3.8b, v3.8b, v7.8b            \n"
   2617     MEMACCESS(2)
   2618     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
   2619     "b.gt       1b                             \n"
   2620 
   2621   : "+r"(src_argb0),  // %0
   2622     "+r"(src_argb1),  // %1
   2623     "+r"(dst_argb),   // %2
   2624     "+r"(width)       // %3
   2625   :
   2626   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   2627   );
   2628 }
   2629 
   2630 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
   2631 // A = 255
   2632 // R = Sobel
   2633 // G = Sobel
   2634 // B = Sobel
   2635 void SobelRow_NEON(const uint8* src_sobelx,
   2636                    const uint8* src_sobely,
   2637                    uint8* dst_argb,
   2638                    int width) {
   2639   asm volatile (
   2640     "movi       v3.8b, #255                    \n"  // alpha
   2641     // 8 pixel loop.
   2642   "1:                                          \n"
   2643     MEMACCESS(0)
   2644     "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
   2645     MEMACCESS(1)
   2646     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
   2647     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   2648     "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
   2649     "orr        v1.8b, v0.8b, v0.8b            \n"
   2650     "orr        v2.8b, v0.8b, v0.8b            \n"
   2651     MEMACCESS(2)
   2652     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
   2653     "b.gt       1b                             \n"
   2654   : "+r"(src_sobelx),  // %0
   2655     "+r"(src_sobely),  // %1
   2656     "+r"(dst_argb),    // %2
   2657     "+r"(width)        // %3
   2658   :
   2659   : "cc", "memory", "v0", "v1", "v2", "v3"
   2660   );
   2661 }
   2662 
   2663 // Adds Sobel X and Sobel Y and stores Sobel into plane.
   2664 void SobelToPlaneRow_NEON(const uint8* src_sobelx,
   2665                           const uint8* src_sobely,
   2666                           uint8* dst_y,
   2667                           int width) {
   2668   asm volatile (
   2669     // 16 pixel loop.
   2670   "1:                                          \n"
   2671     MEMACCESS(0)
   2672     "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
   2673     MEMACCESS(1)
   2674     "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
   2675     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
   2676     "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
   2677     MEMACCESS(2)
   2678     "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
   2679     "b.gt       1b                             \n"
   2680   : "+r"(src_sobelx),  // %0
   2681     "+r"(src_sobely),  // %1
   2682     "+r"(dst_y),       // %2
   2683     "+r"(width)        // %3
   2684   :
   2685   : "cc", "memory", "v0", "v1"
   2686   );
   2687 }
   2688 
   2689 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
   2690 // A = 255
   2691 // R = Sobel X
   2692 // G = Sobel
   2693 // B = Sobel Y
   2694 void SobelXYRow_NEON(const uint8* src_sobelx,
   2695                      const uint8* src_sobely,
   2696                      uint8* dst_argb,
   2697                      int width) {
   2698   asm volatile (
   2699     "movi       v3.8b, #255                    \n"  // alpha
   2700     // 8 pixel loop.
   2701   "1:                                          \n"
   2702     MEMACCESS(0)
   2703     "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
   2704     MEMACCESS(1)
   2705     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
   2706     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
   2707     "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
   2708     MEMACCESS(2)
   2709     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
   2710     "b.gt       1b                             \n"
   2711   : "+r"(src_sobelx),  // %0
   2712     "+r"(src_sobely),  // %1
   2713     "+r"(dst_argb),    // %2
   2714     "+r"(width)        // %3
   2715   :
   2716   : "cc", "memory", "v0", "v1", "v2", "v3"
   2717   );
   2718 }
   2719 
   2720 // SobelX as a matrix is
   2721 // -1  0  1
   2722 // -2  0  2
   2723 // -1  0  1
   2724 void SobelXRow_NEON(const uint8* src_y0,
   2725                     const uint8* src_y1,
   2726                     const uint8* src_y2,
   2727                     uint8* dst_sobelx,
   2728                     int width) {
   2729   asm volatile (
   2730   "1:                                          \n"
   2731     MEMACCESS(0)
   2732     "ld1        {v0.8b}, [%0],%5               \n"  // top
   2733     MEMACCESS(0)
   2734     "ld1        {v1.8b}, [%0],%6               \n"
   2735     "usubl      v0.8h, v0.8b, v1.8b            \n"
   2736     MEMACCESS(1)
   2737     "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
   2738     MEMACCESS(1)
   2739     "ld1        {v3.8b}, [%1],%6               \n"
   2740     "usubl      v1.8h, v2.8b, v3.8b            \n"
   2741     "add        v0.8h, v0.8h, v1.8h            \n"
   2742     "add        v0.8h, v0.8h, v1.8h            \n"
   2743     MEMACCESS(2)
   2744     "ld1        {v2.8b}, [%2],%5               \n"  // bottom
   2745     MEMACCESS(2)
   2746     "ld1        {v3.8b}, [%2],%6               \n"
   2747     "subs       %w4, %w4, #8                   \n"  // 8 pixels
   2748     "usubl      v1.8h, v2.8b, v3.8b            \n"
   2749     "add        v0.8h, v0.8h, v1.8h            \n"
   2750     "abs        v0.8h, v0.8h                   \n"
   2751     "uqxtn      v0.8b, v0.8h                   \n"
   2752     MEMACCESS(3)
   2753     "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
   2754     "b.gt       1b                             \n"
   2755   : "+r"(src_y0),      // %0
   2756     "+r"(src_y1),      // %1
   2757     "+r"(src_y2),      // %2
   2758     "+r"(dst_sobelx),  // %3
   2759     "+r"(width)        // %4
   2760   : "r"(2LL),          // %5
   2761     "r"(6LL)           // %6
   2762   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   2763   );
   2764 }
   2765 
   2766 // SobelY as a matrix is
   2767 // -1 -2 -1
   2768 //  0  0  0
   2769 //  1  2  1
   2770 void SobelYRow_NEON(const uint8* src_y0,
   2771                     const uint8* src_y1,
   2772                     uint8* dst_sobely,
   2773                     int width) {
   2774   asm volatile (
   2775   "1:                                          \n"
   2776     MEMACCESS(0)
   2777     "ld1        {v0.8b}, [%0],%4               \n"  // left
   2778     MEMACCESS(1)
   2779     "ld1        {v1.8b}, [%1],%4               \n"
   2780     "usubl      v0.8h, v0.8b, v1.8b            \n"
   2781     MEMACCESS(0)
   2782     "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
   2783     MEMACCESS(1)
   2784     "ld1        {v3.8b}, [%1],%4               \n"
   2785     "usubl      v1.8h, v2.8b, v3.8b            \n"
   2786     "add        v0.8h, v0.8h, v1.8h            \n"
   2787     "add        v0.8h, v0.8h, v1.8h            \n"
   2788     MEMACCESS(0)
   2789     "ld1        {v2.8b}, [%0],%5               \n"  // right
   2790     MEMACCESS(1)
   2791     "ld1        {v3.8b}, [%1],%5               \n"
   2792     "subs       %w3, %w3, #8                   \n"  // 8 pixels
   2793     "usubl      v1.8h, v2.8b, v3.8b            \n"
   2794     "add        v0.8h, v0.8h, v1.8h            \n"
   2795     "abs        v0.8h, v0.8h                   \n"
   2796     "uqxtn      v0.8b, v0.8h                   \n"
   2797     MEMACCESS(2)
   2798     "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
   2799     "b.gt       1b                             \n"
   2800   : "+r"(src_y0),      // %0
   2801     "+r"(src_y1),      // %1
   2802     "+r"(dst_sobely),  // %2
   2803     "+r"(width)        // %3
   2804   : "r"(1LL),          // %4
   2805     "r"(6LL)           // %5
   2806   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   2807   );
   2808 }
   2809 
   2810 // Caveat - rounds float to half float whereas scaling version truncates.
   2811 void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
   2812   asm volatile (
   2813   "1:                                          \n"
   2814     MEMACCESS(0)
   2815     "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
   2816     "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
   2817     "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
   2818     "uxtl2      v3.4s, v1.8h                   \n"
   2819     "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
   2820     "scvtf      v3.4s, v3.4s                   \n"
   2821     "fcvtn      v1.4h, v2.4s                   \n"  // 8 half floats
   2822     "fcvtn2     v1.8h, v3.4s                   \n"
   2823    MEMACCESS(1)
   2824     "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
   2825     "b.gt       1b                             \n"
   2826   : "+r"(src),    // %0
   2827     "+r"(dst),    // %1
   2828     "+r"(width)   // %2
   2829   :
   2830   : "cc", "memory", "v1", "v2", "v3"
   2831   );
   2832 }
   2833 
   2834 void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
   2835   asm volatile (
   2836   "1:                                          \n"
   2837     MEMACCESS(0)
   2838     "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
   2839     "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
   2840     "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
   2841     "uxtl2      v3.4s, v1.8h                   \n"
   2842     "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
   2843     "scvtf      v3.4s, v3.4s                   \n"
   2844     "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
   2845     "fmul       v3.4s, v3.4s, %3.s[0]          \n"
   2846     "uqshrn     v1.4h, v2.4s, #13              \n"  // isolate halffloat
   2847     "uqshrn2    v1.8h, v3.4s, #13              \n"
   2848    MEMACCESS(1)
   2849     "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
   2850     "b.gt       1b                             \n"
   2851   : "+r"(src),    // %0
   2852     "+r"(dst),    // %1
   2853     "+r"(width)   // %2
   2854   : "w"(scale * 1.9259299444e-34f)    // %3
   2855   : "cc", "memory", "v1", "v2", "v3"
   2856   );
   2857 }
   2858 
   2859 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
   2860 
   2861 #ifdef __cplusplus
   2862 }  // extern "C"
   2863 }  // namespace libyuv
   2864 #endif
   2865