Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/rotate_row.h"
     12 #include "libyuv/row.h"
     13 
     14 #include "libyuv/basic_types.h"
     15 
     16 #ifdef __cplusplus
     17 namespace libyuv {
     18 extern "C" {
     19 #endif
     20 
     21 // This module is for GCC Neon armv8 64 bit.
     22 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
     23 
     24 static uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
     25                                   2, 6, 10, 14, 3, 7, 11, 15};
     26 
     27 void TransposeWx8_NEON(const uint8* src,
     28                        int src_stride,
     29                        uint8* dst,
     30                        int dst_stride,
     31                        int width) {
     32   const uint8* src_temp;
     33   int64 width64 = (int64)width;  // Work around clang 3.4 warning.
     34   asm volatile (
     35     // loops are on blocks of 8. loop will stop when
     36     // counter gets to or below 0. starting the counter
     37     // at w-8 allow for this
     38     "sub         %3, %3, #8                      \n"
     39 
     40     // handle 8x8 blocks. this should be the majority of the plane
     41     "1:                                          \n"
     42       "mov         %0, %1                        \n"
     43 
     44       MEMACCESS(0)
     45       "ld1        {v0.8b}, [%0], %5              \n"
     46       MEMACCESS(0)
     47       "ld1        {v1.8b}, [%0], %5              \n"
     48       MEMACCESS(0)
     49       "ld1        {v2.8b}, [%0], %5              \n"
     50       MEMACCESS(0)
     51       "ld1        {v3.8b}, [%0], %5              \n"
     52       MEMACCESS(0)
     53       "ld1        {v4.8b}, [%0], %5              \n"
     54       MEMACCESS(0)
     55       "ld1        {v5.8b}, [%0], %5              \n"
     56       MEMACCESS(0)
     57       "ld1        {v6.8b}, [%0], %5              \n"
     58       MEMACCESS(0)
     59       "ld1        {v7.8b}, [%0]                  \n"
     60 
     61       "trn2     v16.8b, v0.8b, v1.8b             \n"
     62       "trn1     v17.8b, v0.8b, v1.8b             \n"
     63       "trn2     v18.8b, v2.8b, v3.8b             \n"
     64       "trn1     v19.8b, v2.8b, v3.8b             \n"
     65       "trn2     v20.8b, v4.8b, v5.8b             \n"
     66       "trn1     v21.8b, v4.8b, v5.8b             \n"
     67       "trn2     v22.8b, v6.8b, v7.8b             \n"
     68       "trn1     v23.8b, v6.8b, v7.8b             \n"
     69 
     70       "trn2     v3.4h, v17.4h, v19.4h            \n"
     71       "trn1     v1.4h, v17.4h, v19.4h            \n"
     72       "trn2     v2.4h, v16.4h, v18.4h            \n"
     73       "trn1     v0.4h, v16.4h, v18.4h            \n"
     74       "trn2     v7.4h, v21.4h, v23.4h            \n"
     75       "trn1     v5.4h, v21.4h, v23.4h            \n"
     76       "trn2     v6.4h, v20.4h, v22.4h            \n"
     77       "trn1     v4.4h, v20.4h, v22.4h            \n"
     78 
     79       "trn2     v21.2s, v1.2s, v5.2s             \n"
     80       "trn1     v17.2s, v1.2s, v5.2s             \n"
     81       "trn2     v20.2s, v0.2s, v4.2s             \n"
     82       "trn1     v16.2s, v0.2s, v4.2s             \n"
     83       "trn2     v23.2s, v3.2s, v7.2s             \n"
     84       "trn1     v19.2s, v3.2s, v7.2s             \n"
     85       "trn2     v22.2s, v2.2s, v6.2s             \n"
     86       "trn1     v18.2s, v2.2s, v6.2s             \n"
     87 
     88       "mov         %0, %2                        \n"
     89 
     90     MEMACCESS(0)
     91       "st1      {v17.8b}, [%0], %6               \n"
     92     MEMACCESS(0)
     93       "st1      {v16.8b}, [%0], %6               \n"
     94     MEMACCESS(0)
     95       "st1      {v19.8b}, [%0], %6               \n"
     96     MEMACCESS(0)
     97       "st1      {v18.8b}, [%0], %6               \n"
     98     MEMACCESS(0)
     99       "st1      {v21.8b}, [%0], %6               \n"
    100     MEMACCESS(0)
    101       "st1      {v20.8b}, [%0], %6               \n"
    102     MEMACCESS(0)
    103       "st1      {v23.8b}, [%0], %6               \n"
    104     MEMACCESS(0)
    105       "st1      {v22.8b}, [%0]                   \n"
    106 
    107       "add         %1, %1, #8                    \n"  // src += 8
    108       "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
    109       "subs        %3, %3, #8                    \n"  // w   -= 8
    110       "b.ge        1b                            \n"
    111 
    112     // add 8 back to counter. if the result is 0 there are
    113     // no residuals.
    114     "adds        %3, %3, #8                      \n"
    115     "b.eq        4f                              \n"
    116 
    117     // some residual, so between 1 and 7 lines left to transpose
    118     "cmp         %3, #2                          \n"
    119     "b.lt        3f                              \n"
    120 
    121     "cmp         %3, #4                          \n"
    122     "b.lt        2f                              \n"
    123 
    124     // 4x8 block
    125     "mov         %0, %1                          \n"
    126     MEMACCESS(0)
    127     "ld1     {v0.s}[0], [%0], %5                 \n"
    128     MEMACCESS(0)
    129     "ld1     {v0.s}[1], [%0], %5                 \n"
    130     MEMACCESS(0)
    131     "ld1     {v0.s}[2], [%0], %5                 \n"
    132     MEMACCESS(0)
    133     "ld1     {v0.s}[3], [%0], %5                 \n"
    134     MEMACCESS(0)
    135     "ld1     {v1.s}[0], [%0], %5                 \n"
    136     MEMACCESS(0)
    137     "ld1     {v1.s}[1], [%0], %5                 \n"
    138     MEMACCESS(0)
    139     "ld1     {v1.s}[2], [%0], %5                 \n"
    140     MEMACCESS(0)
    141     "ld1     {v1.s}[3], [%0]                     \n"
    142 
    143     "mov         %0, %2                          \n"
    144 
    145     MEMACCESS(4)
    146     "ld1      {v2.16b}, [%4]                     \n"
    147 
    148     "tbl      v3.16b, {v0.16b}, v2.16b           \n"
    149     "tbl      v0.16b, {v1.16b}, v2.16b           \n"
    150 
    151     // TODO(frkoenig): Rework shuffle above to
    152     // write out with 4 instead of 8 writes.
    153     MEMACCESS(0)
    154     "st1 {v3.s}[0], [%0], %6                     \n"
    155     MEMACCESS(0)
    156     "st1 {v3.s}[1], [%0], %6                     \n"
    157     MEMACCESS(0)
    158     "st1 {v3.s}[2], [%0], %6                     \n"
    159     MEMACCESS(0)
    160     "st1 {v3.s}[3], [%0]                         \n"
    161 
    162     "add         %0, %2, #4                      \n"
    163     MEMACCESS(0)
    164     "st1 {v0.s}[0], [%0], %6                     \n"
    165     MEMACCESS(0)
    166     "st1 {v0.s}[1], [%0], %6                     \n"
    167     MEMACCESS(0)
    168     "st1 {v0.s}[2], [%0], %6                     \n"
    169     MEMACCESS(0)
    170     "st1 {v0.s}[3], [%0]                         \n"
    171 
    172     "add         %1, %1, #4                      \n"  // src += 4
    173     "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
    174     "subs        %3, %3, #4                      \n"  // w   -= 4
    175     "b.eq        4f                              \n"
    176 
    177     // some residual, check to see if it includes a 2x8 block,
    178     // or less
    179     "cmp         %3, #2                          \n"
    180     "b.lt        3f                              \n"
    181 
    182     // 2x8 block
    183     "2:                                          \n"
    184     "mov         %0, %1                          \n"
    185     MEMACCESS(0)
    186     "ld1     {v0.h}[0], [%0], %5                 \n"
    187     MEMACCESS(0)
    188     "ld1     {v1.h}[0], [%0], %5                 \n"
    189     MEMACCESS(0)
    190     "ld1     {v0.h}[1], [%0], %5                 \n"
    191     MEMACCESS(0)
    192     "ld1     {v1.h}[1], [%0], %5                 \n"
    193     MEMACCESS(0)
    194     "ld1     {v0.h}[2], [%0], %5                 \n"
    195     MEMACCESS(0)
    196     "ld1     {v1.h}[2], [%0], %5                 \n"
    197     MEMACCESS(0)
    198     "ld1     {v0.h}[3], [%0], %5                 \n"
    199     MEMACCESS(0)
    200     "ld1     {v1.h}[3], [%0]                     \n"
    201 
    202     "trn2    v2.8b, v0.8b, v1.8b                 \n"
    203     "trn1    v3.8b, v0.8b, v1.8b                 \n"
    204 
    205     "mov         %0, %2                          \n"
    206 
    207     MEMACCESS(0)
    208     "st1     {v3.8b}, [%0], %6                   \n"
    209     MEMACCESS(0)
    210     "st1     {v2.8b}, [%0]                       \n"
    211 
    212     "add         %1, %1, #2                      \n"  // src += 2
    213     "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
    214     "subs        %3, %3,  #2                     \n"  // w   -= 2
    215     "b.eq        4f                              \n"
    216 
    217     // 1x8 block
    218     "3:                                          \n"
    219     MEMACCESS(1)
    220     "ld1         {v0.b}[0], [%1], %5             \n"
    221     MEMACCESS(1)
    222     "ld1         {v0.b}[1], [%1], %5             \n"
    223     MEMACCESS(1)
    224     "ld1         {v0.b}[2], [%1], %5             \n"
    225     MEMACCESS(1)
    226     "ld1         {v0.b}[3], [%1], %5             \n"
    227     MEMACCESS(1)
    228     "ld1         {v0.b}[4], [%1], %5             \n"
    229     MEMACCESS(1)
    230     "ld1         {v0.b}[5], [%1], %5             \n"
    231     MEMACCESS(1)
    232     "ld1         {v0.b}[6], [%1], %5             \n"
    233     MEMACCESS(1)
    234     "ld1         {v0.b}[7], [%1]                 \n"
    235 
    236     MEMACCESS(2)
    237     "st1         {v0.8b}, [%2]                   \n"
    238 
    239     "4:                                          \n"
    240 
    241     : "=&r"(src_temp),                            // %0
    242       "+r"(src),                                  // %1
    243       "+r"(dst),                                  // %2
    244       "+r"(width64)                               // %3
    245     : "r"(&kVTbl4x4Transpose),                    // %4
    246       "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
    247       "r"(static_cast<ptrdiff_t>(dst_stride))     // %6
    248     : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
    249       "v17", "v18", "v19", "v20", "v21", "v22", "v23"
    250   );
    251 }
    252 
    253 static uint8 kVTbl4x4TransposeDi[32] = {
    254     0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
    255     1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
    256 
    257 void TransposeUVWx8_NEON(const uint8* src,
    258                          int src_stride,
    259                          uint8* dst_a,
    260                          int dst_stride_a,
    261                          uint8* dst_b,
    262                          int dst_stride_b,
    263                          int width) {
    264   const uint8* src_temp;
    265   int64 width64 = (int64)width;  // Work around clang 3.4 warning.
    266   asm volatile (
    267     // loops are on blocks of 8. loop will stop when
    268     // counter gets to or below 0. starting the counter
    269     // at w-8 allow for this
    270     "sub       %4, %4, #8                      \n"
    271 
    272     // handle 8x8 blocks. this should be the majority of the plane
    273     "1:                                        \n"
    274     "mov       %0, %1                          \n"
    275 
    276     MEMACCESS(0)
    277     "ld1       {v0.16b}, [%0], %5              \n"
    278     MEMACCESS(0)
    279     "ld1       {v1.16b}, [%0], %5              \n"
    280     MEMACCESS(0)
    281     "ld1       {v2.16b}, [%0], %5              \n"
    282     MEMACCESS(0)
    283     "ld1       {v3.16b}, [%0], %5              \n"
    284     MEMACCESS(0)
    285     "ld1       {v4.16b}, [%0], %5              \n"
    286     MEMACCESS(0)
    287     "ld1       {v5.16b}, [%0], %5              \n"
    288     MEMACCESS(0)
    289     "ld1       {v6.16b}, [%0], %5              \n"
    290     MEMACCESS(0)
    291     "ld1       {v7.16b}, [%0]                  \n"
    292 
    293     "trn1      v16.16b, v0.16b, v1.16b         \n"
    294     "trn2      v17.16b, v0.16b, v1.16b         \n"
    295     "trn1      v18.16b, v2.16b, v3.16b         \n"
    296     "trn2      v19.16b, v2.16b, v3.16b         \n"
    297     "trn1      v20.16b, v4.16b, v5.16b         \n"
    298     "trn2      v21.16b, v4.16b, v5.16b         \n"
    299     "trn1      v22.16b, v6.16b, v7.16b         \n"
    300     "trn2      v23.16b, v6.16b, v7.16b         \n"
    301 
    302     "trn1      v0.8h, v16.8h, v18.8h           \n"
    303     "trn2      v1.8h, v16.8h, v18.8h           \n"
    304     "trn1      v2.8h, v20.8h, v22.8h           \n"
    305     "trn2      v3.8h, v20.8h, v22.8h           \n"
    306     "trn1      v4.8h, v17.8h, v19.8h           \n"
    307     "trn2      v5.8h, v17.8h, v19.8h           \n"
    308     "trn1      v6.8h, v21.8h, v23.8h           \n"
    309     "trn2      v7.8h, v21.8h, v23.8h           \n"
    310 
    311     "trn1      v16.4s, v0.4s, v2.4s            \n"
    312     "trn2      v17.4s, v0.4s, v2.4s            \n"
    313     "trn1      v18.4s, v1.4s, v3.4s            \n"
    314     "trn2      v19.4s, v1.4s, v3.4s            \n"
    315     "trn1      v20.4s, v4.4s, v6.4s            \n"
    316     "trn2      v21.4s, v4.4s, v6.4s            \n"
    317     "trn1      v22.4s, v5.4s, v7.4s            \n"
    318     "trn2      v23.4s, v5.4s, v7.4s            \n"
    319 
    320     "mov       %0, %2                          \n"
    321 
    322     MEMACCESS(0)
    323     "st1       {v16.d}[0], [%0], %6            \n"
    324     MEMACCESS(0)
    325     "st1       {v18.d}[0], [%0], %6            \n"
    326     MEMACCESS(0)
    327     "st1       {v17.d}[0], [%0], %6            \n"
    328     MEMACCESS(0)
    329     "st1       {v19.d}[0], [%0], %6            \n"
    330     MEMACCESS(0)
    331     "st1       {v16.d}[1], [%0], %6            \n"
    332     MEMACCESS(0)
    333     "st1       {v18.d}[1], [%0], %6            \n"
    334     MEMACCESS(0)
    335     "st1       {v17.d}[1], [%0], %6            \n"
    336     MEMACCESS(0)
    337     "st1       {v19.d}[1], [%0]                \n"
    338 
    339     "mov       %0, %3                          \n"
    340 
    341     MEMACCESS(0)
    342     "st1       {v20.d}[0], [%0], %7            \n"
    343     MEMACCESS(0)
    344     "st1       {v22.d}[0], [%0], %7            \n"
    345     MEMACCESS(0)
    346     "st1       {v21.d}[0], [%0], %7            \n"
    347     MEMACCESS(0)
    348     "st1       {v23.d}[0], [%0], %7            \n"
    349     MEMACCESS(0)
    350     "st1       {v20.d}[1], [%0], %7            \n"
    351     MEMACCESS(0)
    352     "st1       {v22.d}[1], [%0], %7            \n"
    353     MEMACCESS(0)
    354     "st1       {v21.d}[1], [%0], %7            \n"
    355     MEMACCESS(0)
    356     "st1       {v23.d}[1], [%0]                \n"
    357 
    358     "add       %1, %1, #16                     \n"  // src   += 8*2
    359     "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a
    360     "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b
    361     "subs      %4, %4,  #8                     \n"  // w     -= 8
    362     "b.ge      1b                              \n"
    363 
    364     // add 8 back to counter. if the result is 0 there are
    365     // no residuals.
    366     "adds      %4, %4, #8                      \n"
    367     "b.eq      4f                              \n"
    368 
    369     // some residual, so between 1 and 7 lines left to transpose
    370     "cmp       %4, #2                          \n"
    371     "b.lt      3f                              \n"
    372 
    373     "cmp       %4, #4                          \n"
    374     "b.lt      2f                              \n"
    375 
    376     // TODO(frkoenig): Clean this up
    377     // 4x8 block
    378     "mov       %0, %1                          \n"
    379     MEMACCESS(0)
    380     "ld1       {v0.8b}, [%0], %5               \n"
    381     MEMACCESS(0)
    382     "ld1       {v1.8b}, [%0], %5               \n"
    383     MEMACCESS(0)
    384     "ld1       {v2.8b}, [%0], %5               \n"
    385     MEMACCESS(0)
    386     "ld1       {v3.8b}, [%0], %5               \n"
    387     MEMACCESS(0)
    388     "ld1       {v4.8b}, [%0], %5               \n"
    389     MEMACCESS(0)
    390     "ld1       {v5.8b}, [%0], %5               \n"
    391     MEMACCESS(0)
    392     "ld1       {v6.8b}, [%0], %5               \n"
    393     MEMACCESS(0)
    394     "ld1       {v7.8b}, [%0]                   \n"
    395 
    396     MEMACCESS(8)
    397     "ld1       {v30.16b}, [%8], #16            \n"
    398     "ld1       {v31.16b}, [%8]                 \n"
    399 
    400     "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
    401     "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
    402     "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
    403     "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
    404 
    405     "mov       %0, %2                          \n"
    406 
    407     MEMACCESS(0)
    408     "st1       {v16.s}[0],  [%0], %6           \n"
    409     MEMACCESS(0)
    410     "st1       {v16.s}[1],  [%0], %6           \n"
    411     MEMACCESS(0)
    412     "st1       {v16.s}[2],  [%0], %6           \n"
    413     MEMACCESS(0)
    414     "st1       {v16.s}[3],  [%0], %6           \n"
    415 
    416     "add       %0, %2, #4                      \n"
    417     MEMACCESS(0)
    418     "st1       {v18.s}[0], [%0], %6            \n"
    419     MEMACCESS(0)
    420     "st1       {v18.s}[1], [%0], %6            \n"
    421     MEMACCESS(0)
    422     "st1       {v18.s}[2], [%0], %6            \n"
    423     MEMACCESS(0)
    424     "st1       {v18.s}[3], [%0]                \n"
    425 
    426     "mov       %0, %3                          \n"
    427 
    428     MEMACCESS(0)
    429     "st1       {v17.s}[0], [%0], %7            \n"
    430     MEMACCESS(0)
    431     "st1       {v17.s}[1], [%0], %7            \n"
    432     MEMACCESS(0)
    433     "st1       {v17.s}[2], [%0], %7            \n"
    434     MEMACCESS(0)
    435     "st1       {v17.s}[3], [%0], %7            \n"
    436 
    437     "add       %0, %3, #4                      \n"
    438     MEMACCESS(0)
    439     "st1       {v19.s}[0],  [%0], %7           \n"
    440     MEMACCESS(0)
    441     "st1       {v19.s}[1],  [%0], %7           \n"
    442     MEMACCESS(0)
    443     "st1       {v19.s}[2],  [%0], %7           \n"
    444     MEMACCESS(0)
    445     "st1       {v19.s}[3],  [%0]               \n"
    446 
    447     "add       %1, %1, #8                      \n"  // src   += 4 * 2
    448     "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a
    449     "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b
    450     "subs      %4,  %4,  #4                    \n"  // w     -= 4
    451     "b.eq      4f                              \n"
    452 
    453     // some residual, check to see if it includes a 2x8 block,
    454     // or less
    455     "cmp       %4, #2                          \n"
    456     "b.lt      3f                              \n"
    457 
    458     // 2x8 block
    459     "2:                                        \n"
    460     "mov       %0, %1                          \n"
    461     MEMACCESS(0)
    462     "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
    463     MEMACCESS(0)
    464     "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
    465     MEMACCESS(0)
    466     "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
    467     MEMACCESS(0)
    468     "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
    469     MEMACCESS(0)
    470     "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
    471     MEMACCESS(0)
    472     "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
    473     MEMACCESS(0)
    474     "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
    475     MEMACCESS(0)
    476     "ld2       {v2.h, v3.h}[3], [%0]           \n"
    477 
    478     "trn1      v4.8b, v0.8b, v2.8b             \n"
    479     "trn2      v5.8b, v0.8b, v2.8b             \n"
    480     "trn1      v6.8b, v1.8b, v3.8b             \n"
    481     "trn2      v7.8b, v1.8b, v3.8b             \n"
    482 
    483     "mov       %0, %2                          \n"
    484 
    485     MEMACCESS(0)
    486     "st1       {v4.d}[0], [%0], %6             \n"
    487     MEMACCESS(0)
    488     "st1       {v6.d}[0], [%0]                 \n"
    489 
    490     "mov       %0, %3                          \n"
    491 
    492     MEMACCESS(0)
    493     "st1       {v5.d}[0], [%0], %7             \n"
    494     MEMACCESS(0)
    495     "st1       {v7.d}[0], [%0]                 \n"
    496 
    497     "add       %1, %1, #4                      \n"  // src   += 2 * 2
    498     "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a
    499     "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b
    500     "subs      %4,  %4,  #2                    \n"  // w     -= 2
    501     "b.eq      4f                              \n"
    502 
    503     // 1x8 block
    504     "3:                                        \n"
    505     MEMACCESS(1)
    506     "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
    507     MEMACCESS(1)
    508     "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
    509     MEMACCESS(1)
    510     "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
    511     MEMACCESS(1)
    512     "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
    513     MEMACCESS(1)
    514     "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
    515     MEMACCESS(1)
    516     "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
    517     MEMACCESS(1)
    518     "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
    519     MEMACCESS(1)
    520     "ld2       {v0.b, v1.b}[7], [%1]           \n"
    521 
    522     MEMACCESS(2)
    523     "st1       {v0.d}[0], [%2]                 \n"
    524     MEMACCESS(3)
    525     "st1       {v1.d}[0], [%3]                 \n"
    526 
    527     "4:                                        \n"
    528 
    529     : "=&r"(src_temp),                            // %0
    530       "+r"(src),                                  // %1
    531       "+r"(dst_a),                                // %2
    532       "+r"(dst_b),                                // %3
    533       "+r"(width64)                               // %4
    534     : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
    535       "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
    536       "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
    537       "r"(&kVTbl4x4TransposeDi)                   // %8
    538     : "memory", "cc",
    539       "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
    540       "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
    541       "v30", "v31"
    542   );
    543 }
    544 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
    545 
    546 #ifdef __cplusplus
    547 }  // extern "C"
    548 }  // namespace libyuv
    549 #endif
    550