Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 #include "libyuv/rotate_row.h"
     13 
     14 #include "libyuv/basic_types.h"
     15 
     16 #ifdef __cplusplus
     17 namespace libyuv {
     18 extern "C" {
     19 #endif
     20 
     21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     22     !defined(__aarch64__)
     23 
     24 static uvec8 kVTbl4x4Transpose =
     25   { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
     26 
     27 void TransposeWx8_NEON(const uint8* src, int src_stride,
     28                        uint8* dst, int dst_stride,
     29                        int width) {
     30   const uint8* src_temp;
     31   asm volatile (
     32     // loops are on blocks of 8. loop will stop when
     33     // counter gets to or below 0. starting the counter
     34     // at w-8 allow for this
     35     "sub         %5, #8                        \n"
     36 
     37     // handle 8x8 blocks. this should be the majority of the plane
     38     "1:                                        \n"
     39       "mov         %0, %1                      \n"
     40 
     41       MEMACCESS(0)
     42       "vld1.8      {d0}, [%0], %2              \n"
     43       MEMACCESS(0)
     44       "vld1.8      {d1}, [%0], %2              \n"
     45       MEMACCESS(0)
     46       "vld1.8      {d2}, [%0], %2              \n"
     47       MEMACCESS(0)
     48       "vld1.8      {d3}, [%0], %2              \n"
     49       MEMACCESS(0)
     50       "vld1.8      {d4}, [%0], %2              \n"
     51       MEMACCESS(0)
     52       "vld1.8      {d5}, [%0], %2              \n"
     53       MEMACCESS(0)
     54       "vld1.8      {d6}, [%0], %2              \n"
     55       MEMACCESS(0)
     56       "vld1.8      {d7}, [%0]                  \n"
     57 
     58       "vtrn.8      d1, d0                      \n"
     59       "vtrn.8      d3, d2                      \n"
     60       "vtrn.8      d5, d4                      \n"
     61       "vtrn.8      d7, d6                      \n"
     62 
     63       "vtrn.16     d1, d3                      \n"
     64       "vtrn.16     d0, d2                      \n"
     65       "vtrn.16     d5, d7                      \n"
     66       "vtrn.16     d4, d6                      \n"
     67 
     68       "vtrn.32     d1, d5                      \n"
     69       "vtrn.32     d0, d4                      \n"
     70       "vtrn.32     d3, d7                      \n"
     71       "vtrn.32     d2, d6                      \n"
     72 
     73       "vrev16.8    q0, q0                      \n"
     74       "vrev16.8    q1, q1                      \n"
     75       "vrev16.8    q2, q2                      \n"
     76       "vrev16.8    q3, q3                      \n"
     77 
     78       "mov         %0, %3                      \n"
     79 
     80     MEMACCESS(0)
     81       "vst1.8      {d1}, [%0], %4              \n"
     82     MEMACCESS(0)
     83       "vst1.8      {d0}, [%0], %4              \n"
     84     MEMACCESS(0)
     85       "vst1.8      {d3}, [%0], %4              \n"
     86     MEMACCESS(0)
     87       "vst1.8      {d2}, [%0], %4              \n"
     88     MEMACCESS(0)
     89       "vst1.8      {d5}, [%0], %4              \n"
     90     MEMACCESS(0)
     91       "vst1.8      {d4}, [%0], %4              \n"
     92     MEMACCESS(0)
     93       "vst1.8      {d7}, [%0], %4              \n"
     94     MEMACCESS(0)
     95       "vst1.8      {d6}, [%0]                  \n"
     96 
     97       "add         %1, #8                      \n"  // src += 8
     98       "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
     99       "subs        %5,  #8                     \n"  // w   -= 8
    100       "bge         1b                          \n"
    101 
    102     // add 8 back to counter. if the result is 0 there are
    103     // no residuals.
    104     "adds        %5, #8                        \n"
    105     "beq         4f                            \n"
    106 
    107     // some residual, so between 1 and 7 lines left to transpose
    108     "cmp         %5, #2                        \n"
    109     "blt         3f                            \n"
    110 
    111     "cmp         %5, #4                        \n"
    112     "blt         2f                            \n"
    113 
    114     // 4x8 block
    115     "mov         %0, %1                        \n"
    116     MEMACCESS(0)
    117     "vld1.32     {d0[0]}, [%0], %2             \n"
    118     MEMACCESS(0)
    119     "vld1.32     {d0[1]}, [%0], %2             \n"
    120     MEMACCESS(0)
    121     "vld1.32     {d1[0]}, [%0], %2             \n"
    122     MEMACCESS(0)
    123     "vld1.32     {d1[1]}, [%0], %2             \n"
    124     MEMACCESS(0)
    125     "vld1.32     {d2[0]}, [%0], %2             \n"
    126     MEMACCESS(0)
    127     "vld1.32     {d2[1]}, [%0], %2             \n"
    128     MEMACCESS(0)
    129     "vld1.32     {d3[0]}, [%0], %2             \n"
    130     MEMACCESS(0)
    131     "vld1.32     {d3[1]}, [%0]                 \n"
    132 
    133     "mov         %0, %3                        \n"
    134 
    135     MEMACCESS(6)
    136     "vld1.8      {q3}, [%6]                    \n"
    137 
    138     "vtbl.8      d4, {d0, d1}, d6              \n"
    139     "vtbl.8      d5, {d0, d1}, d7              \n"
    140     "vtbl.8      d0, {d2, d3}, d6              \n"
    141     "vtbl.8      d1, {d2, d3}, d7              \n"
    142 
    143     // TODO(frkoenig): Rework shuffle above to
    144     // write out with 4 instead of 8 writes.
    145     MEMACCESS(0)
    146     "vst1.32     {d4[0]}, [%0], %4             \n"
    147     MEMACCESS(0)
    148     "vst1.32     {d4[1]}, [%0], %4             \n"
    149     MEMACCESS(0)
    150     "vst1.32     {d5[0]}, [%0], %4             \n"
    151     MEMACCESS(0)
    152     "vst1.32     {d5[1]}, [%0]                 \n"
    153 
    154     "add         %0, %3, #4                    \n"
    155     MEMACCESS(0)
    156     "vst1.32     {d0[0]}, [%0], %4             \n"
    157     MEMACCESS(0)
    158     "vst1.32     {d0[1]}, [%0], %4             \n"
    159     MEMACCESS(0)
    160     "vst1.32     {d1[0]}, [%0], %4             \n"
    161     MEMACCESS(0)
    162     "vst1.32     {d1[1]}, [%0]                 \n"
    163 
    164     "add         %1, #4                        \n"  // src += 4
    165     "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
    166     "subs        %5,  #4                       \n"  // w   -= 4
    167     "beq         4f                            \n"
    168 
    169     // some residual, check to see if it includes a 2x8 block,
    170     // or less
    171     "cmp         %5, #2                        \n"
    172     "blt         3f                            \n"
    173 
    174     // 2x8 block
    175     "2:                                        \n"
    176     "mov         %0, %1                        \n"
    177     MEMACCESS(0)
    178     "vld1.16     {d0[0]}, [%0], %2             \n"
    179     MEMACCESS(0)
    180     "vld1.16     {d1[0]}, [%0], %2             \n"
    181     MEMACCESS(0)
    182     "vld1.16     {d0[1]}, [%0], %2             \n"
    183     MEMACCESS(0)
    184     "vld1.16     {d1[1]}, [%0], %2             \n"
    185     MEMACCESS(0)
    186     "vld1.16     {d0[2]}, [%0], %2             \n"
    187     MEMACCESS(0)
    188     "vld1.16     {d1[2]}, [%0], %2             \n"
    189     MEMACCESS(0)
    190     "vld1.16     {d0[3]}, [%0], %2             \n"
    191     MEMACCESS(0)
    192     "vld1.16     {d1[3]}, [%0]                 \n"
    193 
    194     "vtrn.8      d0, d1                        \n"
    195 
    196     "mov         %0, %3                        \n"
    197 
    198     MEMACCESS(0)
    199     "vst1.64     {d0}, [%0], %4                \n"
    200     MEMACCESS(0)
    201     "vst1.64     {d1}, [%0]                    \n"
    202 
    203     "add         %1, #2                        \n"  // src += 2
    204     "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
    205     "subs        %5,  #2                       \n"  // w   -= 2
    206     "beq         4f                            \n"
    207 
    208     // 1x8 block
    209     "3:                                        \n"
    210     MEMACCESS(1)
    211     "vld1.8      {d0[0]}, [%1], %2             \n"
    212     MEMACCESS(1)
    213     "vld1.8      {d0[1]}, [%1], %2             \n"
    214     MEMACCESS(1)
    215     "vld1.8      {d0[2]}, [%1], %2             \n"
    216     MEMACCESS(1)
    217     "vld1.8      {d0[3]}, [%1], %2             \n"
    218     MEMACCESS(1)
    219     "vld1.8      {d0[4]}, [%1], %2             \n"
    220     MEMACCESS(1)
    221     "vld1.8      {d0[5]}, [%1], %2             \n"
    222     MEMACCESS(1)
    223     "vld1.8      {d0[6]}, [%1], %2             \n"
    224     MEMACCESS(1)
    225     "vld1.8      {d0[7]}, [%1]                 \n"
    226 
    227     MEMACCESS(3)
    228     "vst1.64     {d0}, [%3]                    \n"
    229 
    230     "4:                                        \n"
    231 
    232     : "=&r"(src_temp),         // %0
    233       "+r"(src),               // %1
    234       "+r"(src_stride),        // %2
    235       "+r"(dst),               // %3
    236       "+r"(dst_stride),        // %4
    237       "+r"(width)              // %5
    238     : "r"(&kVTbl4x4Transpose)  // %6
    239     : "memory", "cc", "q0", "q1", "q2", "q3"
    240   );
    241 }
    242 
    243 static uvec8 kVTbl4x4TransposeDi =
    244   { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
    245 
    246 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
    247                          uint8* dst_a, int dst_stride_a,
    248                          uint8* dst_b, int dst_stride_b,
    249                          int width) {
    250   const uint8* src_temp;
    251   asm volatile (
    252     // loops are on blocks of 8. loop will stop when
    253     // counter gets to or below 0. starting the counter
    254     // at w-8 allow for this
    255     "sub         %7, #8                        \n"
    256 
    257     // handle 8x8 blocks. this should be the majority of the plane
    258     "1:                                        \n"
    259       "mov         %0, %1                      \n"
    260 
    261       MEMACCESS(0)
    262       "vld2.8      {d0,  d1},  [%0], %2        \n"
    263       MEMACCESS(0)
    264       "vld2.8      {d2,  d3},  [%0], %2        \n"
    265       MEMACCESS(0)
    266       "vld2.8      {d4,  d5},  [%0], %2        \n"
    267       MEMACCESS(0)
    268       "vld2.8      {d6,  d7},  [%0], %2        \n"
    269       MEMACCESS(0)
    270       "vld2.8      {d16, d17}, [%0], %2        \n"
    271       MEMACCESS(0)
    272       "vld2.8      {d18, d19}, [%0], %2        \n"
    273       MEMACCESS(0)
    274       "vld2.8      {d20, d21}, [%0], %2        \n"
    275       MEMACCESS(0)
    276       "vld2.8      {d22, d23}, [%0]            \n"
    277 
    278       "vtrn.8      q1, q0                      \n"
    279       "vtrn.8      q3, q2                      \n"
    280       "vtrn.8      q9, q8                      \n"
    281       "vtrn.8      q11, q10                    \n"
    282 
    283       "vtrn.16     q1, q3                      \n"
    284       "vtrn.16     q0, q2                      \n"
    285       "vtrn.16     q9, q11                     \n"
    286       "vtrn.16     q8, q10                     \n"
    287 
    288       "vtrn.32     q1, q9                      \n"
    289       "vtrn.32     q0, q8                      \n"
    290       "vtrn.32     q3, q11                     \n"
    291       "vtrn.32     q2, q10                     \n"
    292 
    293       "vrev16.8    q0, q0                      \n"
    294       "vrev16.8    q1, q1                      \n"
    295       "vrev16.8    q2, q2                      \n"
    296       "vrev16.8    q3, q3                      \n"
    297       "vrev16.8    q8, q8                      \n"
    298       "vrev16.8    q9, q9                      \n"
    299       "vrev16.8    q10, q10                    \n"
    300       "vrev16.8    q11, q11                    \n"
    301 
    302       "mov         %0, %3                      \n"
    303 
    304     MEMACCESS(0)
    305       "vst1.8      {d2},  [%0], %4             \n"
    306     MEMACCESS(0)
    307       "vst1.8      {d0},  [%0], %4             \n"
    308     MEMACCESS(0)
    309       "vst1.8      {d6},  [%0], %4             \n"
    310     MEMACCESS(0)
    311       "vst1.8      {d4},  [%0], %4             \n"
    312     MEMACCESS(0)
    313       "vst1.8      {d18}, [%0], %4             \n"
    314     MEMACCESS(0)
    315       "vst1.8      {d16}, [%0], %4             \n"
    316     MEMACCESS(0)
    317       "vst1.8      {d22}, [%0], %4             \n"
    318     MEMACCESS(0)
    319       "vst1.8      {d20}, [%0]                 \n"
    320 
    321       "mov         %0, %5                      \n"
    322 
    323     MEMACCESS(0)
    324       "vst1.8      {d3},  [%0], %6             \n"
    325     MEMACCESS(0)
    326       "vst1.8      {d1},  [%0], %6             \n"
    327     MEMACCESS(0)
    328       "vst1.8      {d7},  [%0], %6             \n"
    329     MEMACCESS(0)
    330       "vst1.8      {d5},  [%0], %6             \n"
    331     MEMACCESS(0)
    332       "vst1.8      {d19}, [%0], %6             \n"
    333     MEMACCESS(0)
    334       "vst1.8      {d17}, [%0], %6             \n"
    335     MEMACCESS(0)
    336       "vst1.8      {d23}, [%0], %6             \n"
    337     MEMACCESS(0)
    338       "vst1.8      {d21}, [%0]                 \n"
    339 
    340       "add         %1, #8*2                    \n"  // src   += 8*2
    341       "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
    342       "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
    343       "subs        %7,  #8                     \n"  // w     -= 8
    344       "bge         1b                          \n"
    345 
    346     // add 8 back to counter. if the result is 0 there are
    347     // no residuals.
    348     "adds        %7, #8                        \n"
    349     "beq         4f                            \n"
    350 
    351     // some residual, so between 1 and 7 lines left to transpose
    352     "cmp         %7, #2                        \n"
    353     "blt         3f                            \n"
    354 
    355     "cmp         %7, #4                        \n"
    356     "blt         2f                            \n"
    357 
    358     // TODO(frkoenig): Clean this up
    359     // 4x8 block
    360     "mov         %0, %1                        \n"
    361     MEMACCESS(0)
    362     "vld1.64     {d0}, [%0], %2                \n"
    363     MEMACCESS(0)
    364     "vld1.64     {d1}, [%0], %2                \n"
    365     MEMACCESS(0)
    366     "vld1.64     {d2}, [%0], %2                \n"
    367     MEMACCESS(0)
    368     "vld1.64     {d3}, [%0], %2                \n"
    369     MEMACCESS(0)
    370     "vld1.64     {d4}, [%0], %2                \n"
    371     MEMACCESS(0)
    372     "vld1.64     {d5}, [%0], %2                \n"
    373     MEMACCESS(0)
    374     "vld1.64     {d6}, [%0], %2                \n"
    375     MEMACCESS(0)
    376     "vld1.64     {d7}, [%0]                    \n"
    377 
    378     MEMACCESS(8)
    379     "vld1.8      {q15}, [%8]                   \n"
    380 
    381     "vtrn.8      q0, q1                        \n"
    382     "vtrn.8      q2, q3                        \n"
    383 
    384     "vtbl.8      d16, {d0, d1}, d30            \n"
    385     "vtbl.8      d17, {d0, d1}, d31            \n"
    386     "vtbl.8      d18, {d2, d3}, d30            \n"
    387     "vtbl.8      d19, {d2, d3}, d31            \n"
    388     "vtbl.8      d20, {d4, d5}, d30            \n"
    389     "vtbl.8      d21, {d4, d5}, d31            \n"
    390     "vtbl.8      d22, {d6, d7}, d30            \n"
    391     "vtbl.8      d23, {d6, d7}, d31            \n"
    392 
    393     "mov         %0, %3                        \n"
    394 
    395     MEMACCESS(0)
    396     "vst1.32     {d16[0]},  [%0], %4           \n"
    397     MEMACCESS(0)
    398     "vst1.32     {d16[1]},  [%0], %4           \n"
    399     MEMACCESS(0)
    400     "vst1.32     {d17[0]},  [%0], %4           \n"
    401     MEMACCESS(0)
    402     "vst1.32     {d17[1]},  [%0], %4           \n"
    403 
    404     "add         %0, %3, #4                    \n"
    405     MEMACCESS(0)
    406     "vst1.32     {d20[0]}, [%0], %4            \n"
    407     MEMACCESS(0)
    408     "vst1.32     {d20[1]}, [%0], %4            \n"
    409     MEMACCESS(0)
    410     "vst1.32     {d21[0]}, [%0], %4            \n"
    411     MEMACCESS(0)
    412     "vst1.32     {d21[1]}, [%0]                \n"
    413 
    414     "mov         %0, %5                        \n"
    415 
    416     MEMACCESS(0)
    417     "vst1.32     {d18[0]}, [%0], %6            \n"
    418     MEMACCESS(0)
    419     "vst1.32     {d18[1]}, [%0], %6            \n"
    420     MEMACCESS(0)
    421     "vst1.32     {d19[0]}, [%0], %6            \n"
    422     MEMACCESS(0)
    423     "vst1.32     {d19[1]}, [%0], %6            \n"
    424 
    425     "add         %0, %5, #4                    \n"
    426     MEMACCESS(0)
    427     "vst1.32     {d22[0]},  [%0], %6           \n"
    428     MEMACCESS(0)
    429     "vst1.32     {d22[1]},  [%0], %6           \n"
    430     MEMACCESS(0)
    431     "vst1.32     {d23[0]},  [%0], %6           \n"
    432     MEMACCESS(0)
    433     "vst1.32     {d23[1]},  [%0]               \n"
    434 
    435     "add         %1, #4*2                      \n"  // src   += 4 * 2
    436     "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
    437     "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
    438     "subs        %7,  #4                       \n"  // w     -= 4
    439     "beq         4f                            \n"
    440 
    441     // some residual, check to see if it includes a 2x8 block,
    442     // or less
    443     "cmp         %7, #2                        \n"
    444     "blt         3f                            \n"
    445 
    446     // 2x8 block
    447     "2:                                        \n"
    448     "mov         %0, %1                        \n"
    449     MEMACCESS(0)
    450     "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
    451     MEMACCESS(0)
    452     "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
    453     MEMACCESS(0)
    454     "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
    455     MEMACCESS(0)
    456     "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
    457     MEMACCESS(0)
    458     "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
    459     MEMACCESS(0)
    460     "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
    461     MEMACCESS(0)
    462     "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
    463     MEMACCESS(0)
    464     "vld2.16     {d1[3], d3[3]}, [%0]          \n"
    465 
    466     "vtrn.8      d0, d1                        \n"
    467     "vtrn.8      d2, d3                        \n"
    468 
    469     "mov         %0, %3                        \n"
    470 
    471     MEMACCESS(0)
    472     "vst1.64     {d0}, [%0], %4                \n"
    473     MEMACCESS(0)
    474     "vst1.64     {d2}, [%0]                    \n"
    475 
    476     "mov         %0, %5                        \n"
    477 
    478     MEMACCESS(0)
    479     "vst1.64     {d1}, [%0], %6                \n"
    480     MEMACCESS(0)
    481     "vst1.64     {d3}, [%0]                    \n"
    482 
    483     "add         %1, #2*2                      \n"  // src   += 2 * 2
    484     "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
    485     "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
    486     "subs        %7,  #2                       \n"  // w     -= 2
    487     "beq         4f                            \n"
    488 
    489     // 1x8 block
    490     "3:                                        \n"
    491     MEMACCESS(1)
    492     "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
    493     MEMACCESS(1)
    494     "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
    495     MEMACCESS(1)
    496     "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
    497     MEMACCESS(1)
    498     "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
    499     MEMACCESS(1)
    500     "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
    501     MEMACCESS(1)
    502     "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
    503     MEMACCESS(1)
    504     "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
    505     MEMACCESS(1)
    506     "vld2.8      {d0[7], d1[7]}, [%1]          \n"
    507 
    508     MEMACCESS(3)
    509     "vst1.64     {d0}, [%3]                    \n"
    510     MEMACCESS(5)
    511     "vst1.64     {d1}, [%5]                    \n"
    512 
    513     "4:                                        \n"
    514 
    515     : "=&r"(src_temp),           // %0
    516       "+r"(src),                 // %1
    517       "+r"(src_stride),          // %2
    518       "+r"(dst_a),               // %3
    519       "+r"(dst_stride_a),        // %4
    520       "+r"(dst_b),               // %5
    521       "+r"(dst_stride_b),        // %6
    522       "+r"(width)                // %7
    523     : "r"(&kVTbl4x4TransposeDi)  // %8
    524     : "memory", "cc",
    525       "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
    526   );
    527 }
    528 #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
    529 
    530 #ifdef __cplusplus
    531 }  // extern "C"
    532 }  // namespace libyuv
    533 #endif
    534