Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/rotate_row.h"
     12 #include "libyuv/row.h"
     13 
     14 #include "libyuv/basic_types.h"
     15 
     16 #ifdef __cplusplus
     17 namespace libyuv {
     18 extern "C" {
     19 #endif
     20 
     21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     22     !defined(__aarch64__)
     23 
     24 static uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
     25                                   2, 6, 10, 14, 3, 7, 11, 15};
     26 
     27 void TransposeWx8_NEON(const uint8* src,
     28                        int src_stride,
     29                        uint8* dst,
     30                        int dst_stride,
     31                        int width) {
     32   const uint8* src_temp;
     33   asm volatile (
     34     // loops are on blocks of 8. loop will stop when
     35     // counter gets to or below 0. starting the counter
     36     // at w-8 allow for this
     37     "sub         %5, #8                        \n"
     38 
     39     // handle 8x8 blocks. this should be the majority of the plane
     40     "1:                                        \n"
     41       "mov         %0, %1                      \n"
     42 
     43       MEMACCESS(0)
     44       "vld1.8      {d0}, [%0], %2              \n"
     45       MEMACCESS(0)
     46       "vld1.8      {d1}, [%0], %2              \n"
     47       MEMACCESS(0)
     48       "vld1.8      {d2}, [%0], %2              \n"
     49       MEMACCESS(0)
     50       "vld1.8      {d3}, [%0], %2              \n"
     51       MEMACCESS(0)
     52       "vld1.8      {d4}, [%0], %2              \n"
     53       MEMACCESS(0)
     54       "vld1.8      {d5}, [%0], %2              \n"
     55       MEMACCESS(0)
     56       "vld1.8      {d6}, [%0], %2              \n"
     57       MEMACCESS(0)
     58       "vld1.8      {d7}, [%0]                  \n"
     59 
     60       "vtrn.8      d1, d0                      \n"
     61       "vtrn.8      d3, d2                      \n"
     62       "vtrn.8      d5, d4                      \n"
     63       "vtrn.8      d7, d6                      \n"
     64 
     65       "vtrn.16     d1, d3                      \n"
     66       "vtrn.16     d0, d2                      \n"
     67       "vtrn.16     d5, d7                      \n"
     68       "vtrn.16     d4, d6                      \n"
     69 
     70       "vtrn.32     d1, d5                      \n"
     71       "vtrn.32     d0, d4                      \n"
     72       "vtrn.32     d3, d7                      \n"
     73       "vtrn.32     d2, d6                      \n"
     74 
     75       "vrev16.8    q0, q0                      \n"
     76       "vrev16.8    q1, q1                      \n"
     77       "vrev16.8    q2, q2                      \n"
     78       "vrev16.8    q3, q3                      \n"
     79 
     80       "mov         %0, %3                      \n"
     81 
     82     MEMACCESS(0)
     83       "vst1.8      {d1}, [%0], %4              \n"
     84     MEMACCESS(0)
     85       "vst1.8      {d0}, [%0], %4              \n"
     86     MEMACCESS(0)
     87       "vst1.8      {d3}, [%0], %4              \n"
     88     MEMACCESS(0)
     89       "vst1.8      {d2}, [%0], %4              \n"
     90     MEMACCESS(0)
     91       "vst1.8      {d5}, [%0], %4              \n"
     92     MEMACCESS(0)
     93       "vst1.8      {d4}, [%0], %4              \n"
     94     MEMACCESS(0)
     95       "vst1.8      {d7}, [%0], %4              \n"
     96     MEMACCESS(0)
     97       "vst1.8      {d6}, [%0]                  \n"
     98 
     99       "add         %1, #8                      \n"  // src += 8
    100       "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
    101       "subs        %5,  #8                     \n"  // w   -= 8
    102       "bge         1b                          \n"
    103 
    104     // add 8 back to counter. if the result is 0 there are
    105     // no residuals.
    106     "adds        %5, #8                        \n"
    107     "beq         4f                            \n"
    108 
    109     // some residual, so between 1 and 7 lines left to transpose
    110     "cmp         %5, #2                        \n"
    111     "blt         3f                            \n"
    112 
    113     "cmp         %5, #4                        \n"
    114     "blt         2f                            \n"
    115 
    116     // 4x8 block
    117     "mov         %0, %1                        \n"
    118     MEMACCESS(0)
    119     "vld1.32     {d0[0]}, [%0], %2             \n"
    120     MEMACCESS(0)
    121     "vld1.32     {d0[1]}, [%0], %2             \n"
    122     MEMACCESS(0)
    123     "vld1.32     {d1[0]}, [%0], %2             \n"
    124     MEMACCESS(0)
    125     "vld1.32     {d1[1]}, [%0], %2             \n"
    126     MEMACCESS(0)
    127     "vld1.32     {d2[0]}, [%0], %2             \n"
    128     MEMACCESS(0)
    129     "vld1.32     {d2[1]}, [%0], %2             \n"
    130     MEMACCESS(0)
    131     "vld1.32     {d3[0]}, [%0], %2             \n"
    132     MEMACCESS(0)
    133     "vld1.32     {d3[1]}, [%0]                 \n"
    134 
    135     "mov         %0, %3                        \n"
    136 
    137     MEMACCESS(6)
    138     "vld1.8      {q3}, [%6]                    \n"
    139 
    140     "vtbl.8      d4, {d0, d1}, d6              \n"
    141     "vtbl.8      d5, {d0, d1}, d7              \n"
    142     "vtbl.8      d0, {d2, d3}, d6              \n"
    143     "vtbl.8      d1, {d2, d3}, d7              \n"
    144 
    145     // TODO(frkoenig): Rework shuffle above to
    146     // write out with 4 instead of 8 writes.
    147     MEMACCESS(0)
    148     "vst1.32     {d4[0]}, [%0], %4             \n"
    149     MEMACCESS(0)
    150     "vst1.32     {d4[1]}, [%0], %4             \n"
    151     MEMACCESS(0)
    152     "vst1.32     {d5[0]}, [%0], %4             \n"
    153     MEMACCESS(0)
    154     "vst1.32     {d5[1]}, [%0]                 \n"
    155 
    156     "add         %0, %3, #4                    \n"
    157     MEMACCESS(0)
    158     "vst1.32     {d0[0]}, [%0], %4             \n"
    159     MEMACCESS(0)
    160     "vst1.32     {d0[1]}, [%0], %4             \n"
    161     MEMACCESS(0)
    162     "vst1.32     {d1[0]}, [%0], %4             \n"
    163     MEMACCESS(0)
    164     "vst1.32     {d1[1]}, [%0]                 \n"
    165 
    166     "add         %1, #4                        \n"  // src += 4
    167     "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
    168     "subs        %5,  #4                       \n"  // w   -= 4
    169     "beq         4f                            \n"
    170 
    171     // some residual, check to see if it includes a 2x8 block,
    172     // or less
    173     "cmp         %5, #2                        \n"
    174     "blt         3f                            \n"
    175 
    176     // 2x8 block
    177     "2:                                        \n"
    178     "mov         %0, %1                        \n"
    179     MEMACCESS(0)
    180     "vld1.16     {d0[0]}, [%0], %2             \n"
    181     MEMACCESS(0)
    182     "vld1.16     {d1[0]}, [%0], %2             \n"
    183     MEMACCESS(0)
    184     "vld1.16     {d0[1]}, [%0], %2             \n"
    185     MEMACCESS(0)
    186     "vld1.16     {d1[1]}, [%0], %2             \n"
    187     MEMACCESS(0)
    188     "vld1.16     {d0[2]}, [%0], %2             \n"
    189     MEMACCESS(0)
    190     "vld1.16     {d1[2]}, [%0], %2             \n"
    191     MEMACCESS(0)
    192     "vld1.16     {d0[3]}, [%0], %2             \n"
    193     MEMACCESS(0)
    194     "vld1.16     {d1[3]}, [%0]                 \n"
    195 
    196     "vtrn.8      d0, d1                        \n"
    197 
    198     "mov         %0, %3                        \n"
    199 
    200     MEMACCESS(0)
    201     "vst1.64     {d0}, [%0], %4                \n"
    202     MEMACCESS(0)
    203     "vst1.64     {d1}, [%0]                    \n"
    204 
    205     "add         %1, #2                        \n"  // src += 2
    206     "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
    207     "subs        %5,  #2                       \n"  // w   -= 2
    208     "beq         4f                            \n"
    209 
    210     // 1x8 block
    211     "3:                                        \n"
    212     MEMACCESS(1)
    213     "vld1.8      {d0[0]}, [%1], %2             \n"
    214     MEMACCESS(1)
    215     "vld1.8      {d0[1]}, [%1], %2             \n"
    216     MEMACCESS(1)
    217     "vld1.8      {d0[2]}, [%1], %2             \n"
    218     MEMACCESS(1)
    219     "vld1.8      {d0[3]}, [%1], %2             \n"
    220     MEMACCESS(1)
    221     "vld1.8      {d0[4]}, [%1], %2             \n"
    222     MEMACCESS(1)
    223     "vld1.8      {d0[5]}, [%1], %2             \n"
    224     MEMACCESS(1)
    225     "vld1.8      {d0[6]}, [%1], %2             \n"
    226     MEMACCESS(1)
    227     "vld1.8      {d0[7]}, [%1]                 \n"
    228 
    229     MEMACCESS(3)
    230     "vst1.64     {d0}, [%3]                    \n"
    231 
    232     "4:                                        \n"
    233 
    234     : "=&r"(src_temp),         // %0
    235       "+r"(src),               // %1
    236       "+r"(src_stride),        // %2
    237       "+r"(dst),               // %3
    238       "+r"(dst_stride),        // %4
    239       "+r"(width)              // %5
    240     : "r"(&kVTbl4x4Transpose)  // %6
    241     : "memory", "cc", "q0", "q1", "q2", "q3"
    242   );
    243 }
    244 
    245 static uvec8 kVTbl4x4TransposeDi = {0, 8,  1, 9,  2, 10, 3, 11,
    246                                     4, 12, 5, 13, 6, 14, 7, 15};
    247 
    248 void TransposeUVWx8_NEON(const uint8* src,
    249                          int src_stride,
    250                          uint8* dst_a,
    251                          int dst_stride_a,
    252                          uint8* dst_b,
    253                          int dst_stride_b,
    254                          int width) {
    255   const uint8* src_temp;
    256   asm volatile (
    257     // loops are on blocks of 8. loop will stop when
    258     // counter gets to or below 0. starting the counter
    259     // at w-8 allow for this
    260     "sub         %7, #8                        \n"
    261 
    262     // handle 8x8 blocks. this should be the majority of the plane
    263     "1:                                        \n"
    264       "mov         %0, %1                      \n"
    265 
    266       MEMACCESS(0)
    267       "vld2.8      {d0,  d1},  [%0], %2        \n"
    268       MEMACCESS(0)
    269       "vld2.8      {d2,  d3},  [%0], %2        \n"
    270       MEMACCESS(0)
    271       "vld2.8      {d4,  d5},  [%0], %2        \n"
    272       MEMACCESS(0)
    273       "vld2.8      {d6,  d7},  [%0], %2        \n"
    274       MEMACCESS(0)
    275       "vld2.8      {d16, d17}, [%0], %2        \n"
    276       MEMACCESS(0)
    277       "vld2.8      {d18, d19}, [%0], %2        \n"
    278       MEMACCESS(0)
    279       "vld2.8      {d20, d21}, [%0], %2        \n"
    280       MEMACCESS(0)
    281       "vld2.8      {d22, d23}, [%0]            \n"
    282 
    283       "vtrn.8      q1, q0                      \n"
    284       "vtrn.8      q3, q2                      \n"
    285       "vtrn.8      q9, q8                      \n"
    286       "vtrn.8      q11, q10                    \n"
    287 
    288       "vtrn.16     q1, q3                      \n"
    289       "vtrn.16     q0, q2                      \n"
    290       "vtrn.16     q9, q11                     \n"
    291       "vtrn.16     q8, q10                     \n"
    292 
    293       "vtrn.32     q1, q9                      \n"
    294       "vtrn.32     q0, q8                      \n"
    295       "vtrn.32     q3, q11                     \n"
    296       "vtrn.32     q2, q10                     \n"
    297 
    298       "vrev16.8    q0, q0                      \n"
    299       "vrev16.8    q1, q1                      \n"
    300       "vrev16.8    q2, q2                      \n"
    301       "vrev16.8    q3, q3                      \n"
    302       "vrev16.8    q8, q8                      \n"
    303       "vrev16.8    q9, q9                      \n"
    304       "vrev16.8    q10, q10                    \n"
    305       "vrev16.8    q11, q11                    \n"
    306 
    307       "mov         %0, %3                      \n"
    308 
    309     MEMACCESS(0)
    310       "vst1.8      {d2},  [%0], %4             \n"
    311     MEMACCESS(0)
    312       "vst1.8      {d0},  [%0], %4             \n"
    313     MEMACCESS(0)
    314       "vst1.8      {d6},  [%0], %4             \n"
    315     MEMACCESS(0)
    316       "vst1.8      {d4},  [%0], %4             \n"
    317     MEMACCESS(0)
    318       "vst1.8      {d18}, [%0], %4             \n"
    319     MEMACCESS(0)
    320       "vst1.8      {d16}, [%0], %4             \n"
    321     MEMACCESS(0)
    322       "vst1.8      {d22}, [%0], %4             \n"
    323     MEMACCESS(0)
    324       "vst1.8      {d20}, [%0]                 \n"
    325 
    326       "mov         %0, %5                      \n"
    327 
    328     MEMACCESS(0)
    329       "vst1.8      {d3},  [%0], %6             \n"
    330     MEMACCESS(0)
    331       "vst1.8      {d1},  [%0], %6             \n"
    332     MEMACCESS(0)
    333       "vst1.8      {d7},  [%0], %6             \n"
    334     MEMACCESS(0)
    335       "vst1.8      {d5},  [%0], %6             \n"
    336     MEMACCESS(0)
    337       "vst1.8      {d19}, [%0], %6             \n"
    338     MEMACCESS(0)
    339       "vst1.8      {d17}, [%0], %6             \n"
    340     MEMACCESS(0)
    341       "vst1.8      {d23}, [%0], %6             \n"
    342     MEMACCESS(0)
    343       "vst1.8      {d21}, [%0]                 \n"
    344 
    345       "add         %1, #8*2                    \n"  // src   += 8*2
    346       "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
    347       "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
    348       "subs        %7,  #8                     \n"  // w     -= 8
    349       "bge         1b                          \n"
    350 
    351     // add 8 back to counter. if the result is 0 there are
    352     // no residuals.
    353     "adds        %7, #8                        \n"
    354     "beq         4f                            \n"
    355 
    356     // some residual, so between 1 and 7 lines left to transpose
    357     "cmp         %7, #2                        \n"
    358     "blt         3f                            \n"
    359 
    360     "cmp         %7, #4                        \n"
    361     "blt         2f                            \n"
    362 
    363     // TODO(frkoenig): Clean this up
    364     // 4x8 block
    365     "mov         %0, %1                        \n"
    366     MEMACCESS(0)
    367     "vld1.64     {d0}, [%0], %2                \n"
    368     MEMACCESS(0)
    369     "vld1.64     {d1}, [%0], %2                \n"
    370     MEMACCESS(0)
    371     "vld1.64     {d2}, [%0], %2                \n"
    372     MEMACCESS(0)
    373     "vld1.64     {d3}, [%0], %2                \n"
    374     MEMACCESS(0)
    375     "vld1.64     {d4}, [%0], %2                \n"
    376     MEMACCESS(0)
    377     "vld1.64     {d5}, [%0], %2                \n"
    378     MEMACCESS(0)
    379     "vld1.64     {d6}, [%0], %2                \n"
    380     MEMACCESS(0)
    381     "vld1.64     {d7}, [%0]                    \n"
    382 
    383     MEMACCESS(8)
    384     "vld1.8      {q15}, [%8]                   \n"
    385 
    386     "vtrn.8      q0, q1                        \n"
    387     "vtrn.8      q2, q3                        \n"
    388 
    389     "vtbl.8      d16, {d0, d1}, d30            \n"
    390     "vtbl.8      d17, {d0, d1}, d31            \n"
    391     "vtbl.8      d18, {d2, d3}, d30            \n"
    392     "vtbl.8      d19, {d2, d3}, d31            \n"
    393     "vtbl.8      d20, {d4, d5}, d30            \n"
    394     "vtbl.8      d21, {d4, d5}, d31            \n"
    395     "vtbl.8      d22, {d6, d7}, d30            \n"
    396     "vtbl.8      d23, {d6, d7}, d31            \n"
    397 
    398     "mov         %0, %3                        \n"
    399 
    400     MEMACCESS(0)
    401     "vst1.32     {d16[0]},  [%0], %4           \n"
    402     MEMACCESS(0)
    403     "vst1.32     {d16[1]},  [%0], %4           \n"
    404     MEMACCESS(0)
    405     "vst1.32     {d17[0]},  [%0], %4           \n"
    406     MEMACCESS(0)
    407     "vst1.32     {d17[1]},  [%0], %4           \n"
    408 
    409     "add         %0, %3, #4                    \n"
    410     MEMACCESS(0)
    411     "vst1.32     {d20[0]}, [%0], %4            \n"
    412     MEMACCESS(0)
    413     "vst1.32     {d20[1]}, [%0], %4            \n"
    414     MEMACCESS(0)
    415     "vst1.32     {d21[0]}, [%0], %4            \n"
    416     MEMACCESS(0)
    417     "vst1.32     {d21[1]}, [%0]                \n"
    418 
    419     "mov         %0, %5                        \n"
    420 
    421     MEMACCESS(0)
    422     "vst1.32     {d18[0]}, [%0], %6            \n"
    423     MEMACCESS(0)
    424     "vst1.32     {d18[1]}, [%0], %6            \n"
    425     MEMACCESS(0)
    426     "vst1.32     {d19[0]}, [%0], %6            \n"
    427     MEMACCESS(0)
    428     "vst1.32     {d19[1]}, [%0], %6            \n"
    429 
    430     "add         %0, %5, #4                    \n"
    431     MEMACCESS(0)
    432     "vst1.32     {d22[0]},  [%0], %6           \n"
    433     MEMACCESS(0)
    434     "vst1.32     {d22[1]},  [%0], %6           \n"
    435     MEMACCESS(0)
    436     "vst1.32     {d23[0]},  [%0], %6           \n"
    437     MEMACCESS(0)
    438     "vst1.32     {d23[1]},  [%0]               \n"
    439 
    440     "add         %1, #4*2                      \n"  // src   += 4 * 2
    441     "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
    442     "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
    443     "subs        %7,  #4                       \n"  // w     -= 4
    444     "beq         4f                            \n"
    445 
    446     // some residual, check to see if it includes a 2x8 block,
    447     // or less
    448     "cmp         %7, #2                        \n"
    449     "blt         3f                            \n"
    450 
    451     // 2x8 block
    452     "2:                                        \n"
    453     "mov         %0, %1                        \n"
    454     MEMACCESS(0)
    455     "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
    456     MEMACCESS(0)
    457     "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
    458     MEMACCESS(0)
    459     "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
    460     MEMACCESS(0)
    461     "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
    462     MEMACCESS(0)
    463     "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
    464     MEMACCESS(0)
    465     "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
    466     MEMACCESS(0)
    467     "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
    468     MEMACCESS(0)
    469     "vld2.16     {d1[3], d3[3]}, [%0]          \n"
    470 
    471     "vtrn.8      d0, d1                        \n"
    472     "vtrn.8      d2, d3                        \n"
    473 
    474     "mov         %0, %3                        \n"
    475 
    476     MEMACCESS(0)
    477     "vst1.64     {d0}, [%0], %4                \n"
    478     MEMACCESS(0)
    479     "vst1.64     {d2}, [%0]                    \n"
    480 
    481     "mov         %0, %5                        \n"
    482 
    483     MEMACCESS(0)
    484     "vst1.64     {d1}, [%0], %6                \n"
    485     MEMACCESS(0)
    486     "vst1.64     {d3}, [%0]                    \n"
    487 
    488     "add         %1, #2*2                      \n"  // src   += 2 * 2
    489     "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
    490     "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
    491     "subs        %7,  #2                       \n"  // w     -= 2
    492     "beq         4f                            \n"
    493 
    494     // 1x8 block
    495     "3:                                        \n"
    496     MEMACCESS(1)
    497     "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
    498     MEMACCESS(1)
    499     "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
    500     MEMACCESS(1)
    501     "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
    502     MEMACCESS(1)
    503     "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
    504     MEMACCESS(1)
    505     "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
    506     MEMACCESS(1)
    507     "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
    508     MEMACCESS(1)
    509     "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
    510     MEMACCESS(1)
    511     "vld2.8      {d0[7], d1[7]}, [%1]          \n"
    512 
    513     MEMACCESS(3)
    514     "vst1.64     {d0}, [%3]                    \n"
    515     MEMACCESS(5)
    516     "vst1.64     {d1}, [%5]                    \n"
    517 
    518     "4:                                        \n"
    519 
    520     : "=&r"(src_temp),           // %0
    521       "+r"(src),                 // %1
    522       "+r"(src_stride),          // %2
    523       "+r"(dst_a),               // %3
    524       "+r"(dst_stride_a),        // %4
    525       "+r"(dst_b),               // %5
    526       "+r"(dst_stride_b),        // %6
    527       "+r"(width)                // %7
    528     : "r"(&kVTbl4x4TransposeDi)  // %8
    529     : "memory", "cc",
    530       "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
    531   );
    532 }
    533 #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
    534 
    535 #ifdef __cplusplus
    536 }  // extern "C"
    537 }  // namespace libyuv
    538 #endif
    539