Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 
     13 #include "libyuv/basic_types.h"
     14 
     15 #ifdef __cplusplus
     16 namespace libyuv {
     17 extern "C" {
     18 #endif
     19 
     20 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
     21 
     22 static uvec8 kVTbl4x4Transpose =
     23   { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
     24 
     25 void TransposeWx8_NEON(const uint8* src, int src_stride,
     26                        uint8* dst, int dst_stride,
     27                        int width) {
     28   const uint8* src_temp = NULL;
     29   asm volatile (
     30     // loops are on blocks of 8. loop will stop when
     31     // counter gets to or below 0. starting the counter
     32     // at w-8 allow for this
     33     "sub         %5, #8                        \n"
     34 
     35     // handle 8x8 blocks. this should be the majority of the plane
     36     ".p2align  2                               \n"
     37     "1:                                        \n"
     38       "mov         %0, %1                      \n"
     39 
     40       MEMACCESS(0)
     41       "vld1.8      {d0}, [%0], %2              \n"
     42       MEMACCESS(0)
     43       "vld1.8      {d1}, [%0], %2              \n"
     44       MEMACCESS(0)
     45       "vld1.8      {d2}, [%0], %2              \n"
     46       MEMACCESS(0)
     47       "vld1.8      {d3}, [%0], %2              \n"
     48       MEMACCESS(0)
     49       "vld1.8      {d4}, [%0], %2              \n"
     50       MEMACCESS(0)
     51       "vld1.8      {d5}, [%0], %2              \n"
     52       MEMACCESS(0)
     53       "vld1.8      {d6}, [%0], %2              \n"
     54       MEMACCESS(0)
     55       "vld1.8      {d7}, [%0]                  \n"
     56 
     57       "vtrn.8      d1, d0                      \n"
     58       "vtrn.8      d3, d2                      \n"
     59       "vtrn.8      d5, d4                      \n"
     60       "vtrn.8      d7, d6                      \n"
     61 
     62       "vtrn.16     d1, d3                      \n"
     63       "vtrn.16     d0, d2                      \n"
     64       "vtrn.16     d5, d7                      \n"
     65       "vtrn.16     d4, d6                      \n"
     66 
     67       "vtrn.32     d1, d5                      \n"
     68       "vtrn.32     d0, d4                      \n"
     69       "vtrn.32     d3, d7                      \n"
     70       "vtrn.32     d2, d6                      \n"
     71 
     72       "vrev16.8    q0, q0                      \n"
     73       "vrev16.8    q1, q1                      \n"
     74       "vrev16.8    q2, q2                      \n"
     75       "vrev16.8    q3, q3                      \n"
     76 
     77       "mov         %0, %3                      \n"
     78 
     79     MEMACCESS(0)
     80       "vst1.8      {d1}, [%0], %4              \n"
     81     MEMACCESS(0)
     82       "vst1.8      {d0}, [%0], %4              \n"
     83     MEMACCESS(0)
     84       "vst1.8      {d3}, [%0], %4              \n"
     85     MEMACCESS(0)
     86       "vst1.8      {d2}, [%0], %4              \n"
     87     MEMACCESS(0)
     88       "vst1.8      {d5}, [%0], %4              \n"
     89     MEMACCESS(0)
     90       "vst1.8      {d4}, [%0], %4              \n"
     91     MEMACCESS(0)
     92       "vst1.8      {d7}, [%0], %4              \n"
     93     MEMACCESS(0)
     94       "vst1.8      {d6}, [%0]                  \n"
     95 
     96       "add         %1, #8                      \n"  // src += 8
     97       "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
     98       "subs        %5,  #8                     \n"  // w   -= 8
     99       "bge         1b                          \n"
    100 
    101     // add 8 back to counter. if the result is 0 there are
    102     // no residuals.
    103     "adds        %5, #8                        \n"
    104     "beq         4f                            \n"
    105 
    106     // some residual, so between 1 and 7 lines left to transpose
    107     "cmp         %5, #2                        \n"
    108     "blt         3f                            \n"
    109 
    110     "cmp         %5, #4                        \n"
    111     "blt         2f                            \n"
    112 
    113     // 4x8 block
    114     "mov         %0, %1                        \n"
    115     MEMACCESS(0)
    116     "vld1.32     {d0[0]}, [%0], %2             \n"
    117     MEMACCESS(0)
    118     "vld1.32     {d0[1]}, [%0], %2             \n"
    119     MEMACCESS(0)
    120     "vld1.32     {d1[0]}, [%0], %2             \n"
    121     MEMACCESS(0)
    122     "vld1.32     {d1[1]}, [%0], %2             \n"
    123     MEMACCESS(0)
    124     "vld1.32     {d2[0]}, [%0], %2             \n"
    125     MEMACCESS(0)
    126     "vld1.32     {d2[1]}, [%0], %2             \n"
    127     MEMACCESS(0)
    128     "vld1.32     {d3[0]}, [%0], %2             \n"
    129     MEMACCESS(0)
    130     "vld1.32     {d3[1]}, [%0]                 \n"
    131 
    132     "mov         %0, %3                        \n"
    133 
    134     MEMACCESS(6)
    135     "vld1.8      {q3}, [%6]                    \n"
    136 
    137     "vtbl.8      d4, {d0, d1}, d6              \n"
    138     "vtbl.8      d5, {d0, d1}, d7              \n"
    139     "vtbl.8      d0, {d2, d3}, d6              \n"
    140     "vtbl.8      d1, {d2, d3}, d7              \n"
    141 
    142     // TODO(frkoenig): Rework shuffle above to
    143     // write out with 4 instead of 8 writes.
    144     MEMACCESS(0)
    145     "vst1.32     {d4[0]}, [%0], %4             \n"
    146     MEMACCESS(0)
    147     "vst1.32     {d4[1]}, [%0], %4             \n"
    148     MEMACCESS(0)
    149     "vst1.32     {d5[0]}, [%0], %4             \n"
    150     MEMACCESS(0)
    151     "vst1.32     {d5[1]}, [%0]                 \n"
    152 
    153     "add         %0, %3, #4                    \n"
    154     MEMACCESS(0)
    155     "vst1.32     {d0[0]}, [%0], %4             \n"
    156     MEMACCESS(0)
    157     "vst1.32     {d0[1]}, [%0], %4             \n"
    158     MEMACCESS(0)
    159     "vst1.32     {d1[0]}, [%0], %4             \n"
    160     MEMACCESS(0)
    161     "vst1.32     {d1[1]}, [%0]                 \n"
    162 
    163     "add         %1, #4                        \n"  // src += 4
    164     "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
    165     "subs        %5,  #4                       \n"  // w   -= 4
    166     "beq         4f                            \n"
    167 
    168     // some residual, check to see if it includes a 2x8 block,
    169     // or less
    170     "cmp         %5, #2                        \n"
    171     "blt         3f                            \n"
    172 
    173     // 2x8 block
    174     "2:                                        \n"
    175     "mov         %0, %1                        \n"
    176     MEMACCESS(0)
    177     "vld1.16     {d0[0]}, [%0], %2             \n"
    178     MEMACCESS(0)
    179     "vld1.16     {d1[0]}, [%0], %2             \n"
    180     MEMACCESS(0)
    181     "vld1.16     {d0[1]}, [%0], %2             \n"
    182     MEMACCESS(0)
    183     "vld1.16     {d1[1]}, [%0], %2             \n"
    184     MEMACCESS(0)
    185     "vld1.16     {d0[2]}, [%0], %2             \n"
    186     MEMACCESS(0)
    187     "vld1.16     {d1[2]}, [%0], %2             \n"
    188     MEMACCESS(0)
    189     "vld1.16     {d0[3]}, [%0], %2             \n"
    190     MEMACCESS(0)
    191     "vld1.16     {d1[3]}, [%0]                 \n"
    192 
    193     "vtrn.8      d0, d1                        \n"
    194 
    195     "mov         %0, %3                        \n"
    196 
    197     MEMACCESS(0)
    198     "vst1.64     {d0}, [%0], %4                \n"
    199     MEMACCESS(0)
    200     "vst1.64     {d1}, [%0]                    \n"
    201 
    202     "add         %1, #2                        \n"  // src += 2
    203     "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
    204     "subs        %5,  #2                       \n"  // w   -= 2
    205     "beq         4f                            \n"
    206 
    207     // 1x8 block
    208     "3:                                        \n"
    209     MEMACCESS(1)
    210     "vld1.8      {d0[0]}, [%1], %2             \n"
    211     MEMACCESS(1)
    212     "vld1.8      {d0[1]}, [%1], %2             \n"
    213     MEMACCESS(1)
    214     "vld1.8      {d0[2]}, [%1], %2             \n"
    215     MEMACCESS(1)
    216     "vld1.8      {d0[3]}, [%1], %2             \n"
    217     MEMACCESS(1)
    218     "vld1.8      {d0[4]}, [%1], %2             \n"
    219     MEMACCESS(1)
    220     "vld1.8      {d0[5]}, [%1], %2             \n"
    221     MEMACCESS(1)
    222     "vld1.8      {d0[6]}, [%1], %2             \n"
    223     MEMACCESS(1)
    224     "vld1.8      {d0[7]}, [%1]                 \n"
    225 
    226     MEMACCESS(3)
    227     "vst1.64     {d0}, [%3]                    \n"
    228 
    229     "4:                                        \n"
    230 
    231     : "+r"(src_temp),          // %0
    232       "+r"(src),               // %1
    233       "+r"(src_stride),        // %2
    234       "+r"(dst),               // %3
    235       "+r"(dst_stride),        // %4
    236       "+r"(width)              // %5
    237     : "r"(&kVTbl4x4Transpose)  // %6
    238     : "memory", "cc", "q0", "q1", "q2", "q3"
    239   );
    240 }
    241 
    242 static uvec8 kVTbl4x4TransposeDi =
    243   { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
    244 
    245 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
    246                          uint8* dst_a, int dst_stride_a,
    247                          uint8* dst_b, int dst_stride_b,
    248                          int width) {
    249   const uint8* src_temp = NULL;
    250   asm volatile (
    251     // loops are on blocks of 8. loop will stop when
    252     // counter gets to or below 0. starting the counter
    253     // at w-8 allow for this
    254     "sub         %7, #8                        \n"
    255 
    256     // handle 8x8 blocks. this should be the majority of the plane
    257     ".p2align  2                               \n"
    258     "1:                                        \n"
    259       "mov         %0, %1                      \n"
    260 
    261       MEMACCESS(0)
    262       "vld2.8      {d0,  d1},  [%0], %2        \n"
    263       MEMACCESS(0)
    264       "vld2.8      {d2,  d3},  [%0], %2        \n"
    265       MEMACCESS(0)
    266       "vld2.8      {d4,  d5},  [%0], %2        \n"
    267       MEMACCESS(0)
    268       "vld2.8      {d6,  d7},  [%0], %2        \n"
    269       MEMACCESS(0)
    270       "vld2.8      {d16, d17}, [%0], %2        \n"
    271       MEMACCESS(0)
    272       "vld2.8      {d18, d19}, [%0], %2        \n"
    273       MEMACCESS(0)
    274       "vld2.8      {d20, d21}, [%0], %2        \n"
    275       MEMACCESS(0)
    276       "vld2.8      {d22, d23}, [%0]            \n"
    277 
    278       "vtrn.8      q1, q0                      \n"
    279       "vtrn.8      q3, q2                      \n"
    280       "vtrn.8      q9, q8                      \n"
    281       "vtrn.8      q11, q10                    \n"
    282 
    283       "vtrn.16     q1, q3                      \n"
    284       "vtrn.16     q0, q2                      \n"
    285       "vtrn.16     q9, q11                     \n"
    286       "vtrn.16     q8, q10                     \n"
    287 
    288       "vtrn.32     q1, q9                      \n"
    289       "vtrn.32     q0, q8                      \n"
    290       "vtrn.32     q3, q11                     \n"
    291       "vtrn.32     q2, q10                     \n"
    292 
    293       "vrev16.8    q0, q0                      \n"
    294       "vrev16.8    q1, q1                      \n"
    295       "vrev16.8    q2, q2                      \n"
    296       "vrev16.8    q3, q3                      \n"
    297       "vrev16.8    q8, q8                      \n"
    298       "vrev16.8    q9, q9                      \n"
    299       "vrev16.8    q10, q10                    \n"
    300       "vrev16.8    q11, q11                    \n"
    301 
    302       "mov         %0, %3                      \n"
    303 
    304     MEMACCESS(0)
    305       "vst1.8      {d2},  [%0], %4             \n"
    306     MEMACCESS(0)
    307       "vst1.8      {d0},  [%0], %4             \n"
    308     MEMACCESS(0)
    309       "vst1.8      {d6},  [%0], %4             \n"
    310     MEMACCESS(0)
    311       "vst1.8      {d4},  [%0], %4             \n"
    312     MEMACCESS(0)
    313       "vst1.8      {d18}, [%0], %4             \n"
    314     MEMACCESS(0)
    315       "vst1.8      {d16}, [%0], %4             \n"
    316     MEMACCESS(0)
    317       "vst1.8      {d22}, [%0], %4             \n"
    318     MEMACCESS(0)
    319       "vst1.8      {d20}, [%0]                 \n"
    320 
    321       "mov         %0, %5                      \n"
    322 
    323     MEMACCESS(0)
    324       "vst1.8      {d3},  [%0], %6             \n"
    325     MEMACCESS(0)
    326       "vst1.8      {d1},  [%0], %6             \n"
    327     MEMACCESS(0)
    328       "vst1.8      {d7},  [%0], %6             \n"
    329     MEMACCESS(0)
    330       "vst1.8      {d5},  [%0], %6             \n"
    331     MEMACCESS(0)
    332       "vst1.8      {d19}, [%0], %6             \n"
    333     MEMACCESS(0)
    334       "vst1.8      {d17}, [%0], %6             \n"
    335     MEMACCESS(0)
    336       "vst1.8      {d23}, [%0], %6             \n"
    337     MEMACCESS(0)
    338       "vst1.8      {d21}, [%0]                 \n"
    339 
    340       "add         %1, #8*2                    \n"  // src   += 8*2
    341       "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
    342       "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
    343       "subs        %7,  #8                     \n"  // w     -= 8
    344       "bge         1b                          \n"
    345 
    346     // add 8 back to counter. if the result is 0 there are
    347     // no residuals.
    348     "adds        %7, #8                        \n"
    349     "beq         4f                            \n"
    350 
    351     // some residual, so between 1 and 7 lines left to transpose
    352     "cmp         %7, #2                        \n"
    353     "blt         3f                            \n"
    354 
    355     "cmp         %7, #4                        \n"
    356     "blt         2f                            \n"
    357 
    358     // TODO(frkoenig): Clean this up
    359     // 4x8 block
    360     "mov         %0, %1                        \n"
    361     MEMACCESS(0)
    362     "vld1.64     {d0}, [%0], %2                \n"
    363     MEMACCESS(0)
    364     "vld1.64     {d1}, [%0], %2                \n"
    365     MEMACCESS(0)
    366     "vld1.64     {d2}, [%0], %2                \n"
    367     MEMACCESS(0)
    368     "vld1.64     {d3}, [%0], %2                \n"
    369     MEMACCESS(0)
    370     "vld1.64     {d4}, [%0], %2                \n"
    371     MEMACCESS(0)
    372     "vld1.64     {d5}, [%0], %2                \n"
    373     MEMACCESS(0)
    374     "vld1.64     {d6}, [%0], %2                \n"
    375     MEMACCESS(0)
    376     "vld1.64     {d7}, [%0]                    \n"
    377 
    378     MEMACCESS(8)
    379     "vld1.8      {q15}, [%8]                   \n"
    380 
    381     "vtrn.8      q0, q1                        \n"
    382     "vtrn.8      q2, q3                        \n"
    383 
    384     "vtbl.8      d16, {d0, d1}, d30            \n"
    385     "vtbl.8      d17, {d0, d1}, d31            \n"
    386     "vtbl.8      d18, {d2, d3}, d30            \n"
    387     "vtbl.8      d19, {d2, d3}, d31            \n"
    388     "vtbl.8      d20, {d4, d5}, d30            \n"
    389     "vtbl.8      d21, {d4, d5}, d31            \n"
    390     "vtbl.8      d22, {d6, d7}, d30            \n"
    391     "vtbl.8      d23, {d6, d7}, d31            \n"
    392 
    393     "mov         %0, %3                        \n"
    394 
    395     MEMACCESS(0)
    396     "vst1.32     {d16[0]},  [%0], %4           \n"
    397     MEMACCESS(0)
    398     "vst1.32     {d16[1]},  [%0], %4           \n"
    399     MEMACCESS(0)
    400     "vst1.32     {d17[0]},  [%0], %4           \n"
    401     MEMACCESS(0)
    402     "vst1.32     {d17[1]},  [%0], %4           \n"
    403 
    404     "add         %0, %3, #4                    \n"
    405     MEMACCESS(0)
    406     "vst1.32     {d20[0]}, [%0], %4            \n"
    407     MEMACCESS(0)
    408     "vst1.32     {d20[1]}, [%0], %4            \n"
    409     MEMACCESS(0)
    410     "vst1.32     {d21[0]}, [%0], %4            \n"
    411     MEMACCESS(0)
    412     "vst1.32     {d21[1]}, [%0]                \n"
    413 
    414     "mov         %0, %5                        \n"
    415 
    416     MEMACCESS(0)
    417     "vst1.32     {d18[0]}, [%0], %6            \n"
    418     MEMACCESS(0)
    419     "vst1.32     {d18[1]}, [%0], %6            \n"
    420     MEMACCESS(0)
    421     "vst1.32     {d19[0]}, [%0], %6            \n"
    422     MEMACCESS(0)
    423     "vst1.32     {d19[1]}, [%0], %6            \n"
    424 
    425     "add         %0, %5, #4                    \n"
    426     MEMACCESS(0)
    427     "vst1.32     {d22[0]},  [%0], %6           \n"
    428     MEMACCESS(0)
    429     "vst1.32     {d22[1]},  [%0], %6           \n"
    430     MEMACCESS(0)
    431     "vst1.32     {d23[0]},  [%0], %6           \n"
    432     MEMACCESS(0)
    433     "vst1.32     {d23[1]},  [%0]               \n"
    434 
    435     "add         %1, #4*2                      \n"  // src   += 4 * 2
    436     "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
    437     "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
    438     "subs        %7,  #4                       \n"  // w     -= 4
    439     "beq         4f                            \n"
    440 
    441     // some residual, check to see if it includes a 2x8 block,
    442     // or less
    443     "cmp         %7, #2                        \n"
    444     "blt         3f                            \n"
    445 
    446     // 2x8 block
    447     "2:                                        \n"
    448     "mov         %0, %1                        \n"
    449     MEMACCESS(0)
    450     "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
    451     MEMACCESS(0)
    452     "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
    453     MEMACCESS(0)
    454     "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
    455     MEMACCESS(0)
    456     "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
    457     MEMACCESS(0)
    458     "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
    459     MEMACCESS(0)
    460     "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
    461     MEMACCESS(0)
    462     "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
    463     MEMACCESS(0)
    464     "vld2.16     {d1[3], d3[3]}, [%0]          \n"
    465 
    466     "vtrn.8      d0, d1                        \n"
    467     "vtrn.8      d2, d3                        \n"
    468 
    469     "mov         %0, %3                        \n"
    470 
    471     MEMACCESS(0)
    472     "vst1.64     {d0}, [%0], %4                \n"
    473     MEMACCESS(0)
    474     "vst1.64     {d2}, [%0]                    \n"
    475 
    476     "mov         %0, %5                        \n"
    477 
    478     MEMACCESS(0)
    479     "vst1.64     {d1}, [%0], %6                \n"
    480     MEMACCESS(0)
    481     "vst1.64     {d3}, [%0]                    \n"
    482 
    483     "add         %1, #2*2                      \n"  // src   += 2 * 2
    484     "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
    485     "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
    486     "subs        %7,  #2                       \n"  // w     -= 2
    487     "beq         4f                            \n"
    488 
    489     // 1x8 block
    490     "3:                                        \n"
    491     MEMACCESS(1)
    492     "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
    493     MEMACCESS(1)
    494     "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
    495     MEMACCESS(1)
    496     "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
    497     MEMACCESS(1)
    498     "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
    499     MEMACCESS(1)
    500     "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
    501     MEMACCESS(1)
    502     "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
    503     MEMACCESS(1)
    504     "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
    505     MEMACCESS(1)
    506     "vld2.8      {d0[7], d1[7]}, [%1]          \n"
    507 
    508     MEMACCESS(3)
    509     "vst1.64     {d0}, [%3]                    \n"
    510     MEMACCESS(5)
    511     "vst1.64     {d1}, [%5]                    \n"
    512 
    513     "4:                                        \n"
    514 
    515     : "+r"(src_temp),            // %0
    516       "+r"(src),                 // %1
    517       "+r"(src_stride),          // %2
    518       "+r"(dst_a),               // %3
    519       "+r"(dst_stride_a),        // %4
    520       "+r"(dst_b),               // %5
    521       "+r"(dst_stride_b),        // %6
    522       "+r"(width)                // %7
    523     : "r"(&kVTbl4x4TransposeDi)  // %8
    524     : "memory", "cc",
    525       "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
    526   );
    527 }
    528 #endif
    529 
    530 #ifdef __cplusplus
    531 }  // extern "C"
    532 }  // namespace libyuv
    533 #endif
    534