Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/row.h"
     12 
     13 #include "libyuv/basic_types.h"
     14 
     15 #ifdef __cplusplus
     16 namespace libyuv {
     17 extern "C" {
     18 #endif
     19 
     20 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
     21 
     22 static const uvec8 kVTbl4x4Transpose =
     23   { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
     24 
     25 void TransposeWx8_NEON(const uint8* src, int src_stride,
     26                        uint8* dst, int dst_stride,
     27                        int width) {
     28   asm volatile (
     29     // loops are on blocks of 8. loop will stop when
     30     // counter gets to or below 0. starting the counter
     31     // at w-8 allow for this
     32     "sub         %4, #8                        \n"
     33 
     34     // handle 8x8 blocks. this should be the majority of the plane
     35     ".p2align  4                               \n"
     36     "1:                                        \n"
     37       "mov         r9, %0                      \n"
     38 
     39       "vld1.8      {d0}, [r9], %1              \n"
     40       "vld1.8      {d1}, [r9], %1              \n"
     41       "vld1.8      {d2}, [r9], %1              \n"
     42       "vld1.8      {d3}, [r9], %1              \n"
     43       "vld1.8      {d4}, [r9], %1              \n"
     44       "vld1.8      {d5}, [r9], %1              \n"
     45       "vld1.8      {d6}, [r9], %1              \n"
     46       "vld1.8      {d7}, [r9]                  \n"
     47 
     48       "vtrn.8      d1, d0                      \n"
     49       "vtrn.8      d3, d2                      \n"
     50       "vtrn.8      d5, d4                      \n"
     51       "vtrn.8      d7, d6                      \n"
     52 
     53       "vtrn.16     d1, d3                      \n"
     54       "vtrn.16     d0, d2                      \n"
     55       "vtrn.16     d5, d7                      \n"
     56       "vtrn.16     d4, d6                      \n"
     57 
     58       "vtrn.32     d1, d5                      \n"
     59       "vtrn.32     d0, d4                      \n"
     60       "vtrn.32     d3, d7                      \n"
     61       "vtrn.32     d2, d6                      \n"
     62 
     63       "vrev16.8    q0, q0                      \n"
     64       "vrev16.8    q1, q1                      \n"
     65       "vrev16.8    q2, q2                      \n"
     66       "vrev16.8    q3, q3                      \n"
     67 
     68       "mov         r9, %2                      \n"
     69 
     70       "vst1.8      {d1}, [r9], %3              \n"
     71       "vst1.8      {d0}, [r9], %3              \n"
     72       "vst1.8      {d3}, [r9], %3              \n"
     73       "vst1.8      {d2}, [r9], %3              \n"
     74       "vst1.8      {d5}, [r9], %3              \n"
     75       "vst1.8      {d4}, [r9], %3              \n"
     76       "vst1.8      {d7}, [r9], %3              \n"
     77       "vst1.8      {d6}, [r9]                  \n"
     78 
     79       "add         %0, #8                      \n"  // src += 8
     80       "add         %2, %2, %3, lsl #3          \n"  // dst += 8 * dst_stride
     81       "subs        %4,  #8                     \n"  // w   -= 8
     82       "bge         1b                          \n"
     83 
     84     // add 8 back to counter. if the result is 0 there are
     85     // no residuals.
     86     "adds        %4, #8                        \n"
     87     "beq         4f                            \n"
     88 
     89     // some residual, so between 1 and 7 lines left to transpose
     90     "cmp         %4, #2                        \n"
     91     "blt         3f                            \n"
     92 
     93     "cmp         %4, #4                        \n"
     94     "blt         2f                            \n"
     95 
     96     // 4x8 block
     97     "mov         r9, %0                        \n"
     98     "vld1.32     {d0[0]}, [r9], %1             \n"
     99     "vld1.32     {d0[1]}, [r9], %1             \n"
    100     "vld1.32     {d1[0]}, [r9], %1             \n"
    101     "vld1.32     {d1[1]}, [r9], %1             \n"
    102     "vld1.32     {d2[0]}, [r9], %1             \n"
    103     "vld1.32     {d2[1]}, [r9], %1             \n"
    104     "vld1.32     {d3[0]}, [r9], %1             \n"
    105     "vld1.32     {d3[1]}, [r9]                 \n"
    106 
    107     "mov         r9, %2                        \n"
    108 
    109     "vld1.8      {q3}, [%5]                    \n"
    110 
    111     "vtbl.8      d4, {d0, d1}, d6              \n"
    112     "vtbl.8      d5, {d0, d1}, d7              \n"
    113     "vtbl.8      d0, {d2, d3}, d6              \n"
    114     "vtbl.8      d1, {d2, d3}, d7              \n"
    115 
    116     // TODO: rework shuffle above to write
    117     //       out with 4 instead of 8 writes
    118     "vst1.32     {d4[0]}, [r9], %3             \n"
    119     "vst1.32     {d4[1]}, [r9], %3             \n"
    120     "vst1.32     {d5[0]}, [r9], %3             \n"
    121     "vst1.32     {d5[1]}, [r9]                 \n"
    122 
    123     "add         r9, %2, #4                    \n"
    124     "vst1.32     {d0[0]}, [r9], %3             \n"
    125     "vst1.32     {d0[1]}, [r9], %3             \n"
    126     "vst1.32     {d1[0]}, [r9], %3             \n"
    127     "vst1.32     {d1[1]}, [r9]                 \n"
    128 
    129     "add         %0, #4                        \n"  // src += 4
    130     "add         %2, %2, %3, lsl #2            \n"  // dst += 4 * dst_stride
    131     "subs        %4,  #4                       \n"  // w   -= 4
    132     "beq         4f                            \n"
    133 
    134     // some residual, check to see if it includes a 2x8 block,
    135     // or less
    136     "cmp         %4, #2                        \n"
    137     "blt         3f                            \n"
    138 
    139     // 2x8 block
    140     "2:                                        \n"
    141     "mov         r9, %0                        \n"
    142     "vld1.16     {d0[0]}, [r9], %1             \n"
    143     "vld1.16     {d1[0]}, [r9], %1             \n"
    144     "vld1.16     {d0[1]}, [r9], %1             \n"
    145     "vld1.16     {d1[1]}, [r9], %1             \n"
    146     "vld1.16     {d0[2]}, [r9], %1             \n"
    147     "vld1.16     {d1[2]}, [r9], %1             \n"
    148     "vld1.16     {d0[3]}, [r9], %1             \n"
    149     "vld1.16     {d1[3]}, [r9]                 \n"
    150 
    151     "vtrn.8      d0, d1                        \n"
    152 
    153     "mov         r9, %2                        \n"
    154 
    155     "vst1.64     {d0}, [r9], %3                \n"
    156     "vst1.64     {d1}, [r9]                    \n"
    157 
    158     "add         %0, #2                        \n"  // src += 2
    159     "add         %2, %2, %3, lsl #1            \n"  // dst += 2 * dst_stride
    160     "subs        %4,  #2                       \n"  // w   -= 2
    161     "beq         4f                            \n"
    162 
    163     // 1x8 block
    164     "3:                                        \n"
    165     "vld1.8      {d0[0]}, [%0], %1             \n"
    166     "vld1.8      {d0[1]}, [%0], %1             \n"
    167     "vld1.8      {d0[2]}, [%0], %1             \n"
    168     "vld1.8      {d0[3]}, [%0], %1             \n"
    169     "vld1.8      {d0[4]}, [%0], %1             \n"
    170     "vld1.8      {d0[5]}, [%0], %1             \n"
    171     "vld1.8      {d0[6]}, [%0], %1             \n"
    172     "vld1.8      {d0[7]}, [%0]                 \n"
    173 
    174     "vst1.64     {d0}, [%2]                    \n"
    175 
    176     "4:                                        \n"
    177 
    178     : "+r"(src),               // %0
    179       "+r"(src_stride),        // %1
    180       "+r"(dst),               // %2
    181       "+r"(dst_stride),        // %3
    182       "+r"(width)              // %4
    183     : "r"(&kVTbl4x4Transpose)  // %5
    184     : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
    185   );
    186 }
    187 
    188 static const uvec8 kVTbl4x4TransposeDi =
    189   { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
    190 
    191 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
    192                          uint8* dst_a, int dst_stride_a,
    193                          uint8* dst_b, int dst_stride_b,
    194                          int width) {
    195   asm volatile (
    196     // loops are on blocks of 8. loop will stop when
    197     // counter gets to or below 0. starting the counter
    198     // at w-8 allow for this
    199     "sub         %6, #8                        \n"
    200 
    201     // handle 8x8 blocks. this should be the majority of the plane
    202     ".p2align  4                               \n"
    203     "1:                                        \n"
    204       "mov         r9, %0                      \n"
    205 
    206       "vld2.8      {d0,  d1},  [r9], %1        \n"
    207       "vld2.8      {d2,  d3},  [r9], %1        \n"
    208       "vld2.8      {d4,  d5},  [r9], %1        \n"
    209       "vld2.8      {d6,  d7},  [r9], %1        \n"
    210       "vld2.8      {d16, d17}, [r9], %1        \n"
    211       "vld2.8      {d18, d19}, [r9], %1        \n"
    212       "vld2.8      {d20, d21}, [r9], %1        \n"
    213       "vld2.8      {d22, d23}, [r9]            \n"
    214 
    215       "vtrn.8      q1, q0                      \n"
    216       "vtrn.8      q3, q2                      \n"
    217       "vtrn.8      q9, q8                      \n"
    218       "vtrn.8      q11, q10                    \n"
    219 
    220       "vtrn.16     q1, q3                      \n"
    221       "vtrn.16     q0, q2                      \n"
    222       "vtrn.16     q9, q11                     \n"
    223       "vtrn.16     q8, q10                     \n"
    224 
    225       "vtrn.32     q1, q9                      \n"
    226       "vtrn.32     q0, q8                      \n"
    227       "vtrn.32     q3, q11                     \n"
    228       "vtrn.32     q2, q10                     \n"
    229 
    230       "vrev16.8    q0, q0                      \n"
    231       "vrev16.8    q1, q1                      \n"
    232       "vrev16.8    q2, q2                      \n"
    233       "vrev16.8    q3, q3                      \n"
    234       "vrev16.8    q8, q8                      \n"
    235       "vrev16.8    q9, q9                      \n"
    236       "vrev16.8    q10, q10                    \n"
    237       "vrev16.8    q11, q11                    \n"
    238 
    239       "mov         r9, %2                      \n"
    240 
    241       "vst1.8      {d2},  [r9], %3             \n"
    242       "vst1.8      {d0},  [r9], %3             \n"
    243       "vst1.8      {d6},  [r9], %3             \n"
    244       "vst1.8      {d4},  [r9], %3             \n"
    245       "vst1.8      {d18}, [r9], %3             \n"
    246       "vst1.8      {d16}, [r9], %3             \n"
    247       "vst1.8      {d22}, [r9], %3             \n"
    248       "vst1.8      {d20}, [r9]                 \n"
    249 
    250       "mov         r9, %4                      \n"
    251 
    252       "vst1.8      {d3},  [r9], %5             \n"
    253       "vst1.8      {d1},  [r9], %5             \n"
    254       "vst1.8      {d7},  [r9], %5             \n"
    255       "vst1.8      {d5},  [r9], %5             \n"
    256       "vst1.8      {d19}, [r9], %5             \n"
    257       "vst1.8      {d17}, [r9], %5             \n"
    258       "vst1.8      {d23}, [r9], %5             \n"
    259       "vst1.8      {d21}, [r9]                 \n"
    260 
    261       "add         %0, #8*2                    \n"  // src   += 8*2
    262       "add         %2, %2, %3, lsl #3          \n"  // dst_a += 8 * dst_stride_a
    263       "add         %4, %4, %5, lsl #3          \n"  // dst_b += 8 * dst_stride_b
    264       "subs        %6,  #8                     \n"  // w     -= 8
    265       "bge         1b                          \n"
    266 
    267     // add 8 back to counter. if the result is 0 there are
    268     // no residuals.
    269     "adds        %6, #8                        \n"
    270     "beq         4f                            \n"
    271 
    272     // some residual, so between 1 and 7 lines left to transpose
    273     "cmp         %6, #2                        \n"
    274     "blt         3f                            \n"
    275 
    276     "cmp         %6, #4                        \n"
    277     "blt         2f                            \n"
    278 
    279     //TODO(frkoenig) : clean this up
    280     // 4x8 block
    281     "mov         r9, %0                        \n"
    282     "vld1.64     {d0}, [r9], %1                \n"
    283     "vld1.64     {d1}, [r9], %1                \n"
    284     "vld1.64     {d2}, [r9], %1                \n"
    285     "vld1.64     {d3}, [r9], %1                \n"
    286     "vld1.64     {d4}, [r9], %1                \n"
    287     "vld1.64     {d5}, [r9], %1                \n"
    288     "vld1.64     {d6}, [r9], %1                \n"
    289     "vld1.64     {d7}, [r9]                    \n"
    290 
    291     "vld1.8      {q15}, [%7]                   \n"
    292 
    293     "vtrn.8      q0, q1                        \n"
    294     "vtrn.8      q2, q3                        \n"
    295 
    296     "vtbl.8      d16, {d0, d1}, d30            \n"
    297     "vtbl.8      d17, {d0, d1}, d31            \n"
    298     "vtbl.8      d18, {d2, d3}, d30            \n"
    299     "vtbl.8      d19, {d2, d3}, d31            \n"
    300     "vtbl.8      d20, {d4, d5}, d30            \n"
    301     "vtbl.8      d21, {d4, d5}, d31            \n"
    302     "vtbl.8      d22, {d6, d7}, d30            \n"
    303     "vtbl.8      d23, {d6, d7}, d31            \n"
    304 
    305     "mov         r9, %2                        \n"
    306 
    307     "vst1.32     {d16[0]},  [r9], %3           \n"
    308     "vst1.32     {d16[1]},  [r9], %3           \n"
    309     "vst1.32     {d17[0]},  [r9], %3           \n"
    310     "vst1.32     {d17[1]},  [r9], %3           \n"
    311 
    312     "add         r9, %2, #4                    \n"
    313     "vst1.32     {d20[0]}, [r9], %3            \n"
    314     "vst1.32     {d20[1]}, [r9], %3            \n"
    315     "vst1.32     {d21[0]}, [r9], %3            \n"
    316     "vst1.32     {d21[1]}, [r9]                \n"
    317 
    318     "mov         r9, %4                        \n"
    319 
    320     "vst1.32     {d18[0]}, [r9], %5            \n"
    321     "vst1.32     {d18[1]}, [r9], %5            \n"
    322     "vst1.32     {d19[0]}, [r9], %5            \n"
    323     "vst1.32     {d19[1]}, [r9], %5            \n"
    324 
    325     "add         r9, %4, #4                    \n"
    326     "vst1.32     {d22[0]},  [r9], %5           \n"
    327     "vst1.32     {d22[1]},  [r9], %5           \n"
    328     "vst1.32     {d23[0]},  [r9], %5           \n"
    329     "vst1.32     {d23[1]},  [r9]               \n"
    330 
    331     "add         %0, #4*2                      \n"  // src   += 4 * 2
    332     "add         %2, %2, %3, lsl #2            \n"  // dst_a += 4 * dst_stride_a
    333     "add         %4, %4, %5, lsl #2            \n"  // dst_b += 4 * dst_stride_b
    334     "subs        %6,  #4                       \n"  // w     -= 4
    335     "beq         4f                            \n"
    336 
    337     // some residual, check to see if it includes a 2x8 block,
    338     // or less
    339     "cmp         %6, #2                        \n"
    340     "blt         3f                            \n"
    341 
    342     // 2x8 block
    343     "2:                                        \n"
    344     "mov         r9, %0                        \n"
    345     "vld2.16     {d0[0], d2[0]}, [r9], %1      \n"
    346     "vld2.16     {d1[0], d3[0]}, [r9], %1      \n"
    347     "vld2.16     {d0[1], d2[1]}, [r9], %1      \n"
    348     "vld2.16     {d1[1], d3[1]}, [r9], %1      \n"
    349     "vld2.16     {d0[2], d2[2]}, [r9], %1      \n"
    350     "vld2.16     {d1[2], d3[2]}, [r9], %1      \n"
    351     "vld2.16     {d0[3], d2[3]}, [r9], %1      \n"
    352     "vld2.16     {d1[3], d3[3]}, [r9]          \n"
    353 
    354     "vtrn.8      d0, d1                        \n"
    355     "vtrn.8      d2, d3                        \n"
    356 
    357     "mov         r9, %2                        \n"
    358 
    359     "vst1.64     {d0}, [r9], %3                \n"
    360     "vst1.64     {d2}, [r9]                    \n"
    361 
    362     "mov         r9, %4                        \n"
    363 
    364     "vst1.64     {d1}, [r9], %5                \n"
    365     "vst1.64     {d3}, [r9]                    \n"
    366 
    367     "add         %0, #2*2                      \n"  // src   += 2 * 2
    368     "add         %2, %2, %3, lsl #1            \n"  // dst_a += 2 * dst_stride_a
    369     "add         %4, %4, %5, lsl #1            \n"  // dst_b += 2 * dst_stride_b
    370     "subs        %6,  #2                       \n"  // w     -= 2
    371     "beq         4f                            \n"
    372 
    373     // 1x8 block
    374     "3:                                        \n"
    375     "vld2.8      {d0[0], d1[0]}, [%0], %1      \n"
    376     "vld2.8      {d0[1], d1[1]}, [%0], %1      \n"
    377     "vld2.8      {d0[2], d1[2]}, [%0], %1      \n"
    378     "vld2.8      {d0[3], d1[3]}, [%0], %1      \n"
    379     "vld2.8      {d0[4], d1[4]}, [%0], %1      \n"
    380     "vld2.8      {d0[5], d1[5]}, [%0], %1      \n"
    381     "vld2.8      {d0[6], d1[6]}, [%0], %1      \n"
    382     "vld2.8      {d0[7], d1[7]}, [%0]          \n"
    383 
    384     "vst1.64     {d0}, [%2]                    \n"
    385     "vst1.64     {d1}, [%4]                    \n"
    386 
    387     "4:                                        \n"
    388 
    389     : "+r"(src),                 // %0
    390       "+r"(src_stride),          // %1
    391       "+r"(dst_a),               // %2
    392       "+r"(dst_stride_a),        // %3
    393       "+r"(dst_b),               // %4
    394       "+r"(dst_stride_b),        // %5
    395       "+r"(width)                // %6
    396     : "r"(&kVTbl4x4TransposeDi)  // %7
    397     : "memory", "cc", "r9",
    398       "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
    399   );
    400 }
    401 #endif
    402 
    403 #ifdef __cplusplus
    404 }  // extern "C"
    405 }  // namespace libyuv
    406 #endif
    407