Home | History | Annotate | Download | only in arm
      1 /*
      2  *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #ifndef VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
     12 #define VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
     13 
     14 #include <arm_neon.h>
     15 
     16 #include "./vpx_config.h"
     17 
     18 // Transpose 64 bit elements as follows:
     19 // a0: 00 01 02 03 04 05 06 07
     20 // a1: 16 17 18 19 20 21 22 23
     21 //
     22 // b0.val[0]: 00 01 02 03 16 17 18 19
     23 // b0.val[1]: 04 05 06 07 20 21 22 23
     24 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
     25   int16x8x2_t b0;
     26   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
     27                            vreinterpret_s16_s32(vget_low_s32(a1)));
     28   b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
     29                            vreinterpret_s16_s32(vget_high_s32(a1)));
     30   return b0;
     31 }
     32 
     33 static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
     34   int32x4x2_t b0;
     35   b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
     36   b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
     37   return b0;
     38 }
     39 
     40 static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
     41   int64x2x2_t b0;
     42   b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
     43                            vreinterpret_s64_s32(vget_low_s32(a1)));
     44   b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
     45                            vreinterpret_s64_s32(vget_high_s32(a1)));
     46   return b0;
     47 }
     48 
     49 static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
     50   uint8x16x2_t b0;
     51   b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
     52                           vreinterpret_u8_u32(vget_low_u32(a1)));
     53   b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)),
     54                           vreinterpret_u8_u32(vget_high_u32(a1)));
     55   return b0;
     56 }
     57 
     58 static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
     59   uint16x8x2_t b0;
     60   b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
     61                            vreinterpret_u16_u32(vget_low_u32(a1)));
     62   b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
     63                            vreinterpret_u16_u32(vget_high_u32(a1)));
     64   return b0;
     65 }
     66 
     67 static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
     68   // Swap 16 bit elements. Goes from:
     69   // a0: 00 01 02 03  10 11 12 13
     70   // a1: 20 21 22 23  30 31 32 33
     71   // to:
     72   // b0.val[0]: 00 01 20 21  10 11 30 31
     73   // b0.val[1]: 02 03 22 23  12 13 32 33
     74 
     75   const uint16x4x2_t b0 =
     76       vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
     77 
     78   // Swap 32 bit elements resulting in:
     79   // c0.val[0]: 00 01 20 21  02 03 22 23
     80   // c0.val[1]: 10 11 30 31  12 13 32 33
     81 
     82   const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
     83                                    vreinterpret_u32_u16(b0.val[1]));
     84 
     85   // Swap 8 bit elements resulting in:
     86   // d0.val[0]: 00 10 20 30  02 12 22 32
     87   // d0.val[1]: 01 11 21 31  03 13 23 33
     88 
     89   const uint8x8x2_t d0 =
     90       vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
     91 
     92   *a0 = d0.val[0];
     93   *a1 = d0.val[1];
     94 }
     95 
     96 static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
     97                                       int16x4_t *a2, int16x4_t *a3) {
     98   // Swap 16 bit elements. Goes from:
     99   // a0: 00 01 02 03
    100   // a1: 10 11 12 13
    101   // a2: 20 21 22 23
    102   // a3: 30 31 32 33
    103   // to:
    104   // b0.val[0]: 00 10 02 12
    105   // b0.val[1]: 01 11 03 13
    106   // b1.val[0]: 20 30 22 32
    107   // b1.val[1]: 21 31 23 33
    108 
    109   const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
    110   const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
    111 
    112   // Swap 32 bit elements resulting in:
    113   // c0.val[0]: 00 10 20 30
    114   // c0.val[1]: 02 12 22 32
    115   // c1.val[0]: 01 11 21 31
    116   // c1.val[1]: 03 13 23 33
    117 
    118   const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
    119                                   vreinterpret_s32_s16(b1.val[0]));
    120   const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
    121                                   vreinterpret_s32_s16(b1.val[1]));
    122 
    123   *a0 = vreinterpret_s16_s32(c0.val[0]);
    124   *a1 = vreinterpret_s16_s32(c1.val[0]);
    125   *a2 = vreinterpret_s16_s32(c0.val[1]);
    126   *a3 = vreinterpret_s16_s32(c1.val[1]);
    127 }
    128 
    129 static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
    130   // Swap 32 bit elements. Goes from:
    131   // a0: 00 01 02 03  10 11 12 13
    132   // a1: 20 21 22 23  30 31 32 33
    133   // to:
    134   // b0.val[0]: 00 01 20 21  10 11 30 31
    135   // b0.val[1]: 02 03 22 23  12 13 32 33
    136 
    137   const int32x4x2_t b0 =
    138       vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1));
    139 
    140   // Swap 64 bit elements resulting in:
    141   // c0.val[0]: 00 01 20 21  02 03 22 23
    142   // c0.val[1]: 10 11 30 31  12 13 32 33
    143 
    144   const int32x4_t c0 =
    145       vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1]));
    146   const int32x4_t c1 =
    147       vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1]));
    148 
    149   // Swap 16 bit elements resulting in:
    150   // d0.val[0]: 00 10 20 30  02 12 22 32
    151   // d0.val[1]: 01 11 21 31  03 13 23 33
    152 
    153   const int16x8x2_t d0 =
    154       vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1));
    155 
    156   *a0 = d0.val[0];
    157   *a1 = d0.val[1];
    158 }
    159 
    160 static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
    161   // Swap 32 bit elements. Goes from:
    162   // a0: 00 01 02 03  10 11 12 13
    163   // a1: 20 21 22 23  30 31 32 33
    164   // to:
    165   // b0.val[0]: 00 01 20 21  10 11 30 31
    166   // b0.val[1]: 02 03 22 23  12 13 32 33
    167 
    168   const uint32x4x2_t b0 =
    169       vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1));
    170 
    171   // Swap 64 bit elements resulting in:
    172   // c0.val[0]: 00 01 20 21  02 03 22 23
    173   // c0.val[1]: 10 11 30 31  12 13 32 33
    174 
    175   const uint32x4_t c0 =
    176       vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1]));
    177   const uint32x4_t c1 =
    178       vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1]));
    179 
    180   // Swap 16 bit elements resulting in:
    181   // d0.val[0]: 00 10 20 30  02 12 22 32
    182   // d0.val[1]: 01 11 21 31  03 13 23 33
    183 
    184   const uint16x8x2_t d0 =
    185       vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1));
    186 
    187   *a0 = d0.val[0];
    188   *a1 = d0.val[1];
    189 }
    190 
    191 static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
    192                                     uint8x8_t *a3, const uint8x8_t a4,
    193                                     const uint8x8_t a5, const uint8x8_t a6,
    194                                     const uint8x8_t a7) {
    195   // Swap 32 bit elements. Goes from:
    196   // a0: 00 01 02 03 XX XX XX XX
    197   // a1: 10 11 12 13 XX XX XX XX
    198   // a2: 20 21 22 23 XX XX XX XX
    199   // a3; 30 31 32 33 XX XX XX XX
    200   // a4: 40 41 42 43 XX XX XX XX
    201   // a5: 50 51 52 53 XX XX XX XX
    202   // a6: 60 61 62 63 XX XX XX XX
    203   // a7: 70 71 72 73 XX XX XX XX
    204   // to:
    205   // b0.val[0]: 00 01 02 03 40 41 42 43
    206   // b1.val[0]: 10 11 12 13 50 51 52 53
    207   // b2.val[0]: 20 21 22 23 60 61 62 63
    208   // b3.val[0]: 30 31 32 33 70 71 72 73
    209 
    210   const uint32x2x2_t b0 =
    211       vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
    212   const uint32x2x2_t b1 =
    213       vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
    214   const uint32x2x2_t b2 =
    215       vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
    216   const uint32x2x2_t b3 =
    217       vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
    218 
    219   // Swap 16 bit elements resulting in:
    220   // c0.val[0]: 00 01 20 21 40 41 60 61
    221   // c0.val[1]: 02 03 22 23 42 43 62 63
    222   // c1.val[0]: 10 11 30 31 50 51 70 71
    223   // c1.val[1]: 12 13 32 33 52 53 72 73
    224 
    225   const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
    226                                    vreinterpret_u16_u32(b2.val[0]));
    227   const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
    228                                    vreinterpret_u16_u32(b3.val[0]));
    229 
    230   // Swap 8 bit elements resulting in:
    231   // d0.val[0]: 00 10 20 30 40 50 60 70
    232   // d0.val[1]: 01 11 21 31 41 51 61 71
    233   // d1.val[0]: 02 12 22 32 42 52 62 72
    234   // d1.val[1]: 03 13 23 33 43 53 63 73
    235 
    236   const uint8x8x2_t d0 =
    237       vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
    238   const uint8x8x2_t d1 =
    239       vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
    240 
    241   *a0 = d0.val[0];
    242   *a1 = d0.val[1];
    243   *a2 = d1.val[0];
    244   *a3 = d1.val[1];
    245 }
    246 
    247 static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
    248                                      int32x4_t *a2, int32x4_t *a3) {
    249   // Swap 32 bit elements. Goes from:
    250   // a0: 00 01 02 03
    251   // a1: 10 11 12 13
    252   // a2: 20 21 22 23
    253   // a3: 30 31 32 33
    254   // to:
    255   // b0.val[0]: 00 10 02 12
    256   // b0.val[1]: 01 11 03 13
    257   // b1.val[0]: 20 30 22 32
    258   // b1.val[1]: 21 31 23 33
    259 
    260   const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
    261   const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
    262 
    263   // Swap 64 bit elements resulting in:
    264   // c0.val[0]: 00 10 20 30
    265   // c0.val[1]: 02 12 22 32
    266   // c1.val[0]: 01 11 21 31
    267   // c1.val[1]: 03 13 23 33
    268 
    269   const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
    270   const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
    271 
    272   *a0 = c0.val[0];
    273   *a1 = c1.val[0];
    274   *a2 = c0.val[1];
    275   *a3 = c1.val[1];
    276 }
    277 
    278 static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
    279                                      const int16x4_t a2, const int16x4_t a3,
    280                                      const int16x4_t a4, const int16x4_t a5,
    281                                      const int16x4_t a6, const int16x4_t a7,
    282                                      int16x8_t *const o0, int16x8_t *const o1,
    283                                      int16x8_t *const o2, int16x8_t *const o3) {
    284   // Swap 16 bit elements. Goes from:
    285   // a0: 00 01 02 03
    286   // a1: 10 11 12 13
    287   // a2: 20 21 22 23
    288   // a3: 30 31 32 33
    289   // a4: 40 41 42 43
    290   // a5: 50 51 52 53
    291   // a6: 60 61 62 63
    292   // a7: 70 71 72 73
    293   // to:
    294   // b0.val[0]: 00 10 02 12
    295   // b0.val[1]: 01 11 03 13
    296   // b1.val[0]: 20 30 22 32
    297   // b1.val[1]: 21 31 23 33
    298   // b2.val[0]: 40 50 42 52
    299   // b2.val[1]: 41 51 43 53
    300   // b3.val[0]: 60 70 62 72
    301   // b3.val[1]: 61 71 63 73
    302 
    303   const int16x4x2_t b0 = vtrn_s16(a0, a1);
    304   const int16x4x2_t b1 = vtrn_s16(a2, a3);
    305   const int16x4x2_t b2 = vtrn_s16(a4, a5);
    306   const int16x4x2_t b3 = vtrn_s16(a6, a7);
    307 
    308   // Swap 32 bit elements resulting in:
    309   // c0.val[0]: 00 10 20 30
    310   // c0.val[1]: 02 12 22 32
    311   // c1.val[0]: 01 11 21 31
    312   // c1.val[1]: 03 13 23 33
    313   // c2.val[0]: 40 50 60 70
    314   // c2.val[1]: 42 52 62 72
    315   // c3.val[0]: 41 51 61 71
    316   // c3.val[1]: 43 53 63 73
    317 
    318   const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
    319                                   vreinterpret_s32_s16(b1.val[0]));
    320   const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
    321                                   vreinterpret_s32_s16(b1.val[1]));
    322   const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
    323                                   vreinterpret_s32_s16(b3.val[0]));
    324   const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
    325                                   vreinterpret_s32_s16(b3.val[1]));
    326 
    327   // Swap 64 bit elements resulting in:
    328   // o0: 00 10 20 30 40 50 60 70
    329   // o1: 01 11 21 31 41 51 61 71
    330   // o2: 02 12 22 32 42 52 62 72
    331   // o3: 03 13 23 33 43 53 63 73
    332 
    333   *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
    334                      vreinterpret_s16_s32(c2.val[0]));
    335   *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
    336                      vreinterpret_s16_s32(c3.val[0]));
    337   *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
    338                      vreinterpret_s16_s32(c2.val[1]));
    339   *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
    340                      vreinterpret_s16_s32(c3.val[1]));
    341 }
    342 
    343 static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,
    344                                      int32x4_t *const a2, int32x4_t *const a3,
    345                                      int32x4_t *const a4, int32x4_t *const a5,
    346                                      int32x4_t *const a6, int32x4_t *const a7) {
    347   // Swap 32 bit elements. Goes from:
    348   // a0: 00 01 02 03
    349   // a1: 10 11 12 13
    350   // a2: 20 21 22 23
    351   // a3: 30 31 32 33
    352   // a4: 40 41 42 43
    353   // a5: 50 51 52 53
    354   // a6: 60 61 62 63
    355   // a7: 70 71 72 73
    356   // to:
    357   // b0.val[0]: 00 10 02 12
    358   // b0.val[1]: 01 11 03 13
    359   // b1.val[0]: 20 30 22 32
    360   // b1.val[1]: 21 31 23 33
    361   // b2.val[0]: 40 50 42 52
    362   // b2.val[1]: 41 51 43 53
    363   // b3.val[0]: 60 70 62 72
    364   // b3.val[1]: 61 71 63 73
    365 
    366   const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
    367   const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
    368   const int32x4x2_t b2 = vtrnq_s32(*a4, *a5);
    369   const int32x4x2_t b3 = vtrnq_s32(*a6, *a7);
    370 
    371   // Swap 64 bit elements resulting in:
    372   // c0.val[0]: 00 10 20 30
    373   // c0.val[1]: 02 12 22 32
    374   // c1.val[0]: 01 11 21 31
    375   // c1.val[1]: 03 13 23 33
    376   // c2.val[0]: 40 50 60 70
    377   // c2.val[1]: 42 52 62 72
    378   // c3.val[0]: 41 51 61 71
    379   // c3.val[1]: 43 53 63 73
    380 
    381   const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b1.val[0]);
    382   const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b1.val[1]);
    383   const int64x2x2_t c2 = vpx_vtrnq_s64(b2.val[0], b3.val[0]);
    384   const int64x2x2_t c3 = vpx_vtrnq_s64(b2.val[1], b3.val[1]);
    385 
    386   *a0 = vreinterpretq_s32_s64(c0.val[0]);
    387   *a1 = vreinterpretq_s32_s64(c2.val[0]);
    388   *a2 = vreinterpretq_s32_s64(c1.val[0]);
    389   *a3 = vreinterpretq_s32_s64(c3.val[0]);
    390   *a4 = vreinterpretq_s32_s64(c0.val[1]);
    391   *a5 = vreinterpretq_s32_s64(c2.val[1]);
    392   *a6 = vreinterpretq_s32_s64(c1.val[1]);
    393   *a7 = vreinterpretq_s32_s64(c3.val[1]);
    394 }
    395 
    396 static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
    397                                     uint8x8_t *a3) {
    398   // Swap 8 bit elements. Goes from:
    399   // a0: 00 01 02 03 04 05 06 07
    400   // a1: 10 11 12 13 14 15 16 17
    401   // a2: 20 21 22 23 24 25 26 27
    402   // a3: 30 31 32 33 34 35 36 37
    403   // to:
    404   // b0.val[0]: 00 10 02 12 04 14 06 16
    405   // b0.val[1]: 01 11 03 13 05 15 07 17
    406   // b1.val[0]: 20 30 22 32 24 34 26 36
    407   // b1.val[1]: 21 31 23 33 25 35 27 37
    408 
    409   const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
    410   const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
    411 
    412   // Swap 16 bit elements resulting in:
    413   // c0.val[0]: 00 10 20 30 04 14 24 34
    414   // c0.val[1]: 02 12 22 32 06 16 26 36
    415   // c1.val[0]: 01 11 21 31 05 15 25 35
    416   // c1.val[1]: 03 13 23 33 07 17 27 37
    417 
    418   const uint16x4x2_t c0 =
    419       vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
    420   const uint16x4x2_t c1 =
    421       vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
    422 
    423   *a0 = vreinterpret_u8_u16(c0.val[0]);
    424   *a1 = vreinterpret_u8_u16(c1.val[0]);
    425   *a2 = vreinterpret_u8_u16(c0.val[1]);
    426   *a3 = vreinterpret_u8_u16(c1.val[1]);
    427 }
    428 
    429 static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1,
    430                                      uint16x8_t *a2, uint16x8_t *a3) {
    431   // Swap 16 bit elements. Goes from:
    432   // a0: 00 01 02 03 04 05 06 07
    433   // a1: 10 11 12 13 14 15 16 17
    434   // a2: 20 21 22 23 24 25 26 27
    435   // a3: 30 31 32 33 34 35 36 37
    436   // to:
    437   // b0.val[0]: 00 10 02 12 04 14 06 16
    438   // b0.val[1]: 01 11 03 13 05 15 07 17
    439   // b1.val[0]: 20 30 22 32 24 34 26 36
    440   // b1.val[1]: 21 31 23 33 25 35 27 37
    441 
    442   const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
    443   const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
    444 
    445   // Swap 32 bit elements resulting in:
    446   // c0.val[0]: 00 10 20 30 04 14 24 34
    447   // c0.val[1]: 02 12 22 32 06 16 26 36
    448   // c1.val[0]: 01 11 21 31 05 15 25 35
    449   // c1.val[1]: 03 13 23 33 07 17 27 37
    450 
    451   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
    452                                     vreinterpretq_u32_u16(b1.val[0]));
    453   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
    454                                     vreinterpretq_u32_u16(b1.val[1]));
    455 
    456   *a0 = vreinterpretq_u16_u32(c0.val[0]);
    457   *a1 = vreinterpretq_u16_u32(c1.val[0]);
    458   *a2 = vreinterpretq_u16_u32(c0.val[1]);
    459   *a3 = vreinterpretq_u16_u32(c1.val[1]);
    460 }
    461 
    462 static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
    463                                      int32x4_t *const a2, int32x4_t *const a3,
    464                                      int32x4_t *const a4, int32x4_t *const a5,
    465                                      int32x4_t *const a6, int32x4_t *const a7) {
    466   // Swap 32 bit elements. Goes from:
    467   // a0: 00 01 02 03
    468   // a1: 04 05 06 07
    469   // a2: 10 11 12 13
    470   // a3: 14 15 16 17
    471   // a4: 20 21 22 23
    472   // a5: 24 25 26 27
    473   // a6: 30 31 32 33
    474   // a7: 34 35 36 37
    475   // to:
    476   // b0.val[0]: 00 10 02 12
    477   // b0.val[1]: 01 11 03 13
    478   // b1.val[0]: 04 14 06 16
    479   // b1.val[1]: 05 15 07 17
    480   // b2.val[0]: 20 30 22 32
    481   // b2.val[1]: 21 31 23 33
    482   // b3.val[0]: 24 34 26 36
    483   // b3.val[1]: 25 35 27 37
    484 
    485   const int32x4x2_t b0 = vtrnq_s32(*a0, *a2);
    486   const int32x4x2_t b1 = vtrnq_s32(*a1, *a3);
    487   const int32x4x2_t b2 = vtrnq_s32(*a4, *a6);
    488   const int32x4x2_t b3 = vtrnq_s32(*a5, *a7);
    489 
    490   // Swap 64 bit elements resulting in:
    491   // c0.val[0]: 00 10 20 30
    492   // c0.val[1]: 02 12 22 32
    493   // c1.val[0]: 01 11 21 31
    494   // c1.val[1]: 03 13 23 33
    495   // c2.val[0]: 04 14 24 34
    496   // c2.val[1]: 06 16 26 36
    497   // c3.val[0]: 05 15 25 35
    498   // c3.val[1]: 07 17 27 37
    499 
    500   const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b2.val[0]);
    501   const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b2.val[1]);
    502   const int64x2x2_t c2 = vpx_vtrnq_s64(b1.val[0], b3.val[0]);
    503   const int64x2x2_t c3 = vpx_vtrnq_s64(b1.val[1], b3.val[1]);
    504 
    505   *a0 = vreinterpretq_s32_s64(c0.val[0]);
    506   *a1 = vreinterpretq_s32_s64(c1.val[0]);
    507   *a2 = vreinterpretq_s32_s64(c0.val[1]);
    508   *a3 = vreinterpretq_s32_s64(c1.val[1]);
    509   *a4 = vreinterpretq_s32_s64(c2.val[0]);
    510   *a5 = vreinterpretq_s32_s64(c3.val[0]);
    511   *a6 = vreinterpretq_s32_s64(c2.val[1]);
    512   *a7 = vreinterpretq_s32_s64(c3.val[1]);
    513 }
    514 
    515 // Note: Using 'd' registers or 'q' registers has almost identical speed. We use
    516 // 'q' registers here to save some instructions.
    517 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
    518                                     uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
    519                                     uint8x8_t *a6, uint8x8_t *a7) {
    520   // Swap 8 bit elements. Goes from:
    521   // a0: 00 01 02 03 04 05 06 07
    522   // a1: 10 11 12 13 14 15 16 17
    523   // a2: 20 21 22 23 24 25 26 27
    524   // a3: 30 31 32 33 34 35 36 37
    525   // a4: 40 41 42 43 44 45 46 47
    526   // a5: 50 51 52 53 54 55 56 57
    527   // a6: 60 61 62 63 64 65 66 67
    528   // a7: 70 71 72 73 74 75 76 77
    529   // to:
    530   // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
    531   // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
    532   // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
    533   // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
    534 
    535   const uint8x16x2_t b0 =
    536       vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
    537   const uint8x16x2_t b1 =
    538       vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
    539 
    540   // Swap 16 bit elements resulting in:
    541   // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
    542   // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
    543   // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
    544   // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
    545 
    546   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
    547                                     vreinterpretq_u16_u8(b1.val[0]));
    548   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
    549                                     vreinterpretq_u16_u8(b1.val[1]));
    550 
    551   // Unzip 32 bit elements resulting in:
    552   // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
    553   // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
    554   // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
    555   // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
    556   const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
    557                                     vreinterpretq_u32_u16(c1.val[0]));
    558   const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
    559                                     vreinterpretq_u32_u16(c1.val[1]));
    560 
    561   *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
    562   *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
    563   *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
    564   *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
    565   *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
    566   *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
    567   *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
    568   *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
    569 }
    570 
    571 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
    572                                      int16x8_t *a2, int16x8_t *a3,
    573                                      int16x8_t *a4, int16x8_t *a5,
    574                                      int16x8_t *a6, int16x8_t *a7) {
    575   // Swap 16 bit elements. Goes from:
    576   // a0: 00 01 02 03 04 05 06 07
    577   // a1: 10 11 12 13 14 15 16 17
    578   // a2: 20 21 22 23 24 25 26 27
    579   // a3: 30 31 32 33 34 35 36 37
    580   // a4: 40 41 42 43 44 45 46 47
    581   // a5: 50 51 52 53 54 55 56 57
    582   // a6: 60 61 62 63 64 65 66 67
    583   // a7: 70 71 72 73 74 75 76 77
    584   // to:
    585   // b0.val[0]: 00 10 02 12 04 14 06 16
    586   // b0.val[1]: 01 11 03 13 05 15 07 17
    587   // b1.val[0]: 20 30 22 32 24 34 26 36
    588   // b1.val[1]: 21 31 23 33 25 35 27 37
    589   // b2.val[0]: 40 50 42 52 44 54 46 56
    590   // b2.val[1]: 41 51 43 53 45 55 47 57
    591   // b3.val[0]: 60 70 62 72 64 74 66 76
    592   // b3.val[1]: 61 71 63 73 65 75 67 77
    593 
    594   const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
    595   const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
    596   const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
    597   const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
    598 
    599   // Swap 32 bit elements resulting in:
    600   // c0.val[0]: 00 10 20 30 04 14 24 34
    601   // c0.val[1]: 02 12 22 32 06 16 26 36
    602   // c1.val[0]: 01 11 21 31 05 15 25 35
    603   // c1.val[1]: 03 13 23 33 07 17 27 37
    604   // c2.val[0]: 40 50 60 70 44 54 64 74
    605   // c2.val[1]: 42 52 62 72 46 56 66 76
    606   // c3.val[0]: 41 51 61 71 45 55 65 75
    607   // c3.val[1]: 43 53 63 73 47 57 67 77
    608 
    609   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
    610                                    vreinterpretq_s32_s16(b1.val[0]));
    611   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
    612                                    vreinterpretq_s32_s16(b1.val[1]));
    613   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
    614                                    vreinterpretq_s32_s16(b3.val[0]));
    615   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
    616                                    vreinterpretq_s32_s16(b3.val[1]));
    617 
    618   // Swap 64 bit elements resulting in:
    619   // d0.val[0]: 00 10 20 30 40 50 60 70
    620   // d0.val[1]: 04 14 24 34 44 54 64 74
    621   // d1.val[0]: 01 11 21 31 41 51 61 71
    622   // d1.val[1]: 05 15 25 35 45 55 65 75
    623   // d2.val[0]: 02 12 22 32 42 52 62 72
    624   // d2.val[1]: 06 16 26 36 46 56 66 76
    625   // d3.val[0]: 03 13 23 33 43 53 63 73
    626   // d3.val[1]: 07 17 27 37 47 57 67 77
    627   const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
    628   const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
    629   const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
    630   const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
    631 
    632   *a0 = d0.val[0];
    633   *a1 = d1.val[0];
    634   *a2 = d2.val[0];
    635   *a3 = d3.val[0];
    636   *a4 = d0.val[1];
    637   *a5 = d1.val[1];
    638   *a6 = d2.val[1];
    639   *a7 = d3.val[1];
    640 }
    641 
    642 static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
    643                                      uint16x8_t *a2, uint16x8_t *a3,
    644                                      uint16x8_t *a4, uint16x8_t *a5,
    645                                      uint16x8_t *a6, uint16x8_t *a7) {
    646   // Swap 16 bit elements. Goes from:
    647   // a0: 00 01 02 03 04 05 06 07
    648   // a1: 10 11 12 13 14 15 16 17
    649   // a2: 20 21 22 23 24 25 26 27
    650   // a3: 30 31 32 33 34 35 36 37
    651   // a4: 40 41 42 43 44 45 46 47
    652   // a5: 50 51 52 53 54 55 56 57
    653   // a6: 60 61 62 63 64 65 66 67
    654   // a7: 70 71 72 73 74 75 76 77
    655   // to:
    656   // b0.val[0]: 00 10 02 12 04 14 06 16
    657   // b0.val[1]: 01 11 03 13 05 15 07 17
    658   // b1.val[0]: 20 30 22 32 24 34 26 36
    659   // b1.val[1]: 21 31 23 33 25 35 27 37
    660   // b2.val[0]: 40 50 42 52 44 54 46 56
    661   // b2.val[1]: 41 51 43 53 45 55 47 57
    662   // b3.val[0]: 60 70 62 72 64 74 66 76
    663   // b3.val[1]: 61 71 63 73 65 75 67 77
    664 
    665   const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
    666   const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
    667   const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
    668   const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
    669 
    670   // Swap 32 bit elements resulting in:
    671   // c0.val[0]: 00 10 20 30 04 14 24 34
    672   // c0.val[1]: 02 12 22 32 06 16 26 36
    673   // c1.val[0]: 01 11 21 31 05 15 25 35
    674   // c1.val[1]: 03 13 23 33 07 17 27 37
    675   // c2.val[0]: 40 50 60 70 44 54 64 74
    676   // c2.val[1]: 42 52 62 72 46 56 66 76
    677   // c3.val[0]: 41 51 61 71 45 55 65 75
    678   // c3.val[1]: 43 53 63 73 47 57 67 77
    679 
    680   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
    681                                     vreinterpretq_u32_u16(b1.val[0]));
    682   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
    683                                     vreinterpretq_u32_u16(b1.val[1]));
    684   const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
    685                                     vreinterpretq_u32_u16(b3.val[0]));
    686   const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
    687                                     vreinterpretq_u32_u16(b3.val[1]));
    688 
    689   // Swap 64 bit elements resulting in:
    690   // d0.val[0]: 00 10 20 30 40 50 60 70
    691   // d0.val[1]: 04 14 24 34 44 54 64 74
    692   // d1.val[0]: 01 11 21 31 41 51 61 71
    693   // d1.val[1]: 05 15 25 35 45 55 65 75
    694   // d2.val[0]: 02 12 22 32 42 52 62 72
    695   // d2.val[1]: 06 16 26 36 46 56 66 76
    696   // d3.val[0]: 03 13 23 33 43 53 63 73
    697   // d3.val[1]: 07 17 27 37 47 57 67 77
    698   const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
    699   const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
    700   const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
    701   const uint16x8x2_t d3 = vpx_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
    702 
    703   *a0 = d0.val[0];
    704   *a1 = d1.val[0];
    705   *a2 = d2.val[0];
    706   *a3 = d3.val[0];
    707   *a4 = d0.val[1];
    708   *a5 = d1.val[1];
    709   *a6 = d2.val[1];
    710   *a7 = d3.val[1];
    711 }
    712 
    713 static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
    714                                      int32x4x2_t *a2, int32x4x2_t *a3,
    715                                      int32x4x2_t *a4, int32x4x2_t *a5,
    716                                      int32x4x2_t *a6, int32x4x2_t *a7) {
    717   // Swap 32 bit elements. Goes from:
    718   // a0: 00 01 02 03 04 05 06 07
    719   // a1: 10 11 12 13 14 15 16 17
    720   // a2: 20 21 22 23 24 25 26 27
    721   // a3: 30 31 32 33 34 35 36 37
    722   // a4: 40 41 42 43 44 45 46 47
    723   // a5: 50 51 52 53 54 55 56 57
    724   // a6: 60 61 62 63 64 65 66 67
    725   // a7: 70 71 72 73 74 75 76 77
    726   // to:
    727   // b0: 00 10 02 12 01 11 03 13
    728   // b1: 20 30 22 32 21 31 23 33
    729   // b2: 40 50 42 52 41 51 43 53
    730   // b3: 60 70 62 72 61 71 63 73
    731   // b4: 04 14 06 16 05 15 07 17
    732   // b5: 24 34 26 36 25 35 27 37
    733   // b6: 44 54 46 56 45 55 47 57
    734   // b7: 64 74 66 76 65 75 67 77
    735 
    736   const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]);
    737   const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]);
    738   const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]);
    739   const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]);
    740   const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]);
    741   const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]);
    742   const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]);
    743   const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]);
    744 
    745   // Swap 64 bit elements resulting in:
    746   // c0: 00 10 20 30 02 12 22 32
    747   // c1: 01 11 21 31 03 13 23 33
    748   // c2: 40 50 60 70 42 52 62 72
    749   // c3: 41 51 61 71 43 53 63 73
    750   // c4: 04 14 24 34 06 16 26 36
    751   // c5: 05 15 25 35 07 17 27 37
    752   // c6: 44 54 64 74 46 56 66 76
    753   // c7: 45 55 65 75 47 57 67 77
    754   const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
    755   const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
    756   const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]);
    757   const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]);
    758   const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]);
    759   const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]);
    760   const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]);
    761   const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]);
    762 
    763   // Swap 128 bit elements resulting in:
    764   // a0: 00 10 20 30 40 50 60 70
    765   // a1: 01 11 21 31 41 51 61 71
    766   // a2: 02 12 22 32 42 52 62 72
    767   // a3: 03 13 23 33 43 53 63 73
    768   // a4: 04 14 24 34 44 54 64 74
    769   // a5: 05 15 25 35 45 55 65 75
    770   // a6: 06 16 26 36 46 56 66 76
    771   // a7: 07 17 27 37 47 57 67 77
    772   a0->val[0] = c0.val[0];
    773   a0->val[1] = c2.val[0];
    774   a1->val[0] = c1.val[0];
    775   a1->val[1] = c3.val[0];
    776   a2->val[0] = c0.val[1];
    777   a2->val[1] = c2.val[1];
    778   a3->val[0] = c1.val[1];
    779   a3->val[1] = c3.val[1];
    780   a4->val[0] = c4.val[0];
    781   a4->val[1] = c6.val[0];
    782   a5->val[0] = c5.val[0];
    783   a5->val[1] = c7.val[0];
    784   a6->val[0] = c4.val[1];
    785   a6->val[1] = c6.val[1];
    786   a7->val[0] = c5.val[1];
    787   a7->val[1] = c7.val[1];
    788 }
    789 
    790 static INLINE void transpose_u8_16x8(
    791     const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
    792     const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
    793     const uint8x16_t i6, const uint8x16_t i7, uint8x8_t *o0, uint8x8_t *o1,
    794     uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
    795     uint8x8_t *o7, uint8x8_t *o8, uint8x8_t *o9, uint8x8_t *o10, uint8x8_t *o11,
    796     uint8x8_t *o12, uint8x8_t *o13, uint8x8_t *o14, uint8x8_t *o15) {
    797   // Swap 8 bit elements. Goes from:
    798   // i0: 00 01 02 03 04 05 06 07  08 09 0A 0B 0C 0D 0E 0F
    799   // i1: 10 11 12 13 14 15 16 17  18 19 1A 1B 1C 1D 1E 1F
    800   // i2: 20 21 22 23 24 25 26 27  28 29 2A 2B 2C 2D 2E 2F
    801   // i3: 30 31 32 33 34 35 36 37  38 39 3A 3B 3C 3D 3E 3F
    802   // i4: 40 41 42 43 44 45 46 47  48 49 4A 4B 4C 4D 4E 4F
    803   // i5: 50 51 52 53 54 55 56 57  58 59 5A 5B 5C 5D 5E 5F
    804   // i6: 60 61 62 63 64 65 66 67  68 69 6A 6B 6C 6D 6E 6F
    805   // i7: 70 71 72 73 74 75 76 77  78 79 7A 7B 7C 7D 7E 7F
    806   // to:
    807   // b0.val[0]: 00 10 02 12 04 14 06 16  08 18 0A 1A 0C 1C 0E 1E
    808   // b0.val[1]: 01 11 03 13 05 15 07 17  09 19 0B 1B 0D 1D 0F 1F
    809   // b1.val[0]: 20 30 22 32 24 34 26 36  28 38 2A 3A 2C 3C 2E 3E
    810   // b1.val[1]: 21 31 23 33 25 35 27 37  29 39 2B 3B 2D 3D 2F 3F
    811   // b2.val[0]: 40 50 42 52 44 54 46 56  48 58 4A 5A 4C 5C 4E 5E
    812   // b2.val[1]: 41 51 43 53 45 55 47 57  49 59 4B 5B 4D 5D 4F 5F
    813   // b3.val[0]: 60 70 62 72 64 74 66 76  68 78 6A 7A 6C 7C 6E 7E
    814   // b3.val[1]: 61 71 63 73 65 75 67 77  69 79 6B 7B 6D 7D 6F 7F
    815   const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
    816   const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
    817   const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
    818   const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
    819 
    820   // Swap 16 bit elements resulting in:
    821   // c0.val[0]: 00 10 20 30 04 14 24 34  08 18 28 38 0C 1C 2C 3C
    822   // c0.val[1]: 02 12 22 32 06 16 26 36  0A 1A 2A 3A 0E 1E 2E 3E
    823   // c1.val[0]: 01 11 21 31 05 15 25 35  09 19 29 39 0D 1D 2D 3D
    824   // c1.val[1]: 03 13 23 33 07 17 27 37  0B 1B 2B 3B 0F 1F 2F 3F
    825   // c2.val[0]: 40 50 60 70 44 54 64 74  48 58 68 78 4C 5C 6C 7C
    826   // c2.val[1]: 42 52 62 72 46 56 66 76  4A 5A 6A 7A 4E 5E 6E 7E
    827   // c3.val[0]: 41 51 61 71 45 55 65 75  49 59 69 79 4D 5D 6D 7D
    828   // c3.val[1]: 43 53 63 73 47 57 67 77  4B 5B 6B 7B 4F 5F 6F 7F
    829   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
    830                                     vreinterpretq_u16_u8(b1.val[0]));
    831   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
    832                                     vreinterpretq_u16_u8(b1.val[1]));
    833   const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
    834                                     vreinterpretq_u16_u8(b3.val[0]));
    835   const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
    836                                     vreinterpretq_u16_u8(b3.val[1]));
    837 
    838   // Swap 32 bit elements resulting in:
    839   // d0.val[0]: 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
    840   // d0.val[1]: 04 14 24 34 44 54 64 74  0C 1C 2C 3C 4C 5C 6C 7C
    841   // d1.val[0]: 02 12 22 32 42 52 62 72  0A 1A 2A 3A 4A 5A 6A 7A
    842   // d1.val[1]: 06 16 26 36 46 56 66 76  0E 1E 2E 3E 4E 5E 6E 7E
    843   // d2.val[0]: 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
    844   // d2.val[1]: 05 15 25 35 45 55 65 75  0D 1D 2D 3D 4D 5D 6D 7D
    845   // d3.val[0]: 03 13 23 33 43 53 63 73  0B 1B 2B 3B 4B 5B 6B 7B
    846   // d3.val[1]: 07 17 27 37 47 57 67 77  0F 1F 2F 3F 4F 5F 6F 7F
    847   const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
    848                                     vreinterpretq_u32_u16(c2.val[0]));
    849   const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
    850                                     vreinterpretq_u32_u16(c2.val[1]));
    851   const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
    852                                     vreinterpretq_u32_u16(c3.val[0]));
    853   const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
    854                                     vreinterpretq_u32_u16(c3.val[1]));
    855 
    856   // Output:
    857   // o0 : 00 10 20 30 40 50 60 70
    858   // o1 : 01 11 21 31 41 51 61 71
    859   // o2 : 02 12 22 32 42 52 62 72
    860   // o3 : 03 13 23 33 43 53 63 73
    861   // o4 : 04 14 24 34 44 54 64 74
    862   // o5 : 05 15 25 35 45 55 65 75
    863   // o6 : 06 16 26 36 46 56 66 76
    864   // o7 : 07 17 27 37 47 57 67 77
    865   // o8 : 08 18 28 38 48 58 68 78
    866   // o9 : 09 19 29 39 49 59 69 79
    867   // o10: 0A 1A 2A 3A 4A 5A 6A 7A
    868   // o11: 0B 1B 2B 3B 4B 5B 6B 7B
    869   // o12: 0C 1C 2C 3C 4C 5C 6C 7C
    870   // o13: 0D 1D 2D 3D 4D 5D 6D 7D
    871   // o14: 0E 1E 2E 3E 4E 5E 6E 7E
    872   // o15: 0F 1F 2F 3F 4F 5F 6F 7F
    873   *o0 = vget_low_u8(vreinterpretq_u8_u32(d0.val[0]));
    874   *o1 = vget_low_u8(vreinterpretq_u8_u32(d2.val[0]));
    875   *o2 = vget_low_u8(vreinterpretq_u8_u32(d1.val[0]));
    876   *o3 = vget_low_u8(vreinterpretq_u8_u32(d3.val[0]));
    877   *o4 = vget_low_u8(vreinterpretq_u8_u32(d0.val[1]));
    878   *o5 = vget_low_u8(vreinterpretq_u8_u32(d2.val[1]));
    879   *o6 = vget_low_u8(vreinterpretq_u8_u32(d1.val[1]));
    880   *o7 = vget_low_u8(vreinterpretq_u8_u32(d3.val[1]));
    881   *o8 = vget_high_u8(vreinterpretq_u8_u32(d0.val[0]));
    882   *o9 = vget_high_u8(vreinterpretq_u8_u32(d2.val[0]));
    883   *o10 = vget_high_u8(vreinterpretq_u8_u32(d1.val[0]));
    884   *o11 = vget_high_u8(vreinterpretq_u8_u32(d3.val[0]));
    885   *o12 = vget_high_u8(vreinterpretq_u8_u32(d0.val[1]));
    886   *o13 = vget_high_u8(vreinterpretq_u8_u32(d2.val[1]));
    887   *o14 = vget_high_u8(vreinterpretq_u8_u32(d1.val[1]));
    888   *o15 = vget_high_u8(vreinterpretq_u8_u32(d3.val[1]));
    889 }
    890 
    891 static INLINE void transpose_u8_8x16(
    892     const uint8x8_t i0, const uint8x8_t i1, const uint8x8_t i2,
    893     const uint8x8_t i3, const uint8x8_t i4, const uint8x8_t i5,
    894     const uint8x8_t i6, const uint8x8_t i7, const uint8x8_t i8,
    895     const uint8x8_t i9, const uint8x8_t i10, const uint8x8_t i11,
    896     const uint8x8_t i12, const uint8x8_t i13, const uint8x8_t i14,
    897     const uint8x8_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
    898     uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
    899     uint8x16_t *o7) {
    900   // Combine 8 bit elements. Goes from:
    901   // i0 : 00 01 02 03 04 05 06 07
    902   // i1 : 10 11 12 13 14 15 16 17
    903   // i2 : 20 21 22 23 24 25 26 27
    904   // i3 : 30 31 32 33 34 35 36 37
    905   // i4 : 40 41 42 43 44 45 46 47
    906   // i5 : 50 51 52 53 54 55 56 57
    907   // i6 : 60 61 62 63 64 65 66 67
    908   // i7 : 70 71 72 73 74 75 76 77
    909   // i8 : 80 81 82 83 84 85 86 87
    910   // i9 : 90 91 92 93 94 95 96 97
    911   // i10: A0 A1 A2 A3 A4 A5 A6 A7
    912   // i11: B0 B1 B2 B3 B4 B5 B6 B7
    913   // i12: C0 C1 C2 C3 C4 C5 C6 C7
    914   // i13: D0 D1 D2 D3 D4 D5 D6 D7
    915   // i14: E0 E1 E2 E3 E4 E5 E6 E7
    916   // i15: F0 F1 F2 F3 F4 F5 F6 F7
    917   // to:
    918   // a0: 00 01 02 03 04 05 06 07  80 81 82 83 84 85 86 87
    919   // a1: 10 11 12 13 14 15 16 17  90 91 92 93 94 95 96 97
    920   // a2: 20 21 22 23 24 25 26 27  A0 A1 A2 A3 A4 A5 A6 A7
    921   // a3: 30 31 32 33 34 35 36 37  B0 B1 B2 B3 B4 B5 B6 B7
    922   // a4: 40 41 42 43 44 45 46 47  C0 C1 C2 C3 C4 C5 C6 C7
    923   // a5: 50 51 52 53 54 55 56 57  D0 D1 D2 D3 D4 D5 D6 D7
    924   // a6: 60 61 62 63 64 65 66 67  E0 E1 E2 E3 E4 E5 E6 E7
    925   // a7: 70 71 72 73 74 75 76 77  F0 F1 F2 F3 F4 F5 F6 F7
    926   const uint8x16_t a0 = vcombine_u8(i0, i8);
    927   const uint8x16_t a1 = vcombine_u8(i1, i9);
    928   const uint8x16_t a2 = vcombine_u8(i2, i10);
    929   const uint8x16_t a3 = vcombine_u8(i3, i11);
    930   const uint8x16_t a4 = vcombine_u8(i4, i12);
    931   const uint8x16_t a5 = vcombine_u8(i5, i13);
    932   const uint8x16_t a6 = vcombine_u8(i6, i14);
    933   const uint8x16_t a7 = vcombine_u8(i7, i15);
    934 
    935   // Swap 8 bit elements resulting in:
    936   // b0.val[0]: 00 10 02 12 04 14 06 16  80 90 82 92 84 94 86 96
    937   // b0.val[1]: 01 11 03 13 05 15 07 17  81 91 83 93 85 95 87 97
    938   // b1.val[0]: 20 30 22 32 24 34 26 36  A0 B0 A2 B2 A4 B4 A6 B6
    939   // b1.val[1]: 21 31 23 33 25 35 27 37  A1 B1 A3 B3 A5 B5 A7 B7
    940   // b2.val[0]: 40 50 42 52 44 54 46 56  C0 D0 C2 D2 C4 D4 C6 D6
    941   // b2.val[1]: 41 51 43 53 45 55 47 57  C1 D1 C3 D3 C5 D5 C7 D7
    942   // b3.val[0]: 60 70 62 72 64 74 66 76  E0 F0 E2 F2 E4 F4 E6 F6
    943   // b3.val[1]: 61 71 63 73 65 75 67 77  E1 F1 E3 F3 E5 F5 E7 F7
    944   const uint8x16x2_t b0 = vtrnq_u8(a0, a1);
    945   const uint8x16x2_t b1 = vtrnq_u8(a2, a3);
    946   const uint8x16x2_t b2 = vtrnq_u8(a4, a5);
    947   const uint8x16x2_t b3 = vtrnq_u8(a6, a7);
    948 
    949   // Swap 16 bit elements resulting in:
    950   // c0.val[0]: 00 10 20 30 04 14 24 34  80 90 A0 B0 84 94 A4 B4
    951   // c0.val[1]: 02 12 22 32 06 16 26 36  82 92 A2 B2 86 96 A6 B6
    952   // c1.val[0]: 01 11 21 31 05 15 25 35  81 91 A1 B1 85 95 A5 B5
    953   // c1.val[1]: 03 13 23 33 07 17 27 37  83 93 A3 B3 87 97 A7 B7
    954   // c2.val[0]: 40 50 60 70 44 54 64 74  C0 D0 E0 F0 C4 D4 E4 F4
    955   // c2.val[1]: 42 52 62 72 46 56 66 76  C2 D2 E2 F2 C6 D6 E6 F6
    956   // c3.val[0]: 41 51 61 71 45 55 65 75  C1 D1 E1 F1 C5 D5 E5 F5
    957   // c3.val[1]: 43 53 63 73 47 57 67 77  C3 D3 E3 F3 C7 D7 E7 F7
    958   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
    959                                     vreinterpretq_u16_u8(b1.val[0]));
    960   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
    961                                     vreinterpretq_u16_u8(b1.val[1]));
    962   const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
    963                                     vreinterpretq_u16_u8(b3.val[0]));
    964   const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
    965                                     vreinterpretq_u16_u8(b3.val[1]));
    966 
    967   // Swap 32 bit elements resulting in:
    968   // d0.val[0]: 00 10 20 30 40 50 60 70  80 90 A0 B0 C0 D0 E0 F0
    969   // d0.val[1]: 04 14 24 34 44 54 64 74  84 94 A4 B4 C4 D4 E4 F4
    970   // d1.val[0]: 02 12 22 32 42 52 62 72  82 92 A2 B2 C2 D2 E2 F2
    971   // d1.val[1]: 06 16 26 36 46 56 66 76  86 96 A6 B6 C6 D6 E6 F6
    972   // d2.val[0]: 01 11 21 31 41 51 61 71  81 91 A1 B1 C1 D1 E1 F1
    973   // d2.val[1]: 05 15 25 35 45 55 65 75  85 95 A5 B5 C5 D5 E5 F5
    974   // d3.val[0]: 03 13 23 33 43 53 63 73  83 93 A3 B3 C3 D3 E3 F3
    975   // d3.val[1]: 07 17 27 37 47 57 67 77  87 97 A7 B7 C7 D7 E7 F7
    976   const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
    977                                     vreinterpretq_u32_u16(c2.val[0]));
    978   const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
    979                                     vreinterpretq_u32_u16(c2.val[1]));
    980   const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
    981                                     vreinterpretq_u32_u16(c3.val[0]));
    982   const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
    983                                     vreinterpretq_u32_u16(c3.val[1]));
    984 
    985   // Output:
    986   // o0: 00 10 20 30 40 50 60 70  80 90 A0 B0 C0 D0 E0 F0
    987   // o1: 01 11 21 31 41 51 61 71  81 91 A1 B1 C1 D1 E1 F1
    988   // o2: 02 12 22 32 42 52 62 72  82 92 A2 B2 C2 D2 E2 F2
    989   // o3: 03 13 23 33 43 53 63 73  83 93 A3 B3 C3 D3 E3 F3
    990   // o4: 04 14 24 34 44 54 64 74  84 94 A4 B4 C4 D4 E4 F4
    991   // o5: 05 15 25 35 45 55 65 75  85 95 A5 B5 C5 D5 E5 F5
    992   // o6: 06 16 26 36 46 56 66 76  86 96 A6 B6 C6 D6 E6 F6
    993   // o7: 07 17 27 37 47 57 67 77  87 97 A7 B7 C7 D7 E7 F7
    994   *o0 = vreinterpretq_u8_u32(d0.val[0]);
    995   *o1 = vreinterpretq_u8_u32(d2.val[0]);
    996   *o2 = vreinterpretq_u8_u32(d1.val[0]);
    997   *o3 = vreinterpretq_u8_u32(d3.val[0]);
    998   *o4 = vreinterpretq_u8_u32(d0.val[1]);
    999   *o5 = vreinterpretq_u8_u32(d2.val[1]);
   1000   *o6 = vreinterpretq_u8_u32(d1.val[1]);
   1001   *o7 = vreinterpretq_u8_u32(d3.val[1]);
   1002 }
   1003 
   1004 static INLINE void transpose_u8_16x16(
   1005     const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
   1006     const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
   1007     const uint8x16_t i6, const uint8x16_t i7, const uint8x16_t i8,
   1008     const uint8x16_t i9, const uint8x16_t i10, const uint8x16_t i11,
   1009     const uint8x16_t i12, const uint8x16_t i13, const uint8x16_t i14,
   1010     const uint8x16_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
   1011     uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
   1012     uint8x16_t *o7, uint8x16_t *o8, uint8x16_t *o9, uint8x16_t *o10,
   1013     uint8x16_t *o11, uint8x16_t *o12, uint8x16_t *o13, uint8x16_t *o14,
   1014     uint8x16_t *o15) {
   1015   // Swap 8 bit elements. Goes from:
   1016   // i0:  00 01 02 03 04 05 06 07  08 09 0A 0B 0C 0D 0E 0F
   1017   // i1:  10 11 12 13 14 15 16 17  18 19 1A 1B 1C 1D 1E 1F
   1018   // i2:  20 21 22 23 24 25 26 27  28 29 2A 2B 2C 2D 2E 2F
   1019   // i3:  30 31 32 33 34 35 36 37  38 39 3A 3B 3C 3D 3E 3F
   1020   // i4:  40 41 42 43 44 45 46 47  48 49 4A 4B 4C 4D 4E 4F
   1021   // i5:  50 51 52 53 54 55 56 57  58 59 5A 5B 5C 5D 5E 5F
   1022   // i6:  60 61 62 63 64 65 66 67  68 69 6A 6B 6C 6D 6E 6F
   1023   // i7:  70 71 72 73 74 75 76 77  78 79 7A 7B 7C 7D 7E 7F
   1024   // i8:  80 81 82 83 84 85 86 87  88 89 8A 8B 8C 8D 8E 8F
   1025   // i9:  90 91 92 93 94 95 96 97  98 99 9A 9B 9C 9D 9E 9F
   1026   // i10: A0 A1 A2 A3 A4 A5 A6 A7  A8 A9 AA AB AC AD AE AF
   1027   // i11: B0 B1 B2 B3 B4 B5 B6 B7  B8 B9 BA BB BC BD BE BF
   1028   // i12: C0 C1 C2 C3 C4 C5 C6 C7  C8 C9 CA CB CC CD CE CF
   1029   // i13: D0 D1 D2 D3 D4 D5 D6 D7  D8 D9 DA DB DC DD DE DF
   1030   // i14: E0 E1 E2 E3 E4 E5 E6 E7  E8 E9 EA EB EC ED EE EF
   1031   // i15: F0 F1 F2 F3 F4 F5 F6 F7  F8 F9 FA FB FC FD FE FF
   1032   // to:
   1033   // b0.val[0]: 00 10 02 12 04 14 06 16  08 18 0A 1A 0C 1C 0E 1E
   1034   // b0.val[1]: 01 11 03 13 05 15 07 17  09 19 0B 1B 0D 1D 0F 1F
   1035   // b1.val[0]: 20 30 22 32 24 34 26 36  28 38 2A 3A 2C 3C 2E 3E
   1036   // b1.val[1]: 21 31 23 33 25 35 27 37  29 39 2B 3B 2D 3D 2F 3F
   1037   // b2.val[0]: 40 50 42 52 44 54 46 56  48 58 4A 5A 4C 5C 4E 5E
   1038   // b2.val[1]: 41 51 43 53 45 55 47 57  49 59 4B 5B 4D 5D 4F 5F
   1039   // b3.val[0]: 60 70 62 72 64 74 66 76  68 78 6A 7A 6C 7C 6E 7E
   1040   // b3.val[1]: 61 71 63 73 65 75 67 77  69 79 6B 7B 6D 7D 6F 7F
   1041   // b4.val[0]: 80 90 82 92 84 94 86 96  88 98 8A 9A 8C 9C 8E 9E
   1042   // b4.val[1]: 81 91 83 93 85 95 87 97  89 99 8B 9B 8D 9D 8F 9F
   1043   // b5.val[0]: A0 B0 A2 B2 A4 B4 A6 B6  A8 B8 AA BA AC BC AE BE
   1044   // b5.val[1]: A1 B1 A3 B3 A5 B5 A7 B7  A9 B9 AB BB AD BD AF BF
   1045   // b6.val[0]: C0 D0 C2 D2 C4 D4 C6 D6  C8 D8 CA DA CC DC CE DE
   1046   // b6.val[1]: C1 D1 C3 D3 C5 D5 C7 D7  C9 D9 CB DB CD DD CF DF
   1047   // b7.val[0]: E0 F0 E2 F2 E4 F4 E6 F6  E8 F8 EA FA EC FC EE FE
   1048   // b7.val[1]: E1 F1 E3 F3 E5 F5 E7 F7  E9 F9 EB FB ED FD EF FF
   1049   const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
   1050   const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
   1051   const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
   1052   const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
   1053   const uint8x16x2_t b4 = vtrnq_u8(i8, i9);
   1054   const uint8x16x2_t b5 = vtrnq_u8(i10, i11);
   1055   const uint8x16x2_t b6 = vtrnq_u8(i12, i13);
   1056   const uint8x16x2_t b7 = vtrnq_u8(i14, i15);
   1057 
   1058   // Swap 16 bit elements resulting in:
   1059   // c0.val[0]: 00 10 20 30 04 14 24 34  08 18 28 38 0C 1C 2C 3C
   1060   // c0.val[1]: 02 12 22 32 06 16 26 36  0A 1A 2A 3A 0E 1E 2E 3E
   1061   // c1.val[0]: 01 11 21 31 05 15 25 35  09 19 29 39 0D 1D 2D 3D
   1062   // c1.val[1]: 03 13 23 33 07 17 27 37  0B 1B 2B 3B 0F 1F 2F 3F
   1063   // c2.val[0]: 40 50 60 70 44 54 64 74  48 58 68 78 4C 5C 6C 7C
   1064   // c2.val[1]: 42 52 62 72 46 56 66 76  4A 5A 6A 7A 4E 5E 6E 7E
   1065   // c3.val[0]: 41 51 61 71 45 55 65 75  49 59 69 79 4D 5D 6D 7D
   1066   // c3.val[1]: 43 53 63 73 47 57 67 77  4B 5B 6B 7B 4F 5F 6F 7F
   1067   // c4.val[0]: 80 90 A0 B0 84 94 A4 B4  88 98 A8 B8 8C 9C AC BC
   1068   // c4.val[1]: 82 92 A2 B2 86 96 A6 B6  8A 9A AA BA 8E 9E AE BE
   1069   // c5.val[0]: 81 91 A1 B1 85 95 A5 B5  89 99 A9 B9 8D 9D AD BD
   1070   // c5.val[1]: 83 93 A3 B3 87 97 A7 B7  8B 9B AB BB 8F 9F AF BF
   1071   // c6.val[0]: C0 D0 E0 F0 C4 D4 E4 F4  C8 D8 E8 F8 CC DC EC FC
   1072   // c6.val[1]: C2 D2 E2 F2 C6 D6 E6 F6  CA DA EA FA CE DE EE FE
   1073   // c7.val[0]: C1 D1 E1 F1 C5 D5 E5 F5  C9 D9 E9 F9 CD DD ED FD
   1074   // c7.val[1]: C3 D3 E3 F3 C7 D7 E7 F7  CB DB EB FB CF DF EF FF
   1075   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
   1076                                     vreinterpretq_u16_u8(b1.val[0]));
   1077   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
   1078                                     vreinterpretq_u16_u8(b1.val[1]));
   1079   const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
   1080                                     vreinterpretq_u16_u8(b3.val[0]));
   1081   const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
   1082                                     vreinterpretq_u16_u8(b3.val[1]));
   1083   const uint16x8x2_t c4 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[0]),
   1084                                     vreinterpretq_u16_u8(b5.val[0]));
   1085   const uint16x8x2_t c5 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[1]),
   1086                                     vreinterpretq_u16_u8(b5.val[1]));
   1087   const uint16x8x2_t c6 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[0]),
   1088                                     vreinterpretq_u16_u8(b7.val[0]));
   1089   const uint16x8x2_t c7 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[1]),
   1090                                     vreinterpretq_u16_u8(b7.val[1]));
   1091 
   1092   // Swap 32 bit elements resulting in:
   1093   // d0.val[0]: 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
   1094   // d0.val[1]: 04 14 24 34 44 54 64 74  0C 1C 2C 3C 4C 5C 6C 7C
   1095   // d1.val[0]: 02 12 22 32 42 52 62 72  0A 1A 2A 3A 4A 5A 6A 7A
   1096   // d1.val[1]: 06 16 26 36 46 56 66 76  0E 1E 2E 3E 4E 5E 6E 7E
   1097   // d2.val[0]: 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
   1098   // d2.val[1]: 05 15 25 35 45 55 65 75  0D 1D 2D 3D 4D 5D 6D 7D
   1099   // d3.val[0]: 03 13 23 33 43 53 63 73  0B 1B 2B 3B 4B 5B 6B 7B
   1100   // d3.val[1]: 07 17 27 37 47 57 67 77  0F 1F 2F 3F 4F 5F 6F 7F
   1101   // d4.val[0]: 80 90 A0 B0 C0 D0 E0 F0  88 98 A8 B8 C8 D8 E8 F8
   1102   // d4.val[1]: 84 94 A4 B4 C4 D4 E4 F4  8C 9C AC BC CC DC EC FC
   1103   // d5.val[0]: 82 92 A2 B2 C2 D2 E2 F2  8A 9A AA BA CA DA EA FA
   1104   // d5.val[1]: 86 96 A6 B6 C6 D6 E6 F6  8E 9E AE BE CE DE EE FE
   1105   // d6.val[0]: 81 91 A1 B1 C1 D1 E1 F1  89 99 A9 B9 C9 D9 E9 F9
   1106   // d6.val[1]: 85 95 A5 B5 C5 D5 E5 F5  8D 9D AD BD CD DD ED FD
   1107   // d7.val[0]: 83 93 A3 B3 C3 D3 E3 F3  8B 9B AB BB CB DB EB FB
   1108   // d7.val[1]: 87 97 A7 B7 C7 D7 E7 F7  8F 9F AF BF CF DF EF FF
   1109   const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
   1110                                     vreinterpretq_u32_u16(c2.val[0]));
   1111   const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
   1112                                     vreinterpretq_u32_u16(c2.val[1]));
   1113   const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
   1114                                     vreinterpretq_u32_u16(c3.val[0]));
   1115   const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
   1116                                     vreinterpretq_u32_u16(c3.val[1]));
   1117   const uint32x4x2_t d4 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[0]),
   1118                                     vreinterpretq_u32_u16(c6.val[0]));
   1119   const uint32x4x2_t d5 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[1]),
   1120                                     vreinterpretq_u32_u16(c6.val[1]));
   1121   const uint32x4x2_t d6 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[0]),
   1122                                     vreinterpretq_u32_u16(c7.val[0]));
   1123   const uint32x4x2_t d7 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[1]),
   1124                                     vreinterpretq_u32_u16(c7.val[1]));
   1125 
   1126   // Swap 64 bit elements resulting in:
   1127   // e0.val[0]: 00 10 20 30 40 50 60 70  80 90 A0 B0 C0 D0 E0 F0
   1128   // e0.val[1]: 08 18 28 38 48 58 68 78  88 98 A8 B8 C8 D8 E8 F8
   1129   // e1.val[0]: 01 11 21 31 41 51 61 71  84 94 A4 B4 C4 D4 E4 F4
   1130   // e1.val[1]: 09 19 29 39 49 59 69 79  89 99 A9 B9 C9 D9 E9 F9
   1131   // e2.val[0]: 02 12 22 32 42 52 62 72  82 92 A2 B2 C2 D2 E2 F2
   1132   // e2.val[1]: 0A 1A 2A 3A 4A 5A 6A 7A  8A 9A AA BA CA DA EA FA
   1133   // e3.val[0]: 03 13 23 33 43 53 63 73  86 96 A6 B6 C6 D6 E6 F6
   1134   // e3.val[1]: 0B 1B 2B 3B 4B 5B 6B 7B  8B 9B AB BB CB DB EB FB
   1135   // e4.val[0]: 04 14 24 34 44 54 64 74  81 91 A1 B1 C1 D1 E1 F1
   1136   // e4.val[1]: 0C 1C 2C 3C 4C 5C 6C 7C  8C 9C AC BC CC DC EC FC
   1137   // e5.val[0]: 05 15 25 35 45 55 65 75  85 95 A5 B5 C5 D5 E5 F5
   1138   // e5.val[1]: 0D 1D 2D 3D 4D 5D 6D 7D  8D 9D AD BD CD DD ED FD
   1139   // e6.val[0]: 06 16 26 36 46 56 66 76  83 93 A3 B3 C3 D3 E3 F3
   1140   // e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E  8E 9E AE BE CE DE EE FE
   1141   // e7.val[0]: 07 17 27 37 47 57 67 77  87 97 A7 B7 C7 D7 E7 F7
   1142   // e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F  8F 9F AF BF CF DF EF FF
   1143   const uint8x16x2_t e0 = vpx_vtrnq_u64_to_u8(d0.val[0], d4.val[0]);
   1144   const uint8x16x2_t e1 = vpx_vtrnq_u64_to_u8(d2.val[0], d6.val[0]);
   1145   const uint8x16x2_t e2 = vpx_vtrnq_u64_to_u8(d1.val[0], d5.val[0]);
   1146   const uint8x16x2_t e3 = vpx_vtrnq_u64_to_u8(d3.val[0], d7.val[0]);
   1147   const uint8x16x2_t e4 = vpx_vtrnq_u64_to_u8(d0.val[1], d4.val[1]);
   1148   const uint8x16x2_t e5 = vpx_vtrnq_u64_to_u8(d2.val[1], d6.val[1]);
   1149   const uint8x16x2_t e6 = vpx_vtrnq_u64_to_u8(d1.val[1], d5.val[1]);
   1150   const uint8x16x2_t e7 = vpx_vtrnq_u64_to_u8(d3.val[1], d7.val[1]);
   1151 
   1152   // Output:
   1153   // o0 : 00 10 20 30 40 50 60 70  80 90 A0 B0 C0 D0 E0 F0
   1154   // o1 : 01 11 21 31 41 51 61 71  84 94 A4 B4 C4 D4 E4 F4
   1155   // o2 : 02 12 22 32 42 52 62 72  82 92 A2 B2 C2 D2 E2 F2
   1156   // o3 : 03 13 23 33 43 53 63 73  86 96 A6 B6 C6 D6 E6 F6
   1157   // o4 : 04 14 24 34 44 54 64 74  81 91 A1 B1 C1 D1 E1 F1
   1158   // o5 : 05 15 25 35 45 55 65 75  85 95 A5 B5 C5 D5 E5 F5
   1159   // o6 : 06 16 26 36 46 56 66 76  83 93 A3 B3 C3 D3 E3 F3
   1160   // o7 : 07 17 27 37 47 57 67 77  87 97 A7 B7 C7 D7 E7 F7
   1161   // o8 : 08 18 28 38 48 58 68 78  88 98 A8 B8 C8 D8 E8 F8
   1162   // o9 : 09 19 29 39 49 59 69 79  89 99 A9 B9 C9 D9 E9 F9
   1163   // o10: 0A 1A 2A 3A 4A 5A 6A 7A  8A 9A AA BA CA DA EA FA
   1164   // o11: 0B 1B 2B 3B 4B 5B 6B 7B  8B 9B AB BB CB DB EB FB
   1165   // o12: 0C 1C 2C 3C 4C 5C 6C 7C  8C 9C AC BC CC DC EC FC
   1166   // o13: 0D 1D 2D 3D 4D 5D 6D 7D  8D 9D AD BD CD DD ED FD
   1167   // o14: 0E 1E 2E 3E 4E 5E 6E 7E  8E 9E AE BE CE DE EE FE
   1168   // o15: 0F 1F 2F 3F 4F 5F 6F 7F  8F 9F AF BF CF DF EF FF
   1169   *o0 = e0.val[0];
   1170   *o1 = e1.val[0];
   1171   *o2 = e2.val[0];
   1172   *o3 = e3.val[0];
   1173   *o4 = e4.val[0];
   1174   *o5 = e5.val[0];
   1175   *o6 = e6.val[0];
   1176   *o7 = e7.val[0];
   1177   *o8 = e0.val[1];
   1178   *o9 = e1.val[1];
   1179   *o10 = e2.val[1];
   1180   *o11 = e3.val[1];
   1181   *o12 = e4.val[1];
   1182   *o13 = e5.val[1];
   1183   *o14 = e6.val[1];
   1184   *o15 = e7.val[1];
   1185 }
   1186 
   1187 static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
   1188                                              const int a_stride, uint8x8_t *a0,
   1189                                              uint8x8_t *a1, uint8x8_t *a2,
   1190                                              uint8x8_t *a3) {
   1191   uint8x8_t a4, a5, a6, a7;
   1192   *a0 = vld1_u8(a);
   1193   a += a_stride;
   1194   *a1 = vld1_u8(a);
   1195   a += a_stride;
   1196   *a2 = vld1_u8(a);
   1197   a += a_stride;
   1198   *a3 = vld1_u8(a);
   1199   a += a_stride;
   1200   a4 = vld1_u8(a);
   1201   a += a_stride;
   1202   a5 = vld1_u8(a);
   1203   a += a_stride;
   1204   a6 = vld1_u8(a);
   1205   a += a_stride;
   1206   a7 = vld1_u8(a);
   1207 
   1208   transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7);
   1209 }
   1210 
   1211 static INLINE void load_and_transpose_u8_8x8(const uint8_t *a,
   1212                                              const int a_stride, uint8x8_t *a0,
   1213                                              uint8x8_t *a1, uint8x8_t *a2,
   1214                                              uint8x8_t *a3, uint8x8_t *a4,
   1215                                              uint8x8_t *a5, uint8x8_t *a6,
   1216                                              uint8x8_t *a7) {
   1217   *a0 = vld1_u8(a);
   1218   a += a_stride;
   1219   *a1 = vld1_u8(a);
   1220   a += a_stride;
   1221   *a2 = vld1_u8(a);
   1222   a += a_stride;
   1223   *a3 = vld1_u8(a);
   1224   a += a_stride;
   1225   *a4 = vld1_u8(a);
   1226   a += a_stride;
   1227   *a5 = vld1_u8(a);
   1228   a += a_stride;
   1229   *a6 = vld1_u8(a);
   1230   a += a_stride;
   1231   *a7 = vld1_u8(a);
   1232 
   1233   transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
   1234 }
   1235 
   1236 static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride,
   1237                                               uint8x8_t a0, uint8x8_t a1,
   1238                                               uint8x8_t a2, uint8x8_t a3,
   1239                                               uint8x8_t a4, uint8x8_t a5,
   1240                                               uint8x8_t a6, uint8x8_t a7) {
   1241   transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
   1242 
   1243   vst1_u8(a, a0);
   1244   a += a_stride;
   1245   vst1_u8(a, a1);
   1246   a += a_stride;
   1247   vst1_u8(a, a2);
   1248   a += a_stride;
   1249   vst1_u8(a, a3);
   1250   a += a_stride;
   1251   vst1_u8(a, a4);
   1252   a += a_stride;
   1253   vst1_u8(a, a5);
   1254   a += a_stride;
   1255   vst1_u8(a, a6);
   1256   a += a_stride;
   1257   vst1_u8(a, a7);
   1258 }
   1259 
   1260 static INLINE void load_and_transpose_s16_8x8(const int16_t *a,
   1261                                               const int a_stride, int16x8_t *a0,
   1262                                               int16x8_t *a1, int16x8_t *a2,
   1263                                               int16x8_t *a3, int16x8_t *a4,
   1264                                               int16x8_t *a5, int16x8_t *a6,
   1265                                               int16x8_t *a7) {
   1266   *a0 = vld1q_s16(a);
   1267   a += a_stride;
   1268   *a1 = vld1q_s16(a);
   1269   a += a_stride;
   1270   *a2 = vld1q_s16(a);
   1271   a += a_stride;
   1272   *a3 = vld1q_s16(a);
   1273   a += a_stride;
   1274   *a4 = vld1q_s16(a);
   1275   a += a_stride;
   1276   *a5 = vld1q_s16(a);
   1277   a += a_stride;
   1278   *a6 = vld1q_s16(a);
   1279   a += a_stride;
   1280   *a7 = vld1q_s16(a);
   1281 
   1282   transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
   1283 }
   1284 
   1285 static INLINE void load_and_transpose_s32_8x8(
   1286     const int32_t *a, const int a_stride, int32x4x2_t *const a0,
   1287     int32x4x2_t *const a1, int32x4x2_t *const a2, int32x4x2_t *const a3,
   1288     int32x4x2_t *const a4, int32x4x2_t *const a5, int32x4x2_t *const a6,
   1289     int32x4x2_t *const a7) {
   1290   a0->val[0] = vld1q_s32(a);
   1291   a0->val[1] = vld1q_s32(a + 4);
   1292   a += a_stride;
   1293   a1->val[0] = vld1q_s32(a);
   1294   a1->val[1] = vld1q_s32(a + 4);
   1295   a += a_stride;
   1296   a2->val[0] = vld1q_s32(a);
   1297   a2->val[1] = vld1q_s32(a + 4);
   1298   a += a_stride;
   1299   a3->val[0] = vld1q_s32(a);
   1300   a3->val[1] = vld1q_s32(a + 4);
   1301   a += a_stride;
   1302   a4->val[0] = vld1q_s32(a);
   1303   a4->val[1] = vld1q_s32(a + 4);
   1304   a += a_stride;
   1305   a5->val[0] = vld1q_s32(a);
   1306   a5->val[1] = vld1q_s32(a + 4);
   1307   a += a_stride;
   1308   a6->val[0] = vld1q_s32(a);
   1309   a6->val[1] = vld1q_s32(a + 4);
   1310   a += a_stride;
   1311   a7->val[0] = vld1q_s32(a);
   1312   a7->val[1] = vld1q_s32(a + 4);
   1313 
   1314   transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
   1315 }
   1316 #endif  // VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
   1317