Home | History | Annotate | Download | only in arm
      1 /*
      2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 #include "./vpx_config.h"
     14 #include "./vpx_dsp_rtcd.h"
     15 #include "vpx_dsp/arm/idct_neon.h"
     16 #include "vpx_dsp/arm/transpose_neon.h"
     17 #include "vpx_dsp/txfm_common.h"
     18 
     19 static INLINE void load_8x8_s32_dual(
     20     const tran_low_t *input, int32x4x2_t *const in0, int32x4x2_t *const in1,
     21     int32x4x2_t *const in2, int32x4x2_t *const in3, int32x4x2_t *const in4,
     22     int32x4x2_t *const in5, int32x4x2_t *const in6, int32x4x2_t *const in7) {
     23   in0->val[0] = vld1q_s32(input);
     24   in0->val[1] = vld1q_s32(input + 4);
     25   input += 32;
     26   in1->val[0] = vld1q_s32(input);
     27   in1->val[1] = vld1q_s32(input + 4);
     28   input += 32;
     29   in2->val[0] = vld1q_s32(input);
     30   in2->val[1] = vld1q_s32(input + 4);
     31   input += 32;
     32   in3->val[0] = vld1q_s32(input);
     33   in3->val[1] = vld1q_s32(input + 4);
     34   input += 32;
     35   in4->val[0] = vld1q_s32(input);
     36   in4->val[1] = vld1q_s32(input + 4);
     37   input += 32;
     38   in5->val[0] = vld1q_s32(input);
     39   in5->val[1] = vld1q_s32(input + 4);
     40   input += 32;
     41   in6->val[0] = vld1q_s32(input);
     42   in6->val[1] = vld1q_s32(input + 4);
     43   input += 32;
     44   in7->val[0] = vld1q_s32(input);
     45   in7->val[1] = vld1q_s32(input + 4);
     46 }
     47 
     48 static INLINE void load_4x8_s32_dual(const tran_low_t *input,
     49                                      int32x4_t *const in0, int32x4_t *const in1,
     50                                      int32x4_t *const in2, int32x4_t *const in3,
     51                                      int32x4_t *const in4, int32x4_t *const in5,
     52                                      int32x4_t *const in6,
     53                                      int32x4_t *const in7) {
     54   *in0 = vld1q_s32(input);
     55   input += 32;
     56   *in1 = vld1q_s32(input);
     57   input += 32;
     58   *in2 = vld1q_s32(input);
     59   input += 32;
     60   *in3 = vld1q_s32(input);
     61   input += 32;
     62   *in4 = vld1q_s32(input);
     63   input += 32;
     64   *in5 = vld1q_s32(input);
     65   input += 32;
     66   *in6 = vld1q_s32(input);
     67   input += 32;
     68   *in7 = vld1q_s32(input);
     69 }
     70 
     71 // Only for the first pass of the  _135_ variant. Since it only uses values from
     72 // the top left 16x16 it can safely assume all the remaining values are 0 and
     73 // skip an awful lot of calculations. In fact, only the first 12 columns make
     74 // the cut. None of the elements in the 13th, 14th, 15th or 16th columns are
     75 // used so it skips any calls to input[12|13|14|15] too.
     76 // In C this does a single row of 32 for each call. Here it transposes the top
     77 // left 12x8 to allow using SIMD.
     78 
     79 // vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero
     80 // coefficients as follows:
     81 //      0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
     82 //  0   0   2   5  10  17  25  38  47  62  83 101 121
     83 //  1   1   4   8  15  22  30  45  58  74  92 112 133
     84 //  2   3   7  12  18  28  36  52  64  82 102 118
     85 //  3   6  11  16  23  31  43  60  73  90 109 126
     86 //  4   9  14  19  29  37  50  65  78  98 116 134
     87 //  5  13  20  26  35  44  54  72  85 105 123
     88 //  6  21  27  33  42  53  63  80  94 113 132
     89 //  7  24  32  39  48  57  71  88 104 120
     90 //  8  34  40  46  56  68  81  96 111 130
     91 //  9  41  49  55  67  77  91 107 124
     92 // 10  51  59  66  76  89  99 119 131
     93 // 11  61  69  75  87 100 114 129
     94 // 12  70  79  86  97 108 122
     95 // 13  84  93 103 110 125
     96 // 14  98 106 115 127
     97 // 15 117 128
     98 static void vpx_highbd_idct32_12_neon(const tran_low_t *const input,
     99                                       int32_t *output) {
    100   int32x4x2_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
    101       s8[32];
    102 
    103   load_8x8_s32_dual(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5],
    104                     &in[6], &in[7]);
    105   transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
    106                     &in[7]);
    107 
    108   load_4x8_s32_dual(input + 8, &in[8].val[0], &in[8].val[1], &in[9].val[0],
    109                     &in[9].val[1], &in[10].val[0], &in[10].val[1],
    110                     &in[11].val[0], &in[11].val[1]);
    111   transpose_s32_4x8(&in[8].val[0], &in[8].val[1], &in[9].val[0], &in[9].val[1],
    112                     &in[10].val[0], &in[10].val[1], &in[11].val[0],
    113                     &in[11].val[1]);
    114 
    115   // stage 1
    116   s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
    117   s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
    118 
    119   s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64);
    120   s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64);
    121 
    122   s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
    123   s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
    124 
    125   s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
    126   s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
    127 
    128   s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64);
    129   s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64);
    130 
    131   s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
    132   s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
    133 
    134   // stage 2
    135   s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
    136   s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
    137 
    138   s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64);
    139   s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64);
    140 
    141   s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
    142   s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
    143 
    144   s2[18] = highbd_idct_sub_dual(s1[19], s1[18]);
    145   s2[19] = highbd_idct_add_dual(s1[18], s1[19]);
    146   s2[20] = highbd_idct_add_dual(s1[20], s1[21]);
    147   s2[21] = highbd_idct_sub_dual(s1[20], s1[21]);
    148   s2[26] = highbd_idct_sub_dual(s1[27], s1[26]);
    149   s2[27] = highbd_idct_add_dual(s1[26], s1[27]);
    150   s2[28] = highbd_idct_add_dual(s1[28], s1[29]);
    151   s2[29] = highbd_idct_sub_dual(s1[28], s1[29]);
    152 
    153   // stage 3
    154   s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
    155   s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
    156 
    157   s3[10] = highbd_idct_sub_dual(s2[11], s2[10]);
    158   s3[11] = highbd_idct_add_dual(s2[10], s2[11]);
    159   s3[12] = highbd_idct_add_dual(s2[12], s2[13]);
    160   s3[13] = highbd_idct_sub_dual(s2[12], s2[13]);
    161 
    162   s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
    163                                                          s1[31], cospi_28_64);
    164   s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
    165                                                          s1[31], cospi_4_64);
    166 
    167   s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64,
    168                                                          s2[29], -cospi_4_64);
    169   s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64,
    170                                                          s2[29], cospi_28_64);
    171 
    172   s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64,
    173                                                          s2[26], cospi_12_64);
    174   s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64,
    175                                                          s2[26], cospi_20_64);
    176 
    177   s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
    178                                                          s1[24], -cospi_20_64);
    179   s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
    180                                                          s1[24], cospi_12_64);
    181 
    182   // stage 4
    183   s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
    184   s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64);
    185   s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64);
    186 
    187   s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
    188                                                         s2[15], cospi_24_64);
    189   s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
    190                                                          s2[15], cospi_8_64);
    191 
    192   s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64,
    193                                                          s3[13], -cospi_8_64);
    194   s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64,
    195                                                          s3[13], cospi_24_64);
    196 
    197   s4[16] = highbd_idct_add_dual(s1[16], s2[19]);
    198   s4[17] = highbd_idct_add_dual(s3[17], s3[18]);
    199   s4[18] = highbd_idct_sub_dual(s3[17], s3[18]);
    200   s4[19] = highbd_idct_sub_dual(s1[16], s2[19]);
    201   s4[20] = highbd_idct_sub_dual(s1[23], s2[20]);
    202   s4[21] = highbd_idct_sub_dual(s3[22], s3[21]);
    203   s4[22] = highbd_idct_add_dual(s3[21], s3[22]);
    204   s4[23] = highbd_idct_add_dual(s2[20], s1[23]);
    205   s4[24] = highbd_idct_add_dual(s1[24], s2[27]);
    206   s4[25] = highbd_idct_add_dual(s3[25], s3[26]);
    207   s4[26] = highbd_idct_sub_dual(s3[25], s3[26]);
    208   s4[27] = highbd_idct_sub_dual(s1[24], s2[27]);
    209   s4[28] = highbd_idct_sub_dual(s1[31], s2[28]);
    210   s4[29] = highbd_idct_sub_dual(s3[30], s3[29]);
    211   s4[30] = highbd_idct_add_dual(s3[29], s3[30]);
    212   s4[31] = highbd_idct_add_dual(s2[28], s1[31]);
    213 
    214   // stage 5
    215   s5[0] = highbd_idct_add_dual(s4[0], s4[3]);
    216   s5[1] = highbd_idct_add_dual(s4[0], s4[2]);
    217   s5[2] = highbd_idct_sub_dual(s4[0], s4[2]);
    218   s5[3] = highbd_idct_sub_dual(s4[0], s4[3]);
    219 
    220   s5[5] = sub_multiply_shift_and_narrow_s32_dual(s3[7], s3[4], cospi_16_64);
    221   s5[6] = add_multiply_shift_and_narrow_s32_dual(s3[4], s3[7], cospi_16_64);
    222 
    223   s5[8] = highbd_idct_add_dual(s2[8], s3[11]);
    224   s5[9] = highbd_idct_add_dual(s4[9], s4[10]);
    225   s5[10] = highbd_idct_sub_dual(s4[9], s4[10]);
    226   s5[11] = highbd_idct_sub_dual(s2[8], s3[11]);
    227   s5[12] = highbd_idct_sub_dual(s2[15], s3[12]);
    228   s5[13] = highbd_idct_sub_dual(s4[14], s4[13]);
    229   s5[14] = highbd_idct_add_dual(s4[13], s4[14]);
    230   s5[15] = highbd_idct_add_dual(s2[15], s3[12]);
    231 
    232   s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64,
    233                                                          s4[29], cospi_24_64);
    234   s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64,
    235                                                          s4[29], cospi_8_64);
    236 
    237   s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64,
    238                                                          s4[28], cospi_24_64);
    239   s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64,
    240                                                          s4[28], cospi_8_64);
    241 
    242   s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64,
    243                                                          s4[27], -cospi_8_64);
    244   s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64,
    245                                                          s4[27], cospi_24_64);
    246 
    247   s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64,
    248                                                          s4[26], -cospi_8_64);
    249   s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64,
    250                                                          s4[26], cospi_24_64);
    251 
    252   // stage 6
    253   s6[0] = highbd_idct_add_dual(s5[0], s3[7]);
    254   s6[1] = highbd_idct_add_dual(s5[1], s5[6]);
    255   s6[2] = highbd_idct_add_dual(s5[2], s5[5]);
    256   s6[3] = highbd_idct_add_dual(s5[3], s3[4]);
    257   s6[4] = highbd_idct_sub_dual(s5[3], s3[4]);
    258   s6[5] = highbd_idct_sub_dual(s5[2], s5[5]);
    259   s6[6] = highbd_idct_sub_dual(s5[1], s5[6]);
    260   s6[7] = highbd_idct_sub_dual(s5[0], s3[7]);
    261 
    262   s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64);
    263   s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64);
    264 
    265   s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64);
    266   s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64);
    267 
    268   s6[16] = highbd_idct_add_dual(s4[16], s4[23]);
    269   s6[17] = highbd_idct_add_dual(s4[17], s4[22]);
    270   s6[18] = highbd_idct_add_dual(s5[18], s5[21]);
    271   s6[19] = highbd_idct_add_dual(s5[19], s5[20]);
    272   s6[20] = highbd_idct_sub_dual(s5[19], s5[20]);
    273   s6[21] = highbd_idct_sub_dual(s5[18], s5[21]);
    274   s6[22] = highbd_idct_sub_dual(s4[17], s4[22]);
    275   s6[23] = highbd_idct_sub_dual(s4[16], s4[23]);
    276 
    277   s6[24] = highbd_idct_sub_dual(s4[31], s4[24]);
    278   s6[25] = highbd_idct_sub_dual(s4[30], s4[25]);
    279   s6[26] = highbd_idct_sub_dual(s5[29], s5[26]);
    280   s6[27] = highbd_idct_sub_dual(s5[28], s5[27]);
    281   s6[28] = highbd_idct_add_dual(s5[27], s5[28]);
    282   s6[29] = highbd_idct_add_dual(s5[26], s5[29]);
    283   s6[30] = highbd_idct_add_dual(s4[25], s4[30]);
    284   s6[31] = highbd_idct_add_dual(s4[24], s4[31]);
    285 
    286   // stage 7
    287   s7[0] = highbd_idct_add_dual(s6[0], s5[15]);
    288   s7[1] = highbd_idct_add_dual(s6[1], s5[14]);
    289   s7[2] = highbd_idct_add_dual(s6[2], s6[13]);
    290   s7[3] = highbd_idct_add_dual(s6[3], s6[12]);
    291   s7[4] = highbd_idct_add_dual(s6[4], s6[11]);
    292   s7[5] = highbd_idct_add_dual(s6[5], s6[10]);
    293   s7[6] = highbd_idct_add_dual(s6[6], s5[9]);
    294   s7[7] = highbd_idct_add_dual(s6[7], s5[8]);
    295   s7[8] = highbd_idct_sub_dual(s6[7], s5[8]);
    296   s7[9] = highbd_idct_sub_dual(s6[6], s5[9]);
    297   s7[10] = highbd_idct_sub_dual(s6[5], s6[10]);
    298   s7[11] = highbd_idct_sub_dual(s6[4], s6[11]);
    299   s7[12] = highbd_idct_sub_dual(s6[3], s6[12]);
    300   s7[13] = highbd_idct_sub_dual(s6[2], s6[13]);
    301   s7[14] = highbd_idct_sub_dual(s6[1], s5[14]);
    302   s7[15] = highbd_idct_sub_dual(s6[0], s5[15]);
    303 
    304   s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64);
    305   s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64);
    306 
    307   s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64);
    308   s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64);
    309 
    310   s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64);
    311   s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64);
    312 
    313   s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64);
    314   s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64);
    315 
    316   // final stage
    317   s8[0] = highbd_idct_add_dual(s7[0], s6[31]);
    318   s8[1] = highbd_idct_add_dual(s7[1], s6[30]);
    319   s8[2] = highbd_idct_add_dual(s7[2], s6[29]);
    320   s8[3] = highbd_idct_add_dual(s7[3], s6[28]);
    321   s8[4] = highbd_idct_add_dual(s7[4], s7[27]);
    322   s8[5] = highbd_idct_add_dual(s7[5], s7[26]);
    323   s8[6] = highbd_idct_add_dual(s7[6], s7[25]);
    324   s8[7] = highbd_idct_add_dual(s7[7], s7[24]);
    325   s8[8] = highbd_idct_add_dual(s7[8], s7[23]);
    326   s8[9] = highbd_idct_add_dual(s7[9], s7[22]);
    327   s8[10] = highbd_idct_add_dual(s7[10], s7[21]);
    328   s8[11] = highbd_idct_add_dual(s7[11], s7[20]);
    329   s8[12] = highbd_idct_add_dual(s7[12], s6[19]);
    330   s8[13] = highbd_idct_add_dual(s7[13], s6[18]);
    331   s8[14] = highbd_idct_add_dual(s7[14], s6[17]);
    332   s8[15] = highbd_idct_add_dual(s7[15], s6[16]);
    333   s8[16] = highbd_idct_sub_dual(s7[15], s6[16]);
    334   s8[17] = highbd_idct_sub_dual(s7[14], s6[17]);
    335   s8[18] = highbd_idct_sub_dual(s7[13], s6[18]);
    336   s8[19] = highbd_idct_sub_dual(s7[12], s6[19]);
    337   s8[20] = highbd_idct_sub_dual(s7[11], s7[20]);
    338   s8[21] = highbd_idct_sub_dual(s7[10], s7[21]);
    339   s8[22] = highbd_idct_sub_dual(s7[9], s7[22]);
    340   s8[23] = highbd_idct_sub_dual(s7[8], s7[23]);
    341   s8[24] = highbd_idct_sub_dual(s7[7], s7[24]);
    342   s8[25] = highbd_idct_sub_dual(s7[6], s7[25]);
    343   s8[26] = highbd_idct_sub_dual(s7[5], s7[26]);
    344   s8[27] = highbd_idct_sub_dual(s7[4], s7[27]);
    345   s8[28] = highbd_idct_sub_dual(s7[3], s6[28]);
    346   s8[29] = highbd_idct_sub_dual(s7[2], s6[29]);
    347   s8[30] = highbd_idct_sub_dual(s7[1], s6[30]);
    348   s8[31] = highbd_idct_sub_dual(s7[0], s6[31]);
    349 
    350   vst1q_s32(output + 0, s8[0].val[0]);
    351   vst1q_s32(output + 4, s8[0].val[1]);
    352   output += 16;
    353   vst1q_s32(output + 0, s8[1].val[0]);
    354   vst1q_s32(output + 4, s8[1].val[1]);
    355   output += 16;
    356   vst1q_s32(output + 0, s8[2].val[0]);
    357   vst1q_s32(output + 4, s8[2].val[1]);
    358   output += 16;
    359   vst1q_s32(output + 0, s8[3].val[0]);
    360   vst1q_s32(output + 4, s8[3].val[1]);
    361   output += 16;
    362   vst1q_s32(output + 0, s8[4].val[0]);
    363   vst1q_s32(output + 4, s8[4].val[1]);
    364   output += 16;
    365   vst1q_s32(output + 0, s8[5].val[0]);
    366   vst1q_s32(output + 4, s8[5].val[1]);
    367   output += 16;
    368   vst1q_s32(output + 0, s8[6].val[0]);
    369   vst1q_s32(output + 4, s8[6].val[1]);
    370   output += 16;
    371   vst1q_s32(output + 0, s8[7].val[0]);
    372   vst1q_s32(output + 4, s8[7].val[1]);
    373   output += 16;
    374 
    375   vst1q_s32(output + 0, s8[8].val[0]);
    376   vst1q_s32(output + 4, s8[8].val[1]);
    377   output += 16;
    378   vst1q_s32(output + 0, s8[9].val[0]);
    379   vst1q_s32(output + 4, s8[9].val[1]);
    380   output += 16;
    381   vst1q_s32(output + 0, s8[10].val[0]);
    382   vst1q_s32(output + 4, s8[10].val[1]);
    383   output += 16;
    384   vst1q_s32(output + 0, s8[11].val[0]);
    385   vst1q_s32(output + 4, s8[11].val[1]);
    386   output += 16;
    387   vst1q_s32(output + 0, s8[12].val[0]);
    388   vst1q_s32(output + 4, s8[12].val[1]);
    389   output += 16;
    390   vst1q_s32(output + 0, s8[13].val[0]);
    391   vst1q_s32(output + 4, s8[13].val[1]);
    392   output += 16;
    393   vst1q_s32(output + 0, s8[14].val[0]);
    394   vst1q_s32(output + 4, s8[14].val[1]);
    395   output += 16;
    396   vst1q_s32(output + 0, s8[15].val[0]);
    397   vst1q_s32(output + 4, s8[15].val[1]);
    398   output += 16;
    399 
    400   vst1q_s32(output + 0, s8[16].val[0]);
    401   vst1q_s32(output + 4, s8[16].val[1]);
    402   output += 16;
    403   vst1q_s32(output + 0, s8[17].val[0]);
    404   vst1q_s32(output + 4, s8[17].val[1]);
    405   output += 16;
    406   vst1q_s32(output + 0, s8[18].val[0]);
    407   vst1q_s32(output + 4, s8[18].val[1]);
    408   output += 16;
    409   vst1q_s32(output + 0, s8[19].val[0]);
    410   vst1q_s32(output + 4, s8[19].val[1]);
    411   output += 16;
    412   vst1q_s32(output + 0, s8[20].val[0]);
    413   vst1q_s32(output + 4, s8[20].val[1]);
    414   output += 16;
    415   vst1q_s32(output + 0, s8[21].val[0]);
    416   vst1q_s32(output + 4, s8[21].val[1]);
    417   output += 16;
    418   vst1q_s32(output + 0, s8[22].val[0]);
    419   vst1q_s32(output + 4, s8[22].val[1]);
    420   output += 16;
    421   vst1q_s32(output + 0, s8[23].val[0]);
    422   vst1q_s32(output + 4, s8[23].val[1]);
    423   output += 16;
    424 
    425   vst1q_s32(output + 0, s8[24].val[0]);
    426   vst1q_s32(output + 4, s8[24].val[1]);
    427   output += 16;
    428   vst1q_s32(output + 0, s8[25].val[0]);
    429   vst1q_s32(output + 4, s8[25].val[1]);
    430   output += 16;
    431   vst1q_s32(output + 0, s8[26].val[0]);
    432   vst1q_s32(output + 4, s8[26].val[1]);
    433   output += 16;
    434   vst1q_s32(output + 0, s8[27].val[0]);
    435   vst1q_s32(output + 4, s8[27].val[1]);
    436   output += 16;
    437   vst1q_s32(output + 0, s8[28].val[0]);
    438   vst1q_s32(output + 4, s8[28].val[1]);
    439   output += 16;
    440   vst1q_s32(output + 0, s8[29].val[0]);
    441   vst1q_s32(output + 4, s8[29].val[1]);
    442   output += 16;
    443   vst1q_s32(output + 0, s8[30].val[0]);
    444   vst1q_s32(output + 4, s8[30].val[1]);
    445   output += 16;
    446   vst1q_s32(output + 0, s8[31].val[0]);
    447   vst1q_s32(output + 4, s8[31].val[1]);
    448 }
    449 
    450 static void vpx_highbd_idct32_16_neon(const int32_t *const input,
    451                                       uint16_t *const output, const int stride,
    452                                       const int bd) {
    453   int32x4x2_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
    454       out[32];
    455 
    456   load_and_transpose_s32_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4],
    457                              &in[5], &in[6], &in[7]);
    458 
    459   load_and_transpose_s32_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11],
    460                              &in[12], &in[13], &in[14], &in[15]);
    461 
    462   // stage 1
    463   s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
    464   s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
    465 
    466   s1[17] = multiply_shift_and_narrow_s32_dual(in[15], -cospi_17_64);
    467   s1[30] = multiply_shift_and_narrow_s32_dual(in[15], cospi_15_64);
    468 
    469   s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64);
    470   s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64);
    471 
    472   s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
    473   s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
    474 
    475   s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
    476   s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
    477 
    478   s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64);
    479   s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64);
    480 
    481   s1[22] = multiply_shift_and_narrow_s32_dual(in[13], cospi_19_64);
    482   s1[25] = multiply_shift_and_narrow_s32_dual(in[13], cospi_13_64);
    483 
    484   s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
    485   s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
    486 
    487   // stage 2
    488   s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
    489   s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
    490 
    491   s2[9] = multiply_shift_and_narrow_s32_dual(in[14], -cospi_18_64);
    492   s2[14] = multiply_shift_and_narrow_s32_dual(in[14], cospi_14_64);
    493 
    494   s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64);
    495   s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64);
    496 
    497   s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
    498   s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
    499 
    500   s2[16] = highbd_idct_add_dual(s1[16], s1[17]);
    501   s2[17] = highbd_idct_sub_dual(s1[16], s1[17]);
    502   s2[18] = highbd_idct_sub_dual(s1[19], s1[18]);
    503   s2[19] = highbd_idct_add_dual(s1[18], s1[19]);
    504   s2[20] = highbd_idct_add_dual(s1[20], s1[21]);
    505   s2[21] = highbd_idct_sub_dual(s1[20], s1[21]);
    506   s2[22] = highbd_idct_sub_dual(s1[23], s1[22]);
    507   s2[23] = highbd_idct_add_dual(s1[22], s1[23]);
    508   s2[24] = highbd_idct_add_dual(s1[24], s1[25]);
    509   s2[25] = highbd_idct_sub_dual(s1[24], s1[25]);
    510   s2[26] = highbd_idct_sub_dual(s1[27], s1[26]);
    511   s2[27] = highbd_idct_add_dual(s1[26], s1[27]);
    512   s2[28] = highbd_idct_add_dual(s1[28], s1[29]);
    513   s2[29] = highbd_idct_sub_dual(s1[28], s1[29]);
    514   s2[30] = highbd_idct_sub_dual(s1[31], s1[30]);
    515   s2[31] = highbd_idct_add_dual(s1[30], s1[31]);
    516 
    517   // stage 3
    518   s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
    519   s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
    520 
    521   s3[5] = multiply_shift_and_narrow_s32_dual(in[12], -cospi_20_64);
    522   s3[6] = multiply_shift_and_narrow_s32_dual(in[12], cospi_12_64);
    523 
    524   s3[8] = highbd_idct_add_dual(s2[8], s2[9]);
    525   s3[9] = highbd_idct_sub_dual(s2[8], s2[9]);
    526   s3[10] = highbd_idct_sub_dual(s2[11], s2[10]);
    527   s3[11] = highbd_idct_add_dual(s2[10], s2[11]);
    528   s3[12] = highbd_idct_add_dual(s2[12], s2[13]);
    529   s3[13] = highbd_idct_sub_dual(s2[12], s2[13]);
    530   s3[14] = highbd_idct_sub_dual(s2[15], s2[14]);
    531   s3[15] = highbd_idct_add_dual(s2[14], s2[15]);
    532 
    533   s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], -cospi_4_64,
    534                                                          s2[30], cospi_28_64);
    535   s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], cospi_28_64,
    536                                                          s2[30], cospi_4_64);
    537 
    538   s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64,
    539                                                          s2[29], -cospi_4_64);
    540   s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64,
    541                                                          s2[29], cospi_28_64);
    542 
    543   s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64,
    544                                                          s2[26], cospi_12_64);
    545   s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64,
    546                                                          s2[26], cospi_20_64);
    547 
    548   s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_12_64,
    549                                                          s2[25], -cospi_20_64);
    550   s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_20_64,
    551                                                          s2[25], cospi_12_64);
    552 
    553   // stage 4
    554   s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
    555   s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64);
    556   s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64);
    557 
    558   s4[4] = highbd_idct_add_dual(s3[4], s3[5]);
    559   s4[5] = highbd_idct_sub_dual(s3[4], s3[5]);
    560   s4[6] = highbd_idct_sub_dual(s3[7], s3[6]);
    561   s4[7] = highbd_idct_add_dual(s3[6], s3[7]);
    562 
    563   s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], -cospi_8_64,
    564                                                         s3[14], cospi_24_64);
    565   s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], cospi_24_64,
    566                                                          s3[14], cospi_8_64);
    567 
    568   s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64,
    569                                                          s3[13], -cospi_8_64);
    570   s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64,
    571                                                          s3[13], cospi_24_64);
    572 
    573   s4[16] = highbd_idct_add_dual(s2[16], s2[19]);
    574   s4[17] = highbd_idct_add_dual(s3[17], s3[18]);
    575   s4[18] = highbd_idct_sub_dual(s3[17], s3[18]);
    576   s4[19] = highbd_idct_sub_dual(s2[16], s2[19]);
    577   s4[20] = highbd_idct_sub_dual(s2[23], s2[20]);
    578   s4[21] = highbd_idct_sub_dual(s3[22], s3[21]);
    579   s4[22] = highbd_idct_add_dual(s3[21], s3[22]);
    580   s4[23] = highbd_idct_add_dual(s2[20], s2[23]);
    581   s4[24] = highbd_idct_add_dual(s2[24], s2[27]);
    582   s4[25] = highbd_idct_add_dual(s3[25], s3[26]);
    583   s4[26] = highbd_idct_sub_dual(s3[25], s3[26]);
    584   s4[27] = highbd_idct_sub_dual(s2[24], s2[27]);
    585   s4[28] = highbd_idct_sub_dual(s2[31], s2[28]);
    586   s4[29] = highbd_idct_sub_dual(s3[30], s3[29]);
    587   s4[30] = highbd_idct_add_dual(s3[29], s3[30]);
    588   s4[31] = highbd_idct_add_dual(s2[28], s2[31]);
    589 
    590   // stage 5
    591   s5[0] = highbd_idct_add_dual(s4[0], s4[3]);
    592   s5[1] = highbd_idct_add_dual(s4[0], s4[2]);
    593   s5[2] = highbd_idct_sub_dual(s4[0], s4[2]);
    594   s5[3] = highbd_idct_sub_dual(s4[0], s4[3]);
    595 
    596   s5[5] = sub_multiply_shift_and_narrow_s32_dual(s4[6], s4[5], cospi_16_64);
    597   s5[6] = add_multiply_shift_and_narrow_s32_dual(s4[5], s4[6], cospi_16_64);
    598 
    599   s5[8] = highbd_idct_add_dual(s3[8], s3[11]);
    600   s5[9] = highbd_idct_add_dual(s4[9], s4[10]);
    601   s5[10] = highbd_idct_sub_dual(s4[9], s4[10]);
    602   s5[11] = highbd_idct_sub_dual(s3[8], s3[11]);
    603   s5[12] = highbd_idct_sub_dual(s3[15], s3[12]);
    604   s5[13] = highbd_idct_sub_dual(s4[14], s4[13]);
    605   s5[14] = highbd_idct_add_dual(s4[13], s4[14]);
    606   s5[15] = highbd_idct_add_dual(s3[15], s3[12]);
    607 
    608   s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64,
    609                                                          s4[29], cospi_24_64);
    610   s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64,
    611                                                          s4[29], cospi_8_64);
    612 
    613   s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64,
    614                                                          s4[28], cospi_24_64);
    615   s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64,
    616                                                          s4[28], cospi_8_64);
    617 
    618   s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64,
    619                                                          s4[27], -cospi_8_64);
    620   s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64,
    621                                                          s4[27], cospi_24_64);
    622 
    623   s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64,
    624                                                          s4[26], -cospi_8_64);
    625   s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64,
    626                                                          s4[26], cospi_24_64);
    627 
    628   // stage 6
    629   s6[0] = highbd_idct_add_dual(s5[0], s4[7]);
    630   s6[1] = highbd_idct_add_dual(s5[1], s5[6]);
    631   s6[2] = highbd_idct_add_dual(s5[2], s5[5]);
    632   s6[3] = highbd_idct_add_dual(s5[3], s4[4]);
    633   s6[4] = highbd_idct_sub_dual(s5[3], s4[4]);
    634   s6[5] = highbd_idct_sub_dual(s5[2], s5[5]);
    635   s6[6] = highbd_idct_sub_dual(s5[1], s5[6]);
    636   s6[7] = highbd_idct_sub_dual(s5[0], s4[7]);
    637 
    638   s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64);
    639   s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64);
    640 
    641   s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64);
    642   s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64);
    643 
    644   s6[16] = highbd_idct_add_dual(s4[16], s4[23]);
    645   s6[17] = highbd_idct_add_dual(s4[17], s4[22]);
    646   s6[18] = highbd_idct_add_dual(s5[18], s5[21]);
    647   s6[19] = highbd_idct_add_dual(s5[19], s5[20]);
    648   s6[20] = highbd_idct_sub_dual(s5[19], s5[20]);
    649   s6[21] = highbd_idct_sub_dual(s5[18], s5[21]);
    650   s6[22] = highbd_idct_sub_dual(s4[17], s4[22]);
    651   s6[23] = highbd_idct_sub_dual(s4[16], s4[23]);
    652   s6[24] = highbd_idct_sub_dual(s4[31], s4[24]);
    653   s6[25] = highbd_idct_sub_dual(s4[30], s4[25]);
    654   s6[26] = highbd_idct_sub_dual(s5[29], s5[26]);
    655   s6[27] = highbd_idct_sub_dual(s5[28], s5[27]);
    656   s6[28] = highbd_idct_add_dual(s5[27], s5[28]);
    657   s6[29] = highbd_idct_add_dual(s5[26], s5[29]);
    658   s6[30] = highbd_idct_add_dual(s4[25], s4[30]);
    659   s6[31] = highbd_idct_add_dual(s4[24], s4[31]);
    660 
    661   // stage 7
    662   s7[0] = highbd_idct_add_dual(s6[0], s5[15]);
    663   s7[1] = highbd_idct_add_dual(s6[1], s5[14]);
    664   s7[2] = highbd_idct_add_dual(s6[2], s6[13]);
    665   s7[3] = highbd_idct_add_dual(s6[3], s6[12]);
    666   s7[4] = highbd_idct_add_dual(s6[4], s6[11]);
    667   s7[5] = highbd_idct_add_dual(s6[5], s6[10]);
    668   s7[6] = highbd_idct_add_dual(s6[6], s5[9]);
    669   s7[7] = highbd_idct_add_dual(s6[7], s5[8]);
    670   s7[8] = highbd_idct_sub_dual(s6[7], s5[8]);
    671   s7[9] = highbd_idct_sub_dual(s6[6], s5[9]);
    672   s7[10] = highbd_idct_sub_dual(s6[5], s6[10]);
    673   s7[11] = highbd_idct_sub_dual(s6[4], s6[11]);
    674   s7[12] = highbd_idct_sub_dual(s6[3], s6[12]);
    675   s7[13] = highbd_idct_sub_dual(s6[2], s6[13]);
    676   s7[14] = highbd_idct_sub_dual(s6[1], s5[14]);
    677   s7[15] = highbd_idct_sub_dual(s6[0], s5[15]);
    678 
    679   s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64);
    680   s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64);
    681 
    682   s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64);
    683   s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64);
    684 
    685   s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64);
    686   s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64);
    687 
    688   s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64);
    689   s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64);
    690 
    691   // final stage
    692   out[0] = highbd_idct_add_dual(s7[0], s6[31]);
    693   out[1] = highbd_idct_add_dual(s7[1], s6[30]);
    694   out[2] = highbd_idct_add_dual(s7[2], s6[29]);
    695   out[3] = highbd_idct_add_dual(s7[3], s6[28]);
    696   out[4] = highbd_idct_add_dual(s7[4], s7[27]);
    697   out[5] = highbd_idct_add_dual(s7[5], s7[26]);
    698   out[6] = highbd_idct_add_dual(s7[6], s7[25]);
    699   out[7] = highbd_idct_add_dual(s7[7], s7[24]);
    700   out[8] = highbd_idct_add_dual(s7[8], s7[23]);
    701   out[9] = highbd_idct_add_dual(s7[9], s7[22]);
    702   out[10] = highbd_idct_add_dual(s7[10], s7[21]);
    703   out[11] = highbd_idct_add_dual(s7[11], s7[20]);
    704   out[12] = highbd_idct_add_dual(s7[12], s6[19]);
    705   out[13] = highbd_idct_add_dual(s7[13], s6[18]);
    706   out[14] = highbd_idct_add_dual(s7[14], s6[17]);
    707   out[15] = highbd_idct_add_dual(s7[15], s6[16]);
    708   out[16] = highbd_idct_sub_dual(s7[15], s6[16]);
    709   out[17] = highbd_idct_sub_dual(s7[14], s6[17]);
    710   out[18] = highbd_idct_sub_dual(s7[13], s6[18]);
    711   out[19] = highbd_idct_sub_dual(s7[12], s6[19]);
    712   out[20] = highbd_idct_sub_dual(s7[11], s7[20]);
    713   out[21] = highbd_idct_sub_dual(s7[10], s7[21]);
    714   out[22] = highbd_idct_sub_dual(s7[9], s7[22]);
    715   out[23] = highbd_idct_sub_dual(s7[8], s7[23]);
    716   out[24] = highbd_idct_sub_dual(s7[7], s7[24]);
    717   out[25] = highbd_idct_sub_dual(s7[6], s7[25]);
    718   out[26] = highbd_idct_sub_dual(s7[5], s7[26]);
    719   out[27] = highbd_idct_sub_dual(s7[4], s7[27]);
    720   out[28] = highbd_idct_sub_dual(s7[3], s6[28]);
    721   out[29] = highbd_idct_sub_dual(s7[2], s6[29]);
    722   out[30] = highbd_idct_sub_dual(s7[1], s6[30]);
    723   out[31] = highbd_idct_sub_dual(s7[0], s6[31]);
    724 
    725   highbd_idct16x16_add_store(out, output, stride, bd);
    726   highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
    727 }
    728 
    729 void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest,
    730                                        int stride, int bd) {
    731   int i;
    732 
    733   if (bd == 8) {
    734     int16_t temp[32 * 16];
    735     int16_t *t = temp;
    736     vpx_idct32_12_neon(input, temp);
    737     vpx_idct32_12_neon(input + 32 * 8, temp + 8);
    738 
    739     for (i = 0; i < 32; i += 8) {
    740       vpx_idct32_16_neon(t, dest, stride, 1);
    741       t += (16 * 8);
    742       dest += 8;
    743     }
    744   } else {
    745     int32_t temp[32 * 16];
    746     int32_t *t = temp;
    747     vpx_highbd_idct32_12_neon(input, temp);
    748     vpx_highbd_idct32_12_neon(input + 32 * 8, temp + 8);
    749 
    750     for (i = 0; i < 32; i += 8) {
    751       vpx_highbd_idct32_16_neon(t, dest, stride, bd);
    752       t += (16 * 8);
    753       dest += 8;
    754     }
    755   }
    756 }
    757