Home | History | Annotate | Download | only in arm
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 #include "./vpx_dsp_rtcd.h"
     14 #include "vpx_dsp/arm/idct_neon.h"
     15 #include "vpx_dsp/arm/mem_neon.h"
     16 #include "vpx_dsp/txfm_common.h"
     17 
     18 static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
     19                                 int16x4_t *const d1) {
     20   *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
     21   *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
     22 }
     23 
     24 static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
     25                                             const int16x4_t s1,
     26                                             const int16x4_t cospi_0_8_16_24,
     27                                             int32x4_t *const t32) {
     28   t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3);
     29   t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3);
     30   t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1);
     31   t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1);
     32 }
     33 
     34 static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1,
     35                                      const int16x4_t cospi_0_8_16_24,
     36                                      int16x4_t *const d0, int16x4_t *const d1) {
     37   int32x4_t t32[2];
     38 
     39   idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
     40   wrap_low_4x2(t32, d0, d1);
     41 }
     42 
     43 static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
     44                                          const int16x4_t cospi_0_8_16_24,
     45                                          int16x4_t *const d0,
     46                                          int16x4_t *const d1) {
     47   int32x4_t t32[2];
     48 
     49   idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
     50   t32[1] = vnegq_s32(t32[1]);
     51   wrap_low_4x2(t32, d0, d1);
     52 }
     53 
     54 static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
     55                                       const int16x4_t cospi_0_8_16_24,
     56                                       int16x4_t *const d0,
     57                                       int16x4_t *const d1) {
     58   int32x4_t t32[3];
     59 
     60   t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2);
     61   t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
     62   t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
     63   wrap_low_4x2(t32, d0, d1);
     64 }
     65 
     66 static INLINE void idct16x16_add_store(const int16x8_t *const out,
     67                                        uint8_t *dest, const int stride) {
     68   // Add the result to dest
     69   idct16x16_add8x1(out[0], &dest, stride);
     70   idct16x16_add8x1(out[1], &dest, stride);
     71   idct16x16_add8x1(out[2], &dest, stride);
     72   idct16x16_add8x1(out[3], &dest, stride);
     73   idct16x16_add8x1(out[4], &dest, stride);
     74   idct16x16_add8x1(out[5], &dest, stride);
     75   idct16x16_add8x1(out[6], &dest, stride);
     76   idct16x16_add8x1(out[7], &dest, stride);
     77   idct16x16_add8x1(out[8], &dest, stride);
     78   idct16x16_add8x1(out[9], &dest, stride);
     79   idct16x16_add8x1(out[10], &dest, stride);
     80   idct16x16_add8x1(out[11], &dest, stride);
     81   idct16x16_add8x1(out[12], &dest, stride);
     82   idct16x16_add8x1(out[13], &dest, stride);
     83   idct16x16_add8x1(out[14], &dest, stride);
     84   idct16x16_add8x1(out[15], &dest, stride);
     85 }
     86 
     87 static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest,
     88                                            const int stride) {
     89   // Add the result to dest
     90   const int16x8_t max = vdupq_n_s16((1 << 8) - 1);
     91   out[0] = vrshrq_n_s16(out[0], 6);
     92   out[1] = vrshrq_n_s16(out[1], 6);
     93   out[2] = vrshrq_n_s16(out[2], 6);
     94   out[3] = vrshrq_n_s16(out[3], 6);
     95   out[4] = vrshrq_n_s16(out[4], 6);
     96   out[5] = vrshrq_n_s16(out[5], 6);
     97   out[6] = vrshrq_n_s16(out[6], 6);
     98   out[7] = vrshrq_n_s16(out[7], 6);
     99   out[8] = vrshrq_n_s16(out[8], 6);
    100   out[9] = vrshrq_n_s16(out[9], 6);
    101   out[10] = vrshrq_n_s16(out[10], 6);
    102   out[11] = vrshrq_n_s16(out[11], 6);
    103   out[12] = vrshrq_n_s16(out[12], 6);
    104   out[13] = vrshrq_n_s16(out[13], 6);
    105   out[14] = vrshrq_n_s16(out[14], 6);
    106   out[15] = vrshrq_n_s16(out[15], 6);
    107   highbd_idct16x16_add8x1(out[0], max, &dest, stride);
    108   highbd_idct16x16_add8x1(out[1], max, &dest, stride);
    109   highbd_idct16x16_add8x1(out[2], max, &dest, stride);
    110   highbd_idct16x16_add8x1(out[3], max, &dest, stride);
    111   highbd_idct16x16_add8x1(out[4], max, &dest, stride);
    112   highbd_idct16x16_add8x1(out[5], max, &dest, stride);
    113   highbd_idct16x16_add8x1(out[6], max, &dest, stride);
    114   highbd_idct16x16_add8x1(out[7], max, &dest, stride);
    115   highbd_idct16x16_add8x1(out[8], max, &dest, stride);
    116   highbd_idct16x16_add8x1(out[9], max, &dest, stride);
    117   highbd_idct16x16_add8x1(out[10], max, &dest, stride);
    118   highbd_idct16x16_add8x1(out[11], max, &dest, stride);
    119   highbd_idct16x16_add8x1(out[12], max, &dest, stride);
    120   highbd_idct16x16_add8x1(out[13], max, &dest, stride);
    121   highbd_idct16x16_add8x1(out[14], max, &dest, stride);
    122   highbd_idct16x16_add8x1(out[15], max, &dest, stride);
    123 }
    124 
    125 void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
    126                                   void *const dest, const int stride,
    127                                   const int highbd_flag) {
    128   const int16x8_t cospis0 = vld1q_s16(kCospi);
    129   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
    130   const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
    131   const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0);
    132   const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1);
    133   const int16x4_t cospi_6_26N_14_18N = vget_high_s16(cospis1);
    134   int16x8_t in[16], step1[16], step2[16], out[16];
    135 
    136   // Load input (16x8)
    137   if (output) {
    138     const tran_low_t *inputT = (const tran_low_t *)input;
    139     in[0] = load_tran_low_to_s16q(inputT);
    140     inputT += 8;
    141     in[8] = load_tran_low_to_s16q(inputT);
    142     inputT += 8;
    143     in[1] = load_tran_low_to_s16q(inputT);
    144     inputT += 8;
    145     in[9] = load_tran_low_to_s16q(inputT);
    146     inputT += 8;
    147     in[2] = load_tran_low_to_s16q(inputT);
    148     inputT += 8;
    149     in[10] = load_tran_low_to_s16q(inputT);
    150     inputT += 8;
    151     in[3] = load_tran_low_to_s16q(inputT);
    152     inputT += 8;
    153     in[11] = load_tran_low_to_s16q(inputT);
    154     inputT += 8;
    155     in[4] = load_tran_low_to_s16q(inputT);
    156     inputT += 8;
    157     in[12] = load_tran_low_to_s16q(inputT);
    158     inputT += 8;
    159     in[5] = load_tran_low_to_s16q(inputT);
    160     inputT += 8;
    161     in[13] = load_tran_low_to_s16q(inputT);
    162     inputT += 8;
    163     in[6] = load_tran_low_to_s16q(inputT);
    164     inputT += 8;
    165     in[14] = load_tran_low_to_s16q(inputT);
    166     inputT += 8;
    167     in[7] = load_tran_low_to_s16q(inputT);
    168     inputT += 8;
    169     in[15] = load_tran_low_to_s16q(inputT);
    170   } else {
    171     const int16_t *inputT = (const int16_t *)input;
    172     in[0] = vld1q_s16(inputT);
    173     inputT += 8;
    174     in[8] = vld1q_s16(inputT);
    175     inputT += 8;
    176     in[1] = vld1q_s16(inputT);
    177     inputT += 8;
    178     in[9] = vld1q_s16(inputT);
    179     inputT += 8;
    180     in[2] = vld1q_s16(inputT);
    181     inputT += 8;
    182     in[10] = vld1q_s16(inputT);
    183     inputT += 8;
    184     in[3] = vld1q_s16(inputT);
    185     inputT += 8;
    186     in[11] = vld1q_s16(inputT);
    187     inputT += 8;
    188     in[4] = vld1q_s16(inputT);
    189     inputT += 8;
    190     in[12] = vld1q_s16(inputT);
    191     inputT += 8;
    192     in[5] = vld1q_s16(inputT);
    193     inputT += 8;
    194     in[13] = vld1q_s16(inputT);
    195     inputT += 8;
    196     in[6] = vld1q_s16(inputT);
    197     inputT += 8;
    198     in[14] = vld1q_s16(inputT);
    199     inputT += 8;
    200     in[7] = vld1q_s16(inputT);
    201     inputT += 8;
    202     in[15] = vld1q_s16(inputT);
    203   }
    204 
    205   // Transpose
    206   transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
    207                     &in[7]);
    208   transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
    209                     &in[15]);
    210 
    211   // stage 1
    212   step1[0] = in[0 / 2];
    213   step1[1] = in[16 / 2];
    214   step1[2] = in[8 / 2];
    215   step1[3] = in[24 / 2];
    216   step1[4] = in[4 / 2];
    217   step1[5] = in[20 / 2];
    218   step1[6] = in[12 / 2];
    219   step1[7] = in[28 / 2];
    220   step1[8] = in[2 / 2];
    221   step1[9] = in[18 / 2];
    222   step1[10] = in[10 / 2];
    223   step1[11] = in[26 / 2];
    224   step1[12] = in[6 / 2];
    225   step1[13] = in[22 / 2];
    226   step1[14] = in[14 / 2];
    227   step1[15] = in[30 / 2];
    228 
    229   // stage 2
    230   step2[0] = step1[0];
    231   step2[1] = step1[1];
    232   step2[2] = step1[2];
    233   step2[3] = step1[3];
    234   step2[4] = step1[4];
    235   step2[5] = step1[5];
    236   step2[6] = step1[6];
    237   step2[7] = step1[7];
    238   idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]);
    239   idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9],
    240                    &step2[14]);
    241   idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
    242                    &step2[13]);
    243   idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11],
    244                   &step2[12]);
    245 
    246   // stage 3
    247   step1[0] = step2[0];
    248   step1[1] = step2[1];
    249   step1[2] = step2[2];
    250   step1[3] = step2[3];
    251   idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]);
    252   idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]);
    253   step1[8] = vaddq_s16(step2[8], step2[9]);
    254   step1[9] = vsubq_s16(step2[8], step2[9]);
    255   step1[10] = vsubq_s16(step2[11], step2[10]);
    256   step1[11] = vaddq_s16(step2[11], step2[10]);
    257   step1[12] = vaddq_s16(step2[12], step2[13]);
    258   step1[13] = vsubq_s16(step2[12], step2[13]);
    259   step1[14] = vsubq_s16(step2[15], step2[14]);
    260   step1[15] = vaddq_s16(step2[15], step2[14]);
    261 
    262   // stage 4
    263   idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]);
    264   idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]);
    265   step2[4] = vaddq_s16(step1[4], step1[5]);
    266   step2[5] = vsubq_s16(step1[4], step1[5]);
    267   step2[6] = vsubq_s16(step1[7], step1[6]);
    268   step2[7] = vaddq_s16(step1[7], step1[6]);
    269   step2[8] = step1[8];
    270   idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
    271                     &step2[14]);
    272   idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
    273                         &step2[10]);
    274   step2[11] = step1[11];
    275   step2[12] = step1[12];
    276   step2[15] = step1[15];
    277 
    278   // stage 5
    279   step1[0] = vaddq_s16(step2[0], step2[3]);
    280   step1[1] = vaddq_s16(step2[1], step2[2]);
    281   step1[2] = vsubq_s16(step2[1], step2[2]);
    282   step1[3] = vsubq_s16(step2[0], step2[3]);
    283   step1[4] = step2[4];
    284   idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
    285   step1[7] = step2[7];
    286   step1[8] = vaddq_s16(step2[8], step2[11]);
    287   step1[9] = vaddq_s16(step2[9], step2[10]);
    288   step1[10] = vsubq_s16(step2[9], step2[10]);
    289   step1[11] = vsubq_s16(step2[8], step2[11]);
    290   step1[12] = vsubq_s16(step2[15], step2[12]);
    291   step1[13] = vsubq_s16(step2[14], step2[13]);
    292   step1[14] = vaddq_s16(step2[14], step2[13]);
    293   step1[15] = vaddq_s16(step2[15], step2[12]);
    294 
    295   // stage 6
    296   step2[0] = vaddq_s16(step1[0], step1[7]);
    297   step2[1] = vaddq_s16(step1[1], step1[6]);
    298   step2[2] = vaddq_s16(step1[2], step1[5]);
    299   step2[3] = vaddq_s16(step1[3], step1[4]);
    300   step2[4] = vsubq_s16(step1[3], step1[4]);
    301   step2[5] = vsubq_s16(step1[2], step1[5]);
    302   step2[6] = vsubq_s16(step1[1], step1[6]);
    303   step2[7] = vsubq_s16(step1[0], step1[7]);
    304   idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
    305                      &step2[13]);
    306   idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
    307                      &step2[12]);
    308   step2[8] = step1[8];
    309   step2[9] = step1[9];
    310   step2[14] = step1[14];
    311   step2[15] = step1[15];
    312 
    313   // stage 7
    314   idct16x16_add_stage7(step2, out);
    315 
    316   if (output) {
    317     idct16x16_store_pass1(out, output);
    318   } else {
    319     if (highbd_flag) {
    320       idct16x16_add_store_bd8(out, dest, stride);
    321     } else {
    322       idct16x16_add_store(out, dest, stride);
    323     }
    324   }
    325 }
    326 
    327 void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output,
    328                                  void *const dest, const int stride,
    329                                  const int highbd_flag) {
    330   const int16x8_t cospis0 = vld1q_s16(kCospi);
    331   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
    332   const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
    333   const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
    334   const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
    335   const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
    336   const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
    337   const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
    338   const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
    339   int16x8_t in[8], step1[16], step2[16], out[16];
    340 
    341   // Load input (8x8)
    342   if (output) {
    343     const tran_low_t *inputT = (const tran_low_t *)input;
    344     in[0] = load_tran_low_to_s16q(inputT);
    345     inputT += 16;
    346     in[1] = load_tran_low_to_s16q(inputT);
    347     inputT += 16;
    348     in[2] = load_tran_low_to_s16q(inputT);
    349     inputT += 16;
    350     in[3] = load_tran_low_to_s16q(inputT);
    351     inputT += 16;
    352     in[4] = load_tran_low_to_s16q(inputT);
    353     inputT += 16;
    354     in[5] = load_tran_low_to_s16q(inputT);
    355     inputT += 16;
    356     in[6] = load_tran_low_to_s16q(inputT);
    357     inputT += 16;
    358     in[7] = load_tran_low_to_s16q(inputT);
    359   } else {
    360     const int16_t *inputT = (const int16_t *)input;
    361     in[0] = vld1q_s16(inputT);
    362     inputT += 16;
    363     in[1] = vld1q_s16(inputT);
    364     inputT += 16;
    365     in[2] = vld1q_s16(inputT);
    366     inputT += 16;
    367     in[3] = vld1q_s16(inputT);
    368     inputT += 16;
    369     in[4] = vld1q_s16(inputT);
    370     inputT += 16;
    371     in[5] = vld1q_s16(inputT);
    372     inputT += 16;
    373     in[6] = vld1q_s16(inputT);
    374     inputT += 16;
    375     in[7] = vld1q_s16(inputT);
    376   }
    377 
    378   // Transpose
    379   transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
    380                     &in[7]);
    381 
    382   // stage 1
    383   step1[0] = in[0 / 2];
    384   step1[2] = in[8 / 2];
    385   step1[4] = in[4 / 2];
    386   step1[6] = in[12 / 2];
    387   step1[8] = in[2 / 2];
    388   step1[10] = in[10 / 2];
    389   step1[12] = in[6 / 2];
    390   step1[14] = in[14 / 2];  // 0 in pass 1
    391 
    392   // stage 2
    393   step2[0] = step1[0];
    394   step2[2] = step1[2];
    395   step2[4] = step1[4];
    396   step2[6] = step1[6];
    397   step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
    398   step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3);
    399   step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3);
    400   step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
    401   step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
    402   step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2);
    403   step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2);
    404   step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
    405 
    406   // stage 3
    407   step1[0] = step2[0];
    408   step1[2] = step2[2];
    409   step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
    410   step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2);
    411   step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1);
    412   step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
    413   step1[8] = vaddq_s16(step2[8], step2[9]);
    414   step1[9] = vsubq_s16(step2[8], step2[9]);
    415   step1[10] = vsubq_s16(step2[11], step2[10]);
    416   step1[11] = vaddq_s16(step2[11], step2[10]);
    417   step1[12] = vaddq_s16(step2[12], step2[13]);
    418   step1[13] = vsubq_s16(step2[12], step2[13]);
    419   step1[14] = vsubq_s16(step2[15], step2[14]);
    420   step1[15] = vaddq_s16(step2[15], step2[14]);
    421 
    422   // stage 4
    423   step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
    424   step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3);
    425   step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1);
    426   step2[4] = vaddq_s16(step1[4], step1[5]);
    427   step2[5] = vsubq_s16(step1[4], step1[5]);
    428   step2[6] = vsubq_s16(step1[7], step1[6]);
    429   step2[7] = vaddq_s16(step1[7], step1[6]);
    430   step2[8] = step1[8];
    431   idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
    432                     &step2[14]);
    433   idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
    434                         &step2[10]);
    435   step2[11] = step1[11];
    436   step2[12] = step1[12];
    437   step2[15] = step1[15];
    438 
    439   // stage 5
    440   step1[0] = vaddq_s16(step2[0], step2[3]);
    441   step1[1] = vaddq_s16(step2[1], step2[2]);
    442   step1[2] = vsubq_s16(step2[1], step2[2]);
    443   step1[3] = vsubq_s16(step2[0], step2[3]);
    444   step1[4] = step2[4];
    445   idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
    446   step1[7] = step2[7];
    447   step1[8] = vaddq_s16(step2[8], step2[11]);
    448   step1[9] = vaddq_s16(step2[9], step2[10]);
    449   step1[10] = vsubq_s16(step2[9], step2[10]);
    450   step1[11] = vsubq_s16(step2[8], step2[11]);
    451   step1[12] = vsubq_s16(step2[15], step2[12]);
    452   step1[13] = vsubq_s16(step2[14], step2[13]);
    453   step1[14] = vaddq_s16(step2[14], step2[13]);
    454   step1[15] = vaddq_s16(step2[15], step2[12]);
    455 
    456   // stage 6
    457   step2[0] = vaddq_s16(step1[0], step1[7]);
    458   step2[1] = vaddq_s16(step1[1], step1[6]);
    459   step2[2] = vaddq_s16(step1[2], step1[5]);
    460   step2[3] = vaddq_s16(step1[3], step1[4]);
    461   step2[4] = vsubq_s16(step1[3], step1[4]);
    462   step2[5] = vsubq_s16(step1[2], step1[5]);
    463   step2[6] = vsubq_s16(step1[1], step1[6]);
    464   step2[7] = vsubq_s16(step1[0], step1[7]);
    465   idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
    466                      &step2[13]);
    467   idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
    468                      &step2[12]);
    469   step2[8] = step1[8];
    470   step2[9] = step1[9];
    471   step2[14] = step1[14];
    472   step2[15] = step1[15];
    473 
    474   // stage 7
    475   idct16x16_add_stage7(step2, out);
    476 
    477   if (output) {
    478     idct16x16_store_pass1(out, output);
    479   } else {
    480     if (highbd_flag) {
    481       idct16x16_add_store_bd8(out, dest, stride);
    482     } else {
    483       idct16x16_add_store(out, dest, stride);
    484     }
    485   }
    486 }
    487 
    488 void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
    489                                        int16_t *output) {
    490   const int16x8_t cospis0 = vld1q_s16(kCospi);
    491   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
    492   const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
    493   const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
    494   const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
    495   const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
    496   const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
    497   const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
    498   const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
    499   int16x4_t in[4], step1[16], step2[16], out[16];
    500 
    501   // Load input (4x4)
    502   in[0] = load_tran_low_to_s16d(input);
    503   input += 16;
    504   in[1] = load_tran_low_to_s16d(input);
    505   input += 16;
    506   in[2] = load_tran_low_to_s16d(input);
    507   input += 16;
    508   in[3] = load_tran_low_to_s16d(input);
    509 
    510   // Transpose
    511   transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
    512 
    513   // stage 1
    514   step1[0] = in[0 / 2];
    515   step1[4] = in[4 / 2];
    516   step1[8] = in[2 / 2];
    517   step1[12] = in[6 / 2];
    518 
    519   // stage 2
    520   step2[0] = step1[0];
    521   step2[4] = step1[4];
    522   step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1);
    523   step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1);
    524   step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0);
    525   step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0);
    526 
    527   // stage 3
    528   step1[0] = step2[0];
    529   step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3);
    530   step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0);
    531   step1[8] = step2[8];
    532   step1[9] = step2[8];
    533   step1[10] = step2[11];
    534   step1[11] = step2[11];
    535   step1[12] = step2[12];
    536   step1[13] = step2[12];
    537   step1[14] = step2[15];
    538   step1[15] = step2[15];
    539 
    540   // stage 4
    541   step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2);
    542   step2[4] = step1[4];
    543   step2[5] = step1[4];
    544   step2[6] = step1[7];
    545   step2[7] = step1[7];
    546   step2[8] = step1[8];
    547   idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
    548                     &step2[14]);
    549   idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
    550                         &step2[10]);
    551   step2[11] = step1[11];
    552   step2[12] = step1[12];
    553   step2[15] = step1[15];
    554 
    555   // stage 5
    556   step1[0] = step2[0];
    557   step1[1] = step2[1];
    558   step1[2] = step2[1];
    559   step1[3] = step2[0];
    560   step1[4] = step2[4];
    561   idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
    562   step1[7] = step2[7];
    563   step1[8] = vadd_s16(step2[8], step2[11]);
    564   step1[9] = vadd_s16(step2[9], step2[10]);
    565   step1[10] = vsub_s16(step2[9], step2[10]);
    566   step1[11] = vsub_s16(step2[8], step2[11]);
    567   step1[12] = vsub_s16(step2[15], step2[12]);
    568   step1[13] = vsub_s16(step2[14], step2[13]);
    569   step1[14] = vadd_s16(step2[14], step2[13]);
    570   step1[15] = vadd_s16(step2[15], step2[12]);
    571 
    572   // stage 6
    573   step2[0] = vadd_s16(step1[0], step1[7]);
    574   step2[1] = vadd_s16(step1[1], step1[6]);
    575   step2[2] = vadd_s16(step1[2], step1[5]);
    576   step2[3] = vadd_s16(step1[3], step1[4]);
    577   step2[4] = vsub_s16(step1[3], step1[4]);
    578   step2[5] = vsub_s16(step1[2], step1[5]);
    579   step2[6] = vsub_s16(step1[1], step1[6]);
    580   step2[7] = vsub_s16(step1[0], step1[7]);
    581   idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
    582                      &step2[13]);
    583   idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
    584                      &step2[12]);
    585   step2[8] = step1[8];
    586   step2[9] = step1[9];
    587   step2[14] = step1[14];
    588   step2[15] = step1[15];
    589 
    590   // stage 7
    591   out[0] = vadd_s16(step2[0], step2[15]);
    592   out[1] = vadd_s16(step2[1], step2[14]);
    593   out[2] = vadd_s16(step2[2], step2[13]);
    594   out[3] = vadd_s16(step2[3], step2[12]);
    595   out[4] = vadd_s16(step2[4], step2[11]);
    596   out[5] = vadd_s16(step2[5], step2[10]);
    597   out[6] = vadd_s16(step2[6], step2[9]);
    598   out[7] = vadd_s16(step2[7], step2[8]);
    599   out[8] = vsub_s16(step2[7], step2[8]);
    600   out[9] = vsub_s16(step2[6], step2[9]);
    601   out[10] = vsub_s16(step2[5], step2[10]);
    602   out[11] = vsub_s16(step2[4], step2[11]);
    603   out[12] = vsub_s16(step2[3], step2[12]);
    604   out[13] = vsub_s16(step2[2], step2[13]);
    605   out[14] = vsub_s16(step2[1], step2[14]);
    606   out[15] = vsub_s16(step2[0], step2[15]);
    607 
    608   // pass 1: save the result into output
    609   vst1_s16(output, out[0]);
    610   output += 4;
    611   vst1_s16(output, out[1]);
    612   output += 4;
    613   vst1_s16(output, out[2]);
    614   output += 4;
    615   vst1_s16(output, out[3]);
    616   output += 4;
    617   vst1_s16(output, out[4]);
    618   output += 4;
    619   vst1_s16(output, out[5]);
    620   output += 4;
    621   vst1_s16(output, out[6]);
    622   output += 4;
    623   vst1_s16(output, out[7]);
    624   output += 4;
    625   vst1_s16(output, out[8]);
    626   output += 4;
    627   vst1_s16(output, out[9]);
    628   output += 4;
    629   vst1_s16(output, out[10]);
    630   output += 4;
    631   vst1_s16(output, out[11]);
    632   output += 4;
    633   vst1_s16(output, out[12]);
    634   output += 4;
    635   vst1_s16(output, out[13]);
    636   output += 4;
    637   vst1_s16(output, out[14]);
    638   output += 4;
    639   vst1_s16(output, out[15]);
    640 }
    641 
    642 void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input,
    643                                        int16_t *const output, void *const dest,
    644                                        const int stride,
    645                                        const int highbd_flag) {
    646   const int16x8_t cospis0 = vld1q_s16(kCospi);
    647   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
    648   const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
    649   const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
    650   const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
    651   const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
    652   const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
    653   const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
    654   const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
    655   int16x4_t ind[8];
    656   int16x8_t in[4], step1[16], step2[16], out[16];
    657 
    658   // Load input (4x8)
    659   ind[0] = vld1_s16(input);
    660   input += 4;
    661   ind[1] = vld1_s16(input);
    662   input += 4;
    663   ind[2] = vld1_s16(input);
    664   input += 4;
    665   ind[3] = vld1_s16(input);
    666   input += 4;
    667   ind[4] = vld1_s16(input);
    668   input += 4;
    669   ind[5] = vld1_s16(input);
    670   input += 4;
    671   ind[6] = vld1_s16(input);
    672   input += 4;
    673   ind[7] = vld1_s16(input);
    674 
    675   // Transpose
    676   transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6],
    677                     ind[7], &in[0], &in[1], &in[2], &in[3]);
    678 
    679   // stage 1
    680   step1[0] = in[0 / 2];
    681   step1[4] = in[4 / 2];
    682   step1[8] = in[2 / 2];
    683   step1[12] = in[6 / 2];
    684 
    685   // stage 2
    686   step2[0] = step1[0];
    687   step2[4] = step1[4];
    688   step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
    689   step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
    690   step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
    691   step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
    692 
    693   // stage 3
    694   step1[0] = step2[0];
    695   step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
    696   step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
    697   step1[8] = step2[8];
    698   step1[9] = step2[8];
    699   step1[10] = step2[11];
    700   step1[11] = step2[11];
    701   step1[12] = step2[12];
    702   step1[13] = step2[12];
    703   step1[14] = step2[15];
    704   step1[15] = step2[15];
    705 
    706   // stage 4
    707   step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
    708   step2[4] = step1[4];
    709   step2[5] = step1[4];
    710   step2[6] = step1[7];
    711   step2[7] = step1[7];
    712   step2[8] = step1[8];
    713   idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
    714                     &step2[14]);
    715   idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
    716                         &step2[10]);
    717   step2[11] = step1[11];
    718   step2[12] = step1[12];
    719   step2[15] = step1[15];
    720 
    721   // stage 5
    722   step1[0] = step2[0];
    723   step1[1] = step2[1];
    724   step1[2] = step2[1];
    725   step1[3] = step2[0];
    726   step1[4] = step2[4];
    727   idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
    728   step1[7] = step2[7];
    729   step1[8] = vaddq_s16(step2[8], step2[11]);
    730   step1[9] = vaddq_s16(step2[9], step2[10]);
    731   step1[10] = vsubq_s16(step2[9], step2[10]);
    732   step1[11] = vsubq_s16(step2[8], step2[11]);
    733   step1[12] = vsubq_s16(step2[15], step2[12]);
    734   step1[13] = vsubq_s16(step2[14], step2[13]);
    735   step1[14] = vaddq_s16(step2[14], step2[13]);
    736   step1[15] = vaddq_s16(step2[15], step2[12]);
    737 
    738   // stage 6
    739   step2[0] = vaddq_s16(step1[0], step1[7]);
    740   step2[1] = vaddq_s16(step1[1], step1[6]);
    741   step2[2] = vaddq_s16(step1[2], step1[5]);
    742   step2[3] = vaddq_s16(step1[3], step1[4]);
    743   step2[4] = vsubq_s16(step1[3], step1[4]);
    744   step2[5] = vsubq_s16(step1[2], step1[5]);
    745   step2[6] = vsubq_s16(step1[1], step1[6]);
    746   step2[7] = vsubq_s16(step1[0], step1[7]);
    747   idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
    748                      &step2[13]);
    749   idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
    750                      &step2[12]);
    751   step2[8] = step1[8];
    752   step2[9] = step1[9];
    753   step2[14] = step1[14];
    754   step2[15] = step1[15];
    755 
    756   // stage 7
    757   idct16x16_add_stage7(step2, out);
    758 
    759   if (output) {
    760     idct16x16_store_pass1(out, output);
    761   } else {
    762     if (highbd_flag) {
    763       idct16x16_add_store_bd8(out, dest, stride);
    764     } else {
    765       idct16x16_add_store(out, dest, stride);
    766     }
    767   }
    768 }
    769 
    770 void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
    771                                 int stride) {
    772   int16_t row_idct_output[16 * 16];
    773 
    774   // pass 1
    775   // Parallel idct on the upper 8 rows
    776   vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0);
    777 
    778   // Parallel idct on the lower 8 rows
    779   vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest,
    780                                stride, 0);
    781 
    782   // pass 2
    783   // Parallel idct to get the left 8 columns
    784   vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0);
    785 
    786   // Parallel idct to get the right 8 columns
    787   vpx_idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
    788                                0);
    789 }
    790 
    791 void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest,
    792                                int stride) {
    793   int16_t row_idct_output[16 * 16];
    794 
    795   // pass 1
    796   // Parallel idct on the upper 8 rows
    797   vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 0);
    798 
    799   // pass 2
    800   // Parallel idct to get the left 8 columns
    801   vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 0);
    802 
    803   // Parallel idct to get the right 8 columns
    804   vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
    805                               0);
    806 }
    807 
    808 void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
    809                                int stride) {
    810   int16_t row_idct_output[4 * 16];
    811 
    812   // pass 1
    813   // Parallel idct on the upper 8 rows
    814   vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output);
    815 
    816   // pass 2
    817   // Parallel idct to get the left 8 columns
    818   vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 0);
    819 
    820   // Parallel idct to get the right 8 columns
    821   vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
    822                                     stride, 0);
    823 }
    824