Home | History | Annotate | Download | only in arm
      1 /*
      2  *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #ifndef VPX_DSP_ARM_IDCT_NEON_H_
     12 #define VPX_DSP_ARM_IDCT_NEON_H_
     13 
     14 #include <arm_neon.h>
     15 
     16 #include "./vpx_config.h"
     17 #include "vpx_dsp/arm/transpose_neon.h"
     18 #include "vpx_dsp/txfm_common.h"
     19 #include "vpx_dsp/vpx_dsp_common.h"
     20 
     21 DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
     22   16384 /*  cospi_0_64  */, 15137 /*  cospi_8_64  */,
     23   11585 /*  cospi_16_64 */, 6270 /*  cospi_24_64 */,
     24   16069 /*  cospi_4_64  */, 13623 /*  cospi_12_64 */,
     25   -9102 /* -cospi_20_64 */, 3196 /*  cospi_28_64 */,
     26   16305 /*  cospi_2_64  */, 1606 /*  cospi_30_64 */,
     27   14449 /*  cospi_10_64 */, 7723 /*  cospi_22_64 */,
     28   15679 /*  cospi_6_64  */, -4756 /* -cospi_26_64 */,
     29   12665 /*  cospi_14_64 */, -10394 /* -cospi_18_64 */
     30 };
     31 
     32 DECLARE_ALIGNED(16, static const int32_t, kCospi32[16]) = {
     33   16384 /*  cospi_0_64  */, 15137 /*  cospi_8_64  */,
     34   11585 /*  cospi_16_64 */, 6270 /*  cospi_24_64 */,
     35   16069 /*  cospi_4_64  */, 13623 /*  cospi_12_64 */,
     36   -9102 /* -cospi_20_64 */, 3196 /*  cospi_28_64 */,
     37   16305 /*  cospi_2_64  */, 1606 /*  cospi_30_64 */,
     38   14449 /*  cospi_10_64 */, 7723 /*  cospi_22_64 */,
     39   15679 /*  cospi_6_64  */, -4756 /* -cospi_26_64 */,
     40   12665 /*  cospi_14_64 */, -10394 /* -cospi_18_64 */
     41 };
     42 
     43 //------------------------------------------------------------------------------
     44 // Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth
     45 static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) {
     46 #if CONFIG_VP9_HIGHBITDEPTH
     47   return vqaddq_s16(a, b);
     48 #else
     49   return vaddq_s16(a, b);
     50 #endif
     51 }
     52 
     53 static INLINE int16x8_t final_sub(const int16x8_t a, const int16x8_t b) {
     54 #if CONFIG_VP9_HIGHBITDEPTH
     55   return vqsubq_s16(a, b);
     56 #else
     57   return vsubq_s16(a, b);
     58 #endif
     59 }
     60 
     61 //------------------------------------------------------------------------------
     62 
     63 static INLINE int32x4x2_t highbd_idct_add_dual(const int32x4x2_t s0,
     64                                                const int32x4x2_t s1) {
     65   int32x4x2_t t;
     66   t.val[0] = vaddq_s32(s0.val[0], s1.val[0]);
     67   t.val[1] = vaddq_s32(s0.val[1], s1.val[1]);
     68   return t;
     69 }
     70 
     71 static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0,
     72                                                const int32x4x2_t s1) {
     73   int32x4x2_t t;
     74   t.val[0] = vsubq_s32(s0.val[0], s1.val[0]);
     75   t.val[1] = vsubq_s32(s0.val[1], s1.val[1]);
     76   return t;
     77 }
     78 
     79 //------------------------------------------------------------------------------
     80 
     81 // Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
     82 static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
     83                                                       const int16_t a_const) {
     84   // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed
     85   // streams. See WRAPLOW and dct_const_round_shift for details.
     86   // This instruction doubles the result and returns the high half, essentially
     87   // resulting in a right shift by 15. By multiplying the constant first that
     88   // becomes a right shift by DCT_CONST_BITS.
     89   // The largest possible value used here is
     90   // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
     91   // within the range of int16_t (+32767 / -32768) even when negated.
     92   return vqrdmulhq_n_s16(a, a_const * 2);
     93 }
     94 
     95 // Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
     96 static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
     97     const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
     98   // In both add_ and it's pair, sub_, the input for well-formed streams will be
     99   // well within 16 bits (input to the idct is the difference between two frames
    100   // and will be within -255 to 255, or 9 bits)
    101   // However, for inputs over about 25,000 (valid for int16_t, but not for idct
    102   // input) this function can not use vaddq_s16.
    103   // In order to match existing behavior and intentionally out of range tests,
    104   // expand the addition up to 32 bits to prevent truncation.
    105   int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
    106   int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
    107   temp_low = vmulq_n_s32(temp_low, ab_const);
    108   temp_high = vmulq_n_s32(temp_high, ab_const);
    109   return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
    110                       vrshrn_n_s32(temp_high, DCT_CONST_BITS));
    111 }
    112 
    113 // Subtract b from a, then multiply by ab_const. Shift and narrow by
    114 // DCT_CONST_BITS.
    115 static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
    116     const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
    117   int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
    118   int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
    119   temp_low = vmulq_n_s32(temp_low, ab_const);
    120   temp_high = vmulq_n_s32(temp_high, ab_const);
    121   return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
    122                       vrshrn_n_s32(temp_high, DCT_CONST_BITS));
    123 }
    124 
    125 // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
    126 // DCT_CONST_BITS.
    127 static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
    128     const int16x8_t a, const int16_t a_const, const int16x8_t b,
    129     const int16_t b_const) {
    130   int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const);
    131   int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
    132   temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
    133   temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
    134   return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
    135                       vrshrn_n_s32(temp_high, DCT_CONST_BITS));
    136 }
    137 
    138 //------------------------------------------------------------------------------
    139 
    140 // Note: The following 4 functions could use 32-bit operations for bit-depth 10.
    141 //       However, although it's 20% faster with gcc, it's 20% slower with clang.
    142 //       Use 64-bit operations for now.
    143 
    144 // Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
    145 static INLINE int32x4x2_t
    146 multiply_shift_and_narrow_s32_dual(const int32x4x2_t a, const int32_t a_const) {
    147   int64x2_t b[4];
    148   int32x4x2_t c;
    149   b[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
    150   b[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
    151   b[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
    152   b[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const);
    153   c.val[0] = vcombine_s32(vrshrn_n_s64(b[0], DCT_CONST_BITS),
    154                           vrshrn_n_s64(b[1], DCT_CONST_BITS));
    155   c.val[1] = vcombine_s32(vrshrn_n_s64(b[2], DCT_CONST_BITS),
    156                           vrshrn_n_s64(b[3], DCT_CONST_BITS));
    157   return c;
    158 }
    159 
    160 // Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
    161 static INLINE int32x4x2_t add_multiply_shift_and_narrow_s32_dual(
    162     const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
    163   const int32x4_t temp_low = vaddq_s32(a.val[0], b.val[0]);
    164   const int32x4_t temp_high = vaddq_s32(a.val[1], b.val[1]);
    165   int64x2_t c[4];
    166   int32x4x2_t d;
    167   c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const);
    168   c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const);
    169   c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const);
    170   c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const);
    171   d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
    172                           vrshrn_n_s64(c[1], DCT_CONST_BITS));
    173   d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
    174                           vrshrn_n_s64(c[3], DCT_CONST_BITS));
    175   return d;
    176 }
    177 
    178 // Subtract b from a, then multiply by ab_const. Shift and narrow by
    179 // DCT_CONST_BITS.
    180 static INLINE int32x4x2_t sub_multiply_shift_and_narrow_s32_dual(
    181     const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
    182   const int32x4_t temp_low = vsubq_s32(a.val[0], b.val[0]);
    183   const int32x4_t temp_high = vsubq_s32(a.val[1], b.val[1]);
    184   int64x2_t c[4];
    185   int32x4x2_t d;
    186   c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const);
    187   c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const);
    188   c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const);
    189   c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const);
    190   d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
    191                           vrshrn_n_s64(c[1], DCT_CONST_BITS));
    192   d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
    193                           vrshrn_n_s64(c[3], DCT_CONST_BITS));
    194   return d;
    195 }
    196 
    197 // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
    198 // DCT_CONST_BITS.
    199 static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual(
    200     const int32x4x2_t a, const int32_t a_const, const int32x4x2_t b,
    201     const int32_t b_const) {
    202   int64x2_t c[4];
    203   int32x4x2_t d;
    204   c[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
    205   c[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
    206   c[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
    207   c[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const);
    208   c[0] = vmlal_n_s32(c[0], vget_low_s32(b.val[0]), b_const);
    209   c[1] = vmlal_n_s32(c[1], vget_high_s32(b.val[0]), b_const);
    210   c[2] = vmlal_n_s32(c[2], vget_low_s32(b.val[1]), b_const);
    211   c[3] = vmlal_n_s32(c[3], vget_high_s32(b.val[1]), b_const);
    212   d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
    213                           vrshrn_n_s64(c[1], DCT_CONST_BITS));
    214   d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
    215                           vrshrn_n_s64(c[3], DCT_CONST_BITS));
    216   return d;
    217 }
    218 
    219 // Shift the output down by 6 and add it to the destination buffer.
    220 static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
    221                                         const int16x8_t a2, const int16x8_t a3,
    222                                         const int16x8_t a4, const int16x8_t a5,
    223                                         const int16x8_t a6, const int16x8_t a7,
    224                                         uint8_t *b, const int b_stride) {
    225   uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
    226   int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
    227   b0 = vld1_u8(b);
    228   b += b_stride;
    229   b1 = vld1_u8(b);
    230   b += b_stride;
    231   b2 = vld1_u8(b);
    232   b += b_stride;
    233   b3 = vld1_u8(b);
    234   b += b_stride;
    235   b4 = vld1_u8(b);
    236   b += b_stride;
    237   b5 = vld1_u8(b);
    238   b += b_stride;
    239   b6 = vld1_u8(b);
    240   b += b_stride;
    241   b7 = vld1_u8(b);
    242   b -= (7 * b_stride);
    243 
    244   // c = b + (a >> 6)
    245   c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6);
    246   c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6);
    247   c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6);
    248   c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6);
    249   c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6);
    250   c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6);
    251   c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6);
    252   c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6);
    253 
    254   b0 = vqmovun_s16(c0);
    255   b1 = vqmovun_s16(c1);
    256   b2 = vqmovun_s16(c2);
    257   b3 = vqmovun_s16(c3);
    258   b4 = vqmovun_s16(c4);
    259   b5 = vqmovun_s16(c5);
    260   b6 = vqmovun_s16(c6);
    261   b7 = vqmovun_s16(c7);
    262 
    263   vst1_u8(b, b0);
    264   b += b_stride;
    265   vst1_u8(b, b1);
    266   b += b_stride;
    267   vst1_u8(b, b2);
    268   b += b_stride;
    269   vst1_u8(b, b3);
    270   b += b_stride;
    271   vst1_u8(b, b4);
    272   b += b_stride;
    273   vst1_u8(b, b5);
    274   b += b_stride;
    275   vst1_u8(b, b6);
    276   b += b_stride;
    277   vst1_u8(b, b7);
    278 }
    279 
    280 static INLINE uint8x16_t create_dcq(const int16_t dc) {
    281   // Clip both sides and gcc may compile to assembly 'usat'.
    282   const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc);
    283   return vdupq_n_u8((uint8_t)t);
    284 }
    285 
    286 static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
    287                                          int16x8_t *const a0,
    288                                          int16x8_t *const a1) {
    289   int16x4_t b0, b1, b2, b3;
    290   int32x4_t c0, c1, c2, c3;
    291   int16x8_t d0, d1;
    292 
    293   transpose_s16_4x4q(a0, a1);
    294   b0 = vget_low_s16(*a0);
    295   b1 = vget_high_s16(*a0);
    296   b2 = vget_low_s16(*a1);
    297   b3 = vget_high_s16(*a1);
    298   c0 = vmull_lane_s16(b0, cospis, 2);
    299   c2 = vmull_lane_s16(b1, cospis, 2);
    300   c1 = vsubq_s32(c0, c2);
    301   c0 = vaddq_s32(c0, c2);
    302   c2 = vmull_lane_s16(b2, cospis, 3);
    303   c3 = vmull_lane_s16(b2, cospis, 1);
    304   c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
    305   c3 = vmlal_lane_s16(c3, b3, cospis, 3);
    306   b0 = vrshrn_n_s32(c0, DCT_CONST_BITS);
    307   b1 = vrshrn_n_s32(c1, DCT_CONST_BITS);
    308   b2 = vrshrn_n_s32(c2, DCT_CONST_BITS);
    309   b3 = vrshrn_n_s32(c3, DCT_CONST_BITS);
    310   d0 = vcombine_s16(b0, b1);
    311   d1 = vcombine_s16(b3, b2);
    312   *a0 = vaddq_s16(d0, d1);
    313   *a1 = vsubq_s16(d0, d1);
    314 }
    315 
    316 static INLINE void idct8x8_12_pass1_bd8(
    317     const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
    318     int16x4_t *const io0, int16x4_t *const io1, int16x4_t *const io2,
    319     int16x4_t *const io3, int16x4_t *const io4, int16x4_t *const io5,
    320     int16x4_t *const io6, int16x4_t *const io7) {
    321   int16x4_t step1[8], step2[8];
    322   int32x4_t t32[2];
    323 
    324   transpose_s16_4x4d(io0, io1, io2, io3);
    325 
    326   // stage 1
    327   step1[4] = vqrdmulh_lane_s16(*io1, cospisd1, 3);
    328   step1[5] = vqrdmulh_lane_s16(*io3, cospisd1, 2);
    329   step1[6] = vqrdmulh_lane_s16(*io3, cospisd1, 1);
    330   step1[7] = vqrdmulh_lane_s16(*io1, cospisd1, 0);
    331 
    332   // stage 2
    333   step2[1] = vqrdmulh_lane_s16(*io0, cospisd0, 2);
    334   step2[2] = vqrdmulh_lane_s16(*io2, cospisd0, 3);
    335   step2[3] = vqrdmulh_lane_s16(*io2, cospisd0, 1);
    336 
    337   step2[4] = vadd_s16(step1[4], step1[5]);
    338   step2[5] = vsub_s16(step1[4], step1[5]);
    339   step2[6] = vsub_s16(step1[7], step1[6]);
    340   step2[7] = vadd_s16(step1[7], step1[6]);
    341 
    342   // stage 3
    343   step1[0] = vadd_s16(step2[1], step2[3]);
    344   step1[1] = vadd_s16(step2[1], step2[2]);
    345   step1[2] = vsub_s16(step2[1], step2[2]);
    346   step1[3] = vsub_s16(step2[1], step2[3]);
    347 
    348   t32[1] = vmull_lane_s16(step2[6], cospis0, 2);
    349   t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2);
    350   t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2);
    351   step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
    352   step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
    353 
    354   // stage 4
    355   *io0 = vadd_s16(step1[0], step2[7]);
    356   *io1 = vadd_s16(step1[1], step1[6]);
    357   *io2 = vadd_s16(step1[2], step1[5]);
    358   *io3 = vadd_s16(step1[3], step2[4]);
    359   *io4 = vsub_s16(step1[3], step2[4]);
    360   *io5 = vsub_s16(step1[2], step1[5]);
    361   *io6 = vsub_s16(step1[1], step1[6]);
    362   *io7 = vsub_s16(step1[0], step2[7]);
    363 }
    364 
    365 static INLINE void idct8x8_12_pass2_bd8(
    366     const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
    367     const int16x4_t input0, const int16x4_t input1, const int16x4_t input2,
    368     const int16x4_t input3, const int16x4_t input4, const int16x4_t input5,
    369     const int16x4_t input6, const int16x4_t input7, int16x8_t *const output0,
    370     int16x8_t *const output1, int16x8_t *const output2,
    371     int16x8_t *const output3, int16x8_t *const output4,
    372     int16x8_t *const output5, int16x8_t *const output6,
    373     int16x8_t *const output7) {
    374   int16x8_t in[4];
    375   int16x8_t step1[8], step2[8];
    376   int32x4_t t32[8];
    377   int16x4_t t16[8];
    378 
    379   transpose_s16_4x8(input0, input1, input2, input3, input4, input5, input6,
    380                     input7, &in[0], &in[1], &in[2], &in[3]);
    381 
    382   // stage 1
    383   step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3);
    384   step1[5] = vqrdmulhq_lane_s16(in[3], cospisd1, 2);
    385   step1[6] = vqrdmulhq_lane_s16(in[3], cospisd1, 1);
    386   step1[7] = vqrdmulhq_lane_s16(in[1], cospisd1, 0);
    387 
    388   // stage 2
    389   step2[1] = vqrdmulhq_lane_s16(in[0], cospisd0, 2);
    390   step2[2] = vqrdmulhq_lane_s16(in[2], cospisd0, 3);
    391   step2[3] = vqrdmulhq_lane_s16(in[2], cospisd0, 1);
    392 
    393   step2[4] = vaddq_s16(step1[4], step1[5]);
    394   step2[5] = vsubq_s16(step1[4], step1[5]);
    395   step2[6] = vsubq_s16(step1[7], step1[6]);
    396   step2[7] = vaddq_s16(step1[7], step1[6]);
    397 
    398   // stage 3
    399   step1[0] = vaddq_s16(step2[1], step2[3]);
    400   step1[1] = vaddq_s16(step2[1], step2[2]);
    401   step1[2] = vsubq_s16(step2[1], step2[2]);
    402   step1[3] = vsubq_s16(step2[1], step2[3]);
    403 
    404   t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2);
    405   t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2);
    406   t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
    407   t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
    408   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
    409   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
    410   t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
    411   t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
    412   t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
    413   t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
    414   step1[5] = vcombine_s16(t16[0], t16[1]);
    415   step1[6] = vcombine_s16(t16[2], t16[3]);
    416 
    417   // stage 4
    418   *output0 = vaddq_s16(step1[0], step2[7]);
    419   *output1 = vaddq_s16(step1[1], step1[6]);
    420   *output2 = vaddq_s16(step1[2], step1[5]);
    421   *output3 = vaddq_s16(step1[3], step2[4]);
    422   *output4 = vsubq_s16(step1[3], step2[4]);
    423   *output5 = vsubq_s16(step1[2], step1[5]);
    424   *output6 = vsubq_s16(step1[1], step1[6]);
    425   *output7 = vsubq_s16(step1[0], step2[7]);
    426 }
    427 
    428 static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
    429                                      const int16x4_t cospis1,
    430                                      int16x8_t *const io0, int16x8_t *const io1,
    431                                      int16x8_t *const io2, int16x8_t *const io3,
    432                                      int16x8_t *const io4, int16x8_t *const io5,
    433                                      int16x8_t *const io6,
    434                                      int16x8_t *const io7) {
    435   int16x4_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
    436       input_7l, input_7h;
    437   int16x4_t step1l[4], step1h[4];
    438   int16x8_t step1[8], step2[8];
    439   int32x4_t t32[8];
    440   int16x4_t t16[8];
    441 
    442   transpose_s16_8x8(io0, io1, io2, io3, io4, io5, io6, io7);
    443 
    444   // stage 1
    445   input_1l = vget_low_s16(*io1);
    446   input_1h = vget_high_s16(*io1);
    447   input_3l = vget_low_s16(*io3);
    448   input_3h = vget_high_s16(*io3);
    449   input_5l = vget_low_s16(*io5);
    450   input_5h = vget_high_s16(*io5);
    451   input_7l = vget_low_s16(*io7);
    452   input_7h = vget_high_s16(*io7);
    453   step1l[0] = vget_low_s16(*io0);
    454   step1h[0] = vget_high_s16(*io0);
    455   step1l[1] = vget_low_s16(*io2);
    456   step1h[1] = vget_high_s16(*io2);
    457   step1l[2] = vget_low_s16(*io4);
    458   step1h[2] = vget_high_s16(*io4);
    459   step1l[3] = vget_low_s16(*io6);
    460   step1h[3] = vget_high_s16(*io6);
    461 
    462   t32[0] = vmull_lane_s16(input_1l, cospis1, 3);
    463   t32[1] = vmull_lane_s16(input_1h, cospis1, 3);
    464   t32[2] = vmull_lane_s16(input_3l, cospis1, 2);
    465   t32[3] = vmull_lane_s16(input_3h, cospis1, 2);
    466   t32[4] = vmull_lane_s16(input_3l, cospis1, 1);
    467   t32[5] = vmull_lane_s16(input_3h, cospis1, 1);
    468   t32[6] = vmull_lane_s16(input_1l, cospis1, 0);
    469   t32[7] = vmull_lane_s16(input_1h, cospis1, 0);
    470   t32[0] = vmlsl_lane_s16(t32[0], input_7l, cospis1, 0);
    471   t32[1] = vmlsl_lane_s16(t32[1], input_7h, cospis1, 0);
    472   t32[2] = vmlal_lane_s16(t32[2], input_5l, cospis1, 1);
    473   t32[3] = vmlal_lane_s16(t32[3], input_5h, cospis1, 1);
    474   t32[4] = vmlsl_lane_s16(t32[4], input_5l, cospis1, 2);
    475   t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2);
    476   t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3);
    477   t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3);
    478   t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
    479   t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
    480   t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
    481   t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
    482   t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
    483   t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
    484   t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
    485   t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
    486   step1[4] = vcombine_s16(t16[0], t16[1]);
    487   step1[5] = vcombine_s16(t16[2], t16[3]);
    488   step1[6] = vcombine_s16(t16[4], t16[5]);
    489   step1[7] = vcombine_s16(t16[6], t16[7]);
    490 
    491   // stage 2
    492   t32[2] = vmull_lane_s16(step1l[0], cospis0, 2);
    493   t32[3] = vmull_lane_s16(step1h[0], cospis0, 2);
    494   t32[4] = vmull_lane_s16(step1l[1], cospis0, 3);
    495   t32[5] = vmull_lane_s16(step1h[1], cospis0, 3);
    496   t32[6] = vmull_lane_s16(step1l[1], cospis0, 1);
    497   t32[7] = vmull_lane_s16(step1h[1], cospis0, 1);
    498   t32[0] = vmlal_lane_s16(t32[2], step1l[2], cospis0, 2);
    499   t32[1] = vmlal_lane_s16(t32[3], step1h[2], cospis0, 2);
    500   t32[2] = vmlsl_lane_s16(t32[2], step1l[2], cospis0, 2);
    501   t32[3] = vmlsl_lane_s16(t32[3], step1h[2], cospis0, 2);
    502   t32[4] = vmlsl_lane_s16(t32[4], step1l[3], cospis0, 1);
    503   t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);
    504   t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);
    505   t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);
    506   t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
    507   t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
    508   t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
    509   t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
    510   t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
    511   t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
    512   t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
    513   t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
    514   step2[0] = vcombine_s16(t16[0], t16[1]);
    515   step2[1] = vcombine_s16(t16[2], t16[3]);
    516   step2[2] = vcombine_s16(t16[4], t16[5]);
    517   step2[3] = vcombine_s16(t16[6], t16[7]);
    518 
    519   step2[4] = vaddq_s16(step1[4], step1[5]);
    520   step2[5] = vsubq_s16(step1[4], step1[5]);
    521   step2[6] = vsubq_s16(step1[7], step1[6]);
    522   step2[7] = vaddq_s16(step1[7], step1[6]);
    523 
    524   // stage 3
    525   step1[0] = vaddq_s16(step2[0], step2[3]);
    526   step1[1] = vaddq_s16(step2[1], step2[2]);
    527   step1[2] = vsubq_s16(step2[1], step2[2]);
    528   step1[3] = vsubq_s16(step2[0], step2[3]);
    529 
    530   t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2);
    531   t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2);
    532   t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
    533   t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
    534   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
    535   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
    536   t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
    537   t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
    538   t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
    539   t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
    540   step1[5] = vcombine_s16(t16[0], t16[1]);
    541   step1[6] = vcombine_s16(t16[2], t16[3]);
    542 
    543   // stage 4
    544   *io0 = vaddq_s16(step1[0], step2[7]);
    545   *io1 = vaddq_s16(step1[1], step1[6]);
    546   *io2 = vaddq_s16(step1[2], step1[5]);
    547   *io3 = vaddq_s16(step1[3], step2[4]);
    548   *io4 = vsubq_s16(step1[3], step2[4]);
    549   *io5 = vsubq_s16(step1[2], step1[5]);
    550   *io6 = vsubq_s16(step1[1], step1[6]);
    551   *io7 = vsubq_s16(step1[0], step2[7]);
    552 }
    553 
    554 static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32,
    555                                               int16x8_t *const d0,
    556                                               int16x8_t *const d1) {
    557   int16x4_t t16[4];
    558 
    559   t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
    560   t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
    561   t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
    562   t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
    563   *d0 = vcombine_s16(t16[0], t16[1]);
    564   *d1 = vcombine_s16(t16[2], t16[3]);
    565 }
    566 
    567 static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0,
    568                                             const int16x8_t s1,
    569                                             const int16x4_t cospi_0_8_16_24,
    570                                             int32x4_t *const t32) {
    571   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3);
    572   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3);
    573   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3);
    574   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3);
    575   t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1);
    576   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1);
    577   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1);
    578   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1);
    579 }
    580 
    581 static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1,
    582                                      const int16x4_t cospi_0_8_16_24,
    583                                      int16x8_t *const d0, int16x8_t *const d1) {
    584   int32x4_t t32[4];
    585 
    586   idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
    587   idct16x16_add_wrap_low_8x2(t32, d0, d1);
    588 }
    589 
    590 static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
    591                                          const int16x4_t cospi_0_8_16_24,
    592                                          int16x8_t *const d0,
    593                                          int16x8_t *const d1) {
    594   int32x4_t t32[4];
    595 
    596   idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
    597   t32[2] = vnegq_s32(t32[2]);
    598   t32[3] = vnegq_s32(t32[3]);
    599   idct16x16_add_wrap_low_8x2(t32, d0, d1);
    600 }
    601 
    602 static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
    603                                       const int16x4_t cospi_0_8_16_24,
    604                                       int16x8_t *const d0,
    605                                       int16x8_t *const d1) {
    606   int32x4_t t32[6];
    607 
    608   t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2);
    609   t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2);
    610   t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
    611   t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
    612   t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
    613   t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
    614   idct16x16_add_wrap_low_8x2(t32, d0, d1);
    615 }
    616 
    617 static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
    618                                    const int16x4_t cospi_2_30_10_22,
    619                                    int16x8_t *const d0, int16x8_t *const d1) {
    620   int32x4_t t32[4];
    621 
    622   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1);
    623   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1);
    624   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1);
    625   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1);
    626   t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0);
    627   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
    628   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
    629   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
    630   idct16x16_add_wrap_low_8x2(t32, d0, d1);
    631 }
    632 
    633 static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
    634                                    const int16x4_t cospi_4_12_20N_28,
    635                                    int16x8_t *const d0, int16x8_t *const d1) {
    636   int32x4_t t32[4];
    637 
    638   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3);
    639   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3);
    640   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3);
    641   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3);
    642   t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0);
    643   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
    644   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
    645   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
    646   idct16x16_add_wrap_low_8x2(t32, d0, d1);
    647 }
    648 
    649 static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
    650                                    const int16x4_t cospi_6_26N_14_18N,
    651                                    int16x8_t *const d0, int16x8_t *const d1) {
    652   int32x4_t t32[4];
    653 
    654   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 0);
    655   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 0);
    656   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 0);
    657   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 0);
    658   t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 1);
    659   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 1);
    660   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 1);
    661   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 1);
    662   idct16x16_add_wrap_low_8x2(t32, d0, d1);
    663 }
    664 
    665 static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
    666                                     const int16x4_t cospi_2_30_10_22,
    667                                     int16x8_t *const d0, int16x8_t *const d1) {
    668   int32x4_t t32[4];
    669 
    670   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3);
    671   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3);
    672   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3);
    673   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3);
    674   t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2);
    675   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
    676   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
    677   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
    678   idct16x16_add_wrap_low_8x2(t32, d0, d1);
    679 }
    680 
    681 static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
    682                                     const int16x4_t cospi_4_12_20N_28,
    683                                     int16x8_t *const d0, int16x8_t *const d1) {
    684   int32x4_t t32[4];
    685 
    686   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1);
    687   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1);
    688   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1);
    689   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1);
    690   t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2);
    691   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
    692   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
    693   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
    694   idct16x16_add_wrap_low_8x2(t32, d0, d1);
    695 }
    696 
    697 static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
    698                                     const int16x4_t cospi_6_26N_14_18N,
    699                                     int16x8_t *const d0, int16x8_t *const d1) {
    700   int32x4_t t32[4];
    701 
    702   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 2);
    703   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 2);
    704   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 2);
    705   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 2);
    706   t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 3);
    707   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 3);
    708   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 3);
    709   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 3);
    710   idct16x16_add_wrap_low_8x2(t32, d0, d1);
    711 }
    712 
    713 static INLINE void idct16x16_add_stage7(const int16x8_t *const step2,
    714                                         int16x8_t *const out) {
    715 #if CONFIG_VP9_HIGHBITDEPTH
    716   // Use saturating add/sub to avoid overflow in 2nd pass
    717   out[0] = vqaddq_s16(step2[0], step2[15]);
    718   out[1] = vqaddq_s16(step2[1], step2[14]);
    719   out[2] = vqaddq_s16(step2[2], step2[13]);
    720   out[3] = vqaddq_s16(step2[3], step2[12]);
    721   out[4] = vqaddq_s16(step2[4], step2[11]);
    722   out[5] = vqaddq_s16(step2[5], step2[10]);
    723   out[6] = vqaddq_s16(step2[6], step2[9]);
    724   out[7] = vqaddq_s16(step2[7], step2[8]);
    725   out[8] = vqsubq_s16(step2[7], step2[8]);
    726   out[9] = vqsubq_s16(step2[6], step2[9]);
    727   out[10] = vqsubq_s16(step2[5], step2[10]);
    728   out[11] = vqsubq_s16(step2[4], step2[11]);
    729   out[12] = vqsubq_s16(step2[3], step2[12]);
    730   out[13] = vqsubq_s16(step2[2], step2[13]);
    731   out[14] = vqsubq_s16(step2[1], step2[14]);
    732   out[15] = vqsubq_s16(step2[0], step2[15]);
    733 #else
    734   out[0] = vaddq_s16(step2[0], step2[15]);
    735   out[1] = vaddq_s16(step2[1], step2[14]);
    736   out[2] = vaddq_s16(step2[2], step2[13]);
    737   out[3] = vaddq_s16(step2[3], step2[12]);
    738   out[4] = vaddq_s16(step2[4], step2[11]);
    739   out[5] = vaddq_s16(step2[5], step2[10]);
    740   out[6] = vaddq_s16(step2[6], step2[9]);
    741   out[7] = vaddq_s16(step2[7], step2[8]);
    742   out[8] = vsubq_s16(step2[7], step2[8]);
    743   out[9] = vsubq_s16(step2[6], step2[9]);
    744   out[10] = vsubq_s16(step2[5], step2[10]);
    745   out[11] = vsubq_s16(step2[4], step2[11]);
    746   out[12] = vsubq_s16(step2[3], step2[12]);
    747   out[13] = vsubq_s16(step2[2], step2[13]);
    748   out[14] = vsubq_s16(step2[1], step2[14]);
    749   out[15] = vsubq_s16(step2[0], step2[15]);
    750 #endif
    751 }
    752 
    753 static INLINE void idct16x16_store_pass1(const int16x8_t *const out,
    754                                          int16_t *output) {
    755   // Save the result into output
    756   vst1q_s16(output, out[0]);
    757   output += 16;
    758   vst1q_s16(output, out[1]);
    759   output += 16;
    760   vst1q_s16(output, out[2]);
    761   output += 16;
    762   vst1q_s16(output, out[3]);
    763   output += 16;
    764   vst1q_s16(output, out[4]);
    765   output += 16;
    766   vst1q_s16(output, out[5]);
    767   output += 16;
    768   vst1q_s16(output, out[6]);
    769   output += 16;
    770   vst1q_s16(output, out[7]);
    771   output += 16;
    772   vst1q_s16(output, out[8]);
    773   output += 16;
    774   vst1q_s16(output, out[9]);
    775   output += 16;
    776   vst1q_s16(output, out[10]);
    777   output += 16;
    778   vst1q_s16(output, out[11]);
    779   output += 16;
    780   vst1q_s16(output, out[12]);
    781   output += 16;
    782   vst1q_s16(output, out[13]);
    783   output += 16;
    784   vst1q_s16(output, out[14]);
    785   output += 16;
    786   vst1q_s16(output, out[15]);
    787 }
    788 
    789 static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest,
    790                                     const int stride) {
    791   uint8x8_t d = vld1_u8(*dest);
    792   uint16x8_t q;
    793 
    794   res = vrshrq_n_s16(res, 6);
    795   q = vaddw_u8(vreinterpretq_u16_s16(res), d);
    796   d = vqmovun_s16(vreinterpretq_s16_u16(q));
    797   vst1_u8(*dest, d);
    798   *dest += stride;
    799 }
    800 
    801 static INLINE void highbd_idct16x16_add8x1(int16x8_t res, const int16x8_t max,
    802                                            uint16_t **dest, const int stride) {
    803   uint16x8_t d = vld1q_u16(*dest);
    804 
    805   res = vqaddq_s16(res, vreinterpretq_s16_u16(d));
    806   res = vminq_s16(res, max);
    807   d = vqshluq_n_s16(res, 0);
    808   vst1q_u16(*dest, d);
    809   *dest += stride;
    810 }
    811 
    812 static INLINE void highbd_idct16x16_add8x1_bd8(int16x8_t res, uint16_t **dest,
    813                                                const int stride) {
    814   uint16x8_t d = vld1q_u16(*dest);
    815 
    816   res = vrsraq_n_s16(vreinterpretq_s16_u16(d), res, 6);
    817   d = vmovl_u8(vqmovun_s16(res));
    818   vst1q_u16(*dest, d);
    819   *dest += stride;
    820 }
    821 
    822 static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a,
    823                                             uint16_t *out, const int b_stride) {
    824   highbd_idct16x16_add8x1_bd8(a[0], &out, b_stride);
    825   highbd_idct16x16_add8x1_bd8(a[1], &out, b_stride);
    826   highbd_idct16x16_add8x1_bd8(a[2], &out, b_stride);
    827   highbd_idct16x16_add8x1_bd8(a[3], &out, b_stride);
    828   highbd_idct16x16_add8x1_bd8(a[4], &out, b_stride);
    829   highbd_idct16x16_add8x1_bd8(a[5], &out, b_stride);
    830   highbd_idct16x16_add8x1_bd8(a[6], &out, b_stride);
    831   highbd_idct16x16_add8x1_bd8(a[7], &out, b_stride);
    832   highbd_idct16x16_add8x1_bd8(a[8], &out, b_stride);
    833   highbd_idct16x16_add8x1_bd8(a[9], &out, b_stride);
    834   highbd_idct16x16_add8x1_bd8(a[10], &out, b_stride);
    835   highbd_idct16x16_add8x1_bd8(a[11], &out, b_stride);
    836   highbd_idct16x16_add8x1_bd8(a[12], &out, b_stride);
    837   highbd_idct16x16_add8x1_bd8(a[13], &out, b_stride);
    838   highbd_idct16x16_add8x1_bd8(a[14], &out, b_stride);
    839   highbd_idct16x16_add8x1_bd8(a[15], &out, b_stride);
    840   highbd_idct16x16_add8x1_bd8(a[16], &out, b_stride);
    841   highbd_idct16x16_add8x1_bd8(a[17], &out, b_stride);
    842   highbd_idct16x16_add8x1_bd8(a[18], &out, b_stride);
    843   highbd_idct16x16_add8x1_bd8(a[19], &out, b_stride);
    844   highbd_idct16x16_add8x1_bd8(a[20], &out, b_stride);
    845   highbd_idct16x16_add8x1_bd8(a[21], &out, b_stride);
    846   highbd_idct16x16_add8x1_bd8(a[22], &out, b_stride);
    847   highbd_idct16x16_add8x1_bd8(a[23], &out, b_stride);
    848   highbd_idct16x16_add8x1_bd8(a[24], &out, b_stride);
    849   highbd_idct16x16_add8x1_bd8(a[25], &out, b_stride);
    850   highbd_idct16x16_add8x1_bd8(a[26], &out, b_stride);
    851   highbd_idct16x16_add8x1_bd8(a[27], &out, b_stride);
    852   highbd_idct16x16_add8x1_bd8(a[28], &out, b_stride);
    853   highbd_idct16x16_add8x1_bd8(a[29], &out, b_stride);
    854   highbd_idct16x16_add8x1_bd8(a[30], &out, b_stride);
    855   highbd_idct16x16_add8x1_bd8(a[31], &out, b_stride);
    856 }
    857 
    858 static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out,
    859                                               uint16_t *dest, const int stride,
    860                                               const int bd) {
    861   // Add the result to dest
    862   const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
    863   int16x8_t o[16];
    864   o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6),
    865                       vrshrn_n_s32(out[0].val[1], 6));
    866   o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6),
    867                       vrshrn_n_s32(out[1].val[1], 6));
    868   o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6),
    869                       vrshrn_n_s32(out[2].val[1], 6));
    870   o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6),
    871                       vrshrn_n_s32(out[3].val[1], 6));
    872   o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6),
    873                       vrshrn_n_s32(out[4].val[1], 6));
    874   o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6),
    875                       vrshrn_n_s32(out[5].val[1], 6));
    876   o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6),
    877                       vrshrn_n_s32(out[6].val[1], 6));
    878   o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6),
    879                       vrshrn_n_s32(out[7].val[1], 6));
    880   o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6),
    881                       vrshrn_n_s32(out[8].val[1], 6));
    882   o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6),
    883                       vrshrn_n_s32(out[9].val[1], 6));
    884   o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6),
    885                        vrshrn_n_s32(out[10].val[1], 6));
    886   o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6),
    887                        vrshrn_n_s32(out[11].val[1], 6));
    888   o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6),
    889                        vrshrn_n_s32(out[12].val[1], 6));
    890   o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6),
    891                        vrshrn_n_s32(out[13].val[1], 6));
    892   o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6),
    893                        vrshrn_n_s32(out[14].val[1], 6));
    894   o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6),
    895                        vrshrn_n_s32(out[15].val[1], 6));
    896   highbd_idct16x16_add8x1(o[0], max, &dest, stride);
    897   highbd_idct16x16_add8x1(o[1], max, &dest, stride);
    898   highbd_idct16x16_add8x1(o[2], max, &dest, stride);
    899   highbd_idct16x16_add8x1(o[3], max, &dest, stride);
    900   highbd_idct16x16_add8x1(o[4], max, &dest, stride);
    901   highbd_idct16x16_add8x1(o[5], max, &dest, stride);
    902   highbd_idct16x16_add8x1(o[6], max, &dest, stride);
    903   highbd_idct16x16_add8x1(o[7], max, &dest, stride);
    904   highbd_idct16x16_add8x1(o[8], max, &dest, stride);
    905   highbd_idct16x16_add8x1(o[9], max, &dest, stride);
    906   highbd_idct16x16_add8x1(o[10], max, &dest, stride);
    907   highbd_idct16x16_add8x1(o[11], max, &dest, stride);
    908   highbd_idct16x16_add8x1(o[12], max, &dest, stride);
    909   highbd_idct16x16_add8x1(o[13], max, &dest, stride);
    910   highbd_idct16x16_add8x1(o[14], max, &dest, stride);
    911   highbd_idct16x16_add8x1(o[15], max, &dest, stride);
    912 }
    913 
    914 void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
    915                                   void *const dest, const int stride,
    916                                   const int highbd_flag);
    917 
    918 void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output,
    919                                  void *const dest, const int stride,
    920                                  const int highbd_flag);
    921 
    922 void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
    923                                        int16_t *output);
    924 
    925 void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input,
    926                                        int16_t *const output, void *const dest,
    927                                        const int stride, const int highbd_flag);
    928 
    929 void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest,
    930                         const int stride, const int highbd_flag);
    931 
    932 void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output);
    933 void vpx_idct32_16_neon(const int16_t *const input, void *const output,
    934                         const int stride, const int highbd_flag);
    935 
    936 void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output);
    937 void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
    938                        const int highbd_flag);
    939 
    940 #endif  // VPX_DSP_ARM_IDCT_NEON_H_
    941