1 /* 2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_DSP_ARM_IDCT_NEON_H_ 12 #define VPX_DSP_ARM_IDCT_NEON_H_ 13 14 #include <arm_neon.h> 15 16 #include "./vpx_config.h" 17 #include "vpx_dsp/arm/transpose_neon.h" 18 #include "vpx_dsp/txfm_common.h" 19 #include "vpx_dsp/vpx_dsp_common.h" 20 21 DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = { 22 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, 23 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, 24 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, 25 -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */, 26 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */, 27 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */, 28 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */, 29 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */ 30 }; 31 32 DECLARE_ALIGNED(16, static const int32_t, kCospi32[16]) = { 33 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, 34 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, 35 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, 36 -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */, 37 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */, 38 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */, 39 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */, 40 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */ 41 }; 42 43 //------------------------------------------------------------------------------ 44 // Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth 45 static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) { 46 #if CONFIG_VP9_HIGHBITDEPTH 47 return vqaddq_s16(a, b); 48 #else 49 return vaddq_s16(a, b); 50 #endif 51 } 52 53 static INLINE int16x8_t final_sub(const int16x8_t a, const int16x8_t b) { 54 #if CONFIG_VP9_HIGHBITDEPTH 55 return vqsubq_s16(a, b); 56 #else 57 return vsubq_s16(a, b); 58 #endif 59 } 60 61 //------------------------------------------------------------------------------ 62 63 static INLINE int32x4x2_t highbd_idct_add_dual(const int32x4x2_t s0, 64 const int32x4x2_t s1) { 65 int32x4x2_t t; 66 t.val[0] = vaddq_s32(s0.val[0], s1.val[0]); 67 t.val[1] = vaddq_s32(s0.val[1], s1.val[1]); 68 return t; 69 } 70 71 static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0, 72 const int32x4x2_t s1) { 73 int32x4x2_t t; 74 t.val[0] = vsubq_s32(s0.val[0], s1.val[0]); 75 t.val[1] = vsubq_s32(s0.val[1], s1.val[1]); 76 return t; 77 } 78 79 //------------------------------------------------------------------------------ 80 81 // Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS. 82 static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a, 83 const int16_t a_const) { 84 // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed 85 // streams. See WRAPLOW and dct_const_round_shift for details. 86 // This instruction doubles the result and returns the high half, essentially 87 // resulting in a right shift by 15. By multiplying the constant first that 88 // becomes a right shift by DCT_CONST_BITS. 89 // The largest possible value used here is 90 // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just* 91 // within the range of int16_t (+32767 / -32768) even when negated. 92 return vqrdmulhq_n_s16(a, a_const * 2); 93 } 94 95 // Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS. 96 static INLINE int16x8_t add_multiply_shift_and_narrow_s16( 97 const int16x8_t a, const int16x8_t b, const int16_t ab_const) { 98 // In both add_ and it's pair, sub_, the input for well-formed streams will be 99 // well within 16 bits (input to the idct is the difference between two frames 100 // and will be within -255 to 255, or 9 bits) 101 // However, for inputs over about 25,000 (valid for int16_t, but not for idct 102 // input) this function can not use vaddq_s16. 103 // In order to match existing behavior and intentionally out of range tests, 104 // expand the addition up to 32 bits to prevent truncation. 105 int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b)); 106 int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b)); 107 temp_low = vmulq_n_s32(temp_low, ab_const); 108 temp_high = vmulq_n_s32(temp_high, ab_const); 109 return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), 110 vrshrn_n_s32(temp_high, DCT_CONST_BITS)); 111 } 112 113 // Subtract b from a, then multiply by ab_const. Shift and narrow by 114 // DCT_CONST_BITS. 115 static INLINE int16x8_t sub_multiply_shift_and_narrow_s16( 116 const int16x8_t a, const int16x8_t b, const int16_t ab_const) { 117 int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b)); 118 int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b)); 119 temp_low = vmulq_n_s32(temp_low, ab_const); 120 temp_high = vmulq_n_s32(temp_high, ab_const); 121 return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), 122 vrshrn_n_s32(temp_high, DCT_CONST_BITS)); 123 } 124 125 // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by 126 // DCT_CONST_BITS. 127 static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( 128 const int16x8_t a, const int16_t a_const, const int16x8_t b, 129 const int16_t b_const) { 130 int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const); 131 int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const); 132 temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const); 133 temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const); 134 return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), 135 vrshrn_n_s32(temp_high, DCT_CONST_BITS)); 136 } 137 138 //------------------------------------------------------------------------------ 139 140 // Note: The following 4 functions could use 32-bit operations for bit-depth 10. 141 // However, although it's 20% faster with gcc, it's 20% slower with clang. 142 // Use 64-bit operations for now. 143 144 // Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS. 145 static INLINE int32x4x2_t 146 multiply_shift_and_narrow_s32_dual(const int32x4x2_t a, const int32_t a_const) { 147 int64x2_t b[4]; 148 int32x4x2_t c; 149 b[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const); 150 b[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const); 151 b[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const); 152 b[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const); 153 c.val[0] = vcombine_s32(vrshrn_n_s64(b[0], DCT_CONST_BITS), 154 vrshrn_n_s64(b[1], DCT_CONST_BITS)); 155 c.val[1] = vcombine_s32(vrshrn_n_s64(b[2], DCT_CONST_BITS), 156 vrshrn_n_s64(b[3], DCT_CONST_BITS)); 157 return c; 158 } 159 160 // Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS. 161 static INLINE int32x4x2_t add_multiply_shift_and_narrow_s32_dual( 162 const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) { 163 const int32x4_t temp_low = vaddq_s32(a.val[0], b.val[0]); 164 const int32x4_t temp_high = vaddq_s32(a.val[1], b.val[1]); 165 int64x2_t c[4]; 166 int32x4x2_t d; 167 c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const); 168 c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const); 169 c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const); 170 c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const); 171 d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), 172 vrshrn_n_s64(c[1], DCT_CONST_BITS)); 173 d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), 174 vrshrn_n_s64(c[3], DCT_CONST_BITS)); 175 return d; 176 } 177 178 // Subtract b from a, then multiply by ab_const. Shift and narrow by 179 // DCT_CONST_BITS. 180 static INLINE int32x4x2_t sub_multiply_shift_and_narrow_s32_dual( 181 const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) { 182 const int32x4_t temp_low = vsubq_s32(a.val[0], b.val[0]); 183 const int32x4_t temp_high = vsubq_s32(a.val[1], b.val[1]); 184 int64x2_t c[4]; 185 int32x4x2_t d; 186 c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const); 187 c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const); 188 c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const); 189 c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const); 190 d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), 191 vrshrn_n_s64(c[1], DCT_CONST_BITS)); 192 d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), 193 vrshrn_n_s64(c[3], DCT_CONST_BITS)); 194 return d; 195 } 196 197 // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by 198 // DCT_CONST_BITS. 199 static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual( 200 const int32x4x2_t a, const int32_t a_const, const int32x4x2_t b, 201 const int32_t b_const) { 202 int64x2_t c[4]; 203 int32x4x2_t d; 204 c[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const); 205 c[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const); 206 c[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const); 207 c[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const); 208 c[0] = vmlal_n_s32(c[0], vget_low_s32(b.val[0]), b_const); 209 c[1] = vmlal_n_s32(c[1], vget_high_s32(b.val[0]), b_const); 210 c[2] = vmlal_n_s32(c[2], vget_low_s32(b.val[1]), b_const); 211 c[3] = vmlal_n_s32(c[3], vget_high_s32(b.val[1]), b_const); 212 d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), 213 vrshrn_n_s64(c[1], DCT_CONST_BITS)); 214 d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), 215 vrshrn_n_s64(c[3], DCT_CONST_BITS)); 216 return d; 217 } 218 219 // Shift the output down by 6 and add it to the destination buffer. 220 static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1, 221 const int16x8_t a2, const int16x8_t a3, 222 const int16x8_t a4, const int16x8_t a5, 223 const int16x8_t a6, const int16x8_t a7, 224 uint8_t *b, const int b_stride) { 225 uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7; 226 int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; 227 b0 = vld1_u8(b); 228 b += b_stride; 229 b1 = vld1_u8(b); 230 b += b_stride; 231 b2 = vld1_u8(b); 232 b += b_stride; 233 b3 = vld1_u8(b); 234 b += b_stride; 235 b4 = vld1_u8(b); 236 b += b_stride; 237 b5 = vld1_u8(b); 238 b += b_stride; 239 b6 = vld1_u8(b); 240 b += b_stride; 241 b7 = vld1_u8(b); 242 b -= (7 * b_stride); 243 244 // c = b + (a >> 6) 245 c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6); 246 c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6); 247 c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6); 248 c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6); 249 c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6); 250 c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6); 251 c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6); 252 c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6); 253 254 b0 = vqmovun_s16(c0); 255 b1 = vqmovun_s16(c1); 256 b2 = vqmovun_s16(c2); 257 b3 = vqmovun_s16(c3); 258 b4 = vqmovun_s16(c4); 259 b5 = vqmovun_s16(c5); 260 b6 = vqmovun_s16(c6); 261 b7 = vqmovun_s16(c7); 262 263 vst1_u8(b, b0); 264 b += b_stride; 265 vst1_u8(b, b1); 266 b += b_stride; 267 vst1_u8(b, b2); 268 b += b_stride; 269 vst1_u8(b, b3); 270 b += b_stride; 271 vst1_u8(b, b4); 272 b += b_stride; 273 vst1_u8(b, b5); 274 b += b_stride; 275 vst1_u8(b, b6); 276 b += b_stride; 277 vst1_u8(b, b7); 278 } 279 280 static INLINE uint8x16_t create_dcq(const int16_t dc) { 281 // Clip both sides and gcc may compile to assembly 'usat'. 282 const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc); 283 return vdupq_n_u8((uint8_t)t); 284 } 285 286 static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis, 287 int16x8_t *const a0, 288 int16x8_t *const a1) { 289 int16x4_t b0, b1, b2, b3; 290 int32x4_t c0, c1, c2, c3; 291 int16x8_t d0, d1; 292 293 transpose_s16_4x4q(a0, a1); 294 b0 = vget_low_s16(*a0); 295 b1 = vget_high_s16(*a0); 296 b2 = vget_low_s16(*a1); 297 b3 = vget_high_s16(*a1); 298 c0 = vmull_lane_s16(b0, cospis, 2); 299 c2 = vmull_lane_s16(b1, cospis, 2); 300 c1 = vsubq_s32(c0, c2); 301 c0 = vaddq_s32(c0, c2); 302 c2 = vmull_lane_s16(b2, cospis, 3); 303 c3 = vmull_lane_s16(b2, cospis, 1); 304 c2 = vmlsl_lane_s16(c2, b3, cospis, 1); 305 c3 = vmlal_lane_s16(c3, b3, cospis, 3); 306 b0 = vrshrn_n_s32(c0, DCT_CONST_BITS); 307 b1 = vrshrn_n_s32(c1, DCT_CONST_BITS); 308 b2 = vrshrn_n_s32(c2, DCT_CONST_BITS); 309 b3 = vrshrn_n_s32(c3, DCT_CONST_BITS); 310 d0 = vcombine_s16(b0, b1); 311 d1 = vcombine_s16(b3, b2); 312 *a0 = vaddq_s16(d0, d1); 313 *a1 = vsubq_s16(d0, d1); 314 } 315 316 static INLINE void idct8x8_12_pass1_bd8( 317 const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1, 318 int16x4_t *const io0, int16x4_t *const io1, int16x4_t *const io2, 319 int16x4_t *const io3, int16x4_t *const io4, int16x4_t *const io5, 320 int16x4_t *const io6, int16x4_t *const io7) { 321 int16x4_t step1[8], step2[8]; 322 int32x4_t t32[2]; 323 324 transpose_s16_4x4d(io0, io1, io2, io3); 325 326 // stage 1 327 step1[4] = vqrdmulh_lane_s16(*io1, cospisd1, 3); 328 step1[5] = vqrdmulh_lane_s16(*io3, cospisd1, 2); 329 step1[6] = vqrdmulh_lane_s16(*io3, cospisd1, 1); 330 step1[7] = vqrdmulh_lane_s16(*io1, cospisd1, 0); 331 332 // stage 2 333 step2[1] = vqrdmulh_lane_s16(*io0, cospisd0, 2); 334 step2[2] = vqrdmulh_lane_s16(*io2, cospisd0, 3); 335 step2[3] = vqrdmulh_lane_s16(*io2, cospisd0, 1); 336 337 step2[4] = vadd_s16(step1[4], step1[5]); 338 step2[5] = vsub_s16(step1[4], step1[5]); 339 step2[6] = vsub_s16(step1[7], step1[6]); 340 step2[7] = vadd_s16(step1[7], step1[6]); 341 342 // stage 3 343 step1[0] = vadd_s16(step2[1], step2[3]); 344 step1[1] = vadd_s16(step2[1], step2[2]); 345 step1[2] = vsub_s16(step2[1], step2[2]); 346 step1[3] = vsub_s16(step2[1], step2[3]); 347 348 t32[1] = vmull_lane_s16(step2[6], cospis0, 2); 349 t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2); 350 t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2); 351 step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); 352 step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); 353 354 // stage 4 355 *io0 = vadd_s16(step1[0], step2[7]); 356 *io1 = vadd_s16(step1[1], step1[6]); 357 *io2 = vadd_s16(step1[2], step1[5]); 358 *io3 = vadd_s16(step1[3], step2[4]); 359 *io4 = vsub_s16(step1[3], step2[4]); 360 *io5 = vsub_s16(step1[2], step1[5]); 361 *io6 = vsub_s16(step1[1], step1[6]); 362 *io7 = vsub_s16(step1[0], step2[7]); 363 } 364 365 static INLINE void idct8x8_12_pass2_bd8( 366 const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1, 367 const int16x4_t input0, const int16x4_t input1, const int16x4_t input2, 368 const int16x4_t input3, const int16x4_t input4, const int16x4_t input5, 369 const int16x4_t input6, const int16x4_t input7, int16x8_t *const output0, 370 int16x8_t *const output1, int16x8_t *const output2, 371 int16x8_t *const output3, int16x8_t *const output4, 372 int16x8_t *const output5, int16x8_t *const output6, 373 int16x8_t *const output7) { 374 int16x8_t in[4]; 375 int16x8_t step1[8], step2[8]; 376 int32x4_t t32[8]; 377 int16x4_t t16[8]; 378 379 transpose_s16_4x8(input0, input1, input2, input3, input4, input5, input6, 380 input7, &in[0], &in[1], &in[2], &in[3]); 381 382 // stage 1 383 step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3); 384 step1[5] = vqrdmulhq_lane_s16(in[3], cospisd1, 2); 385 step1[6] = vqrdmulhq_lane_s16(in[3], cospisd1, 1); 386 step1[7] = vqrdmulhq_lane_s16(in[1], cospisd1, 0); 387 388 // stage 2 389 step2[1] = vqrdmulhq_lane_s16(in[0], cospisd0, 2); 390 step2[2] = vqrdmulhq_lane_s16(in[2], cospisd0, 3); 391 step2[3] = vqrdmulhq_lane_s16(in[2], cospisd0, 1); 392 393 step2[4] = vaddq_s16(step1[4], step1[5]); 394 step2[5] = vsubq_s16(step1[4], step1[5]); 395 step2[6] = vsubq_s16(step1[7], step1[6]); 396 step2[7] = vaddq_s16(step1[7], step1[6]); 397 398 // stage 3 399 step1[0] = vaddq_s16(step2[1], step2[3]); 400 step1[1] = vaddq_s16(step2[1], step2[2]); 401 step1[2] = vsubq_s16(step2[1], step2[2]); 402 step1[3] = vsubq_s16(step2[1], step2[3]); 403 404 t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2); 405 t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2); 406 t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); 407 t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); 408 t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); 409 t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); 410 t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); 411 t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); 412 t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); 413 t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); 414 step1[5] = vcombine_s16(t16[0], t16[1]); 415 step1[6] = vcombine_s16(t16[2], t16[3]); 416 417 // stage 4 418 *output0 = vaddq_s16(step1[0], step2[7]); 419 *output1 = vaddq_s16(step1[1], step1[6]); 420 *output2 = vaddq_s16(step1[2], step1[5]); 421 *output3 = vaddq_s16(step1[3], step2[4]); 422 *output4 = vsubq_s16(step1[3], step2[4]); 423 *output5 = vsubq_s16(step1[2], step1[5]); 424 *output6 = vsubq_s16(step1[1], step1[6]); 425 *output7 = vsubq_s16(step1[0], step2[7]); 426 } 427 428 static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, 429 const int16x4_t cospis1, 430 int16x8_t *const io0, int16x8_t *const io1, 431 int16x8_t *const io2, int16x8_t *const io3, 432 int16x8_t *const io4, int16x8_t *const io5, 433 int16x8_t *const io6, 434 int16x8_t *const io7) { 435 int16x4_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h, 436 input_7l, input_7h; 437 int16x4_t step1l[4], step1h[4]; 438 int16x8_t step1[8], step2[8]; 439 int32x4_t t32[8]; 440 int16x4_t t16[8]; 441 442 transpose_s16_8x8(io0, io1, io2, io3, io4, io5, io6, io7); 443 444 // stage 1 445 input_1l = vget_low_s16(*io1); 446 input_1h = vget_high_s16(*io1); 447 input_3l = vget_low_s16(*io3); 448 input_3h = vget_high_s16(*io3); 449 input_5l = vget_low_s16(*io5); 450 input_5h = vget_high_s16(*io5); 451 input_7l = vget_low_s16(*io7); 452 input_7h = vget_high_s16(*io7); 453 step1l[0] = vget_low_s16(*io0); 454 step1h[0] = vget_high_s16(*io0); 455 step1l[1] = vget_low_s16(*io2); 456 step1h[1] = vget_high_s16(*io2); 457 step1l[2] = vget_low_s16(*io4); 458 step1h[2] = vget_high_s16(*io4); 459 step1l[3] = vget_low_s16(*io6); 460 step1h[3] = vget_high_s16(*io6); 461 462 t32[0] = vmull_lane_s16(input_1l, cospis1, 3); 463 t32[1] = vmull_lane_s16(input_1h, cospis1, 3); 464 t32[2] = vmull_lane_s16(input_3l, cospis1, 2); 465 t32[3] = vmull_lane_s16(input_3h, cospis1, 2); 466 t32[4] = vmull_lane_s16(input_3l, cospis1, 1); 467 t32[5] = vmull_lane_s16(input_3h, cospis1, 1); 468 t32[6] = vmull_lane_s16(input_1l, cospis1, 0); 469 t32[7] = vmull_lane_s16(input_1h, cospis1, 0); 470 t32[0] = vmlsl_lane_s16(t32[0], input_7l, cospis1, 0); 471 t32[1] = vmlsl_lane_s16(t32[1], input_7h, cospis1, 0); 472 t32[2] = vmlal_lane_s16(t32[2], input_5l, cospis1, 1); 473 t32[3] = vmlal_lane_s16(t32[3], input_5h, cospis1, 1); 474 t32[4] = vmlsl_lane_s16(t32[4], input_5l, cospis1, 2); 475 t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2); 476 t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3); 477 t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3); 478 t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); 479 t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); 480 t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); 481 t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); 482 t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS); 483 t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS); 484 t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS); 485 t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS); 486 step1[4] = vcombine_s16(t16[0], t16[1]); 487 step1[5] = vcombine_s16(t16[2], t16[3]); 488 step1[6] = vcombine_s16(t16[4], t16[5]); 489 step1[7] = vcombine_s16(t16[6], t16[7]); 490 491 // stage 2 492 t32[2] = vmull_lane_s16(step1l[0], cospis0, 2); 493 t32[3] = vmull_lane_s16(step1h[0], cospis0, 2); 494 t32[4] = vmull_lane_s16(step1l[1], cospis0, 3); 495 t32[5] = vmull_lane_s16(step1h[1], cospis0, 3); 496 t32[6] = vmull_lane_s16(step1l[1], cospis0, 1); 497 t32[7] = vmull_lane_s16(step1h[1], cospis0, 1); 498 t32[0] = vmlal_lane_s16(t32[2], step1l[2], cospis0, 2); 499 t32[1] = vmlal_lane_s16(t32[3], step1h[2], cospis0, 2); 500 t32[2] = vmlsl_lane_s16(t32[2], step1l[2], cospis0, 2); 501 t32[3] = vmlsl_lane_s16(t32[3], step1h[2], cospis0, 2); 502 t32[4] = vmlsl_lane_s16(t32[4], step1l[3], cospis0, 1); 503 t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1); 504 t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3); 505 t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3); 506 t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); 507 t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); 508 t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); 509 t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); 510 t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS); 511 t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS); 512 t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS); 513 t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS); 514 step2[0] = vcombine_s16(t16[0], t16[1]); 515 step2[1] = vcombine_s16(t16[2], t16[3]); 516 step2[2] = vcombine_s16(t16[4], t16[5]); 517 step2[3] = vcombine_s16(t16[6], t16[7]); 518 519 step2[4] = vaddq_s16(step1[4], step1[5]); 520 step2[5] = vsubq_s16(step1[4], step1[5]); 521 step2[6] = vsubq_s16(step1[7], step1[6]); 522 step2[7] = vaddq_s16(step1[7], step1[6]); 523 524 // stage 3 525 step1[0] = vaddq_s16(step2[0], step2[3]); 526 step1[1] = vaddq_s16(step2[1], step2[2]); 527 step1[2] = vsubq_s16(step2[1], step2[2]); 528 step1[3] = vsubq_s16(step2[0], step2[3]); 529 530 t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2); 531 t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2); 532 t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); 533 t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); 534 t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); 535 t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); 536 t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); 537 t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); 538 t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); 539 t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); 540 step1[5] = vcombine_s16(t16[0], t16[1]); 541 step1[6] = vcombine_s16(t16[2], t16[3]); 542 543 // stage 4 544 *io0 = vaddq_s16(step1[0], step2[7]); 545 *io1 = vaddq_s16(step1[1], step1[6]); 546 *io2 = vaddq_s16(step1[2], step1[5]); 547 *io3 = vaddq_s16(step1[3], step2[4]); 548 *io4 = vsubq_s16(step1[3], step2[4]); 549 *io5 = vsubq_s16(step1[2], step1[5]); 550 *io6 = vsubq_s16(step1[1], step1[6]); 551 *io7 = vsubq_s16(step1[0], step2[7]); 552 } 553 554 static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32, 555 int16x8_t *const d0, 556 int16x8_t *const d1) { 557 int16x4_t t16[4]; 558 559 t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); 560 t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); 561 t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); 562 t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); 563 *d0 = vcombine_s16(t16[0], t16[1]); 564 *d1 = vcombine_s16(t16[2], t16[3]); 565 } 566 567 static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0, 568 const int16x8_t s1, 569 const int16x4_t cospi_0_8_16_24, 570 int32x4_t *const t32) { 571 t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3); 572 t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3); 573 t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3); 574 t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3); 575 t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1); 576 t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1); 577 t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1); 578 t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1); 579 } 580 581 static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1, 582 const int16x4_t cospi_0_8_16_24, 583 int16x8_t *const d0, int16x8_t *const d1) { 584 int32x4_t t32[4]; 585 586 idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32); 587 idct16x16_add_wrap_low_8x2(t32, d0, d1); 588 } 589 590 static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1, 591 const int16x4_t cospi_0_8_16_24, 592 int16x8_t *const d0, 593 int16x8_t *const d1) { 594 int32x4_t t32[4]; 595 596 idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32); 597 t32[2] = vnegq_s32(t32[2]); 598 t32[3] = vnegq_s32(t32[3]); 599 idct16x16_add_wrap_low_8x2(t32, d0, d1); 600 } 601 602 static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1, 603 const int16x4_t cospi_0_8_16_24, 604 int16x8_t *const d0, 605 int16x8_t *const d1) { 606 int32x4_t t32[6]; 607 608 t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2); 609 t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2); 610 t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2); 611 t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2); 612 t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2); 613 t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2); 614 idct16x16_add_wrap_low_8x2(t32, d0, d1); 615 } 616 617 static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1, 618 const int16x4_t cospi_2_30_10_22, 619 int16x8_t *const d0, int16x8_t *const d1) { 620 int32x4_t t32[4]; 621 622 t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1); 623 t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1); 624 t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1); 625 t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1); 626 t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0); 627 t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0); 628 t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0); 629 t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0); 630 idct16x16_add_wrap_low_8x2(t32, d0, d1); 631 } 632 633 static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1, 634 const int16x4_t cospi_4_12_20N_28, 635 int16x8_t *const d0, int16x8_t *const d1) { 636 int32x4_t t32[4]; 637 638 t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3); 639 t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3); 640 t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3); 641 t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3); 642 t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0); 643 t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0); 644 t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0); 645 t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0); 646 idct16x16_add_wrap_low_8x2(t32, d0, d1); 647 } 648 649 static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1, 650 const int16x4_t cospi_6_26N_14_18N, 651 int16x8_t *const d0, int16x8_t *const d1) { 652 int32x4_t t32[4]; 653 654 t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 0); 655 t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 0); 656 t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 0); 657 t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 0); 658 t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 1); 659 t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 1); 660 t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 1); 661 t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 1); 662 idct16x16_add_wrap_low_8x2(t32, d0, d1); 663 } 664 665 static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1, 666 const int16x4_t cospi_2_30_10_22, 667 int16x8_t *const d0, int16x8_t *const d1) { 668 int32x4_t t32[4]; 669 670 t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3); 671 t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3); 672 t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3); 673 t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3); 674 t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2); 675 t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2); 676 t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2); 677 t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2); 678 idct16x16_add_wrap_low_8x2(t32, d0, d1); 679 } 680 681 static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1, 682 const int16x4_t cospi_4_12_20N_28, 683 int16x8_t *const d0, int16x8_t *const d1) { 684 int32x4_t t32[4]; 685 686 t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1); 687 t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1); 688 t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1); 689 t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1); 690 t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2); 691 t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2); 692 t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2); 693 t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2); 694 idct16x16_add_wrap_low_8x2(t32, d0, d1); 695 } 696 697 static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1, 698 const int16x4_t cospi_6_26N_14_18N, 699 int16x8_t *const d0, int16x8_t *const d1) { 700 int32x4_t t32[4]; 701 702 t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 2); 703 t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 2); 704 t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 2); 705 t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 2); 706 t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 3); 707 t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 3); 708 t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 3); 709 t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 3); 710 idct16x16_add_wrap_low_8x2(t32, d0, d1); 711 } 712 713 static INLINE void idct16x16_add_stage7(const int16x8_t *const step2, 714 int16x8_t *const out) { 715 #if CONFIG_VP9_HIGHBITDEPTH 716 // Use saturating add/sub to avoid overflow in 2nd pass 717 out[0] = vqaddq_s16(step2[0], step2[15]); 718 out[1] = vqaddq_s16(step2[1], step2[14]); 719 out[2] = vqaddq_s16(step2[2], step2[13]); 720 out[3] = vqaddq_s16(step2[3], step2[12]); 721 out[4] = vqaddq_s16(step2[4], step2[11]); 722 out[5] = vqaddq_s16(step2[5], step2[10]); 723 out[6] = vqaddq_s16(step2[6], step2[9]); 724 out[7] = vqaddq_s16(step2[7], step2[8]); 725 out[8] = vqsubq_s16(step2[7], step2[8]); 726 out[9] = vqsubq_s16(step2[6], step2[9]); 727 out[10] = vqsubq_s16(step2[5], step2[10]); 728 out[11] = vqsubq_s16(step2[4], step2[11]); 729 out[12] = vqsubq_s16(step2[3], step2[12]); 730 out[13] = vqsubq_s16(step2[2], step2[13]); 731 out[14] = vqsubq_s16(step2[1], step2[14]); 732 out[15] = vqsubq_s16(step2[0], step2[15]); 733 #else 734 out[0] = vaddq_s16(step2[0], step2[15]); 735 out[1] = vaddq_s16(step2[1], step2[14]); 736 out[2] = vaddq_s16(step2[2], step2[13]); 737 out[3] = vaddq_s16(step2[3], step2[12]); 738 out[4] = vaddq_s16(step2[4], step2[11]); 739 out[5] = vaddq_s16(step2[5], step2[10]); 740 out[6] = vaddq_s16(step2[6], step2[9]); 741 out[7] = vaddq_s16(step2[7], step2[8]); 742 out[8] = vsubq_s16(step2[7], step2[8]); 743 out[9] = vsubq_s16(step2[6], step2[9]); 744 out[10] = vsubq_s16(step2[5], step2[10]); 745 out[11] = vsubq_s16(step2[4], step2[11]); 746 out[12] = vsubq_s16(step2[3], step2[12]); 747 out[13] = vsubq_s16(step2[2], step2[13]); 748 out[14] = vsubq_s16(step2[1], step2[14]); 749 out[15] = vsubq_s16(step2[0], step2[15]); 750 #endif 751 } 752 753 static INLINE void idct16x16_store_pass1(const int16x8_t *const out, 754 int16_t *output) { 755 // Save the result into output 756 vst1q_s16(output, out[0]); 757 output += 16; 758 vst1q_s16(output, out[1]); 759 output += 16; 760 vst1q_s16(output, out[2]); 761 output += 16; 762 vst1q_s16(output, out[3]); 763 output += 16; 764 vst1q_s16(output, out[4]); 765 output += 16; 766 vst1q_s16(output, out[5]); 767 output += 16; 768 vst1q_s16(output, out[6]); 769 output += 16; 770 vst1q_s16(output, out[7]); 771 output += 16; 772 vst1q_s16(output, out[8]); 773 output += 16; 774 vst1q_s16(output, out[9]); 775 output += 16; 776 vst1q_s16(output, out[10]); 777 output += 16; 778 vst1q_s16(output, out[11]); 779 output += 16; 780 vst1q_s16(output, out[12]); 781 output += 16; 782 vst1q_s16(output, out[13]); 783 output += 16; 784 vst1q_s16(output, out[14]); 785 output += 16; 786 vst1q_s16(output, out[15]); 787 } 788 789 static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest, 790 const int stride) { 791 uint8x8_t d = vld1_u8(*dest); 792 uint16x8_t q; 793 794 res = vrshrq_n_s16(res, 6); 795 q = vaddw_u8(vreinterpretq_u16_s16(res), d); 796 d = vqmovun_s16(vreinterpretq_s16_u16(q)); 797 vst1_u8(*dest, d); 798 *dest += stride; 799 } 800 801 static INLINE void highbd_idct16x16_add8x1(int16x8_t res, const int16x8_t max, 802 uint16_t **dest, const int stride) { 803 uint16x8_t d = vld1q_u16(*dest); 804 805 res = vqaddq_s16(res, vreinterpretq_s16_u16(d)); 806 res = vminq_s16(res, max); 807 d = vqshluq_n_s16(res, 0); 808 vst1q_u16(*dest, d); 809 *dest += stride; 810 } 811 812 static INLINE void highbd_idct16x16_add8x1_bd8(int16x8_t res, uint16_t **dest, 813 const int stride) { 814 uint16x8_t d = vld1q_u16(*dest); 815 816 res = vrsraq_n_s16(vreinterpretq_s16_u16(d), res, 6); 817 d = vmovl_u8(vqmovun_s16(res)); 818 vst1q_u16(*dest, d); 819 *dest += stride; 820 } 821 822 static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a, 823 uint16_t *out, const int b_stride) { 824 highbd_idct16x16_add8x1_bd8(a[0], &out, b_stride); 825 highbd_idct16x16_add8x1_bd8(a[1], &out, b_stride); 826 highbd_idct16x16_add8x1_bd8(a[2], &out, b_stride); 827 highbd_idct16x16_add8x1_bd8(a[3], &out, b_stride); 828 highbd_idct16x16_add8x1_bd8(a[4], &out, b_stride); 829 highbd_idct16x16_add8x1_bd8(a[5], &out, b_stride); 830 highbd_idct16x16_add8x1_bd8(a[6], &out, b_stride); 831 highbd_idct16x16_add8x1_bd8(a[7], &out, b_stride); 832 highbd_idct16x16_add8x1_bd8(a[8], &out, b_stride); 833 highbd_idct16x16_add8x1_bd8(a[9], &out, b_stride); 834 highbd_idct16x16_add8x1_bd8(a[10], &out, b_stride); 835 highbd_idct16x16_add8x1_bd8(a[11], &out, b_stride); 836 highbd_idct16x16_add8x1_bd8(a[12], &out, b_stride); 837 highbd_idct16x16_add8x1_bd8(a[13], &out, b_stride); 838 highbd_idct16x16_add8x1_bd8(a[14], &out, b_stride); 839 highbd_idct16x16_add8x1_bd8(a[15], &out, b_stride); 840 highbd_idct16x16_add8x1_bd8(a[16], &out, b_stride); 841 highbd_idct16x16_add8x1_bd8(a[17], &out, b_stride); 842 highbd_idct16x16_add8x1_bd8(a[18], &out, b_stride); 843 highbd_idct16x16_add8x1_bd8(a[19], &out, b_stride); 844 highbd_idct16x16_add8x1_bd8(a[20], &out, b_stride); 845 highbd_idct16x16_add8x1_bd8(a[21], &out, b_stride); 846 highbd_idct16x16_add8x1_bd8(a[22], &out, b_stride); 847 highbd_idct16x16_add8x1_bd8(a[23], &out, b_stride); 848 highbd_idct16x16_add8x1_bd8(a[24], &out, b_stride); 849 highbd_idct16x16_add8x1_bd8(a[25], &out, b_stride); 850 highbd_idct16x16_add8x1_bd8(a[26], &out, b_stride); 851 highbd_idct16x16_add8x1_bd8(a[27], &out, b_stride); 852 highbd_idct16x16_add8x1_bd8(a[28], &out, b_stride); 853 highbd_idct16x16_add8x1_bd8(a[29], &out, b_stride); 854 highbd_idct16x16_add8x1_bd8(a[30], &out, b_stride); 855 highbd_idct16x16_add8x1_bd8(a[31], &out, b_stride); 856 } 857 858 static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out, 859 uint16_t *dest, const int stride, 860 const int bd) { 861 // Add the result to dest 862 const int16x8_t max = vdupq_n_s16((1 << bd) - 1); 863 int16x8_t o[16]; 864 o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6), 865 vrshrn_n_s32(out[0].val[1], 6)); 866 o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6), 867 vrshrn_n_s32(out[1].val[1], 6)); 868 o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6), 869 vrshrn_n_s32(out[2].val[1], 6)); 870 o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6), 871 vrshrn_n_s32(out[3].val[1], 6)); 872 o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6), 873 vrshrn_n_s32(out[4].val[1], 6)); 874 o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6), 875 vrshrn_n_s32(out[5].val[1], 6)); 876 o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6), 877 vrshrn_n_s32(out[6].val[1], 6)); 878 o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6), 879 vrshrn_n_s32(out[7].val[1], 6)); 880 o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6), 881 vrshrn_n_s32(out[8].val[1], 6)); 882 o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6), 883 vrshrn_n_s32(out[9].val[1], 6)); 884 o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6), 885 vrshrn_n_s32(out[10].val[1], 6)); 886 o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6), 887 vrshrn_n_s32(out[11].val[1], 6)); 888 o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6), 889 vrshrn_n_s32(out[12].val[1], 6)); 890 o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6), 891 vrshrn_n_s32(out[13].val[1], 6)); 892 o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6), 893 vrshrn_n_s32(out[14].val[1], 6)); 894 o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6), 895 vrshrn_n_s32(out[15].val[1], 6)); 896 highbd_idct16x16_add8x1(o[0], max, &dest, stride); 897 highbd_idct16x16_add8x1(o[1], max, &dest, stride); 898 highbd_idct16x16_add8x1(o[2], max, &dest, stride); 899 highbd_idct16x16_add8x1(o[3], max, &dest, stride); 900 highbd_idct16x16_add8x1(o[4], max, &dest, stride); 901 highbd_idct16x16_add8x1(o[5], max, &dest, stride); 902 highbd_idct16x16_add8x1(o[6], max, &dest, stride); 903 highbd_idct16x16_add8x1(o[7], max, &dest, stride); 904 highbd_idct16x16_add8x1(o[8], max, &dest, stride); 905 highbd_idct16x16_add8x1(o[9], max, &dest, stride); 906 highbd_idct16x16_add8x1(o[10], max, &dest, stride); 907 highbd_idct16x16_add8x1(o[11], max, &dest, stride); 908 highbd_idct16x16_add8x1(o[12], max, &dest, stride); 909 highbd_idct16x16_add8x1(o[13], max, &dest, stride); 910 highbd_idct16x16_add8x1(o[14], max, &dest, stride); 911 highbd_idct16x16_add8x1(o[15], max, &dest, stride); 912 } 913 914 void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output, 915 void *const dest, const int stride, 916 const int highbd_flag); 917 918 void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output, 919 void *const dest, const int stride, 920 const int highbd_flag); 921 922 void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input, 923 int16_t *output); 924 925 void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input, 926 int16_t *const output, void *const dest, 927 const int stride, const int highbd_flag); 928 929 void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest, 930 const int stride, const int highbd_flag); 931 932 void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output); 933 void vpx_idct32_16_neon(const int16_t *const input, void *const output, 934 const int stride, const int highbd_flag); 935 936 void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output); 937 void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride, 938 const int highbd_flag); 939 940 #endif // VPX_DSP_ARM_IDCT_NEON_H_ 941