1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #include "./vpx_dsp_rtcd.h" 14 #include "vpx_dsp/arm/idct_neon.h" 15 #include "vpx_dsp/arm/mem_neon.h" 16 #include "vpx_dsp/txfm_common.h" 17 18 static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0, 19 int16x4_t *const d1) { 20 *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS); 21 *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS); 22 } 23 24 static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0, 25 const int16x4_t s1, 26 const int16x4_t cospi_0_8_16_24, 27 int32x4_t *const t32) { 28 t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3); 29 t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3); 30 t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1); 31 t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1); 32 } 33 34 static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1, 35 const int16x4_t cospi_0_8_16_24, 36 int16x4_t *const d0, int16x4_t *const d1) { 37 int32x4_t t32[2]; 38 39 idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32); 40 wrap_low_4x2(t32, d0, d1); 41 } 42 43 static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1, 44 const int16x4_t cospi_0_8_16_24, 45 int16x4_t *const d0, 46 int16x4_t *const d1) { 47 int32x4_t t32[2]; 48 49 idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32); 50 t32[1] = vnegq_s32(t32[1]); 51 wrap_low_4x2(t32, d0, d1); 52 } 53 54 static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1, 55 const int16x4_t cospi_0_8_16_24, 56 int16x4_t *const d0, 57 int16x4_t *const d1) { 58 int32x4_t t32[3]; 59 60 t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2); 61 t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2); 62 t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2); 63 wrap_low_4x2(t32, d0, d1); 64 } 65 66 static INLINE void idct16x16_add_store(const int16x8_t *const out, 67 uint8_t *dest, const int stride) { 68 // Add the result to dest 69 idct16x16_add8x1(out[0], &dest, stride); 70 idct16x16_add8x1(out[1], &dest, stride); 71 idct16x16_add8x1(out[2], &dest, stride); 72 idct16x16_add8x1(out[3], &dest, stride); 73 idct16x16_add8x1(out[4], &dest, stride); 74 idct16x16_add8x1(out[5], &dest, stride); 75 idct16x16_add8x1(out[6], &dest, stride); 76 idct16x16_add8x1(out[7], &dest, stride); 77 idct16x16_add8x1(out[8], &dest, stride); 78 idct16x16_add8x1(out[9], &dest, stride); 79 idct16x16_add8x1(out[10], &dest, stride); 80 idct16x16_add8x1(out[11], &dest, stride); 81 idct16x16_add8x1(out[12], &dest, stride); 82 idct16x16_add8x1(out[13], &dest, stride); 83 idct16x16_add8x1(out[14], &dest, stride); 84 idct16x16_add8x1(out[15], &dest, stride); 85 } 86 87 static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest, 88 const int stride) { 89 // Add the result to dest 90 const int16x8_t max = vdupq_n_s16((1 << 8) - 1); 91 out[0] = vrshrq_n_s16(out[0], 6); 92 out[1] = vrshrq_n_s16(out[1], 6); 93 out[2] = vrshrq_n_s16(out[2], 6); 94 out[3] = vrshrq_n_s16(out[3], 6); 95 out[4] = vrshrq_n_s16(out[4], 6); 96 out[5] = vrshrq_n_s16(out[5], 6); 97 out[6] = vrshrq_n_s16(out[6], 6); 98 out[7] = vrshrq_n_s16(out[7], 6); 99 out[8] = vrshrq_n_s16(out[8], 6); 100 out[9] = vrshrq_n_s16(out[9], 6); 101 out[10] = vrshrq_n_s16(out[10], 6); 102 out[11] = vrshrq_n_s16(out[11], 6); 103 out[12] = vrshrq_n_s16(out[12], 6); 104 out[13] = vrshrq_n_s16(out[13], 6); 105 out[14] = vrshrq_n_s16(out[14], 6); 106 out[15] = vrshrq_n_s16(out[15], 6); 107 highbd_idct16x16_add8x1(out[0], max, &dest, stride); 108 highbd_idct16x16_add8x1(out[1], max, &dest, stride); 109 highbd_idct16x16_add8x1(out[2], max, &dest, stride); 110 highbd_idct16x16_add8x1(out[3], max, &dest, stride); 111 highbd_idct16x16_add8x1(out[4], max, &dest, stride); 112 highbd_idct16x16_add8x1(out[5], max, &dest, stride); 113 highbd_idct16x16_add8x1(out[6], max, &dest, stride); 114 highbd_idct16x16_add8x1(out[7], max, &dest, stride); 115 highbd_idct16x16_add8x1(out[8], max, &dest, stride); 116 highbd_idct16x16_add8x1(out[9], max, &dest, stride); 117 highbd_idct16x16_add8x1(out[10], max, &dest, stride); 118 highbd_idct16x16_add8x1(out[11], max, &dest, stride); 119 highbd_idct16x16_add8x1(out[12], max, &dest, stride); 120 highbd_idct16x16_add8x1(out[13], max, &dest, stride); 121 highbd_idct16x16_add8x1(out[14], max, &dest, stride); 122 highbd_idct16x16_add8x1(out[15], max, &dest, stride); 123 } 124 125 void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output, 126 void *const dest, const int stride, 127 const int highbd_flag) { 128 const int16x8_t cospis0 = vld1q_s16(kCospi); 129 const int16x8_t cospis1 = vld1q_s16(kCospi + 8); 130 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); 131 const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0); 132 const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1); 133 const int16x4_t cospi_6_26N_14_18N = vget_high_s16(cospis1); 134 int16x8_t in[16], step1[16], step2[16], out[16]; 135 136 // Load input (16x8) 137 if (output) { 138 const tran_low_t *inputT = (const tran_low_t *)input; 139 in[0] = load_tran_low_to_s16q(inputT); 140 inputT += 8; 141 in[8] = load_tran_low_to_s16q(inputT); 142 inputT += 8; 143 in[1] = load_tran_low_to_s16q(inputT); 144 inputT += 8; 145 in[9] = load_tran_low_to_s16q(inputT); 146 inputT += 8; 147 in[2] = load_tran_low_to_s16q(inputT); 148 inputT += 8; 149 in[10] = load_tran_low_to_s16q(inputT); 150 inputT += 8; 151 in[3] = load_tran_low_to_s16q(inputT); 152 inputT += 8; 153 in[11] = load_tran_low_to_s16q(inputT); 154 inputT += 8; 155 in[4] = load_tran_low_to_s16q(inputT); 156 inputT += 8; 157 in[12] = load_tran_low_to_s16q(inputT); 158 inputT += 8; 159 in[5] = load_tran_low_to_s16q(inputT); 160 inputT += 8; 161 in[13] = load_tran_low_to_s16q(inputT); 162 inputT += 8; 163 in[6] = load_tran_low_to_s16q(inputT); 164 inputT += 8; 165 in[14] = load_tran_low_to_s16q(inputT); 166 inputT += 8; 167 in[7] = load_tran_low_to_s16q(inputT); 168 inputT += 8; 169 in[15] = load_tran_low_to_s16q(inputT); 170 } else { 171 const int16_t *inputT = (const int16_t *)input; 172 in[0] = vld1q_s16(inputT); 173 inputT += 8; 174 in[8] = vld1q_s16(inputT); 175 inputT += 8; 176 in[1] = vld1q_s16(inputT); 177 inputT += 8; 178 in[9] = vld1q_s16(inputT); 179 inputT += 8; 180 in[2] = vld1q_s16(inputT); 181 inputT += 8; 182 in[10] = vld1q_s16(inputT); 183 inputT += 8; 184 in[3] = vld1q_s16(inputT); 185 inputT += 8; 186 in[11] = vld1q_s16(inputT); 187 inputT += 8; 188 in[4] = vld1q_s16(inputT); 189 inputT += 8; 190 in[12] = vld1q_s16(inputT); 191 inputT += 8; 192 in[5] = vld1q_s16(inputT); 193 inputT += 8; 194 in[13] = vld1q_s16(inputT); 195 inputT += 8; 196 in[6] = vld1q_s16(inputT); 197 inputT += 8; 198 in[14] = vld1q_s16(inputT); 199 inputT += 8; 200 in[7] = vld1q_s16(inputT); 201 inputT += 8; 202 in[15] = vld1q_s16(inputT); 203 } 204 205 // Transpose 206 transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], 207 &in[7]); 208 transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], 209 &in[15]); 210 211 // stage 1 212 step1[0] = in[0 / 2]; 213 step1[1] = in[16 / 2]; 214 step1[2] = in[8 / 2]; 215 step1[3] = in[24 / 2]; 216 step1[4] = in[4 / 2]; 217 step1[5] = in[20 / 2]; 218 step1[6] = in[12 / 2]; 219 step1[7] = in[28 / 2]; 220 step1[8] = in[2 / 2]; 221 step1[9] = in[18 / 2]; 222 step1[10] = in[10 / 2]; 223 step1[11] = in[26 / 2]; 224 step1[12] = in[6 / 2]; 225 step1[13] = in[22 / 2]; 226 step1[14] = in[14 / 2]; 227 step1[15] = in[30 / 2]; 228 229 // stage 2 230 step2[0] = step1[0]; 231 step2[1] = step1[1]; 232 step2[2] = step1[2]; 233 step2[3] = step1[3]; 234 step2[4] = step1[4]; 235 step2[5] = step1[5]; 236 step2[6] = step1[6]; 237 step2[7] = step1[7]; 238 idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]); 239 idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9], 240 &step2[14]); 241 idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10], 242 &step2[13]); 243 idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11], 244 &step2[12]); 245 246 // stage 3 247 step1[0] = step2[0]; 248 step1[1] = step2[1]; 249 step1[2] = step2[2]; 250 step1[3] = step2[3]; 251 idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]); 252 idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]); 253 step1[8] = vaddq_s16(step2[8], step2[9]); 254 step1[9] = vsubq_s16(step2[8], step2[9]); 255 step1[10] = vsubq_s16(step2[11], step2[10]); 256 step1[11] = vaddq_s16(step2[11], step2[10]); 257 step1[12] = vaddq_s16(step2[12], step2[13]); 258 step1[13] = vsubq_s16(step2[12], step2[13]); 259 step1[14] = vsubq_s16(step2[15], step2[14]); 260 step1[15] = vaddq_s16(step2[15], step2[14]); 261 262 // stage 4 263 idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]); 264 idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]); 265 step2[4] = vaddq_s16(step1[4], step1[5]); 266 step2[5] = vsubq_s16(step1[4], step1[5]); 267 step2[6] = vsubq_s16(step1[7], step1[6]); 268 step2[7] = vaddq_s16(step1[7], step1[6]); 269 step2[8] = step1[8]; 270 idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], 271 &step2[14]); 272 idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], 273 &step2[10]); 274 step2[11] = step1[11]; 275 step2[12] = step1[12]; 276 step2[15] = step1[15]; 277 278 // stage 5 279 step1[0] = vaddq_s16(step2[0], step2[3]); 280 step1[1] = vaddq_s16(step2[1], step2[2]); 281 step1[2] = vsubq_s16(step2[1], step2[2]); 282 step1[3] = vsubq_s16(step2[0], step2[3]); 283 step1[4] = step2[4]; 284 idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); 285 step1[7] = step2[7]; 286 step1[8] = vaddq_s16(step2[8], step2[11]); 287 step1[9] = vaddq_s16(step2[9], step2[10]); 288 step1[10] = vsubq_s16(step2[9], step2[10]); 289 step1[11] = vsubq_s16(step2[8], step2[11]); 290 step1[12] = vsubq_s16(step2[15], step2[12]); 291 step1[13] = vsubq_s16(step2[14], step2[13]); 292 step1[14] = vaddq_s16(step2[14], step2[13]); 293 step1[15] = vaddq_s16(step2[15], step2[12]); 294 295 // stage 6 296 step2[0] = vaddq_s16(step1[0], step1[7]); 297 step2[1] = vaddq_s16(step1[1], step1[6]); 298 step2[2] = vaddq_s16(step1[2], step1[5]); 299 step2[3] = vaddq_s16(step1[3], step1[4]); 300 step2[4] = vsubq_s16(step1[3], step1[4]); 301 step2[5] = vsubq_s16(step1[2], step1[5]); 302 step2[6] = vsubq_s16(step1[1], step1[6]); 303 step2[7] = vsubq_s16(step1[0], step1[7]); 304 idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], 305 &step2[13]); 306 idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], 307 &step2[12]); 308 step2[8] = step1[8]; 309 step2[9] = step1[9]; 310 step2[14] = step1[14]; 311 step2[15] = step1[15]; 312 313 // stage 7 314 idct16x16_add_stage7(step2, out); 315 316 if (output) { 317 idct16x16_store_pass1(out, output); 318 } else { 319 if (highbd_flag) { 320 idct16x16_add_store_bd8(out, dest, stride); 321 } else { 322 idct16x16_add_store(out, dest, stride); 323 } 324 } 325 } 326 327 void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output, 328 void *const dest, const int stride, 329 const int highbd_flag) { 330 const int16x8_t cospis0 = vld1q_s16(kCospi); 331 const int16x8_t cospis1 = vld1q_s16(kCospi + 8); 332 const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); 333 const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); 334 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); 335 const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); 336 const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); 337 const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); 338 const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); 339 int16x8_t in[8], step1[16], step2[16], out[16]; 340 341 // Load input (8x8) 342 if (output) { 343 const tran_low_t *inputT = (const tran_low_t *)input; 344 in[0] = load_tran_low_to_s16q(inputT); 345 inputT += 16; 346 in[1] = load_tran_low_to_s16q(inputT); 347 inputT += 16; 348 in[2] = load_tran_low_to_s16q(inputT); 349 inputT += 16; 350 in[3] = load_tran_low_to_s16q(inputT); 351 inputT += 16; 352 in[4] = load_tran_low_to_s16q(inputT); 353 inputT += 16; 354 in[5] = load_tran_low_to_s16q(inputT); 355 inputT += 16; 356 in[6] = load_tran_low_to_s16q(inputT); 357 inputT += 16; 358 in[7] = load_tran_low_to_s16q(inputT); 359 } else { 360 const int16_t *inputT = (const int16_t *)input; 361 in[0] = vld1q_s16(inputT); 362 inputT += 16; 363 in[1] = vld1q_s16(inputT); 364 inputT += 16; 365 in[2] = vld1q_s16(inputT); 366 inputT += 16; 367 in[3] = vld1q_s16(inputT); 368 inputT += 16; 369 in[4] = vld1q_s16(inputT); 370 inputT += 16; 371 in[5] = vld1q_s16(inputT); 372 inputT += 16; 373 in[6] = vld1q_s16(inputT); 374 inputT += 16; 375 in[7] = vld1q_s16(inputT); 376 } 377 378 // Transpose 379 transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], 380 &in[7]); 381 382 // stage 1 383 step1[0] = in[0 / 2]; 384 step1[2] = in[8 / 2]; 385 step1[4] = in[4 / 2]; 386 step1[6] = in[12 / 2]; 387 step1[8] = in[2 / 2]; 388 step1[10] = in[10 / 2]; 389 step1[12] = in[6 / 2]; 390 step1[14] = in[14 / 2]; // 0 in pass 1 391 392 // stage 2 393 step2[0] = step1[0]; 394 step2[2] = step1[2]; 395 step2[4] = step1[4]; 396 step2[6] = step1[6]; 397 step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1); 398 step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3); 399 step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3); 400 step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1); 401 step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0); 402 step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2); 403 step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2); 404 step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0); 405 406 // stage 3 407 step1[0] = step2[0]; 408 step1[2] = step2[2]; 409 step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3); 410 step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2); 411 step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1); 412 step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0); 413 step1[8] = vaddq_s16(step2[8], step2[9]); 414 step1[9] = vsubq_s16(step2[8], step2[9]); 415 step1[10] = vsubq_s16(step2[11], step2[10]); 416 step1[11] = vaddq_s16(step2[11], step2[10]); 417 step1[12] = vaddq_s16(step2[12], step2[13]); 418 step1[13] = vsubq_s16(step2[12], step2[13]); 419 step1[14] = vsubq_s16(step2[15], step2[14]); 420 step1[15] = vaddq_s16(step2[15], step2[14]); 421 422 // stage 4 423 step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2); 424 step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3); 425 step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1); 426 step2[4] = vaddq_s16(step1[4], step1[5]); 427 step2[5] = vsubq_s16(step1[4], step1[5]); 428 step2[6] = vsubq_s16(step1[7], step1[6]); 429 step2[7] = vaddq_s16(step1[7], step1[6]); 430 step2[8] = step1[8]; 431 idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], 432 &step2[14]); 433 idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], 434 &step2[10]); 435 step2[11] = step1[11]; 436 step2[12] = step1[12]; 437 step2[15] = step1[15]; 438 439 // stage 5 440 step1[0] = vaddq_s16(step2[0], step2[3]); 441 step1[1] = vaddq_s16(step2[1], step2[2]); 442 step1[2] = vsubq_s16(step2[1], step2[2]); 443 step1[3] = vsubq_s16(step2[0], step2[3]); 444 step1[4] = step2[4]; 445 idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); 446 step1[7] = step2[7]; 447 step1[8] = vaddq_s16(step2[8], step2[11]); 448 step1[9] = vaddq_s16(step2[9], step2[10]); 449 step1[10] = vsubq_s16(step2[9], step2[10]); 450 step1[11] = vsubq_s16(step2[8], step2[11]); 451 step1[12] = vsubq_s16(step2[15], step2[12]); 452 step1[13] = vsubq_s16(step2[14], step2[13]); 453 step1[14] = vaddq_s16(step2[14], step2[13]); 454 step1[15] = vaddq_s16(step2[15], step2[12]); 455 456 // stage 6 457 step2[0] = vaddq_s16(step1[0], step1[7]); 458 step2[1] = vaddq_s16(step1[1], step1[6]); 459 step2[2] = vaddq_s16(step1[2], step1[5]); 460 step2[3] = vaddq_s16(step1[3], step1[4]); 461 step2[4] = vsubq_s16(step1[3], step1[4]); 462 step2[5] = vsubq_s16(step1[2], step1[5]); 463 step2[6] = vsubq_s16(step1[1], step1[6]); 464 step2[7] = vsubq_s16(step1[0], step1[7]); 465 idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], 466 &step2[13]); 467 idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], 468 &step2[12]); 469 step2[8] = step1[8]; 470 step2[9] = step1[9]; 471 step2[14] = step1[14]; 472 step2[15] = step1[15]; 473 474 // stage 7 475 idct16x16_add_stage7(step2, out); 476 477 if (output) { 478 idct16x16_store_pass1(out, output); 479 } else { 480 if (highbd_flag) { 481 idct16x16_add_store_bd8(out, dest, stride); 482 } else { 483 idct16x16_add_store(out, dest, stride); 484 } 485 } 486 } 487 488 void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input, 489 int16_t *output) { 490 const int16x8_t cospis0 = vld1q_s16(kCospi); 491 const int16x8_t cospis1 = vld1q_s16(kCospi + 8); 492 const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); 493 const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); 494 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); 495 const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); 496 const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); 497 const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); 498 const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); 499 int16x4_t in[4], step1[16], step2[16], out[16]; 500 501 // Load input (4x4) 502 in[0] = load_tran_low_to_s16d(input); 503 input += 16; 504 in[1] = load_tran_low_to_s16d(input); 505 input += 16; 506 in[2] = load_tran_low_to_s16d(input); 507 input += 16; 508 in[3] = load_tran_low_to_s16d(input); 509 510 // Transpose 511 transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]); 512 513 // stage 1 514 step1[0] = in[0 / 2]; 515 step1[4] = in[4 / 2]; 516 step1[8] = in[2 / 2]; 517 step1[12] = in[6 / 2]; 518 519 // stage 2 520 step2[0] = step1[0]; 521 step2[4] = step1[4]; 522 step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1); 523 step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1); 524 step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0); 525 step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0); 526 527 // stage 3 528 step1[0] = step2[0]; 529 step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3); 530 step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0); 531 step1[8] = step2[8]; 532 step1[9] = step2[8]; 533 step1[10] = step2[11]; 534 step1[11] = step2[11]; 535 step1[12] = step2[12]; 536 step1[13] = step2[12]; 537 step1[14] = step2[15]; 538 step1[15] = step2[15]; 539 540 // stage 4 541 step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2); 542 step2[4] = step1[4]; 543 step2[5] = step1[4]; 544 step2[6] = step1[7]; 545 step2[7] = step1[7]; 546 step2[8] = step1[8]; 547 idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9], 548 &step2[14]); 549 idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13], 550 &step2[10]); 551 step2[11] = step1[11]; 552 step2[12] = step1[12]; 553 step2[15] = step1[15]; 554 555 // stage 5 556 step1[0] = step2[0]; 557 step1[1] = step2[1]; 558 step1[2] = step2[1]; 559 step1[3] = step2[0]; 560 step1[4] = step2[4]; 561 idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); 562 step1[7] = step2[7]; 563 step1[8] = vadd_s16(step2[8], step2[11]); 564 step1[9] = vadd_s16(step2[9], step2[10]); 565 step1[10] = vsub_s16(step2[9], step2[10]); 566 step1[11] = vsub_s16(step2[8], step2[11]); 567 step1[12] = vsub_s16(step2[15], step2[12]); 568 step1[13] = vsub_s16(step2[14], step2[13]); 569 step1[14] = vadd_s16(step2[14], step2[13]); 570 step1[15] = vadd_s16(step2[15], step2[12]); 571 572 // stage 6 573 step2[0] = vadd_s16(step1[0], step1[7]); 574 step2[1] = vadd_s16(step1[1], step1[6]); 575 step2[2] = vadd_s16(step1[2], step1[5]); 576 step2[3] = vadd_s16(step1[3], step1[4]); 577 step2[4] = vsub_s16(step1[3], step1[4]); 578 step2[5] = vsub_s16(step1[2], step1[5]); 579 step2[6] = vsub_s16(step1[1], step1[6]); 580 step2[7] = vsub_s16(step1[0], step1[7]); 581 idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10], 582 &step2[13]); 583 idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11], 584 &step2[12]); 585 step2[8] = step1[8]; 586 step2[9] = step1[9]; 587 step2[14] = step1[14]; 588 step2[15] = step1[15]; 589 590 // stage 7 591 out[0] = vadd_s16(step2[0], step2[15]); 592 out[1] = vadd_s16(step2[1], step2[14]); 593 out[2] = vadd_s16(step2[2], step2[13]); 594 out[3] = vadd_s16(step2[3], step2[12]); 595 out[4] = vadd_s16(step2[4], step2[11]); 596 out[5] = vadd_s16(step2[5], step2[10]); 597 out[6] = vadd_s16(step2[6], step2[9]); 598 out[7] = vadd_s16(step2[7], step2[8]); 599 out[8] = vsub_s16(step2[7], step2[8]); 600 out[9] = vsub_s16(step2[6], step2[9]); 601 out[10] = vsub_s16(step2[5], step2[10]); 602 out[11] = vsub_s16(step2[4], step2[11]); 603 out[12] = vsub_s16(step2[3], step2[12]); 604 out[13] = vsub_s16(step2[2], step2[13]); 605 out[14] = vsub_s16(step2[1], step2[14]); 606 out[15] = vsub_s16(step2[0], step2[15]); 607 608 // pass 1: save the result into output 609 vst1_s16(output, out[0]); 610 output += 4; 611 vst1_s16(output, out[1]); 612 output += 4; 613 vst1_s16(output, out[2]); 614 output += 4; 615 vst1_s16(output, out[3]); 616 output += 4; 617 vst1_s16(output, out[4]); 618 output += 4; 619 vst1_s16(output, out[5]); 620 output += 4; 621 vst1_s16(output, out[6]); 622 output += 4; 623 vst1_s16(output, out[7]); 624 output += 4; 625 vst1_s16(output, out[8]); 626 output += 4; 627 vst1_s16(output, out[9]); 628 output += 4; 629 vst1_s16(output, out[10]); 630 output += 4; 631 vst1_s16(output, out[11]); 632 output += 4; 633 vst1_s16(output, out[12]); 634 output += 4; 635 vst1_s16(output, out[13]); 636 output += 4; 637 vst1_s16(output, out[14]); 638 output += 4; 639 vst1_s16(output, out[15]); 640 } 641 642 void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input, 643 int16_t *const output, void *const dest, 644 const int stride, 645 const int highbd_flag) { 646 const int16x8_t cospis0 = vld1q_s16(kCospi); 647 const int16x8_t cospis1 = vld1q_s16(kCospi + 8); 648 const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); 649 const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); 650 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); 651 const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); 652 const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); 653 const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); 654 const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); 655 int16x4_t ind[8]; 656 int16x8_t in[4], step1[16], step2[16], out[16]; 657 658 // Load input (4x8) 659 ind[0] = vld1_s16(input); 660 input += 4; 661 ind[1] = vld1_s16(input); 662 input += 4; 663 ind[2] = vld1_s16(input); 664 input += 4; 665 ind[3] = vld1_s16(input); 666 input += 4; 667 ind[4] = vld1_s16(input); 668 input += 4; 669 ind[5] = vld1_s16(input); 670 input += 4; 671 ind[6] = vld1_s16(input); 672 input += 4; 673 ind[7] = vld1_s16(input); 674 675 // Transpose 676 transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6], 677 ind[7], &in[0], &in[1], &in[2], &in[3]); 678 679 // stage 1 680 step1[0] = in[0 / 2]; 681 step1[4] = in[4 / 2]; 682 step1[8] = in[2 / 2]; 683 step1[12] = in[6 / 2]; 684 685 // stage 2 686 step2[0] = step1[0]; 687 step2[4] = step1[4]; 688 step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1); 689 step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1); 690 step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0); 691 step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0); 692 693 // stage 3 694 step1[0] = step2[0]; 695 step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3); 696 step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0); 697 step1[8] = step2[8]; 698 step1[9] = step2[8]; 699 step1[10] = step2[11]; 700 step1[11] = step2[11]; 701 step1[12] = step2[12]; 702 step1[13] = step2[12]; 703 step1[14] = step2[15]; 704 step1[15] = step2[15]; 705 706 // stage 4 707 step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2); 708 step2[4] = step1[4]; 709 step2[5] = step1[4]; 710 step2[6] = step1[7]; 711 step2[7] = step1[7]; 712 step2[8] = step1[8]; 713 idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], 714 &step2[14]); 715 idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], 716 &step2[10]); 717 step2[11] = step1[11]; 718 step2[12] = step1[12]; 719 step2[15] = step1[15]; 720 721 // stage 5 722 step1[0] = step2[0]; 723 step1[1] = step2[1]; 724 step1[2] = step2[1]; 725 step1[3] = step2[0]; 726 step1[4] = step2[4]; 727 idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); 728 step1[7] = step2[7]; 729 step1[8] = vaddq_s16(step2[8], step2[11]); 730 step1[9] = vaddq_s16(step2[9], step2[10]); 731 step1[10] = vsubq_s16(step2[9], step2[10]); 732 step1[11] = vsubq_s16(step2[8], step2[11]); 733 step1[12] = vsubq_s16(step2[15], step2[12]); 734 step1[13] = vsubq_s16(step2[14], step2[13]); 735 step1[14] = vaddq_s16(step2[14], step2[13]); 736 step1[15] = vaddq_s16(step2[15], step2[12]); 737 738 // stage 6 739 step2[0] = vaddq_s16(step1[0], step1[7]); 740 step2[1] = vaddq_s16(step1[1], step1[6]); 741 step2[2] = vaddq_s16(step1[2], step1[5]); 742 step2[3] = vaddq_s16(step1[3], step1[4]); 743 step2[4] = vsubq_s16(step1[3], step1[4]); 744 step2[5] = vsubq_s16(step1[2], step1[5]); 745 step2[6] = vsubq_s16(step1[1], step1[6]); 746 step2[7] = vsubq_s16(step1[0], step1[7]); 747 idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], 748 &step2[13]); 749 idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], 750 &step2[12]); 751 step2[8] = step1[8]; 752 step2[9] = step1[9]; 753 step2[14] = step1[14]; 754 step2[15] = step1[15]; 755 756 // stage 7 757 idct16x16_add_stage7(step2, out); 758 759 if (output) { 760 idct16x16_store_pass1(out, output); 761 } else { 762 if (highbd_flag) { 763 idct16x16_add_store_bd8(out, dest, stride); 764 } else { 765 idct16x16_add_store(out, dest, stride); 766 } 767 } 768 } 769 770 void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, 771 int stride) { 772 int16_t row_idct_output[16 * 16]; 773 774 // pass 1 775 // Parallel idct on the upper 8 rows 776 vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0); 777 778 // Parallel idct on the lower 8 rows 779 vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, 780 stride, 0); 781 782 // pass 2 783 // Parallel idct to get the left 8 columns 784 vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0); 785 786 // Parallel idct to get the right 8 columns 787 vpx_idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, 788 0); 789 } 790 791 void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest, 792 int stride) { 793 int16_t row_idct_output[16 * 16]; 794 795 // pass 1 796 // Parallel idct on the upper 8 rows 797 vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 0); 798 799 // pass 2 800 // Parallel idct to get the left 8 columns 801 vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 0); 802 803 // Parallel idct to get the right 8 columns 804 vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, 805 0); 806 } 807 808 void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, 809 int stride) { 810 int16_t row_idct_output[4 * 16]; 811 812 // pass 1 813 // Parallel idct on the upper 8 rows 814 vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output); 815 816 // pass 2 817 // Parallel idct to get the left 8 columns 818 vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 0); 819 820 // Parallel idct to get the right 8 columns 821 vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8, 822 stride, 0); 823 } 824